# Warm-up: numpy

Before introducing PyTorch, we will first implement the network using numpy.

In [1]:
import numpy as np

In [2]:
N, D_in, H, D_out = 64,1000,100,10 

In [3]:
x = np.random.randn(N,D_in)
y = np.random.randn(N,D_out)

In [5]:
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [6]:
learning_rate = 1e-6

for t in range(500):
    #Forward 
    h = x.dot(w1)
    h_relu = np.maximum(h,0)
    y_pred = h_relu.dot(w2)
    
    #compute and print loss
    loss = np.square(y_pred-y).sum()
    print(t,loss)
    
    #Backprop to compute gradients of w1 and w2 with respect to Loss
    grad_y_pred = 2.0*(y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 27963076.44068835
1 21295043.393260494
2 18053064.44767566
3 15619214.911756258
4 13069616.917102002
5 10322822.213021966
6 7681754.305795101
7 5470240.271312732
8 3802709.490344409
9 2641924.4887740067
10 1864613.3518792375
11 1354448.272367428
12 1017231.4761303323
13 790465.6800270253
14 633324.796506019
15 520460.8775139504
16 436476.81199878186
17 371895.72801946383
18 320786.81277967955
19 279317.288994082
20 244992.79799326623
21 216193.99591007526
22 191749.70040833316
23 170799.1557093635
24 152673.6213893742
25 136894.40931602224
26 123088.27600840185
27 110957.66859662782
28 100260.3855789992
29 90782.4869346497
30 82371.55843909417
31 74882.54359412617
32 68197.21116215376
33 62209.49478698279
34 56836.20878798122
35 52003.63034218796
36 47647.95778745902
37 43720.92429631528
38 40169.25546203662
39 36950.473139365524
40 34024.92041949415
41 31360.565399876854
42 28935.26041379159
43 26723.511692057735
44 24704.642097925112
45 22858.79164648919
46 21170.028501574292
47 19

374 0.0012667516712172927
375 0.0012113895614375792
376 0.0011584438787927789
377 0.0011078198921144829
378 0.001059422787392268
379 0.001013149062496838
380 0.0009688984864327648
381 0.0009265891554999595
382 0.0008861327447200104
383 0.0008474515934231908
384 0.0008104605749050182
385 0.0007750971749043304
386 0.0007412725374766464
387 0.000708935424246791
388 0.0006780208634573125
389 0.0006484527346310956
390 0.0006201723388933785
391 0.0005931337435953834
392 0.0005672788360688414
393 0.0005425505609081405
394 0.0005189031839001013
395 0.000496288850789555
396 0.0004746662420181725
397 0.00045399340086603836
398 0.0004342196541372254
399 0.0004153086074167077
400 0.0003972222311369184
401 0.0003799290570318187
402 0.0003633872624975735
403 0.00034756970807266245
404 0.00033244695354453394
405 0.0003179831502660421
406 0.0003041468475243659
407 0.00029091457911512724
408 0.00027825993572792266
409 0.00026615799673961526
410 0.00025458621081850036
411 0.00024351656349009038
412 0.00

# PyTorch: Tensors

Here we introduce the most fundamental PyTorch concept: the Tensor. A PyTorch Tensor is conceptually identical to a numpy array: a Tensor is an n-dimensional array, and PyTorch provides many functions for operating on these Tensors. 

In [7]:
import torch

In [12]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)


learning_rate = 1e-6
for t in range(500):
    #Forward
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print Loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t,loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 37614376.0
1 36631964.0
2 40496824.0
3 40320832.0
4 31555508.0
5 18433702.0
6 8694897.0
7 3983026.75
8 2101296.25
9 1340042.0
10 980937.3125
11 773426.875
12 632317.4375
13 526568.3125
14 443362.0
15 376240.03125
16 321324.71875
17 275927.09375
18 238082.71875
19 206365.90625
20 179592.828125
21 156905.453125
22 137618.15625
23 121106.546875
24 106922.0859375
25 94698.7578125
26 84131.921875
27 74957.2578125
28 66977.125
29 59998.69140625
30 53881.515625
31 48511.46875
32 43776.1953125
33 39594.59375
34 35888.34765625
35 32590.40234375
36 29650.46875
37 27024.115234375
38 24676.228515625
39 22570.77734375
40 20678.361328125
41 18973.443359375
42 17434.51171875
43 16041.84375
44 14780.11328125
45 13634.8486328125
46 12593.4794921875
47 11644.7529296875
48 10779.734375
49 9989.7001953125
50 9266.5771484375
51 8604.6220703125
52 7997.03076171875
53 7439.16064453125
54 6926.1337890625
55 6453.70556640625
56 6018.27734375
57 5616.18994140625
58 5244.798828125
59 4901.39892578125
60 4583.5

450 0.0002422257384750992
451 0.00023728080850560218
452 0.00023228385543916374
453 0.00022770007490180433
454 0.0002231158723589033
455 0.00021885379101149738
456 0.00021418332471512258
457 0.0002106172905769199
458 0.00020584628509823233
459 0.00020142375433351845
460 0.0001981275709113106
461 0.0001941457885550335
462 0.00019064315711148083
463 0.00018674248713068664
464 0.00018327400903217494
465 0.00017957518866751343
466 0.00017601023137103766
467 0.00017248981748707592
468 0.0001695102546364069
469 0.0001666831667535007
470 0.0001635611552046612
471 0.00016082463844213635
472 0.00015765588614158332
473 0.000155398971401155
474 0.0001522754319012165
475 0.00014910902245901525
476 0.00014676649880129844
477 0.00014420313527807593
478 0.00014168121560942382
479 0.0001393060083501041
480 0.0001370741956634447
481 0.00013495745952241123
482 0.0001321775489486754
483 0.00012990206596441567
484 0.00012827209138777107
485 0.00012577755842357874
486 0.00012374458310659975
487 0.000121986

# PyTorch: Tensors and autograd

In the above examples, we had to manually implement both the forward and backward passes of our neural network. Manually implementing the backward pass is not a big deal for a small two-layer network, but can quickly get very hairy for large complex networks.

In [13]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
     # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()
    
    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()


0 33290020.0
1 26750886.0
2 21619474.0
3 16251302.0
4 11251258.0
5 7339624.0
6 4709411.0
7 3078007.25
8 2100011.75
9 1506487.625
10 1133901.625
11 887679.875
12 716231.5625
13 590875.75
14 495405.90625
15 420341.125
16 359920.875
17 310401.90625
18 269225.21875
19 234628.375
20 205356.09375
21 180396.0
22 158992.578125
23 140560.265625
24 124613.0625
25 110754.6640625
26 98672.3671875
27 88099.046875
28 78822.9296875
29 70672.84375
30 63483.8046875
31 57126.2890625
32 51488.234375
33 46478.5390625
34 42018.0
35 38041.8203125
36 34491.1484375
37 31313.583984375
38 28460.591796875
39 25897.34375
40 23595.787109375
41 21526.9609375
42 19658.4921875
43 17969.3984375
44 16440.484375
45 15054.7021484375
46 13797.44140625
47 12655.3818359375
48 11616.236328125
49 10670.130859375
50 9807.6220703125
51 9021.369140625
52 8303.0302734375
53 7646.39404296875
54 7045.87255859375
55 6495.876953125
56 5992.25244140625
57 5531.7275390625
58 5109.615234375
59 4722.234375
60 4366.38427734375
61 4039.230

405 7.094546890584752e-05
406 6.98403746355325e-05
407 6.840649439254776e-05
408 6.734942144248635e-05
409 6.643315282417461e-05
410 6.530217797262594e-05
411 6.408782064681873e-05
412 6.268858123803511e-05
413 6.157578900456429e-05
414 6.027942436048761e-05
415 5.9168542065890506e-05
416 5.834144030814059e-05
417 5.743013389292173e-05
418 5.6456890888512135e-05
419 5.565127867157571e-05
420 5.474650606629439e-05
421 5.3887066314928234e-05
422 5.287804378895089e-05
423 5.184760084375739e-05
424 5.1150207582395524e-05
425 5.02779075759463e-05
426 4.933922173222527e-05
427 4.8730475100455806e-05
428 4.8197965952567756e-05
429 4.7380930482177064e-05
430 4.678289042203687e-05
431 4.5951524953125045e-05
432 4.5149459765525535e-05
433 4.438994437805377e-05
434 4.3853888200828806e-05
435 4.3153846490895376e-05
436 4.228567922837101e-05
437 4.1648061596788466e-05
438 4.1076888010138646e-05
439 4.05149366997648e-05
440 4.003062713309191e-05
441 3.942866896977648e-05
442 3.888002174790017e-05
44

# PyTorch: nn

 The nn package defines a set of Modules, which are roughly equivalent to neural network layers

In [14]:
import torch

In [15]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [16]:
# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

In [17]:
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(reduction='sum')

In [18]:
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 663.844970703125
1 663.3399047851562
2 662.8355712890625
3 662.331787109375
4 661.82861328125
5 661.3262939453125
6 660.824951171875
7 660.3244018554688
8 659.824462890625
9 659.3251342773438
10 658.8271484375
11 658.3298950195312
12 657.8335571289062
13 657.3380126953125
14 656.8430786132812
15 656.3487548828125
16 655.8550415039062
17 655.3618774414062
18 654.869384765625
19 654.3775024414062
20 653.8861083984375
21 653.3953247070312
22 652.9054565429688
23 652.4161376953125
24 651.92724609375
25 651.4389038085938
26 650.9511108398438
27 650.4638671875
28 649.9771728515625
29 649.490966796875
30 649.0057983398438
31 648.521240234375
32 648.0372924804688
33 647.553955078125
34 647.071533203125
35 646.5897827148438
36 646.1085205078125
37 645.6280517578125
38 645.1484985351562
39 644.6697387695312
40 644.1917114257812
41 643.7141723632812
42 643.2371826171875
43 642.7606811523438
44 642.2849731445312
45 641.809814453125
46 641.335205078125
47 640.8612670898438
48 640.388427734375
49 

401 506.767333984375
402 506.4620666503906
403 506.1570129394531
404 505.85198974609375
405 505.5472412109375
406 505.2431640625
407 504.9395751953125
408 504.63616943359375
409 504.3331298828125
410 504.0303039550781
411 503.727783203125
412 503.4255065917969
413 503.1233215332031
414 502.8216857910156
415 502.5202331542969
416 502.21881103515625
417 501.9179382324219
418 501.6173400878906
419 501.3170166015625
420 501.01690673828125
421 500.717041015625
422 500.4173583984375
423 500.11785888671875
424 499.81890869140625
425 499.520263671875
426 499.2218017578125
427 498.9236755371094
428 498.6261291503906
429 498.328857421875
430 498.0318298339844
431 497.7350769042969
432 497.43853759765625
433 497.1420593261719
434 496.84588623046875
435 496.5499267578125
436 496.254150390625
437 495.95880126953125
438 495.6636962890625
439 495.3686218261719
440 495.0740661621094
441 494.7796936035156
442 494.4857177734375
443 494.1917419433594
444 493.8982849121094
445 493.6054992675781
446 493.31

# PyTorch: optim

The optim package in PyTorch abstracts the idea of an optimization algorithm and provides implementations of commonly used optimization algorithms.

In [20]:
# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

for t in range(500):
    # Forward
    y_pred = model(x)
    
    #Compute and print Loss
    loss = loss_fn(y_pred,y)
    print(t,loss.item())
    
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()
    
    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()
    
    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 706.3470458984375
1 689.0985107421875
2 672.3837890625
3 656.1159057617188
4 640.2396240234375
5 624.8063354492188
6 609.8533325195312
7 595.4679565429688
8 581.5108642578125
9 567.9423828125
10 554.7247314453125
11 541.8636474609375
12 529.3876953125
13 517.32861328125
14 505.6119689941406
15 494.2054138183594
16 483.1181640625
17 472.3131103515625
18 461.7609558105469
19 451.47198486328125
20 441.4869079589844
21 431.76849365234375
22 422.33380126953125
23 413.1043395996094
24 404.0680236816406
25 395.23504638671875
26 386.58135986328125
27 378.12799072265625
28 369.8689270019531
29 361.83770751953125
30 354.0144348144531
31 346.3866271972656
32 338.9457092285156
33 331.6524658203125
34 324.5095520019531
35 317.54412841796875
36 310.7486877441406
37 304.0796813964844
38 297.5653991699219
39 291.22125244140625
40 285.0050964355469
41 278.8966369628906
42 272.8809814453125
43 266.9606628417969
44 261.1448669433594
45 255.44061279296875
46 249.85491943359375
47 244.3641357421875
48 23

419 3.000811830133898e-06
420 2.7999492431263207e-06
421 2.610873025332694e-06
422 2.4350747480639257e-06
423 2.2707351945427945e-06
424 2.116807081620209e-06
425 1.9732117380044656e-06
426 1.8386999727226794e-06
427 1.7141485386673594e-06
428 1.596827701177972e-06
429 1.4871113762637833e-06
430 1.385473524351255e-06
431 1.2900951560368412e-06
432 1.2015558468192467e-06
433 1.1185667290192214e-06
434 1.0415064934932161e-06
435 9.691631248642807e-07
436 9.019134381560434e-07
437 8.386313083974528e-07
438 7.801532433404645e-07
439 7.258535674736777e-07
440 6.750322540938214e-07
441 6.27590736712591e-07
442 5.83619907956745e-07
443 5.428063332146849e-07
444 5.042499537921685e-07
445 4.6907899786674534e-07
446 4.3546737060751184e-07
447 4.042682633098593e-07
448 3.7533675367740216e-07
449 3.486624109427794e-07
450 3.2366878599532356e-07
451 3.006008171269059e-07
452 2.7866070695381495e-07
453 2.588507186374045e-07
454 2.4013107235987263e-07
455 2.2272347166563122e-07
456 2.0683624768480513

# PyTorch: Custom nn Modules

Sometimes you will want to specify models that are more complex than a sequence of existing Modules; for these cases you can define your own Modules by subclassing nn.Module and defining a forward which receives input Tensors and produces output Tensors using other modules or other autograd operations on Tensors.

In [26]:
import torch.nn as nn
import torch.nn.functional as F

class TwoLayerNet(nn.Module):
    def __init__(self,D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = nn.Linear(D_in, H)
        self.linear2 = nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(0)
        y_pred = self.linear2(h_relu)
        return y_pred
    
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)


# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(reduction="sum")

optimizer = torch.optim.SGD(model.parameters(),lr=1e-4)

for t in range(500):
    #Forward
    y_pred = model(x)
    
    #Computer and print loss
    loss = criterion(y_pred,y)
    print(t,loss.item())
    
    # Zero grad, Backward, update params
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 647.6163940429688
1 599.6649780273438
2 558.7977905273438
3 523.4417724609375
4 492.290283203125
5 464.2566223144531
6 438.56536865234375
7 414.883544921875
8 392.98016357421875
9 372.5049133300781
10 353.401611328125
11 335.5721740722656
12 318.6746826171875
13 302.54278564453125
14 287.2809143066406
15 272.8375549316406
16 259.0872802734375
17 245.92727661132812
18 233.35009765625
19 221.31410217285156
20 209.7565155029297
21 198.70050048828125
22 188.1429901123047
23 178.0684051513672
24 168.403564453125
25 159.15855407714844
26 150.32565307617188
27 141.90419006347656
28 133.89552307128906
29 126.27449035644531
30 119.03556823730469
31 112.18098449707031
32 105.66665649414062
33 99.49286651611328
34 93.65101623535156
35 88.12578582763672
36 82.91619873046875
37 77.9955062866211
38 73.35179901123047
39 68.96204376220703
40 64.82278442382812
41 60.915897369384766
42 57.24262237548828
43 53.793983459472656
44 50.549537658691406
45 47.500633239746094
46 44.63739776611328
47 41.952007

389 2.6828134650713764e-05
390 2.595947080408223e-05
391 2.5120094505837187e-05
392 2.4308244974236004e-05
393 2.352200863242615e-05
394 2.276069972140249e-05
395 2.2027425075066276e-05
396 2.1316549464245327e-05
397 2.0628373022191226e-05
398 1.996512037294451e-05
399 1.9321212676004507e-05
400 1.870268170023337e-05
401 1.8101258319802582e-05
402 1.752090065565426e-05
403 1.6959815184236504e-05
404 1.6415917343692854e-05
405 1.5890964277787134e-05
406 1.5382052879431285e-05
407 1.4891335013089702e-05
408 1.4416178601095453e-05
409 1.3955590475234203e-05
410 1.3512461009668186e-05
411 1.3081921679258812e-05
412 1.266556773771299e-05
413 1.2264693395991344e-05
414 1.1873818039020989e-05
415 1.149763738794718e-05
416 1.1132770850963425e-05
417 1.0780794582387898e-05
418 1.0440019650559407e-05
419 1.0108369679073803e-05
420 9.78919069893891e-06
421 9.481081178819295e-06
422 9.18105160963023e-06
423 8.891818652045913e-06
424 8.612175406597089e-06
425 8.34150705486536e-06
426 8.079071449174

# why super?

super() lets you avoid referring to the base class explicitly, which can be nice. But the main advantage comes with multiple inheritance, where all sorts of fun stuff can happen. See the standard docs on super if you haven't already.

example:

In [30]:
class Foo(object):
     def __init__(self, frob, frotz):
          self.frobnicate = frob
          self.frotz = frotz

class Bar(Foo):
     def __init__(self, frob, frizzle):
          super(Bar, self).__init__(frob, frizzle)
          self.frotz = 34
          self.frazzle = frizzle


bar = Bar(1,2)
print ("frobnicate:", bar.frobnicate)
print ("frotz:", bar.frotz)
print ("frazzle:", bar.frazzle)

frobnicate: 1
frotz: 34
frazzle: 2


You can see that super is calling the base class (the class that the current class inherits), followed by an access modifier, accessing the base class' .__init__() method. It's like self, but for the base class.