# <font color = 'pickle'> **Import Libraries**

In [1]:
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split

# <font color = 'pickle'>**Understanding Dropout, Batchnorm1d, model.state_dict**



## <font color = 'pickle'>**Dropout**

<img src ="https://drive.google.com/uc?export=view&id=1f7KmsmF1TXZFUNOJpWBH2P4WkawnVH3Z" width =500>


In [2]:
inp = torch.tensor([1.0, 2.0, 3, 4, 5])
print(inp)
model = nn.Dropout(p=0)
output = model(inp)
print(output)

tensor([1., 2., 3., 4., 5.])
tensor([1., 2., 3., 4., 5.])


In [None]:
1/0.6

1.6666666666666667

### <font color = 'pickle'>**Dropout with model.train()**

In [7]:
inp = torch.tensor([1.0, 2.0, 3, 4, 5])
print(inp)
model = nn.Dropout(p=0.4)
model.train()
output = model(inp)
print(output)

tensor([1., 2., 3., 4., 5.])
tensor([1.6667, 3.3333, 5.0000, 6.6667, 0.0000])


### <font color = 'pickle'>**Dropout with model.eval()**

In [10]:
# model.eval() ignores droput and batch normalization layers

inp = torch.tensor([1.0, 2.0, 3, 4, 5])
print(inp)
model = nn.Dropout(p=0.4)
model.eval()
output = model(inp)
print(output)

tensor([1., 2., 3., 4., 5.])
tensor([1., 2., 3., 4., 5.])


## <font color = 'pickle'>**Model.eval vs torch.no_grad()**

In [11]:
N=10
# random data on the x-axis in (-5, +5)
X=np.random.random((N,2))*10-5

# a line plus some noise
Y= 0.5*X[:,0] +0.2*X[:,1]-1 + np.random.randn(N)

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.33, random_state=41)

X_train=torch.from_numpy(X_train.astype(np.float32))
X_test=torch.from_numpy(X_test.astype(np.float32))
y_train=torch.from_numpy(y_train.astype(np.float32).reshape(-1,1))
y_test=torch.from_numpy(y_test.astype(np.float32).reshape(-1,1))


In [12]:
model = nn.Sequential(nn.Dropout(p=0.4),
                      nn.Linear(2,1)
)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

In [13]:
n_epochs = 1
train_losses = np.zeros(n_epochs)
test_losses = np.zeros(n_epochs)

for i in range(n_epochs):
  # zero the parameter gradients
  optimizer.zero_grad()

  model.train()
  # Forward pass
  out_train=model(X_train) # we are using both the layers together here
  out_train_drop=model[0](X_train) # We are using the first layer only here
  loss_train= criterion(out_train,y_train)
  
    
  # Backward and optimize
  loss_train.backward()
  optimizer.step()

  model.eval()
  # test loss and prediction
  with torch.no_grad():
    out_test=model(X_test)
    out_test_drop=model[0](X_test)
    loss_test=criterion(out_test,y_test)
    
  # Save losses
  train_losses[i]=loss_train.item()
  test_losses[i]=loss_test.item()
  


In [None]:
print(out_train.requires_grad)
print(loss_train.requires_grad)
print(out_test.requires_grad)
print(loss_test.requires_grad)


print('\n',out_train_drop)
print('\nX_train',X_train)
print('\n',out_test_drop)
print('\nX_test',X_test)


True
True
False
False

 tensor([[-0.0000,  0.0000],
        [ 7.8531, -4.7530],
        [-0.0000,  0.2579],
        [ 0.0000, -0.0000],
        [ 6.6333,  0.0000],
        [ 2.2999, -0.0000]])

X_train tensor([[-0.4541,  0.8299],
        [ 4.7118, -2.8518],
        [-4.4421,  0.1547],
        [ 2.1214, -2.9844],
        [ 3.9800,  2.5901],
        [ 1.3799, -3.5499]])

 tensor([[-1.2287,  4.5403],
        [-3.3914, -0.7042],
        [-4.8668,  0.7100],
        [-3.4052, -3.5125]])

X_test tensor([[-1.2287,  4.5403],
        [-3.3914, -0.7042],
        [-4.8668,  0.7100],
        [-3.4052, -3.5125]])


In [None]:
model.state_dict()

OrderedDict([('1.weight', tensor([[0.3849, 0.4534]])),
             ('1.bias', tensor([-0.3008]))])

## <font color = 'pickle'>**Batchnorm1d**

<img src ="https://drive.google.com/uc?export=view&id=1f6TJdYfRJdQ10GVO6Q2ZX7biejwwykkq" width =300>


In [14]:
X = torch.randn(3, 2) * 5 + 10

B = nn.BatchNorm1d(2, affine=False)
y = B(X)

mu = torch.mean(X,axis=0)  
var_ = torch.var(X,axis=0, unbiased=False)
sigma = torch.sqrt(var_ + 1e-5)
z = (X - mu)/sigma

#the ratio below should be equal to one
print(z / y)

tensor([[1.0000, 1.0000],
        [1.0000, 1.0000],
        [1.0000, 1.0000]])


### <font color = 'pickle'>**Batchnorm with model.train() and model.eval()**
- During training, this layer keeps a running estimate of its computed mean and variance. The running sum is kept with a default momentum of 0.1.

- During evaluation, this running mean/variance is used for normalization.

In [None]:
torch.manual_seed(0)
X1 = torch.randn(3, 2) * 5 + 10
print(X1)
model = nn.Sequential(nn.BatchNorm1d(2))
print(X1.mean(axis =0))
y = model(X1)
print(y)
print(model[0].running_mean)

tensor([[17.7050,  8.5329],
        [-0.8939, 12.8422],
        [ 4.5774,  3.0070]])
tensor([7.1295, 8.1273])
tensor([[ 1.3551,  0.1007],
        [-1.0281,  1.1713],
        [-0.3270, -1.2720]], grad_fn=<NativeBatchNormBackward0>)
tensor([0.7129, 0.8127])


In [None]:
for i in range(100):
  model.train()
  y1 = model(X1)
  if (i % 10) ==0:
    print(model[0].running_mean)

tensor([1.3546, 1.5442])
tensor([5.1159, 5.8319])
tensor([6.4274, 7.3270])
tensor([6.8847, 7.8483])
tensor([7.0441, 8.0300])
tensor([7.0997, 8.0934])
tensor([7.1191, 8.1155])
tensor([7.1259, 8.1232])
tensor([7.1282, 8.1259])
tensor([7.1290, 8.1268])


In [None]:
for i in range(100):
  model.eval()
  y1 = model(X1)
  if (i % 10) ==0:
    print(model[0].running_mean)

tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])


In [None]:
model.state_dict()

OrderedDict([('0.weight', tensor([1., 1.])),
             ('0.bias', tensor([0., 0.])),
             ('0.running_mean', tensor([7.1293, 8.1272])),
             ('0.running_var', tensor([91.3627, 24.3052])),
             ('0.num_batches_tracked', tensor(101))])