# The problem with overfitting

In [None]:
# Using torch.utils.data.SubsetRandomSampler
indices = np.arange(50000)
np.random.shuffle(indices)
# Dataloaders
transform = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))
    ]
)
trainset = torchvision.datasets.CIFAR(root='/date', train=True,
                                      download=True, transform=transform)
trainloader = torchvision.utils.data.DataLoader(trainset, batch_size=1,
                                                shuffle=False, sampler=torch.utils.data.SubsetRandomSampler(indices[:45000]))
val_loader = torch.utils.data.DataLoader(trainset, batch_size=1, shuffle=False, sampler = torch.utils.data.SubsetRandomSampler(indices[45000:50000]) )

# testset = torchvision.datasets.CIFAR(root='/date', train=False,
#                                      download=True, transform=transform)
# testloader = torchvision.utils.data.DataLoader(trainset, batch_size=128,
#                                                shuffle=False, num_workers=2)

In [None]:
# Example....
# Shuffle the indices
indices = np.arange(60000)
np.random.shuffle(indices)
# Build the train loader
train_loader = torch.utils.data.DataLoader(datasets.MNIST('mnist', download=True, train=True,
                     transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])),
                     batch_size=64, shuffle=False, sampler=torch.utils.data.SubsetRandomSampler(indices[:55000]))

# Build the validation loader
val_loader = torch.utils.data.DataLoader(datasets.MNIST('mnist', download=True, train=True,
                   transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])),
                   batch_size=64, shuffle=False, sampler=torch.utils.data.SubsetRandomSampler(indices[55000:60000]))

# Neural Network Toolset
1. Regularization Techniques - L1,L2 regularization
2. Dropout
3. Batch-normalization
4. Early Stopping
5. Transfer Learning
6. Finetuning CNNs
7. Torchvision module

Question: How to choose all hyperparameters?
- l2 reg, dropout, optim (adam, grad desc), batch-norm momentum & epsilon, num epochs for early stopping etc.?

Answer: Train many networks with different hyperparams and test in the validation set. Use best performing net in the validation set to know the expected accuracy of the network in new data. 

```
# Sets model in train mode
model.train()

# Sets net in eval mode
model.eval()
```

## 1. Regularization

### L2 Regularization

Penalises large weights (${w}$) with the ${\lambda}$ function term: `weight_decay` term is the lambda. 

$C=-\frac{1}{n} \sum_{x j}\left[y_j \ln a_j^L+\left(1-y_j\right) \ln \left(1-a_j^L\right)\right]+\frac{\lambda}{2 n} \sum_w w^2$

```
optimizer = optim.Adam(net.parameters(), lr=3e-4, weight_decay=0.0001)
```

## 2. Dropouts
```
self.classifier = nn.Sequential(
    nn.Dropout(p=0.5),
    nn.Linear(256*6*6, 4096),
    nn.ReLU(inplace=True),
    nn.Dropout(p=0.5),
    nn.Linear(4096, 4096),
    nn.ReLU(inplace=True),
    nn.Linear(4096, num_classes),
)
```

## 3. Batch Normalization

$\begin{aligned} & \text { Input: Values of } x \text { over a mini-batch: } \mathcal{B}=\left\{x_{1 \ldots m}\right\} \text {; } \\ & \text { Parameters to be leamed: } \gamma, \beta \\ & \text { Output: }\left\{y_i=\mathrm{BN}_{\gamma, \beta}\left(x_i\right)\right\} \\ & 
\text { // mini-batch mean } & \mu_{\mathcal{B}} \leftarrow \frac{1}{m} \sum_{i=1}^m x \\ & 
\text { // mini-batch variance } & \sigma_{\mathcal{B}}^2 \leftarrow \frac{1}{m} \sum_{i=1}^m\left(x_i-\mu_{\mathcal{B}}\right)^2 \\ & \\ & 
\text { // normalize } & \widehat{x}_i \leftarrow \frac{x_i-\mu_{\mathcal{B}}}{\sqrt{\sigma_{\mathcal{B}}^2+\epsilon}} \\ &  \\ & 
\text { // scale and shift } & y_{\mathrm{i}} \leftarrow \gamma \widehat{x}_i+\beta \equiv \mathrm{BN}_{\gamma, \beta}\left(x_i\right) \\ &  \\ & \end{aligned}$


```
self.bn = nn.BatchNorm2d(num_features=64, eps = 1e-05, momentum=0.9)
```

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        # Implement the sequential module for feature extraction
        self.features = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=10, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(kernel_size=2,stride=2), nn.ReLU(inplace=True), nn.BatchNorm2d(10),
            nn.Conv2d(in_channels=10, out_channels=20, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(kernel_size=2,stride=2), nn.ReLU(inplace=True), nn.BatchNorm2d(20))
        
        # Implement the fully connected layer for classification
        self.fc = nn.Linear(in_features=7*7*20, out_features=10)

## 4. Early Stopping

## 5. Transfer Learning

**Features in CNNs:**
1. First few layers > `Edges`
2. Next few conv layers > `Simple geometrical shapes`
3. Last layers > `Parts of an object. e.g. Wheel, window`
4. Decision layers (fully connected layers)

In practice, instead of randomly initialising the net, we would use a pre-trained network to reduce the training time. You do not need large datasets for training now. You can now fine-tune the network with a smaller dataset. 


**2 ways of Fine-tuning**
- Freezing all layers except last few. Only tuning the last fully connected layer. 
- Fine tune everything

*Typically a good idea to freeze most layers if small dataset

## 6. Finetuning CNNs

In [None]:
# Fine tuning in pytorch

# Instantiate model
model = Net()
# Load params from old model
model.load_state_dict(torch.load('cifar10_net.pth'))
# Change the number of out channels
model.fc = nn.Linear(4*4*1024, 100)

## 7. Torchvision

torchvision comes with many pre-trained models such as resnets

In [None]:
import torchvision

model = torchvision.models.resnet18(pretrained=True)
model.fc = nn.Linear(512, num_classes)

In [None]:
# Examples for training whole model - not freezing the layers:

# Create a model using
model = Net()

# Load the parameters from the old model
model.load_state_dict(torch.load('my_net.pth'))

# Change the number of out channels
model.fc = nn.Linear(7 * 7 * 512, 26)

# Train and evaluate the model
model.train()
train_net(model, optimizer, criterion)
print("Accuracy of the net is: " + str(model.eval()))

In [None]:
# Example for freezing the layers
# Import the module
import torchvision

# Download resnet18
model = torchvision.models.resnet18(pretrained=True)

# Freeze all the layers bar the last one
for param in model.parameters():
    param.requires_grad = False

# Change the number of output units - predicting 7 possible classes
model.fc = nn.Linear(512, 7)