# Deep Learning
## Exercise 7 - Regularization


### Effect of Regularization

Below you find the data setup for a binary classification problem in 2D input space. We will train two neural networks to gain some inuition about the effect of regularization. In particular, you should train the same model twice, while only adding L2 regularization to one of them.

In [None]:
import torch
import torch.nn as nn
from matplotlib import pyplot as plt

torch.manual_seed(2)


def generate_data(center, spread, samples, test=False):
    x0 = torch.empty(samples, 1).uniform_(center[0] - spread, center[0] + spread)
    x1 = torch.empty(samples, 1).uniform_(center[1] - spread, center[1] + spread)
    y0 = torch.zeros(samples)
    if test: 
        f = lambda point: ((center[0]) + -point[0]) + ((center[1] + spread) + -point[1])
    else:
        f = lambda point: ((center[0]) + -point[0]) + ((center[1] + spread) + -point[1]) + 5 * torch.cos(point[0] * 10)
    y0 = torch.where(f((x0, x1)) > 5, 0, 1).squeeze()
    # print(f(x0))

    return torch.cat([x0, x1], dim=1), y0


X_train, y_train = generate_data(center=(10,10), spread=5, samples=500)
X_test, y_test = generate_data(center=(10,10), spread=5, samples=50, test=True)


def visualize(model=None):
    if model is not None:
        vis_points = torch.meshgrid(torch.arange(5, 15, 0.1), torch.arange(5, 15, 0.1))
        vis_points = torch.stack([vis_points[0].reshape(-1), vis_points[1].reshape(-1)], dim=1)
        preds = model(vis_points)
        preds = torch.where(preds > 0.5, 1, 0)
        plt.scatter(vis_points[:, 0], vis_points[:, 1], s=10, alpha=0.1, c=['orange' if target == 1 else 'blue' for target in preds])
    print('Training data:')
    plt.scatter(X_train[:, 0], X_train[:, 1], marker='.', s=100, c=['orange' if target == 1 else 'blue' for target in y_train])
    plt.show()

    print('Test data:')
    if model is not None:
        plt.scatter(vis_points[:, 0], vis_points[:, 1], s=10, alpha=0.1, c=['orange' if target == 1 else 'blue' for target in preds])
    plt.scatter(X_test[:, 0], X_test[:, 1], marker='.', s=100, c=['orange' if target == 1 else 'blue' for target in y_test])
    plt.show()


visualize()

ds_train = torch.utils.data.TensorDataset(X_train, y_train)
ds_test = torch.utils.data.TensorDataset(X_test, y_test)

#### 1. Set up the model

Use the following network architecture:
- 3 fully connected hidden layers consisting of 64 neurons each.
- A ReLU activation after every layer except the output layer
- A fully connected output layer, mapping its 64 inputs to a single output neuron and using a Sigmoid activation to push values between 0 and 1.


In [None]:
#ToDo: Set up the model

In [None]:
def get_model():
    model = nn.Sequential(        
        nn.Linear(in_features=2, out_features=64),
        nn.ReLU(),
        nn.Linear(in_features=64, out_features=64),
        nn.ReLU(),
        nn.Linear(in_features=64, out_features=64),
        nn.ReLU(),
        nn.Linear(in_features=64, out_features=1),
        nn.Sigmoid()
    )
    return model

#### 2. Training  without regularization.
Train the network using the following hyperparameters:
- Train for 1000 epochs.
- Use the Adam optimizer with a learning rate of 0.001.
- Use a batch size of 32.
- Use a binary cross entropy loss.

Compute the accuracy on both training and test set. Use a threshold of 0.5 to binarize the model's output. How does your training and testing accuracy evolve?

Visualize your model's decision boundary using the given `visualize` function.

In [None]:
#ToDo: Set up the training and evaluation

In [None]:
def train(epochs, model, opt, train_dl, test_dl):
    train_loss = []
    val_loss = []
    train_acc = []
    val_acc = []
    loss_function = nn.BCELoss()
    for epoch in range(epochs):
        model.train()
        cum_loss = 0
        total_matches = 0
        train_entries = 0
        for points, labels in train_dl:
            output = model(points).flatten()
            loss = loss_function(output, labels.float())
            loss.backward()
            opt.step()
            opt.zero_grad()
            cum_loss += loss.item()
            
            total_matches += accuracy(output, labels)
            train_entries += len(points)
        train_loss += [cum_loss/len(train_dl)]
        train_acc += [total_matches/train_entries]
        
        acc, test_loss = evaluation(model, loss_function, test_dl)
        val_loss += [test_loss]
        val_acc += [acc]
        
        if epoch%100 == 0:
            print(f"Epoch {epoch} \t ----> \t Loss {cum_loss/len(train_dl) :.5f} \t ----- \t Val Loss {test_loss:.5f} \t ----- \t Accuracy {acc:.5f}")
    return train_loss, train_acc, val_loss, val_acc
    
    
    
    
def accuracy(output, labels):
    prediction = (output>0.5).int()
    num_matches = (prediction==labels).sum()
    return num_matches
    
        
def evaluation(model, loss_funct, val_dl):
    model.eval()
    total_matches = 0
    val_entries = 0
    cum_loss = 0
    with torch.no_grad():
        for points, labels in val_dl:
            output = model(points).flatten()
            loss = loss_funct(output, labels.float())
            cum_loss += loss.item()

            total_matches += accuracy(output, labels)
            val_entries += len(points)
    acc = total_matches/val_entries
    loss = cum_loss/len(val_dl)
    return acc.item(), loss

In [None]:
#ToDo: Run training.

In [None]:
torch.manual_seed(0)
train_dl = torch.utils.data.DataLoader(ds_train, batch_size=32, shuffle=True)
test_dl = torch.utils.data.DataLoader(ds_test, batch_size=32)
num_epochs = 1000

model = get_model()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_loss, train_acc, val_loss, val_acc = train(num_epochs, model, optimizer, train_dl, test_dl)

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))

ax1.plot(list(range(num_epochs)),train_loss, label='Train')
ax1.plot(list(range(num_epochs)), val_loss, label='Test')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()

ax2.plot(list(range(num_epochs)),train_acc, label='Train')
ax2.plot(list(range(num_epochs)), val_acc, label='Test')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend()
plt.show()

visualize(model)

#### 3. Training with Regularization
Train the same model with the same settings again, only this time add an L2 regularization penalty of 0.01 to the model (Hint: weight decay in the optimizer).

Again, visualize your model's decision boundary using the given `visualize` function.

In [None]:
#ToDo: Run training with regularization

In [None]:
torch.manual_seed(0)
train_dl = torch.utils.data.DataLoader(ds_train, batch_size=32, shuffle=True)
test_dl = torch.utils.data.DataLoader(ds_test, batch_size=32)
num_epochs = 1000

model = get_model()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)

train_loss, train_acc, val_loss, val_acc = train(num_epochs, model, optimizer, train_dl, test_dl)

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))

ax1.plot(list(range(num_epochs)),train_loss, label='Train')
ax1.plot(list(range(num_epochs)), val_loss, label='Test')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()

ax2.plot(list(range(num_epochs)),train_acc, label='Train')
ax2.plot(list(range(num_epochs)), val_acc, label='Test')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend()
plt.show()

visualize(model)