# Ejercicio 1

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import time


## Task 1

In [2]:
iris = load_iris()

X = iris.data
y = iris.target

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Task 2

In [3]:
class SimpleFeedforwardNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size, dropout_rate=0.0):
        super(SimpleFeedforwardNN, self).__init__()
        
        #layers
        self.input_layer = nn.Linear(input_size, hidden_size1)
        self.hidden_layer1 = nn.Linear(hidden_size1, hidden_size2)
        self.hidden_layer2 = nn.Linear(hidden_size2, output_size)

        #dropout
        self.dropout = nn.Dropout(dropout_rate)
    
    def forward(self, x):
        #relu activation function for input layer
        x = F.relu(self.input_layer(x))

        x = self.dropout(x)
        
        #relu activation function for hidden layer 1
        x = F.relu(self.hidden_layer1(x))

        x = self.dropout(x)
        
        #log softmax activation function for hidden layer 2
        x = F.log_softmax(self.hidden_layer2(x), dim=1)
        
        return x

## Task 3

In [4]:
def one_hot_encode(labels, num_classes):
    return F.one_hot(labels, num_classes).float()

In [5]:
def train_model(model, criterion, optimizer, train_loader, test_loader, epochs=50, loss_name=None, num_classes=3, l1_lambda=0.0):
    train_losses = []
    test_losses = []

    start_time = time.time()

    for _ in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)

            # if loss function is MSELoss, convert labels to one-hot
            if loss_name == 'MSELoss':
                labels = F.one_hot(labels, num_classes).float()

            #compute loss
            loss = criterion(outputs, labels)

            if l1_lambda > 0:
                l1_penalty = sum(torch.sum(torch.abs(param)) for param in model.parameters())
                loss += l1_lambda * l1_penalty

            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        train_losses.append(running_loss / len(train_loader))

        # Evaluate the model on the test set
        model.eval()
        test_loss = 0.0
        with torch.no_grad():
            for inputs, labels in test_loader:
                outputs = model(inputs)

                # if loss function is MSELoss, convert labels to one-hot
                if loss_name == 'MSELoss':
                    labels = F.one_hot(labels, num_classes).float()

                loss = criterion(outputs, labels)

                if l1_lambda > 0:
                    l1_penalty = sum(torch.sum(torch.abs(param)) for param in model.parameters())
                    loss += l1_lambda * l1_penalty

                test_loss += loss.item()

        test_losses.append(test_loss / len(test_loader))

    training_time = time.time() - start_time

    return train_losses, test_losses, training_time


In [6]:
#Converting the data into tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

In [7]:
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [8]:
input_size = 4
hidden_size1 = 10
hidden_size2 = 8
output_size = 3

#model
model = SimpleFeedforwardNN(input_size, hidden_size1, hidden_size2, output_size)

In [9]:
# set the different loss functions
loss_functions = {
    'CrossEntropyLoss': nn.CrossEntropyLoss(),
    'MSELoss': nn.MSELoss(),
    'NLLLoss': nn.NLLLoss()
}

#loss records
loss_records = {}

In [10]:
#training the model with different loss functions
for loss_name, criterion in loss_functions.items():
    print(f"Training with {loss_name}...")
    
    # optimizer
    model = SimpleFeedforwardNN(input_size, hidden_size1, hidden_size2, output_size)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # train the model
    train_losses, test_losses, _ = train_model(model, criterion, optimizer, train_loader, test_loader, loss_name=loss_name)
    
    # save the loss records
    loss_records[loss_name] = {
        'train_losses': train_losses,
        'test_losses': test_losses
    }

Training with CrossEntropyLoss...
Training with MSELoss...
Training with NLLLoss...


In [11]:
for loss_name, losses in loss_records.items():
    print(f"Loss function: {loss_name}")
    print(f"Train loss: {losses['train_losses'][-1]:.4f}")
    print(f"Test loss: {losses['test_losses'][-1]:.4f}")
    print()

Loss function: CrossEntropyLoss
Train loss: 0.2302
Test loss: 0.1789

Loss function: MSELoss
Train loss: 2.1917
Test loss: 2.1895

Loss function: NLLLoss
Train loss: 0.2240
Test loss: 0.1603



## Task 4

In [12]:
regularization_techniques = {
    'No Regularization': {'weight_decay': 0.0, 'dropout_rate': 0.0, 'l1_lambda': 0.0},
    'L2 Regularization': {'weight_decay': 0.01, 'dropout_rate': 0.0, 'l1_lambda': 0.0},
    'L1 Regularization': {'weight_decay': 0.0, 'dropout_rate': 0.0, 'l1_lambda': 0.01},
    'Dropout': {'weight_decay': 0.0, 'dropout_rate': 0.5, 'l1_lambda': 0.0},
}

In [13]:
loss_records_regularization = {}

# train the model with different regularization techniques
for reg_name, reg_params in regularization_techniques.items():
    print(f"Training with {reg_name}...")
    
    model = SimpleFeedforwardNN(input_size, hidden_size1, hidden_size2, output_size, dropout_rate=reg_params['dropout_rate'])
    
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=reg_params['weight_decay'])
    
    train_losses, test_losses, _ = train_model(
        model, 
        criterion, 
        optimizer, 
        train_loader, 
        test_loader, 
        epochs=50, 
        loss_name='CrossEntropyLoss',
        l1_lambda=reg_params['l1_lambda']
    )
    
    loss_records_regularization[reg_name] = {
        'train_losses': train_losses,
        'test_losses': test_losses
    }

Training with No Regularization...
Training with L2 Regularization...
Training with L1 Regularization...
Training with Dropout...


In [14]:
for reg_name, losses in loss_records_regularization.items():
    print(f"Regularization technique: {reg_name}")
    print(f"Train loss: {losses['train_losses'][-1]:.4f}")
    print(f"Test loss: {losses['test_losses'][-1]:.4f}")
    print()

Regularization technique: No Regularization
Train loss: 0.2886
Test loss: 0.2204

Regularization technique: L2 Regularization
Train loss: 0.3703
Test loss: 0.2974

Regularization technique: L1 Regularization
Train loss: 0.6292
Test loss: 0.5707

Regularization technique: Dropout
Train loss: 0.6772
Test loss: 0.4485



## Task 5

In [15]:
optimization_techniques = {
    'SGD': {'optimizer': optim.SGD, 'batch_size': 1, 'params': {'lr': 0.01}},
    'Batch GD': {'optimizer': optim.SGD, 'batch_size': len(train_loader.dataset), 'params': {'lr': 0.01}},
    'Mini-Batch GD': {'optimizer': optim.SGD, 'batch_size': 32, 'params': {'lr': 0.01}},
}

In [16]:
loss_records_optimization = {}

for opt_name, opt_config in optimization_techniques.items():
    print(f"Training with {opt_name}...")
    
    #update the batch size
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt_config['batch_size'], shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=opt_config['batch_size'], shuffle=False)
    
    model = SimpleFeedforwardNN(input_size, hidden_size1, hidden_size2, output_size)
    optimizer = opt_config['optimizer'](model.parameters(), **opt_config['params'])
    
    train_losses, test_losses, training_time = train_model(
        model, 
        criterion, 
        optimizer, 
        train_loader, 
        test_loader,
        epochs=50, 
        loss_name='CrossEntropyLoss'
    )
    
    loss_records_optimization[opt_name] = {
        'train_losses': train_losses,
        'test_losses': test_losses,
        'training_time': training_time
    }

Training with SGD...
Training with Batch GD...
Training with Mini-Batch GD...


In [17]:
for opt_name, losses in loss_records_optimization.items():
    print(f"Optimization technique: {opt_name}")
    print(f"Train loss: {losses['train_losses'][-1]:.4f}")
    print(f"Test loss: {losses['test_losses'][-1]:.4f}")
    print(f"Training time: {losses['training_time']:.2f} seconds")
    print()

Optimization technique: SGD
Train loss: 0.0622
Test loss: 0.0386
Training time: 3.83 seconds

Optimization technique: Batch GD
Train loss: 1.0594
Test loss: 1.0627
Training time: 0.13 seconds

Optimization technique: Mini-Batch GD
Train loss: 0.7414
Test loss: 0.7111
Training time: 0.33 seconds



# Ejercicio 2

1. ¿Cuál es la principal innovación de la arquitectura Transformer?

A diferencia de los modelos RNN o CNN, los cuales usaban una artquitectura de encoder-decoder y tienen muchas limitantes de rendimiento y dependencia, la arquitectura transformer busca obtener una atención mayor al contexto dado por medio del Self-Attention. Esto permite que todas las posiciones o tokens de la secuencia puedan atender a cualquier otra posición de la misma, obteniendo así un mejor alcance y mayor comprensión del contexto. Esto permite que sea altamente paralelizadle, por lo que este tipo de arquitecturas proveen una mayor eficiencia computacional.

2. ¿Cómo funciona el mecanismo de atención del scaled dot-product?

Este mecanismo funciona por medio de tres matrices de entrada: Kyes, Values y Queries. Dadas estas matrices, se obtiene el producto punto de Queries y de Keys, para luego dividirlo entre la raíz cuadrada de la dimensionalidad de Keys. Luego de aplicar softmax al resultado, se obtiene un conjunto de ponderaciones que, combinadas con su respectivo Values, devuelve un vector el cual indica la importancia o relevancia de cada Value.

3. ¿Por qué se utiliza la atención de múltiples cabezales en Transformer?

La razón de utilizar múltiples cabezales para la atención es debido a que, de esta forma es posible paralelizar diferentes perspectivas de el mismo conjunto de información. Debido a este tipo de arquitectura, los transformers pueden explorar diferentes perspectivas de la información al mismo tiempo. 

4. ¿Cómo se incorporan los positional encodings en el modelo Transformer?
5. ¿Cuáles son algunas aplicaciones de la arquitectura Transformer más allá de la machine translation?