# Task 4
This serves as a template which will guide you through the implementation of this task. It is advised to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps.
This is the jupyter notebook version of the template. For the python file version, please refer to the file `template_solution.py`.

First, we import necessary libraries:

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# Add any other imports you need here
from transformers import DistilBertTokenizer, DistilBertModel
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from torch.cuda.amp import GradScaler, autocast

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Depending on your approach, you might need to adapt the structure of this template or parts not marked by TODOs.
It is not necessary to completely follow this template. Feel free to add more code and delete any parts that are not required.

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 16  # TODO: Set the batch size according to both training performance and available memory
NUM_EPOCHS = 15  # TODO: Set the number of epochs

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_val = pd.read_csv("/content/drive/MyDrive/Machine_learning/task4_hr35z9/train.csv")
test_val = pd.read_csv("/content/drive/MyDrive/Machine_learning/task4_hr35z9/test_no_score.csv")

In [None]:
text = "Hello, how are you?"
encoded_input = tokenizer(text, truncation=True, padding='max_length', max_length=512)
print(encoded_input)

{'input_ids': [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
# TODO: Fill out ReviewDataset
class ReviewDataset(Dataset):
    def __init__(self, data_frame, train = True):
        self.data_frame = data_frame
        self.train = train

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, index):
        text = self.data_frame.iloc[index]['title'] + " " + self.data_frame.iloc[index]['sentence']

        inputs = tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors="pt")

        input_ids = inputs['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = inputs['attention_mask'].squeeze(0)

        if self.train:
            label = self.data_frame.iloc[index]['score']
            return input_ids, attention_mask, label

        return input_ids, attention_mask

In [None]:
train_dataset = ReviewDataset(train_val, train = True)
test_dataset = ReviewDataset(test_val, train = False)

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True, num_workers=4, pin_memory=True)
test_loader = DataLoader(dataset=test_dataset,
                         batch_size=BATCH_SIZE,
                         shuffle=False, num_workers=4, pin_memory=True)
# Additional code if needed

In [None]:
# TODO: Fill out MyModule
class MyModule(nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        #config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_attention_heads=8, dim=512)  # Reduced dimensionality
        self.pre_classifier = nn.Linear(768, 768)  # Example additional layer
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = distilbert_output[0]
        pooled_output = hidden_state[:, 0]
        x = self.pre_classifier(pooled_output)
        x = nn.ReLU()(x)
        x = self.dropout(x)
        output = self.classifier(x)
        return 10 * torch.sigmoid(output)

model = MyModule().to(DEVICE)

In [None]:
# TODO: Setup loss function, optimiser, and scheduler
criterion = nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=5e-5)
scheduler = StepLR(optimiser, step_size=10, gamma=0.1)

model.train()
for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
    scaler = GradScaler()
    for batch in tqdm(train_loader, total=len(train_loader)):
        input_ids, attention_mask, labels = batch

        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE).float()
        labels = labels.to(DEVICE).float()
        optimiser.zero_grad()

        with autocast():  # Run model in mixed precision
            outputs = model(input_ids, attention_mask)
            outputs = outputs.float()
            loss = criterion(outputs, labels.unsqueeze(1))

        scaler.scale(loss).backward()  # Scale loss to adjust for reduced precision
        scaler.step(optimiser)
        scaler.update()

        running_loss += loss.item()

    # Step the scheduler
    scheduler.step()

    # Print average loss for the epoch
    print(f'Epoch {epoch + 1}/{NUM_EPOCHS}, Loss: {running_loss / len(train_loader)}')

  self.pid = os.fork()
100%|██████████| 782/782 [03:36<00:00,  3.62it/s]


Epoch 1/8, Loss: 0.397436794193695


100%|██████████| 782/782 [03:36<00:00,  3.62it/s]


Epoch 2/8, Loss: 0.3087817595180724


100%|██████████| 782/782 [03:35<00:00,  3.63it/s]


Epoch 3/8, Loss: 0.3108519027652719


100%|██████████| 782/782 [03:35<00:00,  3.63it/s]


Epoch 4/8, Loss: 0.41446485233200175


100%|██████████| 782/782 [03:35<00:00,  3.62it/s]


Epoch 5/8, Loss: 0.21451863356153755


100%|██████████| 782/782 [03:35<00:00,  3.62it/s]


Epoch 6/8, Loss: 0.1472221712946244


100%|██████████| 782/782 [03:35<00:00,  3.62it/s]


Epoch 7/8, Loss: 0.10414763433558633


100%|██████████| 782/782 [03:35<00:00,  3.62it/s]

Epoch 8/8, Loss: 0.0921176433029687





In [None]:
model.eval()
with torch.no_grad():
    results = []
    for batch in tqdm(test_loader, total=len(test_loader)):
        #batch = batch.to(DEVICE)

        # TODO: Set up evaluation loop

        input_ids, attention_mask = batch
        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE).float()
        outputs = model(input_ids, attention_mask)
        outputs = outputs.squeeze().cpu().numpy()
        results.append(outputs)



    with open("/content/drive/MyDrive/Machine_learning/task4_hr35z9/result.txt", "w") as f:
        for val in np.concatenate(results):
            f.write(f"{val}\n")

  self.pid = os.fork()
  self.pid = os.fork()
100%|██████████| 63/63 [00:17<00:00,  3.69it/s]
