# Task 4
This serves as a template which will guide you through the implementation of this task. It is advised to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps.
This is the jupyter notebook version of the template. For the python file version, please refer to the file `template_solution.py`.

First, we import necessary libraries:

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# Add any other imports you need here
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import GPT2Tokenizer, GPT2Model
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F

Depending on your approach, you might need to adapt the structure of this template or parts not marked by TODOs.
It is not necessary to completely follow this template. Feel free to add more code and delete any parts that are not required.

In [2]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64  # TODO: Set the batch size according to both training performance and available memory
NUM_EPOCHS = 64  # TODO: Set the number of epochs

train_val = pd.read_csv("train.csv")
test_val = pd.read_csv("test_no_score.csv")

In [3]:
class ReviewDataset(Dataset):
    def __init__(self, embeddings_path, scores = []): 
        self.embeddings = torch.load(embeddings_path)
        self.scores = scores
    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, index):
        embeddings = self.embeddings[index]
        if len(self.scores) > 0 :
            score = self.scores[index]
            return {
                'embeddings': embeddings,
                'score': score
            }
        else:
            return {
                'embeddings': embeddings
            }

In [4]:
# Precompute embeddings
def compute_embeddings_distilbert(data_frame, tokenizer_name, max_length):
    tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_name)
    transformer = DistilBertModel.from_pretrained(tokenizer_name)
    
    embeddings = []
    for index, row in data_frame.iterrows():
        title = row[0] 
        review = row[1] 
        
        inputs = tokenizer(title + " " + review, 
                           max_length=max_length,
                           padding="max_length",
                           truncation=True,
                           return_tensors="pt")
        
        with torch.no_grad():
            outputs = transformer(**inputs)
            hidden_states = outputs.last_hidden_state
        embeddings.append(hidden_states.squeeze().cpu().numpy())
    
    return torch.tensor(embeddings)

def compute_embeddings_GPT2(data_frame, tokenizer_name, max_length):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    tokenizer.pad_token = tokenizer.eos_token
    transformer = GPT2Model.from_pretrained(tokenizer_name).to(DEVICE)
    
    embeddings = []
    for index, row in data_frame.iterrows():
        title = row[0]  
        review = row[1]  

        input = tokenizer(
            title + " " + review,
            return_tensors='pt',
            padding=True,
            truncation=True
        ).to(DEVICE)
        input_ids = input['input_ids']

        with torch.no_grad():
            outputs = transformer(input_ids)

        word_embeddings = outputs.last_hidden_state

        masked_word_embeddings = word_embeddings * input.attention_mask.unsqueeze(-1).float()

        sentence_embeddings = masked_word_embeddings.sum(dim=1)

        sentence_embeddings /= input.attention_mask.sum(dim=1, keepdim=True).float()
        
        embeddings.append(sentence_embeddings.squeeze().cpu().numpy())
    return torch.tensor(embeddings)



In [5]:
# tokenizer_name = "distilbert-base-uncased"
# max_length = 128

# train_embeddings = compute_embeddings_distilbert(train_val, tokenizer_name, max_length)
# torch.save(train_embeddings, "train_distilbert_embeddings.pt")

# test_embeddings = compute_embeddings_distilbert(test_val, tokenizer_name, max_length)
# torch.save(test_embeddings, "test_distilbert_embeddings.pt")

In [5]:
tokenizer_name = "gpt2"
max_length = 128

train_embeddings = compute_embeddings_GPT2(train_val, tokenizer_name, max_length)
torch.save(train_embeddings, "train_gpt2_embeddings.pt")

test_embeddings = compute_embeddings_GPT2(test_val, tokenizer_name, max_length)
torch.save(test_embeddings, "test_gpt2_embeddings.pt")


  return torch.tensor(embeddings)


In [5]:
train_dataset = ReviewDataset("train_gpt2_embeddings.pt", train_val['score'])
test_dataset = ReviewDataset("test_gpt2_embeddings.pt")

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True, num_workers=16, pin_memory=True)
test_loader = DataLoader(dataset=test_dataset,
                         batch_size=BATCH_SIZE,
                         shuffle=False, num_workers=16, pin_memory=True)


In [6]:
class MyModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(768, 1536)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(1536, 512)
        self.sigmoid = nn.Sigmoid()
        self.fc3 = nn.Linear(512, 64)
        self.fc4 = nn.Linear(64, 1)

 
    def forward(self, x):
        
        
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        x = self.sigmoid(x) * 10.0 
        return x



In [32]:
model = MyModule().to(DEVICE)
LEARNING_RATE = 0.001
NUM_EPOCHS = 80

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = StepLR(optimizer, step_size=1, gamma=0.88)  

model.train()
for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
    for batch in tqdm(train_loader, total=len(train_loader)):

        embeddings = batch['embeddings'].to(DEVICE)
        scores = batch['score'].view(-1, 1).to(DEVICE)

        optimizer.zero_grad()

        outputs = model(embeddings.float()) 
        # print(outputs.shape)
        # print(scores.shape)
        loss = criterion(outputs, scores.float())

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * batch['embeddings'].size(0)

    scheduler.step()

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {epoch_loss:.4f}")

100%|██████████| 196/196 [00:00<00:00, 313.69it/s]


Epoch 1/80, Loss: 5.9670


100%|██████████| 196/196 [00:00<00:00, 305.25it/s]


Epoch 2/80, Loss: 3.3179


100%|██████████| 196/196 [00:00<00:00, 325.15it/s]


Epoch 3/80, Loss: 2.8547


100%|██████████| 196/196 [00:00<00:00, 305.23it/s]


Epoch 4/80, Loss: 2.6209


100%|██████████| 196/196 [00:00<00:00, 305.80it/s]


Epoch 5/80, Loss: 2.5332


100%|██████████| 196/196 [00:00<00:00, 313.34it/s]


Epoch 6/80, Loss: 2.4927


100%|██████████| 196/196 [00:00<00:00, 306.82it/s]


Epoch 7/80, Loss: 2.3633


100%|██████████| 196/196 [00:00<00:00, 284.90it/s]


Epoch 8/80, Loss: 2.2973


100%|██████████| 196/196 [00:00<00:00, 302.08it/s]


Epoch 9/80, Loss: 2.2798


100%|██████████| 196/196 [00:00<00:00, 266.19it/s]


Epoch 10/80, Loss: 2.1978


100%|██████████| 196/196 [00:00<00:00, 262.43it/s]


Epoch 11/80, Loss: 2.0795


100%|██████████| 196/196 [00:00<00:00, 281.27it/s]


Epoch 12/80, Loss: 2.0796


100%|██████████| 196/196 [00:00<00:00, 274.28it/s]


Epoch 13/80, Loss: 2.0366


100%|██████████| 196/196 [00:00<00:00, 290.22it/s]


Epoch 14/80, Loss: 2.0265


100%|██████████| 196/196 [00:00<00:00, 316.39it/s]


Epoch 15/80, Loss: 1.9362


100%|██████████| 196/196 [00:00<00:00, 308.04it/s]


Epoch 16/80, Loss: 1.9305


100%|██████████| 196/196 [00:00<00:00, 318.87it/s]


Epoch 17/80, Loss: 1.8693


100%|██████████| 196/196 [00:00<00:00, 301.74it/s]


Epoch 18/80, Loss: 1.8019


100%|██████████| 196/196 [00:00<00:00, 294.79it/s]


Epoch 19/80, Loss: 1.7864


100%|██████████| 196/196 [00:00<00:00, 311.05it/s]


Epoch 20/80, Loss: 1.7448


100%|██████████| 196/196 [00:00<00:00, 325.14it/s]


Epoch 21/80, Loss: 1.7158


100%|██████████| 196/196 [00:00<00:00, 320.93it/s]


Epoch 22/80, Loss: 1.6800


100%|██████████| 196/196 [00:00<00:00, 313.07it/s]


Epoch 23/80, Loss: 1.6461


100%|██████████| 196/196 [00:00<00:00, 324.76it/s]


Epoch 24/80, Loss: 1.6144


100%|██████████| 196/196 [00:00<00:00, 331.15it/s]


Epoch 25/80, Loss: 1.6036


100%|██████████| 196/196 [00:00<00:00, 328.29it/s]


Epoch 26/80, Loss: 1.5715


100%|██████████| 196/196 [00:00<00:00, 323.67it/s]


Epoch 27/80, Loss: 1.5503


100%|██████████| 196/196 [00:00<00:00, 308.78it/s]


Epoch 28/80, Loss: 1.5429


100%|██████████| 196/196 [00:00<00:00, 293.31it/s]


Epoch 29/80, Loss: 1.5202


100%|██████████| 196/196 [00:00<00:00, 313.15it/s]


Epoch 30/80, Loss: 1.5135


100%|██████████| 196/196 [00:00<00:00, 317.18it/s]


Epoch 31/80, Loss: 1.4957


100%|██████████| 196/196 [00:00<00:00, 276.18it/s]


Epoch 32/80, Loss: 1.4865


100%|██████████| 196/196 [00:00<00:00, 261.59it/s]


Epoch 33/80, Loss: 1.4697


100%|██████████| 196/196 [00:00<00:00, 277.36it/s]


Epoch 34/80, Loss: 1.4641


100%|██████████| 196/196 [00:00<00:00, 298.30it/s]


Epoch 35/80, Loss: 1.4527


100%|██████████| 196/196 [00:00<00:00, 297.12it/s]


Epoch 36/80, Loss: 1.4439


100%|██████████| 196/196 [00:00<00:00, 299.22it/s]


Epoch 37/80, Loss: 1.4358


100%|██████████| 196/196 [00:00<00:00, 298.70it/s]


Epoch 38/80, Loss: 1.4304


100%|██████████| 196/196 [00:00<00:00, 332.44it/s]


Epoch 39/80, Loss: 1.4243


100%|██████████| 196/196 [00:00<00:00, 308.12it/s]


Epoch 40/80, Loss: 1.4194


100%|██████████| 196/196 [00:00<00:00, 302.06it/s]


Epoch 41/80, Loss: 1.4159


100%|██████████| 196/196 [00:00<00:00, 321.09it/s]


Epoch 42/80, Loss: 1.4112


100%|██████████| 196/196 [00:00<00:00, 312.17it/s]


Epoch 43/80, Loss: 1.4081


100%|██████████| 196/196 [00:00<00:00, 304.27it/s]


Epoch 44/80, Loss: 1.4052


100%|██████████| 196/196 [00:00<00:00, 304.42it/s]


Epoch 45/80, Loss: 1.4029


100%|██████████| 196/196 [00:00<00:00, 315.83it/s]


Epoch 46/80, Loss: 1.4006


100%|██████████| 196/196 [00:00<00:00, 306.65it/s]


Epoch 47/80, Loss: 1.3972


100%|██████████| 196/196 [00:00<00:00, 311.66it/s]


Epoch 48/80, Loss: 1.3959


100%|██████████| 196/196 [00:00<00:00, 291.62it/s]


Epoch 49/80, Loss: 1.3941


100%|██████████| 196/196 [00:00<00:00, 311.33it/s]


Epoch 50/80, Loss: 1.3922


100%|██████████| 196/196 [00:00<00:00, 294.52it/s]


Epoch 51/80, Loss: 1.3906


100%|██████████| 196/196 [00:00<00:00, 306.75it/s]


Epoch 52/80, Loss: 1.3902


100%|██████████| 196/196 [00:00<00:00, 305.09it/s]


Epoch 53/80, Loss: 1.3889


100%|██████████| 196/196 [00:00<00:00, 322.41it/s]


Epoch 54/80, Loss: 1.3875


100%|██████████| 196/196 [00:00<00:00, 349.62it/s]


Epoch 55/80, Loss: 1.3872


100%|██████████| 196/196 [00:00<00:00, 307.33it/s]


Epoch 56/80, Loss: 1.3864


100%|██████████| 196/196 [00:00<00:00, 316.50it/s]


Epoch 57/80, Loss: 1.3856


100%|██████████| 196/196 [00:00<00:00, 319.43it/s]


Epoch 58/80, Loss: 1.3851


100%|██████████| 196/196 [00:00<00:00, 321.75it/s]


Epoch 59/80, Loss: 1.3847


100%|██████████| 196/196 [00:00<00:00, 301.48it/s]


Epoch 60/80, Loss: 1.3840


100%|██████████| 196/196 [00:00<00:00, 300.57it/s]


Epoch 61/80, Loss: 1.3838


100%|██████████| 196/196 [00:00<00:00, 319.83it/s]


Epoch 62/80, Loss: 1.3834


100%|██████████| 196/196 [00:00<00:00, 311.88it/s]


Epoch 63/80, Loss: 1.3830


100%|██████████| 196/196 [00:00<00:00, 309.94it/s]


Epoch 64/80, Loss: 1.3828


100%|██████████| 196/196 [00:00<00:00, 300.97it/s]


Epoch 65/80, Loss: 1.3826


100%|██████████| 196/196 [00:00<00:00, 319.13it/s]


Epoch 66/80, Loss: 1.3823


100%|██████████| 196/196 [00:00<00:00, 320.25it/s]


Epoch 67/80, Loss: 1.3821


100%|██████████| 196/196 [00:00<00:00, 324.63it/s]


Epoch 68/80, Loss: 1.3820


100%|██████████| 196/196 [00:00<00:00, 337.70it/s]


Epoch 69/80, Loss: 1.3819


100%|██████████| 196/196 [00:00<00:00, 304.66it/s]


Epoch 70/80, Loss: 1.3817


100%|██████████| 196/196 [00:00<00:00, 313.78it/s]


Epoch 71/80, Loss: 1.3816


100%|██████████| 196/196 [00:00<00:00, 314.74it/s]


Epoch 72/80, Loss: 1.3815


100%|██████████| 196/196 [00:00<00:00, 307.65it/s]


Epoch 73/80, Loss: 1.3814


100%|██████████| 196/196 [00:00<00:00, 300.71it/s]


Epoch 74/80, Loss: 1.3813


100%|██████████| 196/196 [00:00<00:00, 295.71it/s]


Epoch 75/80, Loss: 1.3813


100%|██████████| 196/196 [00:00<00:00, 296.42it/s]


Epoch 76/80, Loss: 1.3812


100%|██████████| 196/196 [00:00<00:00, 286.06it/s]


Epoch 77/80, Loss: 1.3812


100%|██████████| 196/196 [00:00<00:00, 306.32it/s]


Epoch 78/80, Loss: 1.3811


100%|██████████| 196/196 [00:00<00:00, 305.15it/s]


Epoch 79/80, Loss: 1.3811


100%|██████████| 196/196 [00:00<00:00, 325.85it/s]

Epoch 80/80, Loss: 1.3810





In [33]:
model.eval()
with torch.no_grad():
    results = []
    for batch in tqdm(test_loader, total=len(test_loader)):

        embeddings = batch['embeddings'].to(DEVICE)


        outputs = model(embeddings.float()) 
        results.append(outputs.cpu().numpy())

    with open("result.txt", "w") as f:
        for val in np.concatenate(results):
            f.write(f"{val[0]}\n")

100%|██████████| 16/16 [00:00<00:00, 72.19it/s]
