## 1 Fine-Tunning Section
Due to the limitation of computing resources and the time constrain, Initially, I tried to use partial dataset to fine-tune the model. In reality, cloud computing resources or a laptop with GPU should be used to accelerate the training process. <br>
When actually starting training the model, it took too long to even barely finish training epoches when reaching to the end of the exam time. However, there was no error countered during the compiling and while running the preparation data codes. Therefore, the codes should all work but just need more time to train and get the evaluation metrics.

In [1]:
# import libararies
import pandas as pd
import numpy as np

import torch
from torch import nn
from torch.optim import Adam

# from GPUtil import showUtilization as gpu_usage
# !pip install numba
# from numba import cuda

from tqdm import tqdm

import pyarrow.parquet as pq

# !pip install transformers
from transformers import RobertaTokenizer, RobertaModel, Pipeline
from transformers.pipelines import PIPELINE_REGISTRY


In [4]:
# Open dataset for test and train

# Open the Parquet file
train_file = pq.ParquetFile('./train-sentiment.parquet')
test_file = pq.ParquetFile('./test-sentiment.parquet')

# Get the schema of the file
schema = train_file.schema

# Read the data into a Pandas DataFrame
train_raw = train_file.read().to_pandas()
test = test_file.read().to_pandas()

# Print the schema and data
print(schema)
print(train_raw.head())
print(test.head())


<pyarrow._parquet.ParquetSchema object at 0x000001EEB01770C0>
required group field_id=-1 schema {
  optional binary field_id=-1 sentence (String);
  optional int64 field_id=-1 label;
}

                                            sentence  label
0  Altia 's operating profit jumped to EUR 47 mil...      2
1  The agreement was signed with Biohit Healthcar...      2
2  Kesko pursues a strategy of healthy , focused ...      2
3  Vaisala , headquartered in Helsinki in Finland...      1
4  Also , a six-year historic analysis is provide...      1
                                            sentence  label
0  TeliaSonera TLSN said the offer is in line wit...      2
1  STORA ENSO , NORSKE SKOG , M-REAL , UPM-KYMMEN...      2
2  Clothing retail chain Sepp+ñl+ñ 's sales incre...      2
3  Lifetree was founded in 2000 , and its revenue...      2
4  Nordea Group 's operating profit increased in ...      2


In [5]:
# Generate dataset for fine-tunning
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = list(df['label'].values)
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['sentence']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


In [6]:
# Split raw train dataset into train and validation datasets
np.random.seed(112)

# Define the proportion of data to use for validating
val_size = 0.2

# Calculate the number of samples to use for testing
num_val_samples = int(len(train_raw) * val_size)

# Generate a random permutation of the data indices
indices = np.random.permutation(len(train_raw))

# Split the indices into training and testing indices
val_indices = indices[:num_val_samples]
train_indices = indices[num_val_samples:]

# Split the data into training and testing sets
train_data = train_raw.iloc[train_indices]
val_data = train_raw.iloc[val_indices]

# Print the shapes of the training and testing sets
print('Training set shape:', train_data.shape)
print('Testing set shape:', val_data.shape)

Training set shape: (3102, 2)
Testing set shape: (775, 2)


### 1.1 Freeze all the layers expect of classifier and tune head only

In [25]:
# Construct the classifier for sentiment analysis
class SentimentCalssify(nn.Module):

    def __init__(self, dropout=0.5):

        super(SentimentCalssify, self).__init__()

        self.roberta = RobertaModel.from_pretrained('roberta-base')

        # Freeze all layers except the classifier (the last layer)
        for name, param in self.roberta.named_parameters():
            param.requires_grad = False
        self.l1 = nn.Linear(768, 768)
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.LeakyReLU()
        self.linear = nn.Linear(768, 3)
        

    def forward(self, input_id, mask):

        _, pooled_output = self.roberta(input_ids= input_id, attention_mask=mask,return_dict=False)
        pre_classify = self.l1(pooled_output)
        dropout_output = self.dropout(pre_classify)
        activate_output = self.activation(dropout_output)
        linear_output = self.linear(activate_output)

        return linear_output

In [41]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=8, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=8)

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        torch.cuda.empty_cache()
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    criterion = nn.CrossEntropyLoss()
    non_frozen_params = [p for p in model.parameters() if p.requires_grad]
    print(f"tunable parameters: {[n for n, p in model.named_parameters() if p.requires_grad]}")
    optimizer = Adam(non_frozen_params, lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):
            # for train_input, train_label in train_dataloader:
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)
                model.zero_grad()
                
                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')

In [39]:
EPOCHS = 5
model = SentimentCalssify(dropout=0.3)
LR = 1e-5

train(model, train_data, val_data, LR, EPOCHS)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tunable parameters: ['l1.weight', 'l1.bias', 'linear.weight', 'linear.bias']


100%|██████████| 311/311 [01:41<00:00,  3.07it/s]


Epochs: 1 | Train Loss:  0.097                 | Train Accuracy:  0.569                 | Val Loss:  0.094                 | Val Accuracy:  0.588


100%|██████████| 311/311 [01:41<00:00,  3.06it/s]


Epochs: 2 | Train Loss:  0.092                 | Train Accuracy:  0.601                 | Val Loss:  0.095                 | Val Accuracy:  0.588


100%|██████████| 311/311 [01:42<00:00,  3.03it/s]


Epochs: 3 | Train Loss:  0.092                 | Train Accuracy:  0.601                 | Val Loss:  0.094                 | Val Accuracy:  0.588


100%|██████████| 311/311 [01:43<00:00,  3.00it/s]


Epochs: 4 | Train Loss:  0.092                 | Train Accuracy:  0.601                 | Val Loss:  0.094                 | Val Accuracy:  0.588


100%|██████████| 311/311 [01:44<00:00,  2.97it/s]


Epochs: 5 | Train Loss:  0.092                 | Train Accuracy:  0.601                 | Val Loss:  0.094                 | Val Accuracy:  0.588


### 1.2 Unfreeze last 3 layers of the model and tune them

In [42]:
# Unfreeze the last 3 layers and retrain the model
EPOCHS = 5
LR = 1e-5

for name, param in model.roberta.named_parameters():
  temp = name.split('.')
  if temp[2].isdigit() and int(temp[2]) >= 9 or temp[0] == "pooler" : 
    param.requires_grad = True
              
train(model, train_data, val_data, LR, EPOCHS)

tunable parameters: ['roberta.encoder.layer.9.attention.self.query.weight', 'roberta.encoder.layer.9.attention.self.query.bias', 'roberta.encoder.layer.9.attention.self.key.weight', 'roberta.encoder.layer.9.attention.self.key.bias', 'roberta.encoder.layer.9.attention.self.value.weight', 'roberta.encoder.layer.9.attention.self.value.bias', 'roberta.encoder.layer.9.attention.output.dense.weight', 'roberta.encoder.layer.9.attention.output.dense.bias', 'roberta.encoder.layer.9.attention.output.LayerNorm.weight', 'roberta.encoder.layer.9.attention.output.LayerNorm.bias', 'roberta.encoder.layer.9.intermediate.dense.weight', 'roberta.encoder.layer.9.intermediate.dense.bias', 'roberta.encoder.layer.9.output.dense.weight', 'roberta.encoder.layer.9.output.dense.bias', 'roberta.encoder.layer.9.output.LayerNorm.weight', 'roberta.encoder.layer.9.output.LayerNorm.bias', 'roberta.encoder.layer.10.attention.self.query.weight', 'roberta.encoder.layer.10.attention.self.query.bias', 'roberta.encoder.laye

100%|██████████| 388/388 [03:28<00:00,  1.86it/s]


Epochs: 1 | Train Loss:  0.074                 | Train Accuracy:  0.753                 | Val Loss:  0.045                 | Val Accuracy:  0.861


100%|██████████| 388/388 [03:31<00:00,  1.83it/s]


Epochs: 2 | Train Loss:  0.038                 | Train Accuracy:  0.874                 | Val Loss:  0.042                 | Val Accuracy:  0.863


100%|██████████| 388/388 [03:35<00:00,  1.80it/s]


Epochs: 3 | Train Loss:  0.027                 | Train Accuracy:  0.915                 | Val Loss:  0.046                 | Val Accuracy:  0.855


100%|██████████| 388/388 [03:35<00:00,  1.80it/s]


Epochs: 4 | Train Loss:  0.019                 | Train Accuracy:  0.943                 | Val Loss:  0.051                 | Val Accuracy:  0.837


100%|██████████| 388/388 [03:32<00:00,  1.83it/s]


Epochs: 5 | Train Loss:  0.011                 | Train Accuracy:  0.971                 | Val Loss:  0.056                 | Val Accuracy:  0.854


### 1.3 Evaluate finetuned model

In [43]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=10)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

idx = int(0.8 * test.shape[0])
evaluate(model, test.iloc[:idx, :])

Test Accuracy:  0.867


### 1.4 Use Pipelines to create prediction pipline

In [44]:
# Create a new pipeline for re-trained RoBerta model
from transformers import Pipeline


class MyPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        if "maybe_arg" in kwargs:
            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
        return preprocess_kwargs, {}, {}

    def preprocess(self, inputs, maybe_arg=2):
        model_input = Tensor(inputs["input_ids"])
        return {"model_input": model_input}

    def _forward(self, model_inputs):
        # model_inputs == {"model_input": model_input}
        outputs = self.model(**model_inputs)
        # Maybe {"logits": Tensor(...)}
        return outputs

    def postprocess(self, model_outputs):
        best_class = model_outputs["logits"].softmax(-1)
        return best_class

KeyError: "Unknown task sentiment-classification, available tasks are ['audio-classification', 'automatic-speech-recognition', 'conversational', 'depth-estimation', 'document-question-answering', 'feature-extraction', 'fill-mask', 'image-classification', 'image-segmentation', 'image-to-text', 'mask-generation', 'ner', 'object-detection', 'question-answering', 'sentiment-analysis', 'summarization', 'table-question-answering', 'text-classification', 'text-generation', 'text2text-generation', 'token-classification', 'translation', 'video-classification', 'visual-question-answering', 'vqa', 'zero-shot-audio-classification', 'zero-shot-classification', 'zero-shot-image-classification', 'zero-shot-object-detection', 'translation_XX_to_YY']"