In [None]:
# Mount to drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Install requirements
%pip install -r "/content/drive/MyDrive/ml_projects/tdt13_nlp_sentiment/requirements.txt"

In [None]:
# Required imports

# Data manipulation
import pandas as pd
import numpy as np
import datasets
import re

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import wandb

# ML
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.clip_grad import clip_grad_norm
from transformers import (AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModel,
    AutoModelForMaskedLM, 
    AutoTokenizer,
    LineByLineTextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    AdamW,
    get_linear_schedule_with_warmup)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Setup
plt.style.use('ggplot')
device = "cuda:0" if torch.cuda.is_available() else "cpu" # Transfer work over to GPU if possible
PROJECT = "/content/drive/MyDrive/ml_projects/tdt13_nlp_sentiment"
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33molejlia[0m ([33mcgo-testing[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
# Read data
ds = datasets.load_dataset(
    "csv", delimiter="█",
    data_files={
        "train": f"{PROJECT}/huggingface_dataset/train.csv",
        "test": f"{PROJECT}/huggingface_dataset/test1.csv",
        "validation": f"{PROJECT}/huggingface_dataset/val.csv",
    })

ds



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-ae7081b0e086ea7c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

   

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #2:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return func(*args, **kwargs)


Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-ae7081b0e086ea7c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'labels', 'text'],
        num_rows: 56618
    })
    test: Dataset({
        features: ['id', 'labels', 'text'],
        num_rows: 9908
    })
    validation: Dataset({
        features: ['id', 'labels', 'text'],
        num_rows: 4246
    })
})

In [None]:
# Presets and constants

BASE_MODEL = "roberta-base"
LEARNING_RATE = 5e-5
MAX_LENGTH = 512
BATCH_SIZE = 32
EPOCHS = 5

In [None]:
# Initialise tokeniser and base model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1).to(device)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'labels', 'text'],
        num_rows: 56618
    })
    test: Dataset({
        features: ['id', 'labels', 'text'],
        num_rows: 9908
    })
    validation: Dataset({
        features: ['id', 'labels', 'text'],
        num_rows: 4246
    })
})

In [None]:
def preprocess_function(dataset):
    label = dataset["labels"]

    dataset["text"] = re.sub(r'(@.*?)[\s]', ' ', dataset["text"])

    # Replace '&amp;' with '&'
    dataset["text"] = re.sub(r'&amp;', '&', dataset["text"])

    # Remove trailing whitespace
    dataset["text"] = re.sub(r'\s+', ' ', dataset["text"]).strip()

    # Remove html tags
    dataset["text"] = re.sub('<[^<]+?>', '', dataset["text"])


    dataset = tokenizer(dataset["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    
    # Change this to real number
    dataset["label"] = float(label)
    return dataset


ds = ds.map(preprocess_function, remove_columns=["id", "text", "labels"])

  0%|          | 0/56618 [00:00<?, ?ex/s]

  0%|          | 0/9908 [00:00<?, ?ex/s]

  0%|          | 0/4246 [00:00<?, ?ex/s]

In [None]:
#small_train = ds["train"].shuffle(seed=42).select(range(3000))
#small_test = ds["test"].shuffle(seed=42).select(range(500))

In [None]:
# Evaluation metrics
# Fetched from:
#   https://lajavaness.medium.com/regression-with-text-input-using-bert-and-transformers-71c155034b13

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    
    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    
    # Compute accuracy 
    # Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
    
    return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}

In [None]:
# Setting training arguments

training_args = TrainingArguments(
  output_dir = f"{PROJECT}/models/roberta_model_v2",
  learning_rate = LEARNING_RATE,
  per_device_train_batch_size=BATCH_SIZE,
  per_device_eval_batch_size=BATCH_SIZE,
  num_train_epochs=EPOCHS,
  evaluation_strategy="epoch",
  save_strategy="epoch",
  save_total_limit=2,
  metric_for_best_model="accuracy",
  load_best_model_at_end=True,
  weight_decay=0.01,
  report_to="wandb"
)

In [None]:
# Override transformers Trainer loss() function for use on regressor

class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
# Train the model

trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics_for_regression,
)

trainer.train()

***** Running training *****
  Num examples = 56618
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 8850
  Number of trainable parameters = 124646401
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Mse,Mae,R2,Accuracy
1,2.4013,2.246503,2.246503,1.030272,0.780007,0.347621
2,1.6344,2.38551,2.38551,1.038969,0.766394,0.374941
3,1.2031,1.862939,1.862939,0.885512,0.817568,0.45431
4,0.8805,1.883312,1.883312,0.87903,0.815573,0.465379
5,0.6625,1.836324,1.836325,0.851984,0.820174,0.489637


***** Running Evaluation *****
  Num examples = 4246
  Batch size = 32
Saving model checkpoint to /content/drive/MyDrive/ml_projects/tdt13_nlp_sentiment/models/roberta_model_v2/checkpoint-1770
Configuration saved in /content/drive/MyDrive/ml_projects/tdt13_nlp_sentiment/models/roberta_model_v2/checkpoint-1770/config.json
Model weights saved in /content/drive/MyDrive/ml_projects/tdt13_nlp_sentiment/models/roberta_model_v2/checkpoint-1770/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4246
  Batch size = 32
Saving model checkpoint to /content/drive/MyDrive/ml_projects/tdt13_nlp_sentiment/models/roberta_model_v2/checkpoint-3540
Configuration saved in /content/drive/MyDrive/ml_projects/tdt13_nlp_sentiment/models/roberta_model_v2/checkpoint-3540/config.json
Model weights saved in /content/drive/MyDrive/ml_projects/tdt13_nlp_sentiment/models/roberta_model_v2/checkpoint-3540/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4246
  Batch size = 32
Saving mode

TrainOutput(global_step=8850, training_loss=1.500251897607146, metrics={'train_runtime': 6600.6217, 'train_samples_per_second': 42.888, 'train_steps_per_second': 1.341, 'total_flos': 7.448343989910528e+16, 'train_loss': 1.500251897607146, 'epoch': 5.0})

In [None]:
import math

test_df = pd.read_csv(f"{PROJECT}/huggingface_dataset/test1.csv", delimiter="█")

nb_batches = math.ceil(len(test_df.index)/BATCH_SIZE)
y_preds = []

for i in range(nb_batches):
    input_texts = test_df[i * BATCH_SIZE: (i+1) * BATCH_SIZE]["text"].tolist()
    input_labels = test_df[i * BATCH_SIZE: (i+1) * BATCH_SIZE]["labels"]
    encoded = tokenizer(input_texts, truncation=True, padding="max_length", max_length=MAX_LENGTH, return_tensors="pt").to("cuda")
    y_preds += model(**encoded).logits.reshape(-1).tolist()

  return func(*args, **kwargs)


In [None]:
import numpy


pd.set_option('display.max_rows', 500)

d = {'Text': test_df["text"], 'Rating': test_df["labels"], "Prediction": y_preds}
df = pd.DataFrame(data=d)
df["Rounded Prediction"] = df["Prediction"].apply(round)
incorrect_cases = df[df["Rating"] != df["Rounded Prediction"]]
incorrect_cases_tolerance = df[abs(df["Rating"] - df["Rounded Prediction"]) > 1]
accuracy = 1 - len(incorrect_cases.index)/len(test_df.index)
accuracy_w_tolerance = 1 - len(incorrect_cases_tolerance.index)/len(test_df.index)

Unnamed: 0,Text,Rating,Prediction,Rounded Prediction
3,The acting- fantastic. The story- amazing. The...,8,9.928410,10
6,STAR RATING: ***** Saturday Night **** Friday ...,9,6.811824,7
8,Every time whenever i expect something from Ma...,6,7.904014,8
11,A very carelessly written film. Poor character...,1,2.914773,3
12,I only voted excellent because this film took ...,10,8.459162,8
...,...,...,...,...
9896,This movie was so dumb and slow was it ever sl...,3,1.193085,1
9898,"""Written on the Wind"" is a Douglas Sirk's melo...",7,3.627724,4
9899,Thomas Edison had no other reason to make this...,1,3.781042,4
9902,Pros: Phoenix shows you a very gripping fall i...,3,4.742036,5


In [None]:

incorrect_cases
# Legg til tabell som viser hvilken rating som ble feilklassifisert flest ganger
# Legg til tabell som viser hvilken rating som får størst avvik i gjennomsnitt

Unnamed: 0,Text,Rating,Prediction,Rounded Prediction
0,Le meilleur film que j'ai regards.Joaquin Phoe...,9,7.907640,8
2,We've all been around that guy; the guy who dr...,6,4.505195,5
3,The acting- fantastic. The story- amazing. The...,8,9.928410,10
5,"I have to ask myself, do movies like this get ...",2,1.387017,1
6,STAR RATING: ***** Saturday Night **** Friday ...,9,6.811824,7
...,...,...,...,...
9901,This is a nice little horror flick that fans o...,9,7.566714,8
9902,Pros: Phoenix shows you a very gripping fall i...,3,4.742036,5
9903,The story of how the (communist) leader who fr...,8,9.768699,10
9905,The monster from Enemy Mine somehow made his w...,2,2.613206,3
