In [None]:
# Mount to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install requirements
%pip install -r "/content/drive/MyDrive/ml_projects/tdt13_nlp_sentiment/requirements.txt"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 11.7 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 64.9 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.13.5-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 53.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 70.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 46.8 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Do

In [None]:
# Required imports

# Data manipulation
import pandas as pd
import numpy as np
import datasets

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# ML
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.clip_grad import clip_grad_norm
from transformers import (AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModel,
    AutoModelForMaskedLM, 
    AutoTokenizer,
    LineByLineTextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    AdamW,
    get_linear_schedule_with_warmup)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Setup
plt.style.use('ggplot')
device = "cuda:0" if torch.cuda.is_available() else "cpu" # Transfer work over to GPU if possible
PROJECT = "/content/drive/MyDrive/ml_projects/tdt13_nlp_sentiment"

In [None]:
BASE_MODEL = "roberta-base"
LEARNING_RATE = 5e-5
MAX_LENGTH = 512
BATCH_SIZE = 16
EPOCHS = 20

In [None]:
ds = datasets.load_dataset("csv", delimiter="█", data_files={"test": f"dataset_raw/test.csv"})
ds

Using custom data configuration default-1bd89c23fc909d1a
Found cached dataset csv (C:/Users/Jonas/.cache/huggingface/datasets/csv/default-1bd89c23fc909d1a/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 58.76it/s]


DatasetDict({
    test: Dataset({
        features: ['id', 'labels', 'text'],
        num_rows: 9908
    })
})

In [None]:
# Initialise tokeniser and pre-trained model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1).to(device)
checkpoint = torch.load(f"{PROJECT}/models/roberta_model_v2/checkpoint-7080/pytorch_model.bin")

model.load_state_dict(checkpoint)

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

<All keys matched successfully>

In [None]:
import re


def preprocess_function(text):

    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove html tags
    text = re.sub('<[^<]+?>', '', text)
    
    # Change this to real number
    #text["label"] = float(label)
    return text

In [None]:
import math
DATA = f"{PROJECT}/huggingface_dataset/"

test_df = pd.read_csv(f"{DATA}/test1.csv", delimiter="█")
test_string = "This movie would have been better if they had been so much more than $5 million to get to the hotel. I was struck by how bad this movie is. I couldn't say it was the worst movie I've ever seen. I've seen better movies. I will have to mention that I did not like it either. I will never see it again. I gave it a 5.5 because it's a movie that's too much for a horror movie. I can't blame the actors for their performances. They did a very good job. They had enough time to do a great job. I wasn't sure who was the killer. It wasn't clear who was who the killer. Who knows?"

nb_batches = 1 #math.ceil(len(test_df.index)/BATCH_SIZE)
y_preds = []

#input = test_df.drop("label", axis=1)

def change_val(x):
    x = float(x)
    return x

test_string = preprocess_function(test_string)
encoded = tokenizer(test_string, truncation=True, padding="max_length", max_length=MAX_LENGTH, return_tensors="pt").to("cuda")
pred = model(**encoded).logits.reshape(-1).tolist()


''' for i in range(nb_batches):
    input_labels = list(test_df[i * BATCH_SIZE: (i+1) * BATCH_SIZE]["labels"].apply(lambda x: float(x)))
    text_batch = list(test_df[i * BATCH_SIZE: (i+1) * BATCH_SIZE]["text"].apply(preprocess_function))
    #text_batch = text_batch["text"].apply(preprocess_function)

    encoded = tokenizer(text_batch, truncation=True, padding="max_length", max_length=MAX_LENGTH, return_tensors="pt").to("cuda")
    y_preds += model(**encoded).logits.reshape(-1).tolist() '''
pred

[3.476654529571533]

In [None]:
import numpy


pd.set_option('display.max_rows', 500)

d = {'Text': test_df["text"], 'Rating': test_df["labels"], "Prediction": y_preds}
df = pd.DataFrame(data=d)
df["Rounded Prediction"] = df["Prediction"].apply(round)
incorrect_cases = df[df["Rating"] != df["Rounded Prediction"]]
incorrect_cases_tolerance = df[abs(df["Rating"] - df["Rounded Prediction"]) > 1]
accuracy = 1 - len(incorrect_cases.index)/len(test_df.index)
accuracy_w_tolerance = 1 - len(incorrect_cases_tolerance.index)/len(test_df.index)

incorrect_cases
# Legg til tabell som viser hvilken rating som ble feilklassifisert flest ganger
# Legg til tabell som viser hvilken rating som får størst avvik i gjennomsnitt

Unnamed: 0,Text,Rating,Prediction,Rounded Prediction
0,Le meilleur film que j'ai regards.Joaquin Phoe...,9,7.911590,8
1,"The story is seen before, but that does'n matt...",1,1.527801,2
2,We've all been around that guy; the guy who dr...,6,5.139964,5
3,The acting- fantastic. The story- amazing. The...,8,9.892565,10
6,STAR RATING: ***** Saturday Night **** Friday ...,9,6.984258,7
...,...,...,...,...
9901,This is a nice little horror flick that fans o...,9,7.655769,8
9902,Pros: Phoenix shows you a very gripping fall i...,3,4.727397,5
9903,The story of how the (communist) leader who fr...,8,9.813627,10
9905,The monster from Enemy Mine somehow made his w...,2,2.835598,3


In [None]:
print(accuracy, accuracy_w_tolerance)

0.4615462252725071 0.8221639079531692
