In [None]:
%pip install -r "requirements.txt"

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

In [None]:
RAW_DATA = "C:/Users/Jonas/ntnu_5/tdt13_nlp/aclImdb/dataset_raw"
SENTIMENT_DATA = "C:/Users/Jonas/ntnu_5/tdt13_nlp/aclImdb/dataset_sentiment"

train_df = pd.read_csv(f"{RAW_DATA}/train.csv", delimiter="█")
test_df = pd.read_csv(f"{RAW_DATA}/test.csv", delimiter="█")
val_df = pd.read_csv(f"{RAW_DATA}/validation.csv", delimiter="█")

In [5]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import torch

In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

In [8]:
import re
# Preprocess data
def preprocess_function(text):

    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove html tags
    text = re.sub('<[^<]+?>', '', text)
    
    # Change this to real number
    #text["label"] = float(label)
    return text

train_df["text"] = train_df["text"].apply(preprocess_function)
test_df["text"] = test_df["text"].apply(preprocess_function)
val_df["text"] = val_df["text"].apply(preprocess_function)

In [9]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, truncation=True, return_tensors="pt", max_length=512).to(device)
    output = model(**encoded_text)
    scores = output[0][0].cpu().detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        "roberta_neg": scores[0],
        "roberta_neu": scores[1],
        "roberta_pos": scores[2],
    }
    return scores_dict

In [None]:
from tqdm import tqdm

def polarity_score_dataset(df):
    res = {}
    for i, row in tqdm(df.iterrows(), total=len(df.index)):
        try:
            text = row["text"]
            text_id = row["id"]
            res[text_id] = polarity_scores_roberta(text)
        except RuntimeError:
            print(f"[ERROR] Broke for id {text_id}")

    return res

train_polarity = polarity_score_dataset(train_df)
test_polarity = polarity_score_dataset(test_df)
val_polarity = polarity_score_dataset(val_df)

In [None]:
train_polarity_df = pd.DataFrame.from_dict(train_polarity).T
test_polarity_df = pd.DataFrame.from_dict(test_polarity).T
val_polarity_df = pd.DataFrame.from_dict(val_polarity).T

print(len(train_polarity_df.index))
print(len(test_polarity_df.index))
print(len(val_polarity_df.index))

train_labels = pd.DataFrame(train_df["labels"]).set_index(train_df["id"])
test_labels = pd.DataFrame(test_df["labels"]).set_index(test_df["id"])
val_labels = pd.DataFrame(val_df["labels"]).set_index(val_df["id"])

In [None]:
merged_train = pd.concat([train_labels, train_polarity_df], axis=1)
merged_test = pd.concat([test_labels, test_polarity_df], axis=1)
merged_val = pd.concat([val_labels, val_polarity_df], axis=1)

merged_val

In [51]:
merged_train.to_csv(f"{SENTIMENT_DATA}/train.csv", sep="█")
merged_test.to_csv(f"{SENTIMENT_DATA}/test.csv", sep="█")
merged_val.to_csv(f"{SENTIMENT_DATA}/validation.csv", sep="█")