In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/PLP Proj')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd

# Reading Data
df = pd.read_csv('/content/drive/MyDrive/PLP Proj/final_data_v3.csv')
df

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes,review_text_clean,review_sentiment
0,10,Counter-Strike,This will be more of a ''my experience with th...,1,1,This will be more of a ''my experience with th...,Positive
1,10,Counter-Strike,"Easy to learn, hard to master.",1,1,"Easy to learn, hard to master.",Positive
2,10,Counter-Strike,"No r8 revolver, 10/10 will play again.",1,1,"No r8 revolver, 10/10 will play again.",Positive
3,10,Counter-Strike,Still better than Call of Duty: Ghosts...,1,1,Still better than Call of Duty: Ghosts...,Positive
4,10,Counter-Strike,"cant buy skins, cases, keys, stickers - gaben ...",1,1,"cant buy skins, cases, keys, stickers - gaben ...",Positive
...,...,...,...,...,...,...,...
684102,99910,Puzzle Pirates,"Too addictive, spent way too much time on this...",-1,1,"Too addictive, spent way too much time on this...",Negative
684103,99910,Puzzle Pirates,Thought i was putting the age of my character ...,-1,1,Thought i was putting the age of my character ...,Negative
684104,99910,Puzzle Pirates,I care not for a godforsaken deckhand. Just be...,-1,1,I care not for a godforsaken deckhand. Just be...,Negative
684105,99910,Puzzle Pirates,"1.no tutorial 2.gameplay looks to much casual,...",-1,1,"1.no tutorial 2.gameplay looks to much casual,...",Negative


In [4]:
# Mapping to digital label
label2id = {'Negative': 0, 'Positive': 1}
df['label'] = df['review_sentiment'].map(label2id)

In [5]:
from sklearn.model_selection import train_test_split

# Split training set and temporary set
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['review_text_clean'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

# Split validation set and test set
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels)

In [9]:
from transformers import AutoTokenizer

model_checkpoint = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [6]:
train_texts = train_texts.astype(str)
val_texts = val_texts.astype(str)
test_texts = test_texts.astype(str)

In [7]:
import torch
class SteamDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, max_length=512)
        encoding = {key: torch.tensor(val) for key, val in encoding.items()}
        encoding['labels'] = torch.tensor(label)
        return encoding


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=2,
    ignore_mismatched_sizes=True
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

In [10]:
from transformers import DataCollatorWithPadding

# Dynamic padding collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataset = SteamDataset(train_texts, train_labels, tokenizer)
val_dataset = SteamDataset(val_texts, val_labels, tokenizer)
test_dataset = SteamDataset(test_texts, test_labels, tokenizer)

In [15]:
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="macro")  # macro or weighted
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
    }

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    logging_strategy="steps",
    fp16=True,
    num_train_epochs=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1
1000,0.2354,0.219069,0.9119,0.89401
2000,0.206,0.217256,0.917586,0.899576
3000,0.2109,0.197371,0.920758,0.90502
4000,0.1966,0.193757,0.922922,0.908373


TrainOutput(global_step=4276, training_loss=0.22290253817644823, metrics={'train_runtime': 2538.4918, 'train_samples_per_second': 215.595, 'train_steps_per_second': 1.684, 'total_flos': 1.4395996416271104e+17, 'train_loss': 0.22290253817644823, 'epoch': 1.0})