# Import Libraries

In [14]:
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load Dataset     

In [9]:
class SentimentDataset(Dataset):
    def __init__(self, tokenizer, filepath, max_len=512):
        self.df = pd.read_csv(filepath)
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.texts = self.df['text'].tolist()  # The line of text column
        self.labels = self.df['label'].tolist()  # The line of label column

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Create datasets
# Load the dataset
# Since the original dataset was too large so I only use 1/10
df = pd.read_csv('Train.csv')
df1 = pd.read_csv('Test.csv')
# Sample 10% of the dataframe without replacement
df_sampled = df.sample(frac=0.1, random_state=42)  
df1_sampled = df1.sample(frac=0.1, random_state=42) 

# Save the sampled dataframe to a new CSV file
df_sampled.to_csv('train_sampled.csv', index=False)
df1_sampled.to_csv('test_sampled.csv', index=False)

train_dataset = SentimentDataset(tokenizer, 'train_sampled.csv', max_len=128)
test_dataset = SentimentDataset(tokenizer, 'test_sampled.csv', max_len=128)

# RoBERT Model

In [10]:
# RoBERT model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# This function used to calculate the performance
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs we set 3 since dataset was too large
    per_device_train_batch_size=32,   # batch size for training set 32 to have a faster speed to train the data
    per_device_eval_batch_size=16,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
)

# Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,        
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,    
)

trainer.train()

Step,Training Loss
10,0.1584
20,0.1251
30,0.2237
40,0.1546
50,0.1515
60,0.1436
70,0.135
80,0.1475
90,0.1413
100,0.0877


TrainOutput(global_step=375, training_loss=0.12863772416114808, metrics={'train_runtime': 2720.4022, 'train_samples_per_second': 4.411, 'train_steps_per_second': 0.138, 'total_flos': 789333166080000.0, 'train_loss': 0.12863772416114808, 'epoch': 3.0})

# Evaluation Performance

In [18]:
# Fine-tune the model
trainer.evaluate()

{'eval_loss': 0.39188623428344727,
 'eval_accuracy': 0.886,
 'eval_f1': 0.8866799204771372,
 'eval_precision': 0.8955823293172691,
 'eval_recall': 0.8779527559055118,
 'eval_runtime': 28.1449,
 'eval_samples_per_second': 17.765,
 'eval_steps_per_second': 1.137,
 'epoch': 3.0}

In [15]:
# Show the result
results = trainer.evaluate()
print(results)



{'eval_loss': 0.3410271108150482, 'eval_runtime': 49.8574, 'eval_samples_per_second': 10.029, 'eval_steps_per_second': 0.642, 'epoch': 3.0}
