# Importing necessary modules

In [14]:
import torch
import torch.nn
import torch.optim
import torch.profiler
import torch.utils.data
import torchvision.datasets
import torchvision.models
import torchvision.transforms as T

In [12]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('logs')

In [2]:
#Ivan Bibat
# L00167791
# 27/02/2024
#
# Reddit Scraper for data collection for sentiment analysis
# pip install praw

import json
import re

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Loading dataset then splitting into training, testing, and validation sets.

In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Loading dataset 
file = 'data/augmentedredditdata_labelled.csv'

df = pd.read_csv(file)

X = df['cleaned_text'].values
y = df['text_sentiment_label'].values

# Split data into training and validation sets; 64% training, 16% validation, 20% testing.
train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, test_size=0.2, random_state=42)

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)


                                               title  \
0                 Is it worth getting the iPhone 15?   
1  Why is the demand for the iPhone 15 series so ...   
2  Has anyone got the base model of the iPhone 15...   
3    15 Plus thoughts - back to iPhone after 3 years   
4                        Anyone bought iphone 15 pro   

                                                text  score  comments  \
0  I've seen a ton of negative reviews:  \n\-Easi...     14        42   
1  I thought iPhone 13 Pro and 14 Pro series alre...    458       689   
2  Also, how's the camera and battery life?\n\nfe...     30        92   
3  After three long years and handful of android ...    776       401   
4  Anyone here with iphone 15 pro facing absolute...     97       298   

       submission_date                        cleaned_title  \
0  2023-09-30 15:23:12                     worth get iPhone   
1  2023-09-23 11:49:19     demand iPhone   series high year   
2  2023-10-09 06:46:41  get base mo

# Encoding the training labels

In [4]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
encoded_labels_train = label_encoder.fit_transform(train_labels)

# Save the mapping between original labels and encoded labels
label_mapping = {original_label: int_label for original_label, int_label in zip(train_labels, encoded_labels_train)}
label_mapping

{'Positive': 2, 'Negative': 0, 'Neutral': 1}

# Encoding Validation Labels

In [5]:
# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
encoded_labels_valid = label_encoder.fit_transform(val_labels)

# Save the mapping between original labels and encoded labels
label_mapping = {original_label: int_label for original_label, int_label in zip(val_labels, encoded_labels_valid)}
label_mapping

{'Positive': 2, 'Neutral': 1, 'Negative': 0}

# Encoding Test Labels

In [6]:
# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
encoded_labels_test = label_encoder.fit_transform(test_labels)

# Save the mapping between original labels and encoded labels
label_mapping = {original_label: int_label for original_label, int_label in zip(test_labels, encoded_labels_test)}
label_mapping

{'Positive': 2, 'Neutral': 1, 'Negative': 0}

# Model Training

I've decided to use the roberta-base model for training and testing.

In [7]:
import transformers
transformers.__version__

'4.40.1'

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# initalise the tokenizer
roberta = "roberta-base"

# initalise the model for sequence classification from cardiffnlp / setting the num_labels to 3 for positive, negative, neutral.
model = AutoModelForSequenceClassification.from_pretrained(roberta, num_labels = 3)

# initialise the tokenizer using roberta-base
tokenizer = AutoTokenizer.from_pretrained(roberta)



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Sentiment Class

In [9]:
from torch.utils.data import Dataset, DataLoader

# Define a custom Dataset class for sentiment classification
class Sentiment(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize and encode the text using the provided tokenizer
        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        # Return a dictionary which contains the tokenised data and labels
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# labels are defined using the encoded values
train_labels = encoded_labels_train
val_labels = encoded_labels_valid
test_labels = encoded_labels_test

# Initalise Sentiment instances for the training, testing and validation data, with a max_length of 128. 
train_data = Sentiment(train_texts, train_labels, tokenizer, max_len=128)
val_data = Sentiment(val_texts, val_labels, tokenizer, max_len=128)
test_data = Sentiment(test_texts, test_labels, tokenizer, max_len=128)

# Initialise DataLoaders for training, testing and validation data.
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)
test_loader = DataLoader(test_data, batch_size=32)

# AdamW Optimizer


In [10]:
# Import the AdamW optimizer class from the transformers library
from transformers import AdamW

# Initialize the AdamW optimizer with the parameters of the model
# learning rate (lr) put at 1e-6 using the AdamW model.
optimizer = AdamW(model.parameters(), lr=1e-6)



# Performing Sentiment Analysis


In [11]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore', category=FutureWarning)


    import torch
    from sklearn.metrics import accuracy_score

    epoch_list = []
    val_loss_list = []
    val_accuracy_list = []
    # Check if CUDA (GPU) is available, and move the model to the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # For loop for the 10 epochs
    for epoch in range(10):

        model.train()  # Training 
        for batch in train_loader:
            optimizer.zero_grad() 
            input_ids = batch['input_ids'].to(device)  #moves input ids to the device
            attention_mask = batch['attention_mask'].to(device)  #moves attentionmask to the device
            labels = batch['labels'].to(device)  #moves labels from batch to device
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)  # Forward pass
            loss = outputs[0]
            loss.backward() 
            optimizer.step() 

        print(f'Epoch {epoch} completed')  # prints whenever an epoch is completed in training
        
        
        # Validation
        model.eval()  # Evaluation mode to validate the model


        val_loss = 0
        val_accuracy = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs[0]
                
                val_loss += loss.item()

                

                # calculating validation accuracy
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=1)
                accuracy = accuracy_score(labels.cpu(), predictions.cpu())
                val_accuracy += accuracy
                

        val_loss /= len(val_loader)  # calculating average validation loss
        val_accuracy /= len(val_loader)  # calculating average validation accuracy

        writer.add_scalar('Loss/validation', val_loss, epoch)
        writer.add_scalar('Accuracy/validation', val_accuracy, epoch)

        val_loss_list.append(val_loss)
        val_accuracy_list.append(val_accuracy)
        epoch_list.append(epoch)
        
        print(f'Validation Loss: {val_loss} and Validation Accuracy: {val_accuracy}')

Epoch 0 completed
Validation Loss: 0.7992403092591659 and Validation Accuracy: 0.6440217391304348
Epoch 1 completed
Validation Loss: 0.6144055788931639 and Validation Accuracy: 0.7051630434782609
Epoch 2 completed
Validation Loss: 0.5433196658673494 and Validation Accuracy: 0.7984601449275361
Epoch 3 completed
Validation Loss: 0.4886280479638473 and Validation Accuracy: 0.8220108695652174
Epoch 4 completed
Validation Loss: 0.4456567673579506 and Validation Accuracy: 0.8206521739130435
Epoch 5 completed
Validation Loss: 0.43109207632748975 and Validation Accuracy: 0.8274456521739131
Epoch 6 completed
Validation Loss: 0.41340146829252655 and Validation Accuracy: 0.8410326086956522
Epoch 7 completed
Validation Loss: 0.400819749287937 and Validation Accuracy: 0.8559782608695652
Epoch 8 completed
Validation Loss: 0.3828676640987396 and Validation Accuracy: 0.8777173913043478
Epoch 9 completed
Validation Loss: 0.3739435918953108 and Validation Accuracy: 0.8654891304347826


In [None]:
import matplotlib.pyplot as plt

plt.plot(epoch_list, val_accuracy_list, label='Validation Accuracy', marker = 'o')
plt.plot(epoch_list, val_loss_list, label='Validation Loss', marker = 'x')

plt.xlabel('Epoch')
plt.ylabel('Value')
plt.title('Validation Metrics per Epoch')

plt.legend()

plt.grid(True)
plt.show()


# Testing the Model after fine-tuning

In [None]:
model.eval()  # Set the model to evaluation mode

# Lists to store predicted and true labels
all_predictions = []
all_true_labels = []

# Disable gradient calculation for inference
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Perform forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        # Extend the lists with predicted and true labels
        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

# Convert encoded predictions back to original labels using the label encoder
predicted_labels = label_encoder.inverse_transform(all_predictions)
true_labels = label_encoder.inverse_transform(all_true_labels)



In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy using true_labels and predicted_labels
accuracy = accuracy_score(true_labels, predicted_labels)
print(f'Accuracy: {accuracy}')

# Generate a classification report using true_labels and predicted_labels
report = classification_report(true_labels, predicted_labels)
print('Classification Report: ')
print(report)

In [None]:
# Save the state dictionary of the model to the specified file
torch.save(model.state_dict(), 'models/roBERTa.pth')