# Implementation of a Pre-Trained Model (RoBERTa)

In [3]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, pipeline
import torch
from torch.utils.data import Dataset, DataLoader




# Preprocess Data and Training

In [4]:
df = pd.read_csv('ML_Models/Propietary_Models/token_datasets.csv')

# String Conversion
df['Posts'] = df['Posts'].astype(str)

# Label Mapping
label_mapping = {'Positive': 0, 'Depression/Suicidal Thoughts': 1, 'Neutral': 2}
df['label'] = df['label'].map(label_mapping)

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', clean_up_tokenization_spaces=True)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# Move model to GPU if available
if torch.cuda.is_available():
    model = model.to('cuda')

# Tokenize the input texts
inputs = tokenizer(list(df['Posts']), padding=True, truncation=True, return_tensors='pt')

# Get labels
labels = torch.tensor(df['label'].values, dtype=torch.long)

# Create a PyTorch Dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)

dataset = CustomDataset(inputs, labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    fp16=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Train the model
try:
    trainer.train()
except Exception as e:
    print("Error during training:", e)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/390 [00:00<?, ?it/s]

{'train_runtime': 12041.3618, 'train_samples_per_second': 0.517, 'train_steps_per_second': 0.032, 'train_loss': 0.648263432429387, 'epoch': 3.0}


# Model Evaluation

In [5]:
from sklearn.model_selection import train_test_split

# Split dataset into training and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Posts'], df['label'], test_size=0.2)

# Tokenize validation set
val_inputs = tokenizer(list(val_texts), padding=True, truncation=True, return_tensors='pt')
val_labels = torch.tensor(val_labels.values, dtype=torch.long)

# Create validation dataset
val_dataset = CustomDataset(val_inputs, val_labels)


In [10]:
# Evaluate the model
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics

)

eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/26 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.23036621510982513, 'eval_model_preparation_time': 0.006, 'eval_accuracy': 0.908433734939759, 'eval_f1': 0.9093068745570517, 'eval_precision': 0.9168163046523811, 'eval_recall': 0.908433734939759, 'eval_runtime': 464.3046, 'eval_samples_per_second': 0.894, 'eval_steps_per_second': 0.056}


In [11]:
from sklearn.metrics import classification_report
import numpy as np

# Get predictions from the model
preds_output = trainer.predict(val_dataset)
preds = np.argmax(preds_output.predictions, axis=1)

# Generate the classification report
print(classification_report(val_labels, preds, target_names=['Positive', 'Depression/Suicidal Thoughts', 'Neutral']))

  0%|          | 0/26 [00:00<?, ?it/s]

                              precision    recall  f1-score   support

                    Positive       0.98      0.74      0.84        57
Depression/Suicidal Thoughts       0.73      0.86      0.79        78
                     Neutral       0.96      0.96      0.96       280

                    accuracy                           0.91       415
                   macro avg       0.89      0.85      0.86       415
                weighted avg       0.92      0.91      0.91       415



# Saved the trained model and tokenizer 

In [12]:
model.save_pretrained('./sentiment_analysis_model')
tokenizer.save_pretrained('./sentiment_analysis_model')


('./sentiment_analysis_model\\tokenizer_config.json',
 './sentiment_analysis_model\\special_tokens_map.json',
 './sentiment_analysis_model\\vocab.json',
 './sentiment_analysis_model\\merges.txt',
 './sentiment_analysis_model\\added_tokens.json')

# Testing the Model: Making a simple prediction

In [30]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import torch

# Load the saved model and tokenizer
model = RobertaForSequenceClassification.from_pretrained('sentiment_analysis_model')
tokenizer = RobertaTokenizer.from_pretrained('sentiment_analysis_model')

# List of input texts
texts = [
    "I am feeling great today!",
    "I feel very depressed and hopeless.",
    "This is a neutral statement.",
    "Nice try. I am dying slow and painfully",
    "I hate my birthday not just because I wish it never happened. But also because, even if I set up a party. No one would be there.",
    "I'm happy",
]  

# Tokenize the input texts
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Determine the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Move the model to the appropriate device
model = model.to(device)

# Move the input tensors to the same device as the model
inputs = {key: val.to(device) for key, val in inputs.items()}

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Get predicted classes (indices of the highest logits)
predicted_classes = torch.argmax(logits, dim=1).tolist()

# Define label mapping
label_mapping = {0: 'Positive', 1: 'Depression/Suicidal Thoughts', 2: 'Neutral'}

# Get the predicted labels
predicted_labels = [label_mapping[predicted_class] for predicted_class in predicted_classes]

# Print results
for text, label in zip(texts, predicted_labels):
    print(f"Text: {text}\nPredicted label: {label}\n")



Text: I am feeling great today!
Predicted label: Neutral

Text: I feel very depressed and hopeless.
Predicted label: Depression/Suicidal Thoughts

Text: This is a neutral statement.
Predicted label: Neutral

Text: Nice try. I am dying slow and painfully
Predicted label: Neutral

Text: I hate my birthday not just because I wish it never happened. But also because, even if I set up a party. No one would be there.
Predicted label: Neutral

Text: I'm happy
Predicted label: Positive

