<a href="https://colab.research.google.com/github/INTERDICTOR1/SentilystAI/blob/main/sentilyst_model_1_0_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
import torch
import os

# Load the resampled dataset
df = pd.read_csv('/content/drive/MyDrive/resampled_dataset.csv')

In [None]:
# Clean text (body) column
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and digits
    return text.lower()

# Apply cleaning to the 'body' column
df['cleaned_body'] = df['body'].apply(clean_text)

In [None]:
# Split into train (60%), validation (20%), and test (20%)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42, stratify=df['simplified_sentiment'])
cv_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['simplified_sentiment'])

# Save the datasets to Google Drive
train_df.to_csv('/content/drive/MyDrive/train_dataset.csv', index=False)
cv_df.to_csv('/content/drive/MyDrive/cv_dataset.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/test_dataset.csv', index=False)

In [None]:
# Initialize the FinBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]



In [None]:
# Custom dataset class for PyTorch
class FinancialSentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.texts = dataframe['cleaned_body'].tolist()
        self.labels = dataframe['simplified_sentiment'].apply(lambda x: 1 if x == 'positive' else (0 if x == 'neutral' else 2)).tolist()
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create PyTorch datasets for train, CV, and test
train_dataset = FinancialSentimentDataset(train_df, tokenizer, max_len=128)
cv_dataset = FinancialSentimentDataset(cv_df, tokenizer, max_len=128)
test_dataset = FinancialSentimentDataset(test_df, tokenizer, max_len=128)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
cv_loader = DataLoader(cv_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load the FinBERT model
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [None]:
# Fine-tuning settings (adjustments to reduce overfitting)
training_args = TrainingArguments(
    output_dir='drive/MyDrive/model_output',
    num_train_epochs=3,  # Keep the same
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,  # Reduced warmup steps
    weight_decay=0.05,  # Increased weight decay to reduce overfitting
    logging_dir='drive/MyDrive/logs',
    logging_steps=100,  # More frequent logging
    eval_steps=500,  # More frequent evaluation to catch overfitting earlier
    save_steps=500,  # Save model more frequently
    save_total_limit=2,  # Keep only the last 2 checkpoints
    learning_rate=1e-5,  # Reduced learning rate to slow down training
    load_best_model_at_end=True,
    evaluation_strategy="steps",  # Evaluate after certain steps
    save_strategy="steps",
    metric_for_best_model="accuracy",  # Save the best model based on accuracy
)



In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=cv_dataset,
    tokenizer=tokenizer,
    compute_metrics=lambda p: {'accuracy': (p.predictions.argmax(-1) == p.label_ids).mean()}
)

# Fine-tune the model
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
500,0.7239,0.586093,0.757408
1000,0.3953,0.466295,0.827603
1500,0.2926,0.296739,0.888607
2000,0.3133,0.223572,0.914435
2500,0.1654,0.250771,0.92727


KeyboardInterrupt: 

In [None]:
# Save the final model to Google Drive
output_dir = 'drive/MyDrive/finetuned_finbert_2'
trainer.save_model(output_dir)


In [None]:
# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=test_dataset)

# Print the test results
print(f"Test Accuracy: {test_results['eval_accuracy']:.4f}")

Step,Training Loss,Validation Loss,Accuracy
500,0.7239,0.586093,0.757408
1000,0.3953,0.466295,0.827603
1500,0.2926,0.296739,0.888607
2000,0.3133,0.223572,0.914435
2500,0.1654,0.250771,0.92727
2539,0.1654,0.22236,0.931242


Test Accuracy: 0.9312


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Assuming you have predictions and true labels from the test set
predictions = trainer.predict(test_dataset)
y_true = predictions.label_ids
y_pred = predictions.predictions.argmax(-1)

# Classification report (Precision, Recall, F1-Score for each class)
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=['negative', 'neutral', 'positive']))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))


Step,Training Loss,Validation Loss,Accuracy
500,0.7239,0.586093,0.757408
1000,0.3953,0.466295,0.827603
1500,0.2926,0.296739,0.888607
2000,0.3133,0.223572,0.914435
2500,0.1654,0.250771,0.92727
2539,0.1654,0.22236,0.931242


Classification Report:
              precision    recall  f1-score   support

    negative       0.98      0.99      0.98      2104
     neutral       0.93      0.88      0.91      2104
    positive       0.89      0.92      0.90      2104

    accuracy                           0.93      6312
   macro avg       0.93      0.93      0.93      6312
weighted avg       0.93      0.93      0.93      6312

Confusion Matrix:
[[2090    4   10]
 [  12 1860  232]
 [  39  137 1928]]


In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np
import torch

# Softmax function to convert logits to probabilities
def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    return exp_logits / exp_logits.sum(axis=1, keepdims=True)

# Use the trainer to get predictions on the test dataset
predictions_output = trainer.predict(test_dataset)

# Extract logits and true labels from the prediction output
logits = predictions_output.predictions
true_labels = predictions_output.label_ids

# Convert logits to probabilities
probabilities = softmax(logits)

# Compute the ROC AUC score (one-vs-rest, OvR approach)
roc_auc = roc_auc_score(true_labels, probabilities, multi_class='ovr')

print(f"ROC AUC Score: {roc_auc:.4f}")


Step,Training Loss,Validation Loss,Accuracy
500,0.7239,0.586093,0.757408
1000,0.3953,0.466295,0.827603
1500,0.2926,0.296739,0.888607
2000,0.3133,0.223572,0.914435
2500,0.1654,0.250771,0.92727
2539,0.1654,0.22236,0.931242


ROC AUC Score: 0.9896
