In [18]:
!pip install openpyxl
# ============================================================================
# CONFIGURATION
# ============================================================================
MODEL_NAME = "j-hartmann/emotion-english-distilroberta-base"
OUTPUT_DIR = "./emotion_classifier_jhartmann"
SEED = 42

# Note: j-hartmann model uses different labels, we'll map to our 4 categories
# Original labels: anger, disgust, fear, joy, neutral, sadness, surprise
# Our labels: Joy, Sadness, Neutral, Anger
EMOTION_LABELS = {
    0: "Joy",
    1: "Sadness", 
    2: "Neutral",
    3: "Anger"
}

# Improved mapping from j-hartmann labels to our labels
JHARTMANN_TO_OUR_LABELS = {
    "joy": 0,      # Joy
    "sadness": 1,  # Sadness  
    "neutral": 2,  # Neutral
    "anger": 3,    # Anger
    "disgust": 3,  # Map disgust to Anger (similar negative emotion)
    "fear": 1,     # Map fear to Sadness (negative emotion) 
    "surprise": 2  # Map surprise to Neutral (often ambiguous)
}

import torch
import numpy as np
import pandas as pd
import re
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding,
    EarlyStoppingCallback
)
from datasets import Dataset, ClassLabel
import evaluate
from transformers import EvalPrediction
import torch.nn.functional as F
from sklearn.metrics import classification_report, confusion_matrix
import random
import json
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.manual_seed(SEED)
np.random.seed(SEED)

# ============================================================================
# LOAD DATA
# ============================================================================
df = pd.read_csv('/kaggle/input/emotions1/emotions-dataset.csv').copy()
df = df.sample(frac=1).reset_index(drop=True)
df.dropna(subset=['content'], inplace=True)

print("=" * 80)
print("EMOTION DISTRIBUTION")
print("=" * 80)
for label_id, emotion_name in EMOTION_LABELS.items():
    count = (df['sentiment'] == label_id).sum()
    pct = (count / len(df)) * 100
    print(f"{label_id}: {emotion_name:10s} - {count:5d} samples ({pct:5.2f}%)")
print(f"\nTotal: {len(df)} samples")
print("=" * 80)

# ============================================================================
# SIMPLE TEXT CLEANING - Minimal processing for pre-trained model
# ============================================================================
def clean_text_simple(text):
    """
    Simple cleaning - the j-hartmann model is already trained on emotional text
    so we don't need heavy preprocessing
    """
    text = str(text)
   
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
   
    # Clean whitespace
    text = re.sub(r'\s+', ' ', text).strip()
   
    return text

df['cleaned_content'] = df['content'].apply(clean_text_simple)
df = df.rename(columns={'sentiment': 'label'})

# Show examples per emotion
print("\n" + "=" * 80)
print("SAMPLE TEXTS PER EMOTION")
print("=" * 80)
for label_id, emotion_name in EMOTION_LABELS.items():
    print(f"\n{emotion_name.upper()} (Label {label_id}):")
    samples = df[df['label'] == label_id]['cleaned_content'].head(3).tolist()
    for i, sample in enumerate(samples, 1):
        print(f" {i}. {sample[:80]}...")
print("=" * 80)

# ============================================================================
# CLASS WEIGHT COMPUTATION
# ============================================================================
num_labels = len(EMOTION_LABELS)
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(df['label']),
    y=df['label']
)

print("\n" + "=" * 80)
print("CLASS WEIGHTS (for imbalanced data)")
print("=" * 80)
for i, weight in enumerate(class_weights):
    print(f"{EMOTION_LABELS[i]:10s} (Label {i}): {weight:.3f}")
print("=" * 80)

# ============================================================================
# TOKENIZATION - Store test set texts for later analysis
# ============================================================================

# Create a copy of the dataframe with original indices for test set reconstruction
df_with_index = df.reset_index().rename(columns={'index': 'original_index'})

hf_dataset = Dataset.from_pandas(df_with_index[['cleaned_content', 'label', 'original_index']], preserve_index=False)

# Cast label to ClassLabel for stratification
hf_dataset = hf_dataset.cast_column('label', ClassLabel(names=[EMOTION_LABELS[i] for i in range(num_labels)]))

# Load tokenizer from j-hartmann model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples["cleaned_content"],
        truncation=True,
        max_length=128,
        padding=False # Dynamic padding via data collator
    )

tokenized_dataset = hf_dataset.map(tokenize_function, batched=True, remove_columns=['cleaned_content'])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Stratified split to maintain emotion distribution
split_dataset = tokenized_dataset.train_test_split(
    test_size=0.2,
    seed=SEED,
    stratify_by_column='label'
)

test_valid_split = split_dataset['test'].train_test_split(
    test_size=0.5,
    seed=SEED,
    stratify_by_column='label'
)

train_dataset = split_dataset['train']
validation_dataset = test_valid_split['train']
test_dataset = test_valid_split['test']

print(f"\n‚úì Dataset sizes - Train: {len(train_dataset)} | Val: {len(validation_dataset)} | Test: {len(test_dataset)}")

# Verify stratification
print("\nValidation set distribution:")
for label in range(num_labels):
    count = sum(1 for x in validation_dataset if x['label'] == label)
    pct = (count / len(validation_dataset)) * 100
    print(f" {EMOTION_LABELS[label]:10s}: {count:4d} ({pct:5.2f}%)")

# Store test set texts for error analysis
test_indices = test_dataset['original_index']
test_texts = df.loc[test_indices, 'cleaned_content'].tolist()

# ============================================================================
# MODEL LOADING - Using j-hartmann pre-trained emotion model
# ============================================================================
print(f"\nLoading pre-trained emotion model: {MODEL_NAME}")
print("This model is already fine-tuned on emotion classification!")

# Load the model - it already has emotion classification capabilities
# We just need to adjust the classifier for our 4 classes instead of 7
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    ignore_mismatched_sizes=True # Important: allows loading with different number of labels
)

print("‚úì Model loaded successfully!")
print(f"Original model has 7 emotions, adapted to our {num_labels} emotions")

# ============================================================================
# WEIGHTED TRAINER
# ============================================================================
class WeightedTrainer(Trainer):
    """
    Trainer with class weights for handling imbalance
    """
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
       
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
       
        # Standard cross-entropy with class weights
        if self.class_weights is not None:
            device = labels.device
            weight_tensor = torch.tensor(self.class_weights, dtype=torch.float).to(device)
            loss = F.cross_entropy(logits, labels, weight=weight_tensor)
        else:
            loss = F.cross_entropy(logits, labels)
       
        return (loss, outputs) if return_outputs else loss

# ============================================================================
# METRICS
# ============================================================================
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
   
    # Overall metrics
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)['accuracy']
    f1_macro = f1_metric.compute(predictions=preds, references=labels, average='macro')['f1']
    f1_weighted = f1_metric.compute(predictions=preds, references=labels, average='weighted')['f1']
   
    # Per-class F1 scores
    f1_per_class = f1_metric.compute(predictions=preds, references=labels, average=None)['f1']
   
    metrics = {
        "accuracy": accuracy,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
    }
   
    # Add per-emotion F1 scores
    for i, emotion in EMOTION_LABELS.items():
        metrics[f"f1_{emotion.lower()}"] = f1_per_class[i]
   
    return metrics

# ============================================================================
# TRAINING ARGUMENTS - Optimized for fine-tuning pre-trained model
# ============================================================================
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
   
    # Training hyperparameters - fewer epochs needed since model is pre-trained on emotions
    num_train_epochs=4, # Reduced because we're fine-tuning an already emotion-trained model
    per_device_train_batch_size=32, # Increased batch size
    per_device_eval_batch_size=64,
    learning_rate=2e-5, # Standard fine-tuning LR
    warmup_ratio=0.1,
    weight_decay=0.01,
   
    # Evaluation strategy
    eval_strategy="epoch", # Evaluate every epoch
    save_strategy="epoch",
    save_total_limit=2,
   
    # Model selection
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
   
    # Logging
    logging_dir='./logs',
    logging_steps=50,
   
    # Performance
    fp16=True,
    dataloader_num_workers=2,
    seed=SEED,
   
    report_to="none"
)

# ============================================================================
# INITIALIZE TRAINER
# ============================================================================
trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] # Less patience needed
)

# ============================================================================
# TRAIN
# ============================================================================
print("\n" + "=" * 80)
print("FINE-TUNING PRE-TRAINED EMOTION CLASSIFIER")
print(f"Model: {MODEL_NAME}")
print("Strategy: Transfer Learning + Class Weights")
print("=" * 80)

trainer.train()

# ============================================================================
# FINAL EVALUATION
# ============================================================================
print("\n" + "=" * 80)
print("FINAL EVALUATION ON TEST SET")
print("=" * 80)

test_results = trainer.evaluate(test_dataset)
print("\n--- Test Set Performance ---")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 (Macro): {test_results['eval_f1_macro']:.4f}")
print(f"F1 (Weight): {test_results['eval_f1_weighted']:.4f}")
print("\nPer-Emotion F1 Scores:")
for emotion in EMOTION_LABELS.values():
    key = f"eval_f1_{emotion.lower()}"
    print(f" {emotion:10s}: {test_results[key]:.4f}")

# ============================================================================
# CONFUSION MATRIX & CLASSIFICATION REPORT
# ============================================================================
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

print("\n" + "=" * 80)
print("CONFUSION MATRIX")
print("=" * 80)
print(" ", " ".join([f"{EMOTION_LABELS[i]:8s}" for i in range(num_labels)]))
cm = confusion_matrix(true_labels, preds)
for i, row in enumerate(cm):
    print(f"{EMOTION_LABELS[i]:10s}: ", " ".join([f"{val:8d}" for val in row]))

print("\n" + "=" * 80)
print("DETAILED CLASSIFICATION REPORT")
print("=" * 80)
emotion_names = [EMOTION_LABELS[i] for i in range(num_labels)]
print(classification_report(true_labels, preds, target_names=emotion_names, digits=4))

# ============================================================================
# ERROR ANALYSIS - FIXED VERSION
# ============================================================================
def analyze_errors(true_labels, preds, test_texts):
    """Analyze misclassifications to understand model weaknesses"""
    misclassified = []
    
    for i, (true, pred) in enumerate(zip(true_labels, preds)):
        if true != pred:
            text_sample = test_texts[i]  # Use stored test texts
               
            misclassified.append({
                'text': text_sample,
                'true_label': true,
                'pred_label': pred,
                'true_emotion': EMOTION_LABELS[true],
                'pred_emotion': EMOTION_LABELS[pred]
            })
   
    # Count error types
    error_matrix = np.zeros((num_labels, num_labels))
    for error in misclassified:
        error_matrix[error['true_label'], error['pred_label']] += 1
   
    print("\n" + "=" * 80)
    print("ERROR ANALYSIS")
    print("=" * 80)
    print(f"Total misclassified: {len(misclassified)} ({len(misclassified)/len(true_labels):.1%})")
    print("\nMost common misclassifications:")
    
    error_counts = []
    for true_idx in range(num_labels):
        for pred_idx in range(num_labels):
            if true_idx != pred_idx and error_matrix[true_idx, pred_idx] > 0:
                count = error_matrix[true_idx, pred_idx]
                percentage = (count / len(misclassified)) * 100
                error_counts.append((true_idx, pred_idx, count, percentage))
    
    # Sort by count descending
    error_counts.sort(key=lambda x: x[2], reverse=True)
    
    for true_idx, pred_idx, count, percentage in error_counts[:10]:  # Show top 10
        print(f" {EMOTION_LABELS[true_idx]} ‚Üí {EMOTION_LABELS[pred_idx]}: {count:.0f} errors ({percentage:.1f}%)")
    
    # Show some example errors
    print(f"\nExample misclassifications:")
    for i, error in enumerate(misclassified[:5]):
        print(f"\n {i+1}. '{error['text'][:100]}...'")
        print(f"    True: {error['true_emotion']} ‚Üí Pred: {error['pred_emotion']}")
   
    return misclassified, error_matrix

misclassified, error_matrix = analyze_errors(true_labels, preds, test_texts)

# ============================================================================
# COMPARE WITH BASELINE (Zero-shot with original j-hartmann model) - FIXED
# ============================================================================
print("\n" + "=" * 80)
print("BASELINE COMPARISON: Zero-shot with original model")
print("=" * 80)

# Load original model for comparison
original_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
original_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
original_model.eval()

def predict_with_original_model(texts):
    """Get predictions using original j-hartmann model"""
    inputs = original_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = original_model(**inputs)
    probs = F.softmax(outputs.logits, dim=-1)
    preds = torch.argmax(probs, dim=1)
    return preds.numpy(), probs.numpy()

# Test on actual test set samples (first 100 for speed)
sample_texts = test_texts[:100]
sample_true_labels = true_labels[:100]

original_preds, original_probs = predict_with_original_model(sample_texts)

# Map original predictions to our labels
mapped_preds = []
for pred in original_preds:
    # j-hartmann model label names
    original_labels = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
    predicted_emotion = original_labels[pred]
    mapped_label = JHARTMANN_TO_OUR_LABELS.get(predicted_emotion, 2) # default to Neutral
    mapped_preds.append(mapped_label)

# Calculate baseline accuracy
baseline_accuracy = np.mean(np.array(mapped_preds) == np.array(sample_true_labels))
print(f"Baseline (zero-shot) accuracy on test samples: {baseline_accuracy:.2%}")
print(f"Our fine-tuned model accuracy: {test_results['eval_accuracy']:.2%}")
print(f"Improvement: {test_results['eval_accuracy'] - baseline_accuracy:+.2%}")

# ============================================================================
# INFERENCE WITH EXPLANATIONS
# ============================================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

def predict_with_confidence(text, show_all_probs=False):
    """Predict emotion with confidence scores"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    inputs = inputs.to(device)
   
    with torch.no_grad():
        outputs = model(**inputs)
   
    probs = F.softmax(outputs.logits, dim=-1)[0]
    pred_label = torch.argmax(probs).item()
   
    if show_all_probs:
        print(f"\nText: {text}")
        print("Probabilities:")
        for i, prob in enumerate(probs):
            print(f" {EMOTION_LABELS[i]:10s}: {prob.item():.2%}")
        print(f"‚Üí Predicted: {EMOTION_LABELS[pred_label]}")
   
    return pred_label, probs[pred_label].item()

# Test on diverse samples from test set
print("\n" + "=" * 80)
print("INFERENCE ON DIVERSE TEST SAMPLES")
print("=" * 80)

# Get samples from each emotion in test set
test_samples = []
for emotion_id in range(num_labels):
    # Find indices in test set for this emotion
    emotion_indices = np.where(true_labels == emotion_id)[0]
    if len(emotion_indices) >= 2:
        # Take first 2 samples of this emotion from test set
        for idx in emotion_indices[:2]:
            test_samples.append((test_texts[idx], emotion_id))

correct_predictions = 0
total_predictions = len(test_samples)

print(f"Testing on {total_predictions} diverse samples from test set:")
for text, true_label in test_samples:
    pred_label, confidence = predict_with_confidence(text, show_all_probs=False)
   
    status = "‚úÖ" if pred_label == true_label else "‚ùå"
    if pred_label == true_label:
        correct_predictions += 1
   
    true_emotion = EMOTION_LABELS[true_label]
    pred_emotion = EMOTION_LABELS[pred_label]
   
    print(f"\n{status} '{text[:70]}...'")
    print(f" True: {true_emotion:10s} | Pred: {pred_emotion:10s} | Conf: {confidence:.1%}")

print(f"\nDiverse sample accuracy: {correct_predictions}/{total_predictions} = {correct_predictions/total_predictions:.1%}")

# ============================================================================
# SAVE MODEL
# ============================================================================
print("\n" + "=" * 80)
print("SAVING FINE-TUNED MODEL")
print("=" * 80)

trainer.save_model(f"{OUTPUT_DIR}/final_model")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model")

# Save configuration
config_info = {
    "emotion_labels": EMOTION_LABELS,
    "base_model": MODEL_NAME,
    "fine_tuned_on": "4-emotion dataset",
    "performance": {
        "accuracy": test_results['eval_accuracy'],
        "f1_macro": test_results['eval_f1_macro'],
        "f1_weighted": test_results['eval_f1_weighted']
    }
}

with open(f"{OUTPUT_DIR}/final_model/model_config.json", 'w') as f:
    json.dump(config_info, f, indent=2)

print(f"‚úì Model saved to: {OUTPUT_DIR}/final_model")
print(f"‚úì Model configuration saved")
print("=" * 80)
print("\nüéâ FINE-TUNING COMPLETE!")
print(f"Final Test Accuracy: {test_results['eval_accuracy']:.2%}")
print(f"Final Test F1 (Macro): {test_results['eval_f1_macro']:.2%}")

# ============================================================================
# PERFORMANCE SUMMARY
# ============================================================================
print("\n" + "=" * 80)
print("PERFORMANCE SUMMARY")
print("=" * 80)
print(f"6. ‚úì Achieved performance: {test_results['eval_accuracy']:.2%} accuracy")
print("\nModel Strengths:")
print(f" - Best at detecting Anger (F1: {test_results['eval_f1_anger']:.2%})")
print(f" - Good Neutral detection (F1: {test_results['eval_f1_neutral']:.2%})")
print("\nAreas for Improvement:")
print(f" - Joy detection could be improved (F1: {test_results['eval_f1_joy']:.2%})")
print(" - Some confusion between Joy/Sadness/Neutral")
print("=" * 80)

EMOTION DISTRIBUTION
0: Joy        -  9211 samples (26.27%)
1: Sadness    -  9530 samples (27.18%)
2: Neutral    -  6412 samples (18.29%)
3: Anger      -  9913 samples (28.27%)

Total: 35066 samples

SAMPLE TEXTS PER EMOTION

JOY (Label 0):
 1. vene2ia great...
 2. mrcartersnurse congratulations for your mom for tomorrow buenas noches...
 3. ddlovato oh i see thanks for replying anyway how are you...

SADNESS (Label 1):
 1. just finished watching quotmarley and mequot...
 2. msignorile weather sucks up here...
 3. sasss09 hahaha sadly this ones supposed to be done individually so no chance of ...

NEUTRAL (Label 2):
 1. sadknob right now if be happy to win a packet of salt n vinegar crisps or a new ...
 2. ok im out of here for now just popped in to say hi and check on things ill proba...
 3. just finished my 1st new song soon on youtube keeping you updated...

ANGER (Label 3):
 1. tanisha found herself in an outrageous situation...
 2. ok free people skirt hide and seeks over...
 3. t

Casting the dataset:   0%|          | 0/35066 [00:00<?, ? examples/s]

Map:   0%|          | 0/35066 [00:00<?, ? examples/s]


‚úì Dataset sizes - Train: 28052 | Val: 3507 | Test: 3507

Validation set distribution:
 Joy       :  921 (26.26%)
 Sadness   :  953 (27.17%)
 Neutral   :  642 (18.31%)
 Anger     :  991 (28.26%)

Loading pre-trained emotion model: j-hartmann/emotion-english-distilroberta-base
This model is already fine-tuned on emotion classification!


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at j-hartmann/emotion-english-distilroberta-base and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([7, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([7]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úì Model loaded successfully!
Original model has 7 emotions, adapted to our 4 emotions


  super().__init__(*args, **kwargs)



PRE-TRAINED MODEL CAPABILITIES
The j-hartmann model is already trained on:
 - 7 emotions: anger, disgust, fear, joy, neutral, sadness, surprise
 - Large emotional dataset
 - Good understanding of emotional language patterns
We are adapting it to our 4 emotion categories

FINE-TUNING PRE-TRAINED EMOTION CLASSIFIER
Model: j-hartmann/emotion-english-distilroberta-base
Strategy: Transfer Learning + Class Weights


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted,F1 Joy,F1 Sadness,F1 Neutral,F1 Anger
1,0.6472,0.612753,0.765041,0.761396,0.764135,0.697124,0.753933,0.747573,0.846954
2,0.5473,0.592762,0.776732,0.773817,0.777002,0.714609,0.766355,0.7545,0.859803
3,0.4928,0.592416,0.775307,0.772303,0.775411,0.706607,0.765027,0.755031,0.862545
4,0.4409,0.601648,0.776732,0.774041,0.777021,0.712206,0.766385,0.757337,0.860237



FINAL EVALUATION ON TEST SET



--- Test Set Performance ---
Accuracy: 0.7821
F1 (Macro): 0.7796
F1 (Weight): 0.7823

Per-Emotion F1 Scores:
 Joy       : 0.7230
 Sadness   : 0.7564
 Neutral   : 0.7662
 Anger     : 0.8728

CONFUSION MATRIX
  Joy      Sadness  Neutral  Anger   
Joy       :       646      122      116       37
Sadness   :       115      725       56       57
Neutral   :        64       43      521       13
Anger     :        41       74       26      851

DETAILED CLASSIFICATION REPORT
              precision    recall  f1-score   support

         Joy     0.7460    0.7014    0.7230       921
     Sadness     0.7521    0.7608    0.7564       953
     Neutral     0.7246    0.8128    0.7662       641
       Anger     0.8883    0.8579    0.8728       992

    accuracy                         0.7821      3507
   macro avg     0.7777    0.7832    0.7796      3507
weighted avg     0.7840    0.7821    0.7823      3507


ERROR ANALYSIS
Total misclassified: 764 (21.8%)

Most common misclassifications:
 Joy ‚Üí 

In [19]:
!pip install openpyxl
# ============================================================================
# CONFIGURATION
# ============================================================================
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
OUTPUT_DIR = "./sentiment_classifier_cardiffnlp"
SEED = 42

# Note: cardiffnlp model uses negative, neutral, positive labels
# We'll map to our 4 categories: Joy, Sadness, Neutral, Anger
EMOTION_LABELS = {
    0: "Joy",
    1: "Sadness", 
    2: "Neutral",
    3: "Anger"
}

# Mapping from cardiffnlp sentiment labels to our emotion labels
CARDIFFNLP_TO_OUR_LABELS = {
    "positive": 0,   # Joy
    "neutral": 2,    # Neutral
    "negative": 1,   # Sadness (we'll handle Anger separately)
}

import torch
import numpy as np
import pandas as pd
import re
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding,
    EarlyStoppingCallback
)
from datasets import Dataset, ClassLabel
import evaluate
from transformers import EvalPrediction
import torch.nn.functional as F
from sklearn.metrics import classification_report, confusion_matrix
import random
import json
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.manual_seed(SEED)
np.random.seed(SEED)

# ============================================================================
# LOAD DATA
# ============================================================================
df = pd.read_csv('/kaggle/input/emotions1/emotions-dataset.csv').copy()
df = df.sample(frac=1).reset_index(drop=True)
df.dropna(subset=['content'], inplace=True)

print("=" * 80)
print("EMOTION DISTRIBUTION")
print("=" * 80)
for label_id, emotion_name in EMOTION_LABELS.items():
    count = (df['sentiment'] == label_id).sum()
    pct = (count / len(df)) * 100
    print(f"{label_id}: {emotion_name:10s} - {count:5d} samples ({pct:5.2f}%)")
print(f"\nTotal: {len(df)} samples")
print("=" * 80)

# ============================================================================
# TEXT CLEANING - Optimized for Twitter model
# ============================================================================
def clean_text_twitter(text):
    """
    Cleaning optimized for Twitter-trained model
    """
    text = str(text)
   
    # Remove URLs but keep other Twitter elements
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Clean extra whitespace but preserve emojis and hashtags
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove user mentions if they don't add context
    text = re.sub(r'@\w+', '', text)
    
    return text.strip()

df['cleaned_content'] = df['content'].apply(clean_text_twitter)
df = df.rename(columns={'sentiment': 'label'})

# Show examples per emotion
print("\n" + "=" * 80)
print("SAMPLE TEXTS PER EMOTION")
print("=" * 80)
for label_id, emotion_name in EMOTION_LABELS.items():
    print(f"\n{emotion_name.upper()} (Label {label_id}):")
    samples = df[df['label'] == label_id]['cleaned_content'].head(3).tolist()
    for i, sample in enumerate(samples, 1):
        print(f" {i}. {sample[:80]}...")
print("=" * 80)

# ============================================================================
# CLASS WEIGHT COMPUTATION
# ============================================================================
num_labels = len(EMOTION_LABELS)
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(df['label']),
    y=df['label']
)

print("\n" + "=" * 80)
print("CLASS WEIGHTS (for imbalanced data)")
print("=" * 80)
for i, weight in enumerate(class_weights):
    print(f"{EMOTION_LABELS[i]:10s} (Label {i}): {weight:.3f}")
print("=" * 80)

# ============================================================================
# TOKENIZATION - Store test set texts for later analysis
# ============================================================================

# Create a copy of the dataframe with original indices for test set reconstruction
df_with_index = df.reset_index().rename(columns={'index': 'original_index'})

hf_dataset = Dataset.from_pandas(df_with_index[['cleaned_content', 'label', 'original_index']], preserve_index=False)

# Cast label to ClassLabel for stratification
hf_dataset = hf_dataset.cast_column('label', ClassLabel(names=[EMOTION_LABELS[i] for i in range(num_labels)]))

# Load tokenizer from cardiffnlp model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples["cleaned_content"],
        truncation=True,
        max_length=128,
        padding=False # Dynamic padding via data collator
    )

tokenized_dataset = hf_dataset.map(tokenize_function, batched=True, remove_columns=['cleaned_content'])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Stratified split to maintain emotion distribution
split_dataset = tokenized_dataset.train_test_split(
    test_size=0.2,
    seed=SEED,
    stratify_by_column='label'
)

test_valid_split = split_dataset['test'].train_test_split(
    test_size=0.5,
    seed=SEED,
    stratify_by_column='label'
)

train_dataset = split_dataset['train']
validation_dataset = test_valid_split['train']
test_dataset = test_valid_split['test']

print(f"\n‚úì Dataset sizes - Train: {len(train_dataset)} | Val: {len(validation_dataset)} | Test: {len(test_dataset)}")

# Verify stratification
print("\nValidation set distribution:")
for label in range(num_labels):
    count = sum(1 for x in validation_dataset if x['label'] == label)
    pct = (count / len(validation_dataset)) * 100
    print(f" {EMOTION_LABELS[label]:10s}: {count:4d} ({pct:5.2f}%)")

# Store test set texts for error analysis
test_indices = test_dataset['original_index']
test_texts = df.loc[test_indices, 'cleaned_content'].tolist()

# ============================================================================
# MODEL LOADING - Using cardiffnlp Twitter sentiment model
# ============================================================================
print(f"\nLoading Twitter sentiment model: {MODEL_NAME}")
print("This model is pre-trained on Twitter data for sentiment analysis!")

# Load the model - it has sentiment classification capabilities
# We need to adjust the classifier for our 4 emotion classes
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    ignore_mismatched_sizes=True # Important: allows loading with different number of labels
)

print("‚úì Twitter sentiment model loaded successfully!")
print(f"Original model has 3 sentiment classes, adapted to our {num_labels} emotion classes")

# ============================================================================
# WEIGHTED TRAINER
# ============================================================================
class WeightedTrainer(Trainer):
    """
    Trainer with class weights for handling imbalance
    """
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
       
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
       
        # Standard cross-entropy with class weights
        if self.class_weights is not None:
            device = labels.device
            weight_tensor = torch.tensor(self.class_weights, dtype=torch.float).to(device)
            loss = F.cross_entropy(logits, labels, weight=weight_tensor)
        else:
            loss = F.cross_entropy(logits, labels)
       
        return (loss, outputs) if return_outputs else loss

# ============================================================================
# METRICS
# ============================================================================
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
   
    # Overall metrics
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)['accuracy']
    f1_macro = f1_metric.compute(predictions=preds, references=labels, average='macro')['f1']
    f1_weighted = f1_metric.compute(predictions=preds, references=labels, average='weighted')['f1']
   
    # Per-class F1 scores
    f1_per_class = f1_metric.compute(predictions=preds, references=labels, average=None)['f1']
   
    metrics = {
        "accuracy": accuracy,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
    }
   
    # Add per-emotion F1 scores
    for i, emotion in EMOTION_LABELS.items():
        metrics[f"f1_{emotion.lower()}"] = f1_per_class[i]
   
    return metrics

# ============================================================================
# TRAINING ARGUMENTS - Optimized for Twitter model
# ============================================================================
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
   
    # Training hyperparameters
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
   
    # Evaluation strategy
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
   
    # Model selection
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
   
    # Logging
    logging_dir='./logs',
    logging_steps=50,
   
    # Performance
    fp16=True,
    dataloader_num_workers=2,
    seed=SEED,
   
    report_to="none"
)

# ============================================================================
# INITIALIZE TRAINER
# ============================================================================
trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# ============================================================================
# TRAIN
# ============================================================================
print("\n" + "=" * 80)
print("FINE-TUNING TWITTER SENTIMENT MODEL FOR EMOTION CLASSIFICATION")
print(f"Model: {MODEL_NAME}")
print("Strategy: Transfer Learning from Twitter Sentiment + Class Weights")
print("=" * 80)

trainer.train()

# ============================================================================
# FINAL EVALUATION
# ============================================================================
print("\n" + "=" * 80)
print("FINAL EVALUATION ON TEST SET")
print("=" * 80)

test_results = trainer.evaluate(test_dataset)
print("\n--- Test Set Performance ---")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 (Macro): {test_results['eval_f1_macro']:.4f}")
print(f"F1 (Weight): {test_results['eval_f1_weighted']:.4f}")
print("\nPer-Emotion F1 Scores:")
for emotion in EMOTION_LABELS.values():
    key = f"eval_f1_{emotion.lower()}"
    print(f" {emotion:10s}: {test_results[key]:.4f}")

# ============================================================================
# CONFUSION MATRIX & CLASSIFICATION REPORT
# ============================================================================
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

print("\n" + "=" * 80)
print("CONFUSION MATRIX")
print("=" * 80)
print(" ", " ".join([f"{EMOTION_LABELS[i]:8s}" for i in range(num_labels)]))
cm = confusion_matrix(true_labels, preds)
for i, row in enumerate(cm):
    print(f"{EMOTION_LABELS[i]:10s}: ", " ".join([f"{val:8d}" for val in row]))

print("\n" + "=" * 80)
print("DETAILED CLASSIFICATION REPORT")
print("=" * 80)
emotion_names = [EMOTION_LABELS[i] for i in range(num_labels)]
print(classification_report(true_labels, preds, target_names=emotion_names, digits=4))

# ============================================================================
# ERROR ANALYSIS
# ============================================================================
def analyze_errors(true_labels, preds, test_texts):
    """Analyze misclassifications to understand model weaknesses"""
    misclassified = []
    
    for i, (true, pred) in enumerate(zip(true_labels, preds)):
        if true != pred:
            text_sample = test_texts[i]
               
            misclassified.append({
                'text': text_sample,
                'true_label': true,
                'pred_label': pred,
                'true_emotion': EMOTION_LABELS[true],
                'pred_emotion': EMOTION_LABELS[pred]
            })
   
    # Count error types
    error_matrix = np.zeros((num_labels, num_labels))
    for error in misclassified:
        error_matrix[error['true_label'], error['pred_label']] += 1
   
    print("\n" + "=" * 80)
    print("ERROR ANALYSIS")
    print("=" * 80)
    print(f"Total misclassified: {len(misclassified)} ({len(misclassified)/len(true_labels):.1%})")
    print("\nMost common misclassifications:")
    
    error_counts = []
    for true_idx in range(num_labels):
        for pred_idx in range(num_labels):
            if true_idx != pred_idx and error_matrix[true_idx, pred_idx] > 0:
                count = error_matrix[true_idx, pred_idx]
                percentage = (count / len(misclassified)) * 100
                error_counts.append((true_idx, pred_idx, count, percentage))
    
    # Sort by count descending
    error_counts.sort(key=lambda x: x[2], reverse=True)
    
    for true_idx, pred_idx, count, percentage in error_counts[:10]:
        print(f" {EMOTION_LABELS[true_idx]} ‚Üí {EMOTION_LABELS[pred_idx]}: {count:.0f} errors ({percentage:.1f}%)")
    
    # Show some example errors
    print(f"\nExample misclassifications:")
    for i, error in enumerate(misclassified[:5]):
        print(f"\n {i+1}. '{error['text'][:100]}...'")
        print(f"    True: {error['true_emotion']} ‚Üí Pred: {error['pred_emotion']}")
   
    return misclassified, error_matrix

misclassified, error_matrix = analyze_errors(true_labels, preds, test_texts)

# ============================================================================
# COMPARE WITH BASELINE (Zero-shot with original cardiffnlp model)
# ============================================================================
print("\n" + "=" * 80)
print("BASELINE COMPARISON: Zero-shot with original Twitter model")
print("=" * 80)

# Load original model for comparison
original_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
original_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
original_model.eval()

def predict_with_original_model(texts):
    """Get predictions using original cardiffnlp model"""
    inputs = original_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = original_model(**inputs)
    probs = F.softmax(outputs.logits, dim=-1)
    preds = torch.argmax(probs, dim=1)
    return preds.numpy(), probs.numpy()

# Test on actual test set samples (first 100 for speed)
sample_texts = test_texts[:100]
sample_true_labels = true_labels[:100]

original_preds, original_probs = predict_with_original_model(sample_texts)

# Map original predictions to our labels
# The cardiffnlp model uses: 0 -> negative, 1 -> neutral, 2 -> positive
mapped_preds = []
for pred in original_preds:
    if pred == 2:  # positive -> Joy
        mapped_preds.append(0)
    elif pred == 1:  # neutral -> Neutral
        mapped_preds.append(2)
    elif pred == 0:  # negative -> need to distinguish between Sadness and Anger
        # For baseline, we'll map all negative to Sadness
        # This is a simplification - in practice you might want more sophisticated mapping
        mapped_preds.append(1)

# Calculate baseline accuracy
baseline_accuracy = np.mean(np.array(mapped_preds) == np.array(sample_true_labels))
print(f"Baseline (zero-shot) accuracy on test samples: {baseline_accuracy:.2%}")
print(f"Our fine-tuned model accuracy: {test_results['eval_accuracy']:.2%}")
print(f"Improvement: {test_results['eval_accuracy'] - baseline_accuracy:+.2%}")

# ============================================================================
# INFERENCE WITH EXPLANATIONS
# ============================================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

def predict_with_confidence(text, show_all_probs=False):
    """Predict emotion with confidence scores"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    inputs = inputs.to(device)
   
    with torch.no_grad():
        outputs = model(**inputs)
   
    probs = F.softmax(outputs.logits, dim=-1)[0]
    pred_label = torch.argmax(probs).item()
   
    if show_all_probs:
        print(f"\nText: {text}")
        print("Probabilities:")
        for i, prob in enumerate(probs):
            print(f" {EMOTION_LABELS[i]:10s}: {prob.item():.2%}")
        print(f"‚Üí Predicted: {EMOTION_LABELS[pred_label]}")
   
    return pred_label, probs[pred_label].item()

# Test on diverse samples from test set
print("\n" + "=" * 80)
print("INFERENCE ON DIVERSE TEST SAMPLES")
print("=" * 80)

# Get samples from each emotion in test set
test_samples = []
for emotion_id in range(num_labels):
    emotion_indices = np.where(true_labels == emotion_id)[0]
    if len(emotion_indices) >= 2:
        for idx in emotion_indices[:2]:
            test_samples.append((test_texts[idx], emotion_id))

correct_predictions = 0
total_predictions = len(test_samples)

print(f"Testing on {total_predictions} diverse samples from test set:")
for text, true_label in test_samples:
    pred_label, confidence = predict_with_confidence(text, show_all_probs=False)
   
    status = "‚úÖ" if pred_label == true_label else "‚ùå"
    if pred_label == true_label:
        correct_predictions += 1
   
    true_emotion = EMOTION_LABELS[true_label]
    pred_emotion = EMOTION_LABELS[pred_label]
   
    print(f"\n{status} '{text[:70]}...'")
    print(f" True: {true_emotion:10s} | Pred: {pred_emotion:10s} | Conf: {confidence:.1%}")

print(f"\nDiverse sample accuracy: {correct_predictions}/{total_predictions} = {correct_predictions/total_predictions:.1%}")

# ============================================================================
# SAVE MODEL
# ============================================================================
print("\n" + "=" * 80)
print("SAVING FINE-TUNED MODEL")
print("=" * 80)

trainer.save_model(f"{OUTPUT_DIR}/final_model")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model")

# Save configuration
config_info = {
    "emotion_labels": EMOTION_LABELS,
    "base_model": MODEL_NAME,
    "fine_tuned_on": "4-emotion dataset",
    "performance": {
        "accuracy": test_results['eval_accuracy'],
        "f1_macro": test_results['eval_f1_macro'],
        "f1_weighted": test_results['eval_f1_weighted']
    }
}

with open(f"{OUTPUT_DIR}/final_model/model_config.json", 'w') as f:
    json.dump(config_info, f, indent=2)

print(f"‚úì Model saved to: {OUTPUT_DIR}/final_model")
print(f"‚úì Model configuration saved")
print("=" * 80)
print("\nüéâ FINE-TUNING COMPLETE!")
print(f"Final Test Accuracy: {test_results['eval_accuracy']:.2%}")
print(f"Final Test F1 (Macro): {test_results['eval_f1_macro']:.2%}")

# ============================================================================
# PERFORMANCE SUMMARY
# ============================================================================
print("\n" + "=" * 80)
print("PERFORMANCE SUMMARY")
print("=" * 80)
print(f"‚úì Achieved performance: {test_results['eval_accuracy']:.2%} accuracy")
print(f"‚úì Using Twitter-optimized model: {MODEL_NAME}")

# Find best and worst performing emotions
f1_scores = {}
for emotion in EMOTION_LABELS.values():
    key = f"eval_f1_{emotion.lower()}"
    f1_scores[emotion] = test_results[key]

best_emotion = max(f1_scores, key=f1_scores.get)
worst_emotion = min(f1_scores, key=f1_scores.get)

print(f"\nModel Strengths:")
print(f" - Best at detecting {best_emotion} (F1: {f1_scores[best_emotion]:.2%})")
print(f"\nAreas for Improvement:")
print(f" - {worst_emotion} detection could be improved (F1: {f1_scores[worst_emotion]:.2%})")
print("=" * 80)

EMOTION DISTRIBUTION
0: Joy        -  9211 samples (26.27%)
1: Sadness    -  9530 samples (27.18%)
2: Neutral    -  6412 samples (18.29%)
3: Anger      -  9913 samples (28.27%)

Total: 35066 samples

SAMPLE TEXTS PER EMOTION

JOY (Label 0):
 1. vene2ia great...
 2. mrcartersnurse congratulations for your mom for tomorrow buenas noches...
 3. ddlovato oh i see thanks for replying anyway how are you...

SADNESS (Label 1):
 1. just finished watching quotmarley and mequot...
 2. msignorile weather sucks up here...
 3. sasss09 hahaha sadly this ones supposed to be done individually so no chance of ...

NEUTRAL (Label 2):
 1. sadknob right now if be happy to win a packet of salt n vinegar crisps or a new ...
 2. ok im out of here for now just popped in to say hi and check on things ill proba...
 3. just finished my 1st new song soon on youtube keeping you updated...

ANGER (Label 3):
 1. tanisha found herself in an outrageous situation...
 2. ok free people skirt hide and seeks over...
 3. t

Casting the dataset:   0%|          | 0/35066 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Map:   0%|          | 0/35066 [00:00<?, ? examples/s]


‚úì Dataset sizes - Train: 28052 | Val: 3507 | Test: 3507

Validation set distribution:
 Joy       :  921 (26.26%)
 Sadness   :  953 (27.17%)
 Neutral   :  642 (18.31%)
 Anger     :  991 (28.26%)

Loading Twitter sentiment model: cardiffnlp/twitter-roberta-base-sentiment-latest
This model is pre-trained on Twitter data for sentiment analysis!


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

‚úì Twitter sentiment model loaded successfully!
Original model has 3 sentiment classes, adapted to our 4 emotion classes


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

  super().__init__(*args, **kwargs)



FINE-TUNING TWITTER SENTIMENT MODEL FOR EMOTION CLASSIFICATION
Model: cardiffnlp/twitter-roberta-base-sentiment-latest
Strategy: Transfer Learning from Twitter Sentiment + Class Weights


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted,F1 Joy,F1 Sadness,F1 Neutral,F1 Anger
1,0.5828,0.571646,0.774451,0.770122,0.772425,0.702187,0.756303,0.762115,0.859885
2,0.491,0.572678,0.786997,0.783199,0.786972,0.716177,0.778503,0.759232,0.878882
3,0.3956,0.598695,0.780439,0.775032,0.778272,0.692818,0.773143,0.759166,0.875
4,0.3229,0.66896,0.783005,0.780011,0.783563,0.713805,0.772497,0.758414,0.875326



FINAL EVALUATION ON TEST SET



--- Test Set Performance ---
Accuracy: 0.7904
F1 (Macro): 0.7873
F1 (Weight): 0.7903

Per-Emotion F1 Scores:
 Joy       : 0.7133
 Sadness   : 0.7742
 Neutral   : 0.7738
 Anger     : 0.8879

CONFUSION MATRIX
  Joy      Sadness  Neutral  Anger   
Joy       :       612      152      121       36
Sadness   :        88      785       45       35
Neutral   :        61       52      520        8
Anger     :        34       86       17      855

DETAILED CLASSIFICATION REPORT
              precision    recall  f1-score   support

         Joy     0.7698    0.6645    0.7133       921
     Sadness     0.7302    0.8237    0.7742       953
     Neutral     0.7397    0.8112    0.7738       641
       Anger     0.9154    0.8619    0.8879       992

    accuracy                         0.7904      3507
   macro avg     0.7888    0.7903    0.7873      3507
weighted avg     0.7947    0.7904    0.7903      3507


ERROR ANALYSIS
Total misclassified: 735 (21.0%)

Most common misclassifications:
 Joy ‚Üí 

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Baseline (zero-shot) accuracy on test samples: 23.00%
Our fine-tuned model accuracy: 79.04%
Improvement: +56.04%

INFERENCE ON DIVERSE TEST SAMPLES
Testing on 8 diverse samples from test set:

‚úÖ '3wordsaftersex my turn yet...'
 True: Joy        | Pred: Joy        | Conf: 81.1%

‚ùå 'i have turned into a fast food whore...'
 True: Joy        | Pred: Anger      | Conf: 43.2%

‚úÖ 'lulion07 im praying for you sorry to hear about your bro man...'
 True: Sadness    | Pred: Sadness    | Conf: 99.0%

‚úÖ 'i feel sicklike dont wanna get out of bed be bothered dont go 2 work t...'
 True: Sadness    | Pred: Sadness    | Conf: 46.5%

‚ùå 'hmmosaka last show todayvery sad i can decode ur msg haha cant wait ti...'
 True: Neutral    | Pred: Sadness    | Conf: 94.3%

‚úÖ 'divinediva1 norwood house party haaaaaa yaaaaaaay smiles...'
 True: Neutral    | Pred: Neutral    | Conf: 98.2%

‚úÖ 'gooooodnight i fully gave up on my english pride and prejudice love th...'
 True: Anger      | Pred: Anger      