In [2]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

2025-04-23 04:21:37.446178: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745382097.469055     306 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745382097.475962     306 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [24]:
def preprocess_data(df, text_column):
    df = df[['Split', text_column, 'Sentiment']].rename(columns={text_column: 'text', 'Sentiment': 'label'})
    df = df.dropna()
    df = df[df['label'].isin([0, 1, 2])]
    return df

def tokenize_function(example, tokenizer):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=128)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    report = classification_report(labels, preds, output_dict=True)
    return {
        'accuracy': acc,
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1': report['weighted avg']['f1-score']
    }

def plot_confusion_matrix(cm, class_names):
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    return plt

In [None]:
def finetune_sentiment_model(language):
    print(f"\nProcessing language: {language}")
    base_dir = "/kaggle/input/translated-data"
    df = pd.read_csv(f"{base_dir}/{language}_cleaned.csv")
    
    # Use the original Twitter-specific model
    model_name = 'cardiffnlp/twitter-roberta-base-sentiment'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Define sentiment class names for confusion matrix
    class_names = ["Negative", "Neutral", "Positive"]
    
    for mode in ['Cleaned_Literal_Translation', 'Cleaned_Fluent_Translation']:
        print(f"Training using: {mode}")
        data = preprocess_data(df, mode)
                
        train_df = data[data['Split'] == 'train']
        val_df = data[data['Split'] == 'val']
        test_df = data[data['Split'] == 'test']
        
        train_ds = Dataset.from_pandas(train_df[['text', 'label']])
        val_ds = Dataset.from_pandas(val_df[['text', 'label']])
        test_ds = Dataset.from_pandas(test_df[['text', 'label']])
        
        train_ds = train_ds.map(lambda x: tokenize_function(x, tokenizer), batched=True)
        val_ds = val_ds.map(lambda x: tokenize_function(x, tokenizer), batched=True)
        test_ds = test_ds.map(lambda x: tokenize_function(x, tokenizer), batched=True)
        
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
        
        training_args = TrainingArguments(
            output_dir=f"./results_{language}_{mode.lower()}_temp",
            eval_strategy="epoch",
            save_strategy="no",
            learning_rate=3e-5,
            per_device_train_batch_size=32,
            per_device_eval_batch_size=32,
            num_train_epochs=15,
            weight_decay=0.1,
            logging_steps=10,
            logging_dir=f"./logs_{language}_{mode.lower()}",
            report_to="none",
            metric_for_best_model="eval_accuracy",
            warmup_ratio=0.1,
        )
        
        early_stopping_callback = EarlyStoppingCallback(
            early_stopping_patience=2,
            early_stopping_threshold=0.001
        )
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            callbacks=[early_stopping_callback]
        )
        
        trainer.train()
        
        print("Evaluating on test set using the best model...")
        test_metrics = trainer.evaluate(test_ds)
        
        predictions = trainer.predict(test_ds)
        preds = predictions.predictions.argmax(-1)
        labels = predictions.label_ids
        
        correct_predictions = (preds == labels).sum()
        total_samples = len(labels)
        
        print(f"Test set statistics:")
        print(f"Total samples: {total_samples}")
        print(f"Correctly classified: {correct_predictions} ({correct_predictions/total_samples:.2%})")
        
        cm = confusion_matrix(labels, preds)
        print("\n   Confusion Matrix:")
        print("      True \\ Predicted | ", end="")
        for i, name in enumerate(class_names):
            print(f"{name:<10}", end=" ")
        print("\n      " + "-" * 50)
        
        for i, name in enumerate(class_names):
            print(f"      {name:<15} | ", end="")
            for j in range(len(class_names)):
                print(f"{cm[i, j]:<10}", end=" ")
            print()
        
        plt = plot_confusion_matrix(cm, class_names)
        plt.savefig(f"{language}_{mode.lower()}_confusion_matrix.png")
        plt.close()
        print(f"Saved confusion matrix plot to {language}_{mode.lower()}_confusion_matrix.png")
        
        class_report = classification_report(labels, preds, target_names=class_names, output_dict=True)
        test_metrics.update({
            "detailed_metrics": class_report,
            "total_samples": total_samples,
            "correct_predictions": int(correct_predictions),
            "confusion_matrix": cm.tolist()
        })
        
        out_file = f"{language}_{mode.lower()}_results.json"
        with open(out_file, "w") as f:
            json.dump(test_metrics, f, indent=2)
        print(f"Saved results to {out_file}")

        import shutil
        import os
        temp_path = "/kaggle/working"

        if os.path.exists(f"{temp_path}/results_{language}_{mode.lower()}_temp"):
            shutil.rmtree(f"{temp_path}/results_{language}_{mode.lower()}_temp")
            print(f"Cleaned up temporary checkpoint files")

In [None]:
finetune_sentiment_model("french")

In [None]:
finetune_sentiment_model("italian")

In [None]:
finetune_sentiment_model("german")

In [None]:
finetune_sentiment_model("spanish")

In [None]:
finetune_sentiment_model("arabic")