In [3]:
import os
import json
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
import pytorch_lightning as pl
from pytorch_lightning.loggers import CSVLogger
from ray import tune
from ray.tune.integration.pytorch_lightning import TuneReportCallback
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

In [4]:
class SentimentDataModule(pl.LightningDataModule):
    def __init__(self, tokenizer, train_df, val_df, test_df, batch_size):
        super().__init__()
        self.tokenizer = tokenizer
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.batch_size = batch_size

    def tokenize(self, df):
        ds = Dataset.from_pandas(df[['text', 'label']])
        return ds.map(lambda x: self.tokenizer(x['text'], truncation=True, padding='max_length', max_length=128), batched=True)

    def setup(self, stage=None):
        self.train_dataset = self.tokenize(self.train_df).with_format("torch")
        self.val_dataset = self.tokenize(self.val_df).with_format("torch")
        self.test_dataset = self.tokenize(self.test_df).with_format("torch")

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

In [5]:
class SentimentClassifier(pl.LightningModule):
    def __init__(self, model_name, learning_rate, num_labels=3):
        super().__init__()
        self.save_hyperparameters()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    def forward(self, batch):
        return self.model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch.get('label')
        )

    def training_step(self, batch, batch_idx):
        outputs = self(batch)
        return outputs.loss

    def validation_step(self, batch, batch_idx):
        outputs = self(batch)
        preds = torch.argmax(outputs.logits, dim=1)
        acc = accuracy_score(batch['label'].cpu(), preds.cpu())
        self.log('val_accuracy', acc, prog_bar=True)
        return {"loss": outputs.loss, "acc": acc}

    def test_step(self, batch, batch_idx):
        outputs = self(batch)
        preds = torch.argmax(outputs.logits, dim=1)
        acc = accuracy_score(batch['label'].cpu(), preds.cpu())
        self.log('test_accuracy', acc)
        return {"preds": preds.cpu(), "labels": batch['label'].cpu()}

    def predict_step(self, batch, batch_idx):
        outputs = self.model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask']
        )
        preds = torch.argmax(outputs.logits, dim=1)
        return {"preds": preds.cpu(), "labels": batch['label'].cpu()}

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.learning_rate, weight_decay=0.1)
        total_steps = self.trainer.estimated_stepping_batches
        scheduler = get_scheduler(
            name="linear",
            optimizer=optimizer,
            num_warmup_steps=int(0.1 * total_steps),
            num_training_steps=total_steps,
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "step",
                "frequency": 1,
            },
        }

In [6]:
def preprocess_data(df, text_column):
    df = df[['Split', text_column, 'Sentiment']].rename(columns={text_column: 'text', 'Sentiment': 'label'})
    df = df.dropna()
    df = df[df['label'].isin([0, 1, 2])]
    return df

def plot_confusion_matrix(cm, class_names, filename):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

In [None]:
def run_training(language):
    base_dir = "/kaggle/input/translated-data"
    df = pd.read_csv(f"{base_dir}/{language}_cleaned.csv")
    model_name = 'cardiffnlp/twitter-roberta-base-sentiment'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    class_names = ["Negative", "Neutral", "Positive"]

    for mode in ['Cleaned_Fluent_Translation']:
        print(f"\nLanguage: {language}, Mode: {mode}")
        data = preprocess_data(df, mode)

        train_df = data[data['Split'] == 'train']
        val_df = data[data['Split'] == 'val']
        test_df = data[data['Split'] == 'test']

        def train_tune(config):
            dm = SentimentDataModule(tokenizer, train_df, val_df, test_df, config["batch_size"])
            model = SentimentClassifier(model_name, learning_rate=config["lr"])

            trainer = pl.Trainer(
                max_epochs=10,
                logger=CSVLogger("logs", name=f"{language}_{mode}"),
                enable_checkpointing=False,
                callbacks=[TuneReportCallback({"accuracy": "val_accuracy"}, on="validation_end")],
                enable_progress_bar=False
            )
            trainer.fit(model, dm)

        config = {
            "lr": tune.grid_search([1e-5, 2e-5, 3e-5]),
            "batch_size": tune.grid_search([16, 32])
        }

        analysis = tune.run(train_tune, config=config, metric="accuracy", mode="max", num_samples=1, resources_per_trial={"cpu": 2, "gpu": 1})

        best_config = analysis.get_best_config("accuracy", mode="max")
        print(f"Best config: {best_config}")

        dm = SentimentDataModule(tokenizer, train_df, val_df, test_df, best_config["batch_size"])
        model = SentimentClassifier(model_name, learning_rate=best_config["lr"])
        trainer = pl.Trainer(max_epochs=10, enable_checkpointing=False, logger=False)
        trainer.fit(model, dm)

        results = trainer.test(model, datamodule=dm)[0]
        predictions = trainer.predict(model, dataloaders=dm.test_dataloader())

        preds = torch.cat([x['preds'] for x in predictions]).numpy()
        labels = torch.cat([x['labels'] for x in predictions]).numpy()
        cm = confusion_matrix(labels, preds)
        plot_confusion_matrix(cm, class_names, f"{language}_{mode}_conf_matrix.png")

        metrics = {
            "config": best_config,
            "accuracy": results['test_accuracy'],
            "classification_report": classification_report(labels, preds, target_names=class_names, output_dict=True),
            "confusion_matrix": cm.tolist()
        }
        with open(f"{language}_{mode}_results.json", "w") as f:
            json.dump(metrics, f, indent=2)
        print(f"Saved: {language}_{mode}_results.json")

In [None]:
!pip install -U tensorboardx

In [None]:
run_training('spanish')

In [None]:
run_training('french')

In [None]:
run_training('italian')

In [None]:
run_training('german')

In [None]:
run_training('arabic')