In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr

from datasets import Dataset, DatasetDict
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    XLMRobertaTokenizer,
    XLMRobertaForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    pipeline
)
from sklearn.metrics import accuracy_score, f1_score

df_arabic_test = pd.read_json('data/arabic/test.jsonl', lines=True)
df_arabic_train = pd.read_json('data/arabic/train.jsonl', lines=True)
df_arabic_val = pd.read_json('data/arabic/validation.jsonl', lines=True)
df_english_test = pd.read_json('data/english/test.jsonl', lines=True)
df_english_train = pd.read_json('data/english/train.jsonl', lines=True)
df_english_val = pd.read_json('data/english/validation.jsonl', lines=True)
df_french_test = pd.read_json('data/french/test.jsonl', lines=True)
df_french_train = pd.read_json('data/french/train.jsonl', lines=True)
df_french_val = pd.read_json('data/french/validation.jsonl', lines=True)       
df_german_test = pd.read_json('data/german/test.jsonl', lines=True)
df_german_train = pd.read_json('data/german/train.jsonl', lines=True)
df_german_val = pd.read_json('data/german/validation.jsonl', lines=True)       
df_italian_test = pd.read_json('data/italian/test.jsonl', lines=True)
df_italian_train = pd.read_json('data/italian/train.jsonl', lines=True)
df_italian_val = pd.read_json('data/italian/validation.jsonl', lines=True)       
df_spanish_test = pd.read_json('data/spanish/test.jsonl', lines=True)
df_spanish_train = pd.read_json('data/spanish/train.jsonl', lines=True)
df_spanish_val = pd.read_json('data/spanish/validation.jsonl', lines=True)
df_hindi_test = pd.read_json('data/hindi/test.jsonl', lines=True)
df_hindi_train = pd.read_json('data/hindi/train.jsonl', lines=True)
df_hindi_val = pd.read_json('data/hindi/validation.jsonl', lines=True)
df_portuguese_test = pd.read_json('data/portuguese/test.jsonl', lines=True)
df_portuguese_train = pd.read_json('data/portuguese/train.jsonl', lines=True)
df_portuguese_val = pd.read_json('data/portuguese/validation.jsonl', lines=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- 1. Create lists of your DataFrames ---
train_dfs = [
    df_arabic_train, df_english_train, df_french_train, df_german_train,
    df_italian_train, df_spanish_train, df_hindi_train, df_portuguese_train
]

val_dfs = [
    df_arabic_val, df_english_val, df_french_val, df_german_val,
    df_italian_val, df_spanish_val, df_hindi_val, df_portuguese_val
]

# We also create a dictionary for the test sets to easily loop by language
test_dfs = {
    "arabic": df_arabic_test,
    "english": df_english_test,
    "french": df_french_test,
    "german": df_german_test,
    "italian": df_italian_test,
    "spanish": df_spanish_test,
    "hindi": df_hindi_test,
    "portuguese": df_portuguese_test
}

# --- 2. Concatenate them into single DataFrames ---
df_train = pd.concat(train_dfs, ignore_index=True)
df_val = pd.concat(val_dfs, ignore_index=True)

# Shuffle the training data to mix the languages
df_train = df_train.sample(frac=1).reset_index(drop=True)

# --- 3. Convert pandas DataFrames to Hugging Face Dataset objects ---
ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)

# --- 4. Combine them into a single DatasetDict ---
dataset = DatasetDict({
    'train': ds_train,
    'validation': ds_val
})

print("Combined DatasetDict created:")
print(dataset)

Combined DatasetDict created:
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 14712
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2592
    })
})


In [3]:
# --- Model 1: Monolingual (RoBERTa) ---
mono_model_name = "roberta-base"
mono_tokenizer = RobertaTokenizer.from_pretrained(mono_model_name)
mono_model = RobertaForSequenceClassification.from_pretrained(mono_model_name, num_labels=3) # 3 labels: 0, 1, 2

# --- Model 2: Multilingual (XLM-RoBERTa) ---
multi_model_name = "xlm-roberta-base"
multi_tokenizer = XLMRobertaTokenizer.from_pretrained(multi_model_name)
multi_model = XLMRobertaForSequenceClassification.from_pretrained(multi_model_name, num_labels=3)

Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [None]:
def clean_text(text):
    # Replace URLs with "http"
    text = re.sub(r"https://\S+|www\.\S+", "http", text)
    # Replace @user mentions
    text = re.sub(r"@\w+", "@user", text)
    # Remove the hashtag symbol
    text = re.sub(r"#", "", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Preprocessing function for monolingual model
def clean_and_tokenize_mono(batch):
    cleaned_texts = [clean_text(text) for text in batch["text"]]
    return mono_tokenizer(cleaned_texts, truncation=True)

# Preprocessing function for multilingual model
def clean_and_tokenize_multi(batch):
    cleaned_texts = [clean_text(text) for text in batch["text"]]
    return multi_tokenizer(cleaned_texts, truncation=True)

# --- 3. Apply Tokenization ---
print("Applying cleaning and tokenization...")
tokenized_dataset_mono = dataset.map(clean_and_tokenize_mono, batched=True)
tokenized_dataset_multi = dataset.map(clean_and_tokenize_multi, batched=True)

# --- 4. Define Data Collators ---
data_collator_mono = DataCollatorWithPadding(tokenizer=mono_tokenizer)
data_collator_multi = DataCollatorWithPadding(tokenizer=multi_tokenizer)

print("Preprocessing complete!")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")
    
    return {"accuracy": acc, "f1": f1}

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none" # Disables wandb/tensorboard logging if you don't have it set up
)


trainer_mono = Trainer(
    model=mono_model,
    args=training_args,
    train_dataset=tokenized_dataset_mono["train"],
    eval_dataset=tokenized_dataset_mono["validation"],
    data_collator=data_collator_mono,
    tokenizer=mono_tokenizer,
    compute_metrics=compute_metrics,
)

print("--- Training Monolingual Model (roberta-base) ---")
trainer_mono.train()
trainer_mono.save_model("./models/roberta-base-finetuned")

In [None]:
# Create new args to point to a different output directory
training_args_multi = TrainingArguments(
    output_dir="./results_multi",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs_multi",
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

trainer_multi = Trainer(
    model=multi_model,
    args=training_args_multi,
    train_dataset=tokenized_dataset_multi["train"],
    eval_dataset=tokenized_dataset_multi["validation"],
    data_collator=data_collator_multi,
    tokenizer=multi_tokenizer,
    compute_metrics=compute_metrics,
)

print("--- Training Multilingual Model (xlm-roberta-base) ---")
trainer_multi.train()
trainer_multi.save_model("./models/xlm-roberta-base-finetuned")


In [None]:
results = []
id_to_label = {0: "Negative", 1: "Neutral", 2: "Positive"}

print("--- Evaluating models language-wise ---")

# Loop through the test_dfs dictionary created in Phase 1
for lang, lang_df in test_dfs.items():
    
    # 1. Convert the language-specific pandas DataFrame to a Dataset
    lang_test_set = Dataset.from_pandas(lang_df)
    
    # 2. Tokenize it for BOTH models
    tokenized_lang_test_mono = lang_test_set.map(clean_and_tokenize_mono, batched=True)
    tokenized_lang_test_multi = lang_test_set.map(clean_and_tokenize_multi, batched=True)

    # 3. Evaluate Monolingual Model
    print(f"Evaluating roberta-base on {lang}...")
    mono_preds = trainer_mono.predict(tokenized_lang_test_mono)
    results.append({
        "model": "roberta-base",
        "language": lang,
        "f1": mono_preds.metrics["test_f1"],
        "accuracy": mono_preds.metrics["test_accuracy"]
    })

    # 4. Evaluate Multilingual Model
    print(f"Evaluating xlm-roberta-base on {lang}...")
    multi_preds = trainer_multi.predict(tokenized_lang_test_multi)
    results.append({
        "model": "xlm-roberta-base",
        "language": lang,
        "f1": multi_preds.metrics["test_f1"],
        "accuracy": multi_preds.metrics["test_accuracy"]
    })
    
    # --- 5. Qualitative Analysis (Example for French) ---
    # We can do this inside the loop
    if lang == 'french':
        print(f"\n--- Qualitative Analysis for {lang} (XLM-R) ---")
        predicted_labels = np.argmax(multi_preds.predictions, axis=1)

        # Find a correct prediction
        for i in range(10):
            text = lang_test_set[i]['text']
            true_label = id_to_label[lang_test_set[i]['label']]
            pred_label = id_to_label[predicted_labels[i]]
            
            if true_label == pred_label:
                print(f"SUCCESS Example:")
                print(f"Text: {text}")
                print(f"True: {true_label} | Predicted: {pred_label}\n")
                break
        
        # Find an incorrect prediction
        for i in range(10):
            text = lang_test_set[i]['text']
            true_label = id_to_label[lang_test_set[i]['label']]
            pred_label = id_to_label[predicted_labels[i]]
            
            if true_label != pred_label:
                print(f"FAILURE Example:")
                print(f"Text: {text}")
                print(f"True: {true_label} | Predicted: {pred_label}\n")
                break


# --- 6. Convert results to a DataFrame for easy analysis ---
results_df = pd.DataFrame(results)
print("--- Final Results ---")
print(results_df.pivot(index="language", columns="model", values="f1"))

In [None]:
plt.figure(figsize=(14, 7))
sns.barplot(
    x="language", 
    y="f1", 
    hue="model", 
    data=results_df, 
    palette=["#6495ED", "#FF6347"]
)
plt.title("F1-Score Comparison: Monolingual (roberta-base) vs. Multilingual (xlm-roberta-base)")
plt.ylabel("Macro F1-Score")
plt.xlabel("Language")
plt.legend(title="Model")
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(14, 7))
sns.barplot(
    x="language", 
    y="f1", 
    hue="model", 
    data=results_df, 
    palette=["#6495ED", "#FF6347"]
)
plt.title("F1-Score Comparison: Monolingual (roberta-base) vs. Multilingual (xlm-roberta-base)")
plt.ylabel("Macro F1-Score")
plt.xlabel("Language")
plt.legend(title="Model")
plt.xticks(rotation=45)
plt.show()