# Multilingual NLP Capstone: Monolingual vs. Multilingual Models

This notebook evaluates and compares two transformer models on sentiment analysis across 8 languages:
1. **Monolingual:** `roberta-base` (English only)
2. **Multilingual:** `xlm-roberta-base` (multilingual support)

The pipeline includes data preprocessing, fine-tuning, evaluation (Accuracy/F1), visualization, and a Gradio demo.

In [None]:
# Install necessary libraries
!pip install transformers datasets pandas scikit-learn matplotlib seaborn gradio accelerate

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from datasets import Dataset, DatasetDict
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    XLMRobertaTokenizer,
    XLMRobertaForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    pipeline
)

## 1. Data Loading
Upload your `data.zip` file containing the language folders (arabic, english, french, etc.).

In [None]:
# If running in Colab, uncomment the lines below to upload data
# from google.colab import files
# uploaded = files.upload()
# !unzip -o data.zip -d /content/

In [None]:
# Load datasets
# Ensure your data folder structure matches these paths

languages = ['arabic', 'english', 'french', 'german', 'italian', 'spanish', 'hindi', 'portuguese']
data_path = 'data/' # Adjust if your folder is named differently

dfs = {}
for lang in languages:
    dfs[f'{lang}_train'] = pd.read_json(f'{data_path}{lang}/train.jsonl', lines=True)
    dfs[f'{lang}_val'] = pd.read_json(f'{data_path}{lang}/validation.jsonl', lines=True)
    dfs[f'{lang}_test'] = pd.read_json(f'{data_path}{lang}/test.jsonl', lines=True)

# Combine for Multilingual Training
train_dfs = [dfs[f'{lang}_train'] for lang in languages]
val_dfs = [dfs[f'{lang}_val'] for lang in languages]

df_train = pd.concat(train_dfs, ignore_index=True).sample(frac=1).reset_index(drop=True)
df_val = pd.concat(val_dfs, ignore_index=True)

# Create Hugging Face Datasets
ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)

dataset = DatasetDict({
    'train': ds_train,
    'validation': ds_val
})

print("Combined Dataset Summary:", dataset)

## 2. Model & Tokenizer Initialization

In [None]:
# --- Model 1: Monolingual (RoBERTa) ---
mono_model_name = "roberta-base"
mono_tokenizer = RobertaTokenizer.from_pretrained(mono_model_name)
mono_model = RobertaForSequenceClassification.from_pretrained(mono_model_name, num_labels=3)

# --- Model 2: Multilingual (XLM-RoBERTa) ---
multi_model_name = "xlm-roberta-base"
multi_tokenizer = XLMRobertaTokenizer.from_pretrained(multi_model_name)
multi_model = XLMRobertaForSequenceClassification.from_pretrained(multi_model_name, num_labels=3)

In [None]:
def clean_text(text):
    text = re.sub(r"https://\S+|www\.\S+", "http", text)
    text = re.sub(r"@\w+", "@user", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Tokenization Functions
def clean_and_tokenize_mono(batch):
    cleaned_texts = [clean_text(text) for text in batch["text"]]
    return mono_tokenizer(cleaned_texts, truncation=True)

def clean_and_tokenize_multi(batch):
    cleaned_texts = [clean_text(text) for text in batch["text"]]
    return multi_tokenizer(cleaned_texts, truncation=True)

# Apply Tokenization
# 1. Monolingual (English Only)
dataset_mono = DatasetDict({
    'train': Dataset.from_pandas(dfs['english_train']),
    'validation': Dataset.from_pandas(dfs['english_val'])
})
tokenized_dataset_mono = dataset_mono.map(clean_and_tokenize_mono, batched=True)

# 2. Multilingual (All Languages)
tokenized_dataset_multi = dataset.map(clean_and_tokenize_multi, batched=True)

# Data Collators
data_collator_mono = DataCollatorWithPadding(tokenizer=mono_tokenizer)
data_collator_multi = DataCollatorWithPadding(tokenizer=multi_tokenizer)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")
    return {"accuracy": acc, "f1": f1}

## 3. Training

In [None]:
# --- Train Monolingual Model ---
training_args_mono = TrainingArguments(
    output_dir="./results_mono",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

trainer_mono = Trainer(
    model=mono_model,
    args=training_args_mono,
    train_dataset=tokenized_dataset_mono["train"],
    eval_dataset=tokenized_dataset_mono["validation"],
    data_collator=data_collator_mono,
    tokenizer=mono_tokenizer,
    compute_metrics=compute_metrics,
)

print("Starting Monolingual Training...")
trainer_mono.train()
trainer_mono.save_model("./models/roberta-base-finetuned")

In [None]:
# --- Train Multilingual Model ---
training_args_multi = TrainingArguments(
    output_dir="./results_multi",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

trainer_multi = Trainer(
    model=multi_model,
    args=training_args_multi,
    train_dataset=tokenized_dataset_multi["train"],
    eval_dataset=tokenized_dataset_multi["validation"],
    data_collator=data_collator_multi,
    tokenizer=multi_tokenizer,
    compute_metrics=compute_metrics,
)

print("Starting Multilingual Training...")
trainer_multi.train()
trainer_multi.save_model("./models/xlm-roberta-base-finetuned")

## 4. Evaluation & Visualization

In [None]:
results = []
labels_list = ["Negative", "Neutral", "Positive"]

print("--- Evaluating models language-wise ---")

for lang in languages:
    test_df = dfs[f'{lang}_test']
    ds_test = Dataset.from_pandas(test_df)
    
    # Preprocess
    tok_test_mono = ds_test.map(clean_and_tokenize_mono, batched=True)
    tok_test_multi = ds_test.map(clean_and_tokenize_multi, batched=True)
    true_labels = ds_test['label']

    # 1. Monolingual Preds
    mono_out = trainer_mono.predict(tok_test_mono)
    mono_preds = np.argmax(mono_out.predictions, axis=1)
    mono_f1 = f1_score(true_labels, mono_preds, average="macro")
    results.append({"language": lang, "model": "roberta-base", "f1": mono_f1})

    # 2. Multilingual Preds
    multi_out = trainer_multi.predict(tok_test_multi)
    multi_preds = np.argmax(multi_out.predictions, axis=1)
    multi_f1 = f1_score(true_labels, multi_preds, average="macro")
    results.append({"language": lang, "model": "xlm-roberta-base", "f1": multi_f1})

    # Plot Confusion Matrix for English and Hindi as examples
    if lang in ['english', 'hindi']:
        fig, ax = plt.subplots(1, 2, figsize=(12, 5))
        sns.heatmap(confusion_matrix(true_labels, mono_preds), annot=True, fmt='d', cmap='Blues', ax=ax[0])
        ax[0].set_title(f'RoBERTa on {lang}')
        
        sns.heatmap(confusion_matrix(true_labels, multi_preds), annot=True, fmt='d', cmap='Oranges', ax=ax[1])
        ax[1].set_title(f'XLM-RoBERTa on {lang}')
        plt.show()

# Create Results DataFrame
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
# Plot F1 Comparison
plt.figure(figsize=(14, 7))
sns.barplot(x="language", y="f1", hue="model", data=results_df, palette=["#6495ED", "#FF6347"])
plt.title("F1-Score: Monolingual vs Multilingual Models")
plt.ylim(0, 1)
plt.show()

## 5. Gradio Demo

In [None]:
# Load best model for inference
model_path = "./models/xlm-roberta-base-finetuned"
classifier = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)

# Map ids to labels
id2label = {0: "Negative", 1: "Neutral", 2: "Positive"}
classifier.model.config.id2label = id2label
classifier.model.config.label2id = {v: k for k, v in id2label.items()}

def predict(text):
    text = clean_text(text)
    res = classifier(text)[0]
    return {res['label']: res['score']}

iface = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(placeholder="Enter text in any supported language..."),
    outputs=gr.Label(num_top_classes=3),
    title="Multilingual Sentiment Analysis",
    description="Detects Positive, Neutral, or Negative sentiment in 8 languages."
)

iface.launch()