In [None]:
pip install torch transformers scikit-learn pandas tqdm lime


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from lime.lime_text import LimeTextExplainer
import matplotlib.pyplot as plt
from tqdm import tqdm

# ========== Config ==========
USE_FOCAL_LOSS = False
MAX_LENGTH = 128
BATCH_SIZE = 16
EPOCHS = 20
LR = 2e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# ========== Dataset ==========
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, label_cols):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.label_cols = label_cols
        self.labels = self.df[label_cols].values.astype(np.float32)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx]['text']
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx]),
            'raw_text': text
        }


In [None]:

# ========== Model ==========
class XLMREmotionClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.encoder = XLMRobertaModel.from_pretrained('xlm-roberta-base')
        self.classifier = nn.Linear(self.encoder.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        logits = self.classifier(cls_output)
        return logits

# ========== Focal Loss ==========
def focal_loss(inputs, targets, alpha=0.25, gamma=2.0):
    BCE = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
    pt = torch.exp(-BCE)
    return (alpha * (1 - pt) ** gamma * BCE).mean()

# ========== Metrics ==========
def compute_metrics(preds, labels):
    preds = (torch.sigmoid(preds) > 0.5).cpu().numpy()
    labels = labels.cpu().numpy()
    return {
        'f1_macro': f1_score(labels, preds, average='macro', zero_division=0),
        'precision_macro': precision_score(labels, preds, average='macro', zero_division=0),
        'recall_macro': recall_score(labels, preds, average='macro', zero_division=0)
    }


In [None]:
# ========== Training ==========
def train_model(model, train_loader, val_loader, label_cols,USE_FOCAL_LOSS=False):
    optimizer = optim.AdamW(model.parameters(), lr=LR)
    loss_fn = focal_loss if USE_FOCAL_LOSS else nn.BCEWithLogitsLoss()
    model.to(DEVICE)

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1} - Loss: {total_loss / len(train_loader):.4f}")
        evaluate_model(model, val_loader, label_cols)

# ========== Evaluation ==========
def evaluate_model(model, val_loader, label_cols):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            logits = model(input_ids, attention_mask)
            all_preds.append(logits)
            all_labels.append(labels)
    preds = torch.cat(all_preds)
    labels = torch.cat(all_labels)
    metrics = compute_metrics(preds, labels)
    print("Validation Metrics:", metrics)


In [None]:
# ========== LIME ==========
def explain_prediction(model, tokenizer, text, label_names):
    model.eval()
    def predict_proba(texts):
        model.eval()
        outputs = []
        for t in texts:
            encoding = tokenizer(t, return_tensors="pt", truncation=True, padding='max_length', max_length=MAX_LENGTH).to(DEVICE)
            with torch.no_grad():
                logits = model(**encoding)
            probs = torch.sigmoid(logits).cpu().numpy()[0]
            outputs.append(probs)
        return np.array(outputs)
    all_labels = list(range(len(label_names)))
    explainer = LimeTextExplainer(class_names=label_names)
    exp = explainer.explain_instance(text, predict_proba, num_features=10, labels=all_labels)
    exp.show_in_notebook()

# ========== Attention Visualization ==========
def plot_attention(model, tokenizer, text):
    encoding = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=MAX_LENGTH)
    input_ids = encoding['input_ids'][0]
    attention_mask = encoding['attention_mask'][0]

    # Move to device
    encoding = {k: v.to(DEVICE) for k, v in encoding.items()}

    # Get attentions from the model
    with torch.no_grad():
        outputs = model.encoder(**encoding, output_attentions=True)

    # Layer 0, Head 0
    attentions = outputs.attentions[0][0, 0]  # (seq_len, seq_len)
    token_scores = attentions.sum(dim=0).cpu().numpy()  # total attention received per token

    # Convert token IDs to tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Filter: remove special tokens and padding
    filtered_scores = []
    filtered_tokens = []
    for score, token, mask in zip(token_scores, tokens, attention_mask):
        if mask.item() == 1 and token not in tokenizer.all_special_tokens:
            filtered_scores.append(score)
            filtered_tokens.append(token)

    # Plot
    plt.figure(figsize=(12, 4))
    plt.bar(range(len(filtered_tokens)), filtered_scores, color='skyblue')
    plt.xticks(range(len(filtered_tokens)), filtered_tokens, rotation='vertical')
    plt.title("Attention Scores (Layer 0, Head 0)")
    plt.tight_layout()
    plt.show()




In [None]:
def get_dataloaders(csv_path, label_cols, batch_size=BATCH_SIZE):
    df = pd.read_csv(csv_path)
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

    train_data = EmotionDataset(train_df, tokenizer, label_cols)
    val_data = EmotionDataset(val_df, tokenizer, label_cols)

    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size)

    return train_loader, val_loader, tokenizer, train_df, val_df


# English

## With BCE_loss

In [None]:
# ========== Run English Model ==========
english_labels = ['anger', 'fear', 'joy', 'sadness', 'surprise']
eng_train_loader,eng_val_loader,eng_tokenizer,eng_train_df,eng_val_df = get_dataloaders("/kaggle/input/languages/eng.csv",english_labels)
eng_model = XLMREmotionClassifier(num_labels=len(english_labels))
train_model(eng_model, eng_train_loader, eng_val_loader, english_labels)

In [None]:

# LIME + attention on one sample
eng_sample_text = eng_val_df.iloc[0]['text']
eng_sample_labels = eng_val_df.iloc[0][english_labels].to_dict()

print("\nSample Text:", eng_sample_text)
print("Original Labels:", eng_sample_labels)
explain_prediction(eng_model, eng_tokenizer, eng_sample_text, english_labels)
plot_attention(eng_model, eng_tokenizer, eng_sample_text)


## with FOCAL_loss

In [None]:
# ========== Run English Model ==========
english_labels = ['anger', 'fear', 'joy', 'sadness', 'surprise']
eng_train_loader,eng_val_loader,eng_tokenizer,eng_train_df,eng_val_df = get_dataloaders("/kaggle/input/languages/eng.csv",english_labels)
eng_model = XLMREmotionClassifier(num_labels=len(english_labels))
train_model(eng_model, eng_train_loader, eng_val_loader, english_labels,True)

In [None]:

# LIME + attention on one sample
eng_sample_text = eng_val_df.iloc[0]['text']
eng_sample_labels = eng_val_df.iloc[0][english_labels].to_dict()

print("\nSample Text:", eng_sample_text)
print("Original Labels:", eng_sample_labels)
explain_prediction(eng_model, eng_tokenizer, eng_sample_text, english_labels)
plot_attention(eng_model, eng_tokenizer, eng_sample_text)


# Spanish

## with BCE_loss

In [None]:
# ========== Run English Model ==========
non_english_labels = ['anger', 'fear', 'joy', 'sadness', 'surprise', 'disgust']
esp_train_loader,esp_val_loader,esp_tokenizer,esp_train_df,esp_val_df = get_dataloaders("/kaggle/input/languages/esp.csv",non_english_labels)
esp_model = XLMREmotionClassifier(num_labels=len(non_english_labels))
train_model(esp_model, esp_train_loader, esp_val_loader, non_english_labels)

In [None]:

# LIME + attention on one sample
esp_sample_text = esp_val_df.iloc[0]['text']
esp_sample_labels = esp_val_df.iloc[0][non_english_labels].to_dict()

print("\nSample Text:", esp_sample_text)
print("Original Labels:", esp_sample_labels)
explain_prediction(esp_model, esp_tokenizer, esp_sample_text, non_english_labels)
plot_attention(esp_model, esp_tokenizer, esp_sample_text)


## with FOCAL_loss

In [None]:
# ========== Run EFOCAL lossnglish Model ==========
non_english_labels = ['anger', 'fear', 'joy', 'sadness', 'surprise', 'disgust']
esp_train_loader,esp_val_loader,esp_tokenizer,esp_train_df,esp_val_df = get_dataloaders("/kaggle/input/languages/esp.csv",non_english_labels)
esp_model = XLMREmotionClassifier(num_labels=len(non_english_labels))
train_model(esp_model, esp_train_loader, esp_val_loader, non_english_labels,True)

In [None]:

# LIME + attention on one sample
esp_sample_text = esp_val_df.iloc[0]['text']
esp_sample_labels = esp_val_df.iloc[0][non_english_labels].to_dict()

print("\nSample Text:", esp_sample_text)
print("Original Labels:", esp_sample_labels)
explain_prediction(esp_model, esp_tokenizer, esp_sample_text, non_english_labels)
plot_attention(esp_model, esp_tokenizer, esp_sample_text)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import (
    XLMRobertaTokenizer,
    XLMRobertaModel,
    AutoTokenizer,
    AutoModelForSeq2SeqLM
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from lime.lime_text import LimeTextExplainer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# ========== Config ==========
MAX_LENGTH  = 128
BATCH_SIZE  = 16
EPOCHS      = 20
LR          = 2e-5
DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ========== COMET-ATOMIC Setup (public model) ==========
COMET_REPO      = "mismayil/comet-bart-ai2"
comet_tokenizer = AutoTokenizer.from_pretrained(COMET_REPO)
comet_model     = AutoModelForSeq2SeqLM.from_pretrained(COMET_REPO).to(DEVICE)

def generate_comet_context(text: str, relations=["xIntent","xNeed"], max_new_tokens=20) -> str:
    contexts = []
    for rel in relations:
        prompt = f"{text} <{rel}>"
        inputs = comet_tokenizer(prompt, return_tensors="pt").to(DEVICE)
        outs   = comet_model.generate(**inputs, max_new_tokens=max_new_tokens)
        ctx    = comet_tokenizer.decode(outs[0], skip_special_tokens=True)
        contexts.append(f"[{rel}]: {ctx}")
    return " ".join(contexts)
# ========== Dataset Classes ==========
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, label_cols, max_length=MAX_LENGTH):
        # copy to avoid modifying original
        self.df = df.reset_index(drop=True).copy()
        # robustly convert label columns to numeric 0/1
        self.df[label_cols] = (
            self.df[label_cols]
            .apply(pd.to_numeric, errors="coerce")  # non-numeric→NaN
            .fillna(0)                               # NaN→0
            .astype(int)                             # to int
        )
        self.df[label_cols] = self.df[label_cols].astype(float)
        self.tokenizer  = tokenizer
        self.label_cols = label_cols
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.loc[idx, "text"]
        enc  = self.tokenizer(
            text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length
        )
        labels = torch.tensor(
            self.df.loc[idx, self.label_cols].values,
            dtype=torch.float
        )
        return {
            "input_ids":       enc["input_ids"].squeeze(0),
            "attention_mask":  enc["attention_mask"].squeeze(0),
            "labels":          labels
        }
class EmotionDatasetWithComet(EmotionDataset):
    def __init__(self, df, tokenizer, label_cols,
                 max_length=MAX_LENGTH, use_comet=False, comet_relations=["xIntent","xNeed"]):
        super().__init__(df, tokenizer, label_cols, max_length)
        self.use_comet       = use_comet
        self.comet_relations = comet_relations

    def __getitem__(self, idx):
        text = self.df.loc[idx, "text"]
        enc  = self.tokenizer(
            text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length
        )
        # FIX: force labels to be a float32 numpy array
        label_array = self.df.loc[idx, self.label_cols].to_numpy(dtype=np.float32)
        labels      = torch.from_numpy(label_array)

        return {
            "input_ids":      enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels":         labels
        }

In [None]:
english_labels = ["anger","fear","joy","sadness","surprise"]
train_loader_eng, val_loader_eng, tokenizer, train_df_eng, val_df_eng = \
    get_dataloaders("eng.csv", english_labels)

In [None]:
# ========== DataLoader Helper ==========
def get_dataloaders(csv_path, label_cols, batch_size=BATCH_SIZE):
    df        = pd.read_csv(csv_path)
    tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
    tr_df, vl_df = train_test_split(df, test_size=0.1, random_state=42)
    tr_loader = DataLoader(EmotionDataset(tr_df, tokenizer, label_cols),
                           batch_size=batch_size, shuffle=True)
    vl_loader = DataLoader(EmotionDataset(vl_df, tokenizer, label_cols),
                           batch_size=batch_size)
    return tr_loader, vl_loader, tokenizer, tr_df, vl_df

In [None]:
english_labels = ["anger","fear","joy","sadness","surprise"]
train_loader_eng, val_loader_eng, tokenizer, train_df_eng, val_df_eng = \
    get_dataloaders("/kaggle/input/languages/esp.csv", english_labels)

train_loader_eng = DataLoader(
    EmotionDatasetWithComet(train_df_eng, tokenizer, english_labels, use_comet=True),
    batch_size=BATCH_SIZE, shuffle=True
)

In [None]:
train_loader_eng

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import (
    XLMRobertaTokenizer,
    XLMRobertaModel,
    AutoTokenizer,
    AutoModelForSeq2SeqLM
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from lime.lime_text import LimeTextExplainer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# ========== Config ==========
MAX_LENGTH  = 128
BATCH_SIZE  = 16
EPOCHS      = 20
LR          = 2e-5
DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ========== COMET-ATOMIC Setup (public model) ==========
COMET_REPO      = "mismayil/comet-bart-ai2"
comet_tokenizer = AutoTokenizer.from_pretrained(COMET_REPO)
comet_model     = AutoModelForSeq2SeqLM.from_pretrained(COMET_REPO).to(DEVICE)

def generate_comet_context(text: str, relations=["xIntent","xNeed"], max_new_tokens=20) -> str:
    contexts = []
    for rel in relations:
        prompt = f"{text} <{rel}>"
        inputs = comet_tokenizer(prompt, return_tensors="pt").to(DEVICE)
        outs   = comet_model.generate(**inputs, max_new_tokens=max_new_tokens)
        ctx    = comet_tokenizer.decode(outs[0], skip_special_tokens=True)
        contexts.append(f"[{rel}]: {ctx}")
    return " ".join(contexts)

# ========== Dataset Classes ==========
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, label_cols, max_length=MAX_LENGTH):
        # copy to avoid modifying original
        self.df = df.reset_index(drop=True).copy()
        # robustly convert label columns to numeric 0/1
        self.df[label_cols] = (
            self.df[label_cols]
            .apply(pd.to_numeric, errors="coerce")  # non-numeric→NaN
            .fillna(0)                               # NaN→0
            .astype(int)                             # to int
        )
        self.df[label_cols] = self.df[label_cols].astype(float)
        self.tokenizer  = tokenizer
        self.label_cols = label_cols
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.loc[idx, "text"]
        enc  = self.tokenizer(
            text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length
        )
        labels = torch.tensor(
            self.df.loc[idx, self.label_cols].values,
            dtype=torch.float
        )
        return {
            "input_ids":       enc["input_ids"].squeeze(0),
            "attention_mask":  enc["attention_mask"].squeeze(0),
            "labels":          labels
        }

class EmotionDatasetWithComet(EmotionDataset):
    def __init__(self, df, tokenizer, label_cols,
                 max_length=MAX_LENGTH, use_comet=False, comet_relations=["xIntent","xNeed"]):
        super().__init__(df, tokenizer, label_cols, max_length)
        self.use_comet       = use_comet
        self.comet_relations = comet_relations

    def __getitem__(self, idx):
        text = self.df.loc[idx, "text"]
        enc  = self.tokenizer(
            text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length
        )
        # FIX: force labels to be a float32 numpy array
        label_array = self.df.loc[idx, self.label_cols].to_numpy(dtype=np.float32)
        labels      = torch.from_numpy(label_array)

        return {
            "input_ids":      enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels":         labels
        }

# ========== Loss Functions ==========
def focal_loss(inputs, targets, alpha=0.25, gamma=2.0):
    bce = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
    pt  = torch.exp(-bce)
    return (alpha * (1 - pt) ** gamma * bce).mean()

class LabelSmoothingBCEWithLogitsLoss(nn.Module):
    def __init__(self, smoothing=0.1):
        super().__init__()
        assert 0.0 <= smoothing < 1.0
        self.smoothing = smoothing

    def forward(self, logits, targets):
        smooth = targets * (1 - self.smoothing) + 0.5 * self.smoothing
        return nn.functional.binary_cross_entropy_with_logits(logits, smooth)

def combined_bce_focal_loss(logits, targets, alpha=0.5, focal_alpha=0.25, focal_gamma=2.0):
    bce    = nn.functional.binary_cross_entropy_with_logits(logits, targets)
    bce_ex = nn.functional.binary_cross_entropy_with_logits(logits, targets, reduction="none")
    pt     = torch.exp(-bce_ex)
    focal  = (focal_alpha * (1 - pt) ** focal_gamma * bce_ex).mean()
    return alpha * bce + (1 - alpha) * focal

# ========== Metrics ==========
def compute_metrics(preds, labels):
    preds  = (torch.sigmoid(preds) > 0.5).cpu().numpy()
    labels = labels.cpu().numpy()
    return {
        "f1_macro":       f1_score(labels, preds, average="macro", zero_division=0),
        "precision_macro":precision_score(labels, preds, average="macro", zero_division=0),
        "recall_macro":   recall_score(labels, preds, average="macro", zero_division=0)
    }

# ========== Model ==========
class XLMREmotionClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.encoder    = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        self.classifier = nn.Linear(self.encoder.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        out     = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_tok = out.last_hidden_state[:, 0, :]
        logits  = self.classifier(cls_tok)
        return logits

# ========== Training Loop ==========
def train_model(model, train_loader, val_loader, label_cols,
                loss_type="bce", loss_kwargs=None, lr=LR, device=DEVICE):
    loss_kwargs = loss_kwargs or {}
    if   loss_type=="bce":     loss_fn = nn.BCEWithLogitsLoss()
    elif loss_type=="focal":   loss_fn = focal_loss
    elif loss_type=="lsbce":   loss_fn = LabelSmoothingBCEWithLogitsLoss(**loss_kwargs)
    elif loss_type=="combined":
        loss_fn = lambda lg, y: combined_bce_focal_loss(lg, y, **loss_kwargs)
    else:
        raise ValueError(f"Unknown loss_type {loss_type}")

    optimizer = optim.AdamW(model.parameters(), lr=lr)
    model.to(device)

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0.0
        for batch in train_loader:
            ids   = batch["input_ids"].to(device)
            mask  = batch["attention_mask"].to(device)
            labs  = batch["labels"].to(device)
            optimizer.zero_grad()
            logits = model(ids, mask)
            loss   = loss_fn(logits, labs)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{EPOCHS} — Loss: {total_loss/len(train_loader):.4f}")

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            ids     = batch["input_ids"].to(device)
            mask    = batch["attention_mask"].to(device)
            labs    = batch["labels"].to(device)
            logits  = model(ids, mask)
            all_preds.append(logits)
            all_labels.append(labs)
    preds  = torch.cat(all_preds)
    labels = torch.cat(all_labels)
    metrics= compute_metrics(preds, labels)
    print("Validation Metrics:", metrics)
    return metrics

# ========== Interpretability ==========
def explain_prediction(model, tokenizer, text, label_names):
    model.eval()
    def predict_proba(texts):
        outs = []
        for t in texts:
            enc = tokenizer(t, return_tensors="pt",
                             padding="max_length", truncation=True,
                             max_length=MAX_LENGTH).to(DEVICE)
            with torch.no_grad():
                lg = model(**enc)
            outs.append(torch.sigmoid(lg).cpu().numpy()[0])
        return np.array(outs)

    explainer = LimeTextExplainer(class_names=label_names)
    exp       = explainer.explain_instance(text, predict_proba,
                                           num_features=10,
                                           labels=list(range(len(label_names))))
    exp.show_in_notebook()

def plot_attention(model, tokenizer, text):
    enc   = tokenizer(text, return_tensors="pt",
                      padding="max_length", truncation=True,
                      max_length=MAX_LENGTH)
    input_ids      = enc["input_ids"][0]
    attention_mask = enc["attention_mask"][0]
    enc = {k:v.to(DEVICE) for k,v in enc.items()}
    with torch.no_grad():
        out = model.encoder(**enc, output_attentions=True)
    attn    = out.attentions[0][0,0]
    tokens  = tokenizer.convert_ids_to_tokens(input_ids)
    scores  = attn[0].cpu()
    filtered= [(s,t) for s,t,m in zip(scores, tokens, attention_mask) if m==1 and t not in tokenizer.all_special_tokens]
    scores, tokens = zip(*filtered)
    plt.figure(figsize=(12,4))
    plt.bar(range(len(tokens)), scores)
    plt.xticks(range(len(tokens)), tokens, rotation="vertical")
    plt.title("Attention Scores (Layer 0, Head 0)")
    plt.tight_layout()
    plt.show()

# ========== DataLoader Helper ==========
def get_dataloaders(csv_path, label_cols, batch_size=BATCH_SIZE):
    df        = pd.read_csv(csv_path)
    tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
    tr_df, vl_df = train_test_split(df, test_size=0.1, random_state=42)
    tr_loader = DataLoader(EmotionDataset(tr_df, tokenizer, label_cols),
                           batch_size=batch_size, shuffle=True)
    vl_loader = DataLoader(EmotionDataset(vl_df, tokenizer, label_cols),
                           batch_size=batch_size)
    return tr_loader, vl_loader, tokenizer, tr_df, vl_df

# === Part 1: English (ENG) ===
english_labels = ["anger","fear","joy","sadness","surprise"]
train_loader_eng, val_loader_eng, tokenizer, train_df_eng, val_df_eng = \
    get_dataloaders("/kaggle/input/languages/eng.csv", english_labels)

train_loader_eng = DataLoader(
    EmotionDatasetWithComet(train_df_eng, tokenizer, english_labels, use_comet=True),
    batch_size=BATCH_SIZE, shuffle=True
)
val_loader_eng = DataLoader(
    EmotionDatasetWithComet(val_df_eng, tokenizer, english_labels, use_comet=True),
    batch_size=BATCH_SIZE
)

model_eng = XLMREmotionClassifier(num_labels=len(english_labels))
metrics_eng = train_model(
    model_eng, train_loader_eng, val_loader_eng, english_labels,
    loss_type="combined", loss_kwargs={"alpha":0.6,"focal_alpha":0.25,"focal_gamma":2.0},
    lr=LR, device=DEVICE
)
print("English validation metrics:", metrics_eng)

sample_eng = val_df_eng.iloc[0]["text"]
print("\nLIME (ENG):");    explain_prediction(model_eng, tokenizer, sample_eng, english_labels)
print("\nAttention (ENG):"); plot_attention(model_eng, tokenizer, sample_eng)

# === Part 2: Spanish (ESP) ===
spanish_labels = ["anger","fear","joy","sadness","surprise","disgust"]
train_loader_esp, val_loader_esp, tokenizer, train_df_esp, val_df_esp = \
    get_dataloaders("/kaggle/input/languages/esp.csv", spanish_labels)

train_loader_esp = DataLoader(
    EmotionDatasetWithComet(train_df_esp, tokenizer, spanish_labels, use_comet=True),
    batch_size=BATCH_SIZE, shuffle=True
)
val_loader_esp = DataLoader(
    EmotionDatasetWithComet(val_df_esp, tokenizer, spanish_labels, use_comet=True),
    batch_size=BATCH_SIZE
)

model_esp = XLMREmotionClassifier(num_labels=len(spanish_labels))
metrics_esp = train_model(
    model_esp, train_loader_esp, val_loader_esp, spanish_labels,
    loss_type="lsbce", loss_kwargs={"smoothing":0.1},
    lr=LR, device=DEVICE
)
print("Spanish validation metrics:", metrics_esp)

sample_esp = val_df_esp.iloc[0]["text"]
print("\nLIME (ESP):");    explain_prediction(model_esp, tokenizer, sample_esp, spanish_labels)
print("\nAttention (ESP):"); plot_attention(model_esp, tokenizer, sample_esp)