In [None]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from collections import defaultdict



In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
model_name = "Qwen/Qwen2-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name).to(device)
embedding_model.eval()

for p in embedding_model.parameters():
    p.requires_grad = False

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeds = model_output.last_hidden_state
    mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
    sum_emb = torch.sum(token_embeds * mask, dim=1)
    sum_mask = torch.clamp(mask.sum(dim=1), min=1e-9)
    return sum_emb / sum_mask

In [None]:
class Classifier(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.linear = nn.Linear(dim, 2)

    def forward(self, x):
        return self.linear(x)

In [None]:
def encode_texts(text_list):
    batch = tokenizer(text_list, padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)
    with torch.no_grad():
        output = embedding_model(**batch)
        emb = mean_pooling(output, batch["attention_mask"])
    return emb.cpu()

In [None]:
def evaluate_language(lang, df):
    print(f"\n============= {lang.upper()} =============")

    texts = df["text"].astype(str).tolist()
    labels = df["label"].astype(int).values

    # Compute embeddings once
    X = encode_texts(texts).numpy()
    y = labels

    k = 5
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    acc_list = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"Fold {fold+1}")

        # Split
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
        X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
        y_train_t = torch.tensor(y_train, dtype=torch.long).to(device)
        y_test_t = torch.tensor(y_test, dtype=torch.long).to(device)

        # Classifier
        clf = Classifier(X_train_t.shape[1]).to(device)
        opt = torch.optim.Adam(clf.parameters(), lr=1e-3)
        loss_fn = nn.CrossEntropyLoss()

        # Train
        clf.train()
        loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)
        for epoch in range(5):
            for bx, by in loader:
                opt.zero_grad()
                logits = clf(bx)
                loss = loss_fn(logits, by)
                loss.backward()
                opt.step()

        # Evaluate
        clf.eval()
        with torch.no_grad():
            logits = clf(X_test_t)
            pred = torch.argmax(logits, dim=1).cpu().numpy()

        acc = np.mean(pred == y_test)
        acc_list.append(acc)
        print(f"  Fold Accuracy: {acc:.4f}")

    # Compute mean/std + baseline
    mean_acc = np.mean(acc_list)
    std_acc = np.std(acc_list)
    baseline = df["label"].value_counts().max() / len(df)

    return lang, mean_acc, std_acc, baseline


In [None]:
language_files = {
    "amh": "amh.csv",
    "arb": "arb.csv",
    "deu": "deu.csv",
    "eng": "eng.csv",
    "fas": "fas.csv",
    "hau": "hau.csv",
    "hin": "hin.csv",
    "ita": "ita.csv",
    "nep": "nep.csv",
    "spa": "spa.csv",
    "tur": "tur.csv",
    "urd": "urd.csv",
    "zho": "zho.csv",
}

results = []

for lang, filename in language_files.items():
    df = pd.read_csv(filename)
    lang_result = evaluate_language(lang, df)
    results.append(lang_result)

# -------------------------------
# 3. Print final summary table
# -------------------------------
summary = pd.DataFrame(results, columns=["Language", "Mean", "Std", "Baseline"])
print("\n============== FINAL SUMMARY ==============\n")
print(summary)