## training

In [1]:
!pip install transformers==4.57.1

Collecting transformers==4.57.1
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.57.1-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m133.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.57.3
    Uninstalling transformers-4.57.3:
      Successfully uninstalled transformers-4.57.3
Successfully installed transformers-4.57.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# If needed (uncomment):
# %pip install -q torch torchvision torchaudio scikit-learn pandas numpy matplotlib transformers accelerate

from pathlib import Path
import json, random, re, math, os, time
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)

# ===== CONFIG =====
BASE = Path(r"/content/drive/MyDrive/name2nat/cleaned")
TRAIN_PATH = BASE / "train_combined.jsonl"
VAL_PATH   = BASE / "val_original.jsonl"
TEST_PATH  = BASE / "test_original.jsonl"

OUT_DIR = BASE / "mbert_runs_combined"
OUT_DIR.mkdir(parents=True, exist_ok=True)

MIN_PER_SPLIT   = 0       # same as your LSTM notebook
MAX_LEN         = 40      # max WordPiece tokens (can bump to 64/80 later if desired)
BATCH_SIZE      = 64      # BERT is heavier than LSTM; start smaller
EPOCHS          = 10      # you can increase later
PATIENCE        = 5       # early stopping
LEARNING_RATE   = 2e-5    # standard for BERT fine-tuning
WARMUP_RATIO    = 0.1

MODEL_NAME      = "bert-base-multilingual-cased"

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE


device(type='cuda')

In [4]:
def load_jsonl(path: Path):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            obj = json.loads(line)
            name = (obj.get("name") or "").strip()
            country = (obj.get("country") or "").strip().lower()
            if name and country:
                rows.append({"name": name, "country": country})
    return pd.DataFrame(rows)

df_train = load_jsonl(TRAIN_PATH)
df_val   = load_jsonl(VAL_PATH)
df_test  = load_jsonl(TEST_PATH)

print(f"Loaded → train: {len(df_train):,}, val: {len(df_val):,}, test: {len(df_test):,}")
print("Unique countries per split:",
      len(df_train['country'].unique()), len(df_val['country'].unique()), len(df_test['country'].unique()))

train_counts = df_train['country'].value_counts()
val_counts   = df_val['country'].value_counts()
test_counts  = df_test['country'].value_counts()

eligible = set(train_counts[train_counts >= MIN_PER_SPLIT].index) \
         & set(val_counts[val_counts >= MIN_PER_SPLIT].index) \
         & set(test_counts[test_counts >= MIN_PER_SPLIT].index)

print(f"Eligible countries (>= {MIN_PER_SPLIT} each split): {len(eligible)}")
if len(eligible) == 0:
    raise SystemExit("No countries meet the threshold. Relax MIN_PER_SPLIT or inspect data.")

df_train_f = df_train[df_train['country'].isin(eligible)].reset_index(drop=True)
df_val_f   = df_val[df_val['country'].isin(eligible)].reset_index(drop=True)
df_test_f  = df_test[df_test['country'].isin(eligible)].reset_index(drop=True)

print(f"After filter → train: {len(df_train_f):,} | val: {len(df_val_f):,} | test: {len(df_test_f):,}")
print("Classes:", len(eligible))


Loaded → train: 1,857,507, val: 133,551, test: 134,300
Unique countries per split: 103 96 97
Eligible countries (>= 0 each split): 96
After filter → train: 1,851,327 | val: 133,551 | test: 134,299
Classes: 96


In [5]:
classes = sorted(df_train_f['country'].unique())
label2idx = {c: i for i, c in enumerate(classes)}
idx2label = {i: c for c, i in label2idx.items()}

with (OUT_DIR / "label_map.json").open("w", encoding="utf-8") as f:
    json.dump({"label2idx": label2idx, "idx2label": idx2label}, f, ensure_ascii=False, indent=2)

len(classes), classes[:10]


(96,
 ['argentina',
  'australia',
  'austria',
  'bahrain',
  'bangladesh',
  'belarus',
  'belgium',
  'bhutan',
  'bolivia',
  'brazil'])

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class NameDataset(Dataset):
    def __init__(self, df):
        self.names = df['name'].tolist()
        self.labels = [label2idx[c] for c in df['country'].tolist()]

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        return self.names[idx], self.labels[idx]

def collate_fn(batch):
    names, labels = zip(*batch)
    enc = tokenizer(
        list(names),
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    labels = torch.tensor(labels, dtype=torch.long)
    enc["labels"] = labels
    return enc

train_ds = NameDataset(df_train_f)
val_ds   = NameDataset(df_val_f)
test_ds  = NameDataset(df_test_f)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=0, collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=collate_fn)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=collate_fn)

len(train_ds), len(val_ds), len(test_ds)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

(1851327, 133551, 134299)

In [7]:
num_classes = len(classes)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_classes
).to(DEVICE)

total_params = sum(p.numel() for p in model.parameters())
total_params, model.config


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(177927264,
 BertConfig {
   "architectures": [
     "BertForMaskedLM"
   ],
   "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
   "directionality": "bidi",
   "dtype": "float32",
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "id2label": {
     "0": "LABEL_0",
     "1": "LABEL_1",
     "2": "LABEL_2",
     "3": "LABEL_3",
     "4": "LABEL_4",
     "5": "LABEL_5",
     "6": "LABEL_6",
     "7": "LABEL_7",
     "8": "LABEL_8",
     "9": "LABEL_9",
     "10": "LABEL_10",
     "11": "LABEL_11",
     "12": "LABEL_12",
     "13": "LABEL_13",
     "14": "LABEL_14",
     "15": "LABEL_15",
     "16": "LABEL_16",
     "17": "LABEL_17",
     "18": "LABEL_18",
     "19": "LABEL_19",
     "20": "LABEL_20",
     "21": "LABEL_21",
     "22": "LABEL_22",
     "23": "LABEL_23",
     "24": "LABEL_24",
     "25": "LABEL_25",
     "26": "LABEL_26",
     "27": "LABEL_27",
     "28": "LABEL_28",
     "29": "LABEL_29",
     "30": "LABEL_30",
     "31": 

In [None]:
def run_epoch(model, loader, optimizer=None, scheduler=None):
    is_train = optimizer is not None
    model.train(is_train)

    all_preds = []
    all_labels = []
    total_loss = 0.0
    total = 0

    for batch in loader:
        # Move to device
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        labels = batch["labels"]

        with torch.set_grad_enabled(is_train):
            outputs = model(**batch)
            loss = outputs.loss
            logits = outputs.logits

            if is_train:
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                if scheduler is not None:
                    scheduler.step()

        bs = labels.size(0)
        total_loss += loss.item() * bs
        total += bs

        preds = torch.argmax(logits, dim=1)
        all_preds.append(preds.detach().cpu().numpy())
        all_labels.append(labels.detach().cpu().numpy())

    all_preds = np.concatenate(all_preds) if all_preds else np.array([])
    all_labels = np.concatenate(all_labels) if all_labels else np.array([])

    if total > 0:
        acc   = accuracy_score(all_labels, all_preds)
        f1_mi = f1_score(all_labels, all_preds, average="micro")
        f1_ma = f1_score(all_labels, all_preds, average="macro")
        avg_loss = total_loss / total
    else:
        acc = f1_mi = f1_ma = 0.0
        avg_loss = 0.0

    return avg_loss, acc, f1_mi, f1_ma


In [None]:
# Total training steps for scheduler
num_training_steps = EPOCHS * len(train_loader)
num_warmup_steps   = int(WARMUP_RATIO * num_training_steps)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

best_val_f1 = -1.0
best_state = None
epochs_no_improve = 0
history = []

for epoch in range(1, EPOCHS + 1):
    t0 = time.time()

    train_loss, train_acc, train_f1mi, train_f1ma = run_epoch(model, train_loader, optimizer, scheduler)
    val_loss,   val_acc,   val_f1mi,   val_f1ma   = run_epoch(model, val_loader, optimizer=None)

    dt = time.time() - t0

    history.append({
        "epoch": epoch,
        "train_loss": train_loss, "train_acc": train_acc, "train_f1_micro": train_f1mi, "train_f1_macro": train_f1ma,
        "val_loss": val_loss,     "val_acc": val_acc,     "val_f1_micro": val_f1mi,     "val_f1_macro": val_f1ma,
        "sec": dt
    })

    print(f"[{epoch:02d}] "
          f"train_loss={train_loss:.4f} acc={train_acc:.4f} f1(mi)={train_f1mi:.4f} f1(ma)={train_f1ma:.4f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.4f} f1(mi)={val_f1mi:.4f} f1(ma)={val_f1ma:.4f}  ({dt:.1f}s)")

    key = val_f1ma
    if key > best_val_f1:
        best_val_f1 = key
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            print(f"Early stopping at epoch {epoch}. Best val macro-F1 = {best_val_f1:.4f}")
            break

# Restore best weights
if best_state is not None:
    model.load_state_dict(best_state)

pd.DataFrame(history)


In [None]:
test_loss, test_acc, test_f1mi, test_f1ma = run_epoch(model, test_loader, optimizer=None)
print(f"TEST → loss={test_loss:.4f}  acc={test_acc:.4f}  f1(micro)={test_f1mi:.4f}  f1(macro)={test_f1ma:.4f}")


In [None]:
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        labels = batch["labels"]
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.append(preds)
        all_labels.append(labels.numpy())

all_preds  = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)

print("\nClassification report (per country):")
print(classification_report(
    all_labels,
    all_preds,
    target_names=[idx2label[i] for i in range(num_classes)],
    digits=4
))
cm = confusion_matrix(all_labels, all_preds, labels=list(range(num_classes)))

fig = plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.colorbar()
plt.tight_layout()
plt.show()


In [None]:
report_dict = classification_report(
    all_labels,
    all_preds,
    target_names=[idx2label[i] for i in range(num_classes)],
    output_dict=True,
    digits=4
)

df_report = pd.DataFrame(report_dict).transpose().reset_index().rename(columns={'index': 'country_or_metric'})
df_report = df_report.round(4)

report_csv = OUT_DIR / "mbert_test_report.csv"
df_report.to_csv(report_csv, index=False, encoding="utf-8-sig")

print(f"Saved detailed test report → {report_csv}")
df_report.head(15)


Saved detailed test report → /content/drive/MyDrive/name2nat/mbert/mbert_runs/mbert_test_report.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,country_or_metric,precision,recall,f1-score,support
0,argentina,0.337,0.405,0.3679,916.0
1,australia,0.2212,0.0147,0.0277,1695.0
2,austria,0.4005,0.1891,0.2569,915.0
3,bahrain,0.0,0.0,0.0,22.0
4,bangladesh,0.4793,0.6268,0.5432,351.0
5,belarus,1.0,0.0196,0.0385,51.0
6,belgium,0.5314,0.3231,0.4019,1204.0
7,bhutan,0.0,0.0,0.0,1.0
8,bolivia,0.0,0.0,0.0,8.0
9,brazil,0.7228,0.7424,0.7325,2978.0


In [None]:
MODEL_SAVE_DIR = OUT_DIR / "mbert_model"
MODEL_SAVE_DIR.mkdir(parents=True, exist_ok=True)

# Ensure best weights are loaded before saving
if best_state is not None:
    model.load_state_dict(best_state)

model.save_pretrained(MODEL_SAVE_DIR)
tokenizer.save_pretrained(MODEL_SAVE_DIR)

meta = {
    "MODEL_NAME": MODEL_NAME,
    "MAX_LEN": MAX_LEN,
    "num_classes": num_classes,
    "label2idx": label2idx,
    "idx2label": idx2label
}
with (MODEL_SAVE_DIR / "meta.json").open("w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("Saved model and tokenizer to:", MODEL_SAVE_DIR)


Saved model and tokenizer to: /content/drive/MyDrive/name2nat/mbert/mbert_runs/mbert_model


In [None]:
def predict_country(name: str, topk: int = 5):
    model.eval()
    enc = tokenizer(
        name,
        padding=False,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    enc = {k: v.to(DEVICE) for k, v in enc.items()}

    with torch.no_grad():
        outputs = model(**enc)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1).squeeze(0).cpu().numpy()

    topk_idx = probs.argsort()[::-1][:topk]
    return [(idx2label[i], float(probs[i])) for i in topk_idx]

# Quick sanity check
tests = ["Fuchen Dong", "Yayoi Arai", "Claire Chauvin", "O. P. Van Bijsterveld", "García Márquez"]
for t in tests:
    p = predict_country(t, topk=3)
    print(t)
    print(p)
    print()


Fuchen Dong
[('china', 0.933861494064331), ('usa', 0.015237406827509403), ('singapore', 0.014926593750715256)]

Yayoi Arai
[('japan', 0.9891668558120728), ('usa', 0.004925518296658993), ('united kingdom', 0.0007774350815452635)]

Claire Chauvin
[('france', 0.9732143878936768), ('united kingdom', 0.0064899856224656105), ('switzerland', 0.004177503753453493)]

O. P. Van Bijsterveld
[('netherlands', 0.9524874091148376), ('south africa', 0.007707451935857534), ('united kingdom', 0.007694448344409466)]

García Márquez
[('spain', 0.426643043756485), ('mexico', 0.15931253135204315), ('usa', 0.07594753056764603)]



## evaluation

In [None]:
df_new_test = load_jsonl(Path("/content/drive/MyDrive/name2nat/mbert/all_test.jsonl"))
df_new_test

In [None]:
df_new_test = df_new_test[df_new_test["country"].isin(classes)]
new_test_ds = NameDataset(df_new_test)

In [None]:
new_test_loader = DataLoader(
    new_test_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
)

In [None]:
def evaluate(model, dataloader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            y = batch["labels"].to(DEVICE)

            out = model(input_ids, attention_mask=attention_mask)
            logits = out.logits
            preds.extend(logits.argmax(dim=-1).cpu().tolist())
            labels.extend(y.cpu().tolist())

    return preds, labels

In [None]:
pred_new, true_new = evaluate(model, new_test_loader)

In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score

print("Accuracy:", accuracy_score(true_new, pred_new))
print("Macro F1:", f1_score(true_new, pred_new, average="macro"))
print("Weighted F1:", f1_score(true_new, pred_new, average="weighted"))


unique_labels = sorted(set(true_new) | set(pred_new))

print(classification_report(
    true_new,
    pred_new,
    labels=unique_labels,
    target_names=[classes[i] for i in unique_labels]
))

### load and evaluate

In [8]:
from pathlib import Path
import json
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pandas as pd

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_DIR = Path("/content/drive/MyDrive/name2nat/mbert/mbert_runs/mbert_model")

# # 1. Load meta info: label2idx, idx2label, num_classes
with open(MODEL_DIR /"meta.json", "r") as f:
    meta = json.load(f)

label2idx = meta["label2idx"]          # country -> int
idx2label = {int(k): v for k, v in meta["idx2label"].items()}
num_classes = meta["num_classes"]

# A canonical "classes" list in the EXACT order the model uses
classes = [idx2label[i] for i in range(num_classes)]

# 2. Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.to(DEVICE)
model.eval()

# 3. Dataset & collate_fn (same logic as in your notebook)
class NameDataset(Dataset):
    def __init__(self, df):
        self.names = df["name"].tolist()
        self.labels = [label2idx[c] for c in df["country"].tolist()]

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        return self.names[idx], self.labels[idx]

def collate_fn_mbert(batch):
    names, labels = zip(*batch)
    enc = tokenizer(
        list(names),
        padding=True,
        truncation=True,
        max_length=meta["MAX_LEN"],
        return_tensors="pt"
    )
    enc["labels"] = torch.tensor(labels, dtype=torch.long)
    return enc

# 4. Load your new test set
def load_jsonl(path):
    import pandas as pd
    rows = []
    with open(path, "r") as f:
        for line in f:
            rows.append(json.loads(line))
    return pd.DataFrame(rows)

df_new_test = load_jsonl("/content/drive/MyDrive/name2nat/cleaned/test_validated.jsonl")
# df_new_test = pd.read_csv("/content/drive/MyDrive/name2nat/cleaned/test_generated_100.csv")
# df_new_test = df_new_test.rename(columns={'Name': 'name', 'OAG_true': 'country'})

# filter out countries unseen in training (just in case)
df_new_test = df_new_test[df_new_test["country"].isin(label2idx.keys())]
# df_new_test = df_new_test[df_new_test['source'] != "csv_fullname"]

new_test_ds = NameDataset(df_new_test)
new_test_loader = DataLoader(
    new_test_ds,
    batch_size=128,
    shuffle=False,
    collate_fn=collate_fn_mbert,
)

# 5. Evaluation loop
def evaluate(model, dataloader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            y = batch.pop("labels")
            out = model(**batch)
            logits = out.logits
            preds.extend(logits.argmax(dim=-1).cpu().tolist())
            labels.extend(y.cpu().tolist())
    return preds, labels

pred_new, true_new = evaluate(model, new_test_loader)

print("Accuracy:", accuracy_score(true_new, pred_new))
print("Macro F1:", f1_score(true_new, pred_new, average="macro"))
print("Weighted F1:", f1_score(true_new, pred_new, average="weighted"))

unique_labels = sorted(set(true_new) | set(pred_new))
print(classification_report(
    true_new,
    pred_new,
    labels=unique_labels,
    target_names=[idx2label[i] for i in unique_labels]
))

Accuracy: 0.46741930088643585
Macro F1: 0.3499234631445458
Weighted F1: 0.41822359376127755
                      precision    recall  f1-score   support

           argentina       0.49      0.49      0.49       649
           australia       0.22      0.01      0.03       998
             austria       0.75      0.26      0.39       658
             bahrain       0.00      0.00      0.00       938
          bangladesh       0.51      0.37      0.43       916
             belarus       1.00      0.01      0.02      1023
             belgium       0.78      0.44      0.56       693
              bhutan       0.00      0.00      0.00       942
             bolivia       0.00      0.00      0.00         6
              brazil       0.59      0.82      0.69       858
              brunei       0.00      0.00      0.00       981
            bulgaria       0.62      0.78      0.69       196
            cambodia       1.00      0.00      0.00       991
              canada       0.36      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# ---------------------------------------------------------
# 6. Save Predictions to CSV
# ---------------------------------------------------------

# 1. Map the predicted integer indices back to actual country names
# We use the 'idx2label' dictionary loaded from meta.json earlier
predicted_countries = [idx2label[idx] for idx in pred_new]

# 2. Create a results DataFrame based on the original test data
df_results = df_new_test.copy()

# 3. Add the prediction columns
df_results["pred_idx"] = pred_new
df_results["pred_country"] = predicted_countries

# Optional: Add an 'is_correct' column for easier manual analysis
df_results["is_correct"] = df_results["country"] == df_results["pred_country"]

# 4. Save to CSV
output_path = "/content/drive/MyDrive/name2nat/test_predictions_result_original.csv"
df_results.to_csv(output_path, index=False)

print(f"✅ Predictions saved to: {output_path}")
print(df_results[["name", "country", "pred_country", "is_correct"]].head())

✅ Predictions saved to: /content/drive/MyDrive/name2nat/test_predictions_result_original.csv
                 name       country  pred_country  is_correct
0     Ahmed Al-Otaibi  saudi arabia  saudi arabia        True
1     Mishari Al-Ajmi  saudi arabia        kuwait       False
2  Adel Al-Abdulkarim  saudi arabia        kuwait       False
3     Hazim Al-Harthi  saudi arabia         qatar       False
4        Suha Al-Hadi  saudi arabia        kuwait       False
