## 1) Paths + load model

In [1]:
from google.colab import drive
drive.mount("/content/drive")

import os, glob, pandas as pd, torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

DEV_DIR   = "/content/drive/MyDrive/dev"
MODEL_DIR = "/content/drive/MyDrive/student_xlmr_base_distill/FINAL_TRAIN/FINAL"  # your final student
OUT_ROOT  = "/content/drive/MyDrive/submission"
OUT_DIR   = os.path.join(OUT_ROOT, "subtask_1")

os.makedirs(OUT_DIR, exist_ok=True)

print("GPU available:", torch.cuda.is_available())
device = "cuda" if torch.cuda.is_available() else "cpu"

tok = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device).eval()

MAX_LEN = 256
BATCH = 64


Mounted at /content/drive
GPU available: True


The tokenizer you are loading from '/content/drive/MyDrive/student_xlmr_base_distill/FINAL_TRAIN/FINAL' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


## 2) Predict one file

In [2]:
import numpy as np

@torch.no_grad()
def predict_df(df):
    texts = df["text"].astype(str).tolist()
    ids   = df["id"].astype(str).tolist()

    preds = []
    for i in range(0, len(texts), BATCH):
        batch_texts = texts[i:i+BATCH]
        enc = tok(batch_texts, truncation=True, max_length=MAX_LEN, padding=True, return_tensors="pt")
        enc = {k: v.to(device) for k, v in enc.items()}

        logits = model(**enc).logits
        y = torch.argmax(logits, dim=-1).detach().cpu().numpy().astype(int)
        preds.extend(y.tolist())

    out = pd.DataFrame({"id": ids, "polarization": preds})
    return out


## 3) Run all languages + save + zip

In [3]:
import shutil

paths = sorted(glob.glob(os.path.join(DEV_DIR, "*.csv")))
print("Found dev files:", len(paths))
assert len(paths) > 0, "No dev csvs found. Check DEV_DIR path."

for p in paths:
    lang = os.path.splitext(os.path.basename(p))[0]  # e.g., "arb"
    df = pd.read_csv(p, low_memory=False)

    # must have id + text
    assert "id" in df.columns and "text" in df.columns, f"Missing id/text in {p}"

    pred = predict_df(df)
    out_path = os.path.join(OUT_DIR, f"pred_{lang}.csv")
    pred.to_csv(out_path, index=False)
    print("Wrote:", out_path, "rows:", len(pred))

# zip folder as required
zip_path = os.path.join(OUT_ROOT, "subtask_1.zip")
if os.path.exists(zip_path):
    os.remove(zip_path)

shutil.make_archive(base_name=os.path.join(OUT_ROOT, "subtask_1"), format="zip", root_dir=OUT_ROOT, base_dir="subtask_1")
print("Zipped:", zip_path)


Found dev files: 22
Wrote: /content/drive/MyDrive/submission/subtask_1/pred_amh.csv rows: 166
Wrote: /content/drive/MyDrive/submission/subtask_1/pred_arb.csv rows: 169
Wrote: /content/drive/MyDrive/submission/subtask_1/pred_ben.csv rows: 166
Wrote: /content/drive/MyDrive/submission/subtask_1/pred_deu.csv rows: 159
Wrote: /content/drive/MyDrive/submission/subtask_1/pred_eng.csv rows: 160
Wrote: /content/drive/MyDrive/submission/subtask_1/pred_fas.csv rows: 164
Wrote: /content/drive/MyDrive/submission/subtask_1/pred_hau.csv rows: 182
Wrote: /content/drive/MyDrive/submission/subtask_1/pred_hin.csv rows: 137
Wrote: /content/drive/MyDrive/submission/subtask_1/pred_ita.csv rows: 166
Wrote: /content/drive/MyDrive/submission/subtask_1/pred_khm.csv rows: 332
Wrote: /content/drive/MyDrive/submission/subtask_1/pred_mya.csv rows: 144
Wrote: /content/drive/MyDrive/submission/subtask_1/pred_nep.csv rows: 100
Wrote: /content/drive/MyDrive/submission/subtask_1/pred_ori.csv rows: 118
Wrote: /content/dr