In [7]:
from google.colab import drive
drive.flush_and_unmount()
print("unmounted")


Drive not mounted, so nothing to flush and unmount.
unmounted


In [8]:
import shutil, os
shutil.rmtree("/content/drive", ignore_errors=True)
os.makedirs("/content/drive", exist_ok=True)
print("cleared /content/drive")


cleared /content/drive


In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)


Mounted at /content/drive


In [8]:
# =========================
# INSPECT MASTER + SHARD FILES
# =========================
import os, glob
import pandas as pd

MASTER_IN = "/content/drive/MyDrive/master_dataset.csv"
SHARDS_DIR = "/content/drive/MyDrive/translation_shards"

print("=== MASTER INSPECTION ===")
master = pd.read_csv(MASTER_IN, low_memory=False)
print("Master shape:", master.shape)
print("Master columns:", list(master.columns))

# show key columns presence
for c in ["id", "row_id", "lang", "split", "text", "text_en"]:
    print(f"Master has {c}:", c in master.columns)

print("\nMaster split counts:")
print(master["split"].value_counts(dropna=False).to_string() if "split" in master.columns else "NO split col")

print("\nMaster lang counts (top 15):")
print(master["lang"].value_counts().head(15).to_string() if "lang" in master.columns else "NO lang col")

# quick sample
show_cols = [c for c in ["id","row_id","lang","split","text","polarized"] if c in master.columns]
print("\nMaster sample (3 rows):")
print(master[show_cols].head(3).to_string(index=False))

print("\n=== SHARD FILE DISCOVERY ===")
shard_paths = sorted(glob.glob(os.path.join(SHARDS_DIR, "shard_person*_S*_*.csv")))
print("Found shard CSVs:", len(shard_paths))
print("Shard filenames:")
for p in shard_paths:
    print(" -", os.path.basename(p))

assert len(shard_paths) > 0, "No shard CSVs found."

print("\n=== SHARD COLUMN CHECK (first 3 files) ===")
for p in shard_paths[:3]:
    df = pd.read_csv(p, low_memory=False)
    print("\nFile:", os.path.basename(p))
    print("  shape:", df.shape)
    print("  columns:", list(df.columns))
    for c in ["id", "row_id", "lang", "text", "text_en", "translation", "translated_text", "output"]:
        if c in df.columns:
            print("  has:", c)

print("\n=== SHARD KEY SANITY (all shard files) ===")
summary = []
for p in shard_paths:
    df = pd.read_csv(p, low_memory=False)
    cols = set(df.columns)
    key = "row_id" if "row_id" in cols else ("id" if "id" in cols else None)
    out = "text_en" if "text_en" in cols else ("translation" if "translation" in cols else ("translated_text" if "translated_text" in cols else None))
    summary.append([os.path.basename(p), df.shape[0], df.shape[1], key, out])
summary_df = pd.DataFrame(summary, columns=["file","rows","cols","key_col","text_en_col_guess"])
print(summary_df.to_string(index=False))

print("\n=== SHARD SAMPLE ROWS (pick one file) ===")
pick = shard_paths[0]
df0 = pd.read_csv(pick, low_memory=False)
print("Picked:", os.path.basename(pick))
# print a compact view of relevant cols
cand = [c for c in ["id","row_id","lang","text","text_en","translation","translated_text"] if c in df0.columns]
print(df0[cand].head(5).to_string(index=False))


=== MASTER INSPECTION ===
Master shape: (77368, 10)
Master columns: ['id', 'text', 'political', 'racial/ethnic', 'religious', 'gender/sexual', 'other', 'lang', 'split', 'polarized']
Master has id: True
Master has row_id: False
Master has lang: True
Master has split: True
Master has text: True
Master has text_en: False

Master split counts:
split
train    73681
dev       3687

Master lang counts (top 15):
lang
swa    7340
khm    6972
zho    4494
hau    3833
urd    3740
arb    3549
rus    3515
ita    3500
ben    3499
amh    3498
spa    3470
fas    3459
eng    3382
deu    3339
mya    3033

Master sample (3 rows):
                                  id lang split                                                                                                text  polarized
spa_bc7bf0a1b710cd724cf96b5eeb020bff  spa train "bueno, tirando"\ny si hay repregunta, entonces palante.\nprotestante no se. proteston soy un rato.        0.0
spa_688aec0f6abf4e0be44ecbfa886251cc  spa train                 

In [9]:
import pandas as pd, glob, os

MASTER_IN = "/content/drive/MyDrive/master_dataset.csv"
SHARDS_DIR = "/content/drive/MyDrive/translation_shards"

master = pd.read_csv(MASTER_IN, low_memory=False)

# load one shard
p = sorted(glob.glob(os.path.join(SHARDS_DIR, "shard_person*_S*_*.csv")))[0]
sh = pd.read_csv(p, low_memory=False)

print("Master id example:", master["id"].iloc[0])
print("Shard row_id example:", sh["row_id"].iloc[0])

# key overlap check
master_ids = set(master["id"].astype(str))
shard_row_ids = set(sh["row_id"].astype(str))

overlap = len(master_ids & shard_row_ids)
print("Overlap count (master.id ∩ shard.row_id):", overlap)
print("Overlap % of shard:", overlap / len(shard_row_ids))


Master id example: spa_bc7bf0a1b710cd724cf96b5eeb020bff
Shard row_id example: a3c780d72560856c
Overlap count (master.id ∩ shard.row_id): 0
Overlap % of shard: 0.0


In [10]:
import pandas as pd

master = pd.read_csv("/content/drive/MyDrive/master_dataset.csv", low_memory=False)

dup = master.duplicated(subset=["lang","split","text"]).sum()
print("Master duplicates by (lang,split,text):", dup)

if dup > 0:
    # show a few duplicates
    d = master[master.duplicated(subset=["lang","split","text"], keep=False)] \
              .sort_values(["lang","split","text"]) \
              .head(10)
    print(d[["id","lang","split","text"]].to_string(index=False))


Master duplicates by (lang,split,text): 3
                                  id lang split                                                                                                                  text
ita_4aabf598b2b7663c2912407ef5a57d63  ita train                                     Bici rubate ai ciclisti della Milano-Saremo: trovate in un campo nomadi @URL @URL
ita_3f40e90b06195d56f6b83aece1c63ad8  ita train                                     Bici rubate ai ciclisti della Milano-Saremo: trovate in un campo nomadi @URL @URL
ita_32d6701aaedb97e4f6fa87596554b13e  ita train Qui Radio Londra: Roma, cinese morta, ore contate per chi l'ha scippata: «Vengono dal campo rom» - Il Messaggero @URL
ita_e21c9d831fd82aba4d0436174505dece  ita train Qui Radio Londra: Roma, cinese morta, ore contate per chi l'ha scippata: «Vengono dal campo rom» - Il Messaggero @URL
ita_4c67b8940fd4c4c49849110f58f70d97  ita train                                                                     Roma, quei p

## Filling Master text_en

In [11]:
import os, glob
import pandas as pd

MASTER_IN  = "/content/drive/MyDrive/master_dataset.csv"
SHARDS_DIR = "/content/drive/MyDrive/translation_shards"
MASTER_OUT = "/content/drive/MyDrive/master_dataset_with_text_en.csv"

OUT_COL  = "text_en"
LANG_COL = "lang"
SPLIT_COL= "split"
TEXT_COL = "text"

FILL_ONLY_MISSING = True   # keep True

# 1) Load master
master = pd.read_csv(MASTER_IN, low_memory=False)

# ensure text_en exists
if OUT_COL not in master.columns:
    master[OUT_COL] = pd.NA

before = master[OUT_COL].notna().sum()

# 2) Load all shard CSVs
shard_paths = sorted(glob.glob(os.path.join(SHARDS_DIR, "shard_person*_S*_*.csv")))
print("Found shard CSVs:", len(shard_paths))
assert len(shard_paths) > 0, "No shard CSVs found."

shards = []
for p in shard_paths:
    df = pd.read_csv(p, low_memory=False)

    # keep only needed columns
    need = [LANG_COL, SPLIT_COL, TEXT_COL, OUT_COL]
    missing_cols = [c for c in need if c not in df.columns]
    if missing_cols:
        print("[SKIP]", os.path.basename(p), "missing", missing_cols)
        continue

    df = df[need].copy()

    # clean translations
    df[OUT_COL] = df[OUT_COL].astype(str).replace({"nan": ""}).str.strip()
    df = df[df[OUT_COL].notna() & (df[OUT_COL] != "")]

    shards.append(df)

all_shards = pd.concat(shards, ignore_index=True)
print("Shard rows (after dropping empty text_en):", len(all_shards))

# 3) Deduplicate shards on the merge key
all_shards = all_shards.drop_duplicates(subset=[LANG_COL, SPLIT_COL, TEXT_COL], keep="last")
print("Shard unique (lang,split,text):", len(all_shards))

# 4) Merge into master on (lang, split, text)
merged = master.merge(
    all_shards,
    on=[LANG_COL, SPLIT_COL, TEXT_COL],
    how="left",
    suffixes=("", "_shard"),
)

# 5) Fill master text_en
if FILL_ONLY_MISSING:
    miss = merged[OUT_COL].isna() | (merged[OUT_COL].astype(str).str.strip() == "")
    merged.loc[miss, OUT_COL] = merged.loc[miss, OUT_COL + "_shard"]
else:
    merged[OUT_COL] = merged[OUT_COL + "_shard"].combine_first(merged[OUT_COL])

merged = merged.drop(columns=[OUT_COL + "_shard"])

after = merged[OUT_COL].notna().sum()
print("Master text_en filled before:", before)
print("Master text_en filled after :", after)
print("Newly filled:", after - before)

# 6) Remaining missing report
missing = merged[OUT_COL].isna() | (merged[OUT_COL].astype(str).str.strip() == "")
print("\nRemaining missing text_en:", int(missing.sum()))
print("Missing by lang (top 20):")
print(merged.loc[missing, LANG_COL].value_counts().head(20).to_string())

# 7) Save
merged.to_csv(MASTER_OUT, index=False)
print("\nSaved:", MASTER_OUT)


Found shard CSVs: 8
Shard rows (after dropping empty text_en): 41158
Shard unique (lang,split,text): 41155
Master text_en filled before: 0
Master text_en filled after : 41158
Newly filled: 41158

Remaining missing text_en: 36210
Missing by lang (top 20):
lang
hau    3833
urd    3740
arb    3549
rus    3515
spa    3470
eng    3382
deu    3339
hin    2881
pol    2510
tur    2479
ita     573
swa     573
fas     440
amh     362
khm     334
tel     273
zho     214
mya     212
ben     167
pan     132

Saved: /content/drive/MyDrive/master_dataset_with_text_en.csv


## copy teacher languages into text_en

In [12]:
import pandas as pd

PATH = "/content/drive/MyDrive/master_dataset_with_text_en.csv"
df = pd.read_csv(PATH, low_memory=False)

TEACHER_LANGS = ['eng','spa','deu','rus','tur','pol','arb']

missing = df["text_en"].isna() | (df["text_en"].astype(str).str.strip() == "")
mask_teacher = df["lang"].isin(TEACHER_LANGS)

before = (~missing).sum()
df.loc[mask_teacher & missing, "text_en"] = df.loc[mask_teacher & missing, "text"]

missing2 = df["text_en"].isna() | (df["text_en"].astype(str).str.strip() == "")
after = (~missing2).sum()

print("filled before:", int(before))
print("filled after :", int(after))
print("newly filled :", int(after - before))

print("\nRemaining missing by lang (top 20):")
print(df.loc[missing2, "lang"].value_counts().head(20).to_string())

df.to_csv(PATH, index=False)
print("\nSaved updated file:", PATH)


filled before: 41158
filled after : 63402
newly filled : 22244

Remaining missing by lang (top 20):
lang
hau    3833
urd    3740
hin    2881
swa     573
ita     573
fas     440
amh     362
khm     334
tel     273
zho     214
mya     212
ben     167
pan     132
ori     132
nep     100

Saved updated file: /content/drive/MyDrive/master_dataset_with_text_en.csv


In [13]:
import pandas as pd

PATH = "/content/drive/MyDrive/master_dataset_with_text_en.csv"
df = pd.read_csv(PATH, low_memory=False)

miss = df["text_en"].isna() | (df["text_en"].astype(str).str.strip()=="")
have = ~miss

print("Missing count:", int(miss.sum()))
print("\nLabel dist (have_en):")
print(df.loc[have & (df["split"]=="train"), "polarized"].value_counts(normalize=True).to_string())

print("\nLabel dist (missing_en):")
print(df.loc[miss & (df["split"]=="train"), "polarized"].value_counts(normalize=True).to_string())

print("\nText length mean (have_en vs missing_en):")
print("have_en:", df.loc[have, "text"].astype(str).str.len().mean())
print("missing:", df.loc[miss, "text"].astype(str).str.len().mean())


Missing count: 13966

Label dist (have_en):
polarized
1.0    0.534131
0.0    0.465869

Label dist (missing_en):
polarized
1.0    0.515574
0.0    0.484426

Text length mean (have_en vs missing_en):
have_en: 105.64728242011293
missing: 112.10067306315337


## Logits Generation

In [3]:
import os, re
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ---------- paths ----------
MASTER_PATH = "/content/drive/MyDrive/master_dataset_with_text_en.csv"
OUT_MASTER  = "/content/drive/MyDrive/master_with_teacherA_logits.csv"
LOGITS_PATH = "/content/drive/MyDrive/distill/teacherA_logits_real_train.csv"  # resumable
os.makedirs(os.path.dirname(LOGITS_PATH), exist_ok=True)

#TEACHER_DIR_BASE = "/content/drive/MyDrive/teachers/teacherA_xlmr_large"  # folder containing checkpoints
# If you have a FINAL folder, set it instead:
TEACHER_DIR_BASE = "/content/drive/MyDrive/teachers/teacherA_xlmr_large_FINAL"

# ---------- pick latest checkpoint automatically (if not FINAL) ----------
def pick_ckpt(base_dir):
    if os.path.isfile(os.path.join(base_dir, "config.json")):
        return base_dir  # already a model dir
    ckpts = [d for d in os.listdir(base_dir) if d.startswith("checkpoint-")]
    if not ckpts:
        raise FileNotFoundError(f"No checkpoints found in {base_dir}")
    ckpts = sorted(ckpts, key=lambda x: int(re.findall(r"\d+", x)[0]))
    return os.path.join(base_dir, ckpts[-1])

TEACHER_CKPT = pick_ckpt(TEACHER_DIR_BASE)
print("Using teacher checkpoint:", TEACHER_CKPT)

# ---------- load data ----------
df = pd.read_csv(MASTER_PATH, low_memory=False)

# real + train + has text_en
has_en = df["text_en"].notna() & (df["text_en"].astype(str).str.strip() != "")
is_train = df["split"].astype(str).eq("train")

# if is_synthetic exists, exclude it; otherwise master is real anyway
if "is_synthetic" in df.columns:
    is_real = df["is_synthetic"].fillna(0).astype(int).eq(0)
else:
    is_real = pd.Series(True, index=df.index)

pool = df[is_train & has_en & is_real].copy()
pool["id"] = pool["id"].astype(str)
print("Real train rows with text_en:", pool.shape)

# ---------- resume (skip already computed ids) ----------
done_ids = set()
if os.path.exists(LOGITS_PATH):
    prev = pd.read_csv(LOGITS_PATH, usecols=["id"])
    done_ids = set(prev["id"].astype(str))
    print("Already have logits for:", len(done_ids))

pool = pool[~pool["id"].isin(done_ids)].reset_index(drop=True)
print("To compute now:", len(pool))

if len(pool) == 0:
    print("Nothing to do. (All logits already computed)")
else:
    # ---------- load teacher ----------
    tok = AutoTokenizer.from_pretrained(TEACHER_CKPT)
    model = AutoModelForSequenceClassification.from_pretrained(TEACHER_CKPT)
    model.eval().to("cuda")

    MAX_LEN = 256
    BATCH = 64  # A100 ok; reduce to 32 if you hit OOM

    class TextEnDS(Dataset):
        def __init__(self, df):
            self.ids = df["id"].tolist()
            self.texts = df["text_en"].astype(str).tolist()
        def __len__(self): return len(self.ids)
        def __getitem__(self, i): return self.ids[i], self.texts[i]

    ds = TextEnDS(pool)
    dl = DataLoader(ds, batch_size=BATCH, shuffle=False, num_workers=2)

    rows = []
    with torch.no_grad():
        for batch in dl:
            ids, texts = batch
            enc = tok(list(texts), truncation=True, max_length=MAX_LEN, padding=True, return_tensors="pt")
            enc = {k: v.to("cuda") for k, v in enc.items()}

            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                logits = model(**enc).logits  # [bs,2]

            probs = torch.softmax(logits.float(), dim=1)
            p1 = probs[:, 1].cpu().numpy()
            log0 = logits[:, 0].float().cpu().numpy()
            log1 = logits[:, 1].float().cpu().numpy()

            for i in range(len(ids)):
                rows.append((str(ids[i]), float(log0[i]), float(log1[i]), float(p1[i])))

    out_df = pd.DataFrame(rows, columns=["id", "teacherA_logit0", "teacherA_logit1", "teacherA_p1"])

    # append to logits file (resumable)
    header = not os.path.exists(LOGITS_PATH)
    out_df.to_csv(LOGITS_PATH, mode="a", header=header, index=False)
    print("Wrote logits:", len(out_df), "->", LOGITS_PATH)

# ---------- add logits columns into master and save ----------
logits_all = pd.read_csv(LOGITS_PATH, low_memory=False)
logits_all["id"] = logits_all["id"].astype(str)

df["id"] = df["id"].astype(str)
df2 = df.merge(logits_all, on="id", how="left")

df2.to_csv(OUT_MASTER, index=False)
print("Saved merged master with teacher logits:", OUT_MASTER)
print("Non-null teacherA_p1:", int(df2["teacherA_p1"].notna().sum()))


Using teacher checkpoint: /content/drive/MyDrive/teachers/teacherA_xlmr_large_FINAL
Real train rows with text_en: (70546, 11)
Already have logits for: 62348
To compute now: 8198


The tokenizer you are loading from '/content/drive/MyDrive/teachers/teacherA_xlmr_large_FINAL' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
  with torch.cuda.amp.autocast(dtype=torch.bfloat16):


Wrote logits: 8198 -> /content/drive/MyDrive/distill/teacherA_logits_real_train.csv
Saved merged master with teacher logits: /content/drive/MyDrive/master_with_teacherA_logits.csv
Non-null teacherA_p1: 70546


## Adding the last 3 langauges (took too much time during translation)

In [2]:
import pandas as pd

MASTER_IN  = "/content/drive/MyDrive/master_dataset_with_text_en.csv"
MASTER_OUT = "/content/drive/MyDrive/master_dataset_with_text_en.csv"  # overwrite same file

s0 = pd.read_csv("/content/drive/MyDrive/translation_shards/shard_person0_S0_urd_hin_hau.csv", low_memory=False)
s1 = pd.read_csv("/content/drive/MyDrive/translation_shards/shard_person0_S1_urd_hin_hau.csv", low_memory=False)

master = pd.read_csv(MASTER_IN, low_memory=False)

# keep only needed cols
need = ["lang","split","text","text_en"]
sh = pd.concat([s0[need], s1[need]], ignore_index=True)

# drop empty text_en + dedupe by (lang,split,text)
sh["text_en"] = sh["text_en"].astype(str).replace({"nan": ""}).str.strip()
sh = sh[sh["text_en"].notna() & (sh["text_en"] != "")]
sh = sh.drop_duplicates(subset=["lang","split","text"], keep="last")

# merge + fill only missing
merged = master.merge(sh, on=["lang","split","text"], how="left", suffixes=("", "_shard"))
miss = merged["text_en"].isna() | (merged["text_en"].astype(str).str.strip() == "")
merged.loc[miss, "text_en"] = merged.loc[miss, "text_en_shard"]
merged = merged.drop(columns=["text_en_shard"])

# save
merged.to_csv(MASTER_OUT, index=False)
print("Saved:", MASTER_OUT)

# sanity: missing by lang
missing = merged["text_en"].isna() | (merged["text_en"].astype(str).str.strip() == "")
print("Remaining missing total:", int(missing.sum()))
print(merged.loc[missing, "lang"].value_counts().head(20).to_string())


Saved: /content/drive/MyDrive/master_dataset_with_text_en.csv
Remaining missing total: 5768
lang
urd    1459
swa     573
ita     573
hin     564
fas     440
amh     362
khm     334
tel     273
hau     233
zho     214
mya     212
ben     167
pan     132
ori     132
nep     100
