In [19]:
!pip -q install --upgrade --force-reinstall \
  "transformers==4.57.3" \
  "datasets==4.4.2" \
  "accelerate==1.12.0" \
  "huggingface_hub==0.36.0" \
  "evaluate==0.4.6" \
  "pyarrow==22.0.0" \
  "tokenizers==0.22.1" \
  "safetensors==0.7.0"


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m119.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m122.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.2/507.2 kB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.0/201.0 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount("/content/drive")

## Inspecting how our data look like so far

In [2]:
import pandas as pd
import numpy as np

PATH = "/content/drive/MyDrive/teacher_training_data_aug_v2.csv"

df = pd.read_csv(PATH)

print("=== SHAPE ===")
print(df.shape)

print("\n=== COLUMNS ===")
print(list(df.columns))

print("\n=== DTYPES ===")
print(df.dtypes)

print("\n=== HEAD (3) ===")
print(df.head(3).to_string(index=False))

print("\n=== MISSING VALUES (top 30) ===")
na = df.isna().sum().sort_values(ascending=False)
na = na[na > 0]
print(na.head(30).to_string() if len(na) else "No missing values ✅")

# ---- Basic distributions
def vc(col, top=30):
    if col in df.columns:
        print(f"\n=== VALUE COUNTS: {col} ===")
        print(df[col].value_counts(dropna=False).head(top).to_string())
    else:
        print(f"\n(MISSING COLUMN) {col}")

vc("polarized")
vc("lang")
vc("split")
vc("is_synthetic")
vc("augmentation_type")

# ---- Topic/type columns (your Subtask-2 type vector columns)
TYPE_COLS = ["political", "racial/ethnic", "religious", "gender/sexual", "other"]
present_types = [c for c in TYPE_COLS if c in df.columns]

print("\n=== TYPE COLS PRESENT ===")
print(present_types)

if present_types:
    print("\n=== TYPE COLS SUMS (how many 1s) ===")
    print(df[present_types].fillna(0).astype(int).sum().to_string())

    print("\n=== TYPE VECTOR VALIDITY (row-wise) ===")
    row_sum = df[present_types].fillna(0).astype(int).sum(axis=1)
    print("rows with all 0:", int((row_sum == 0).sum()))
    print("rows with >=1 :", int((row_sum >= 1).sum()))
    print("rows with >1  :", int((row_sum > 1).sum()))

# ---- Text column guess + length stats
TEXT_COL_CANDIDATES = ["text", "sentence", "content", "tweet"]
text_col = next((c for c in TEXT_COL_CANDIDATES if c in df.columns), None)

if text_col:
    lens = df[text_col].astype(str).str.len()
    print(f"\n=== TEXT COLUMN DETECTED: {text_col} ===")
    print("len min/mean/median/max:", int(lens.min()), float(lens.mean()), float(lens.median()), int(lens.max()))

    # duplicates
    dup = df.duplicated(subset=["lang", text_col]).sum()
    print("\n=== DUPLICATES (by lang + text) ===")
    print("duplicate rows:", int(dup))

    # label x synthetic sanity
    if "is_synthetic" in df.columns and "polarized" in df.columns:
        print("\n=== LABEL x is_synthetic crosstab ===")
        print(pd.crosstab(df["is_synthetic"], df["polarized"], dropna=False))
else:
    print("\n⚠️ Could not auto-detect the text column. If it's named differently, tell me the column name.")

# ---- Quick stratified glimpse
if "lang" in df.columns and "polarized" in df.columns:
    print("\n=== LANG x LABEL (top 20 langs) ===")
    tab = pd.crosstab(df["lang"], df["polarized"])
    tab["total"] = tab.sum(axis=1)
    print(tab.sort_values("total", ascending=False).head(20).to_string())


  df = pd.read_csv(PATH)


=== SHAPE ===
(103489, 15)

=== COLUMNS ===
['id', 'text', 'political', 'racial/ethnic', 'religious', 'gender/sexual', 'other', 'lang', 'split', 'polarized', 'is_synthetic', 'augmentation_type', 'source_row_id', 'seed_text', 'seed_lang']

=== DTYPES ===
id                   object
text                 object
political             int64
racial/ethnic         int64
religious             int64
gender/sexual         int64
other                 int64
lang                 object
split                object
polarized             int64
is_synthetic          int64
augmentation_type    object
source_row_id        object
seed_text            object
seed_lang            object
dtype: object

=== HEAD (3) ===
                                  id                                                                                                text  political  racial/ethnic  religious  gender/sexual  other lang split  polarized  is_synthetic augmentation_type source_row_id seed_text seed_lang
spa_bc7bf0

## Fixing the data structure

In [3]:
PATH = "/content/drive/MyDrive/teacher_training_data_aug_v2.csv"

df = pd.read_csv(PATH, low_memory=False)

# ---- Normalize split
df["split"] = df["split"].astype(str)

# ---- Fill augmentation metadata for originals
df["augmentation_type"] = df["augmentation_type"].fillna("original")
df["source_row_id"]     = df["source_row_id"].fillna("")
df["seed_text"]         = df["seed_text"].fillna("")
df["seed_lang"]         = df["seed_lang"].fillna("")

# ---- Fix missing ids: create stable ids from (lang, index) or (lang, hash(text))
missing_id = df["id"].isna()
df.loc[missing_id, "id"] = (
    df.loc[missing_id, "lang"].astype(str)
    + "_missing_"
    + df.loc[missing_id].index.astype(str)
)

# ---- Ensure types are int {0,1}
TYPE_COLS = ["political", "racial/ethnic", "religious", "gender/sexual", "other"]
for c in ["polarized","is_synthetic"] + TYPE_COLS:
    df[c] = df[c].fillna(0).astype(int)

# ---- Optional: drop the 4 exact duplicates you found
df = df.drop_duplicates(subset=["lang","text"]).reset_index(drop=True)

# Splits
train_df = df[df["split"].isin(["train","train_synth_v2"])].reset_index(drop=True)
dev_df   = df[df["split"].eq("dev")].reset_index(drop=True)

print(train_df.shape, dev_df.shape)


(99799, 15) (3686, 15)


## Teacher data pool selection

In [4]:
SEED = 42
VAL_FRAC = 0.05

TEACHER_LANGS = ['eng','spa','deu','rus','tur','pol','arb']
TYPE_COLS = ["political", "racial/ethnic", "religious", "gender/sexual", "other"]

MASTER_PATH = "/content/drive/MyDrive/master_dataset.csv"
SYNTH_PATH  = "/content/drive/MyDrive/teacher_training_data_aug_v2.csv"

# -------------------
# Load
# -------------------
master = pd.read_csv(MASTER_PATH, low_memory=False)
synth  = pd.read_csv(SYNTH_PATH,  low_memory=False)

# -------------------
# Teacher pool = (master train only) + (synthetic), both restricted to teacher langs
# -------------------
master_train = master[master["split"].eq("train")].copy()
master_train = master_train[master_train["lang"].isin(TEACHER_LANGS)].copy()

synth_train  = synth[synth["lang"].isin(TEACHER_LANGS)].copy()

# Keep only needed columns (master is exactly these 10)
master_train = master_train[["id","text","lang","polarized"] + TYPE_COLS].copy()

# Synthetic may have extra cols; keep what we need + group helpers if present
keep_synth = ["id","text","lang","polarized"] + TYPE_COLS
for extra in ["source_row_id","is_synthetic"]:
    if extra in synth_train.columns:
        keep_synth.append(extra)
synth_train = synth_train[keep_synth].copy()

# Ensure dtypes
for c in ["polarized"] + TYPE_COLS:
    master_train[c] = master_train[c].fillna(0).astype(int)
    synth_train[c]  = synth_train[c].fillna(0).astype(int)

# Add missing helper cols for grouping
master_train["is_synthetic"] = 0
master_train["source_row_id"] = ""

if "is_synthetic" not in synth_train.columns:
    synth_train["is_synthetic"] = 1
else:
    synth_train["is_synthetic"] = synth_train["is_synthetic"].fillna(1).astype(int)

if "source_row_id" not in synth_train.columns:
    synth_train["source_row_id"] = ""
else:
    synth_train["source_row_id"] = synth_train["source_row_id"].fillna("").astype(str)

teacher_pool = pd.concat([master_train, synth_train], ignore_index=True)

# Optional: remove exact duplicates
teacher_pool = teacher_pool.drop_duplicates(subset=["lang","text"]).reset_index(drop=True)

print("Teacher pool shape:", teacher_pool.shape)
print("Teacher pool langs:\n", teacher_pool["lang"].value_counts())
print("Teacher pool labels:\n", teacher_pool["polarized"].value_counts())

# -------------------
# Internal split (group-safe + stratified per language)
# -------------------
tp = teacher_pool.copy()

tp["group_id"] = np.where(
    tp["source_row_id"].astype(str).str.len() > 0,
    tp["source_row_id"].astype(str),
    tp["id"].astype(str),
)

rng = np.random.default_rng(SEED)
train_idx, val_idx = [], []

for lang, sub in tp.groupby("lang", sort=False):
    g = sub.groupby("group_id")["polarized"].agg(lambda x: int(round(x.mean()))).reset_index()

    g0 = g[g["polarized"] == 0]["group_id"].to_list()
    g1 = g[g["polarized"] == 1]["group_id"].to_list()
    rng.shuffle(g0); rng.shuffle(g1)

    n0_val = max(1, int(len(g0) * VAL_FRAC)) if len(g0) else 0
    n1_val = max(1, int(len(g1) * VAL_FRAC)) if len(g1) else 0

    val_groups = set(g0[:n0_val] + g1[:n1_val])
    is_val = sub["group_id"].isin(val_groups)

    val_idx.extend(sub.index[is_val].to_list())
    train_idx.extend(sub.index[~is_val].to_list())

train_internal = tp.loc[train_idx].reset_index(drop=True)
val_internal   = tp.loc[val_idx].reset_index(drop=True)

print("\nInternal Train:", train_internal.shape, "Internal Val:", val_internal.shape)
print("\nTrain label dist:\n", train_internal["polarized"].value_counts(normalize=True))
print("\nVal label dist:\n", val_internal["polarized"].value_counts(normalize=True))

# Safety checks
assert set(train_internal["lang"]).issubset(set(TEACHER_LANGS))
assert set(val_internal["lang"]).issubset(set(TEACHER_LANGS))


Teacher pool shape: (48365, 11)
Teacher pool langs:
 lang
arb    8481
spa    8290
deu    7077
rus    6838
eng    6394
tur    6231
pol    5054
Name: count, dtype: int64
Teacher pool labels:
 polarized
0    26262
1    22103
Name: count, dtype: int64

Internal Train: (45971, 12) Internal Val: (2394, 12)

Train label dist:
 polarized
0    0.542886
1    0.457114
Name: proportion, dtype: float64

Val label dist:
 polarized
0    0.545113
1    0.454887
Name: proportion, dtype: float64


## Teacher A training

In [26]:
!pip -q uninstall -y accelerate transformers
!pip -q install --upgrade --force-reinstall \
  "accelerate==1.12.0" \
  "transformers==4.57.3"


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.4/201.4 kB[0m [31m54.9 kB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.
datasets 4.4.2 requires fsspec[http]<=2025.10.0,>=2023.1.0, but you have fsspec 2025.12.0 which is incompatible.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 2.4.0 which is incompatible.
torchvision 0.24.0+cu126 requires torch==2.9.0, but you have torch 2.9.1 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 

## Teacher Training

In [5]:
import numpy as np
import evaluate
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

SEED = 42
MAX_LEN = 128

tok = AutoTokenizer.from_pretrained("xlm-roberta-large")

# HuggingFace datasets
trainA = Dataset.from_pandas(train_internal[["id","text","lang","polarized"]])
valA   = Dataset.from_pandas(val_internal[["id","text","lang","polarized"]])

def tokenizeA(batch):
    return tok(batch["text"], truncation=True, max_length=MAX_LEN)

trainA = trainA.map(tokenizeA, batched=True)
valA   = valA.map(tokenizeA, batched=True)

trainA = trainA.rename_column("polarized", "labels")
valA   = valA.rename_column("polarized", "labels")

collator = DataCollatorWithPadding(tok)

f1 = evaluate.load("f1")
acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "acc": acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "f1_pos": f1.compute(predictions=preds, references=labels, pos_label=1)["f1"],
    }

modelA = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-large", num_labels=2)

# --------- batching knobs ----------
# Start with per_device_train_batch_size=4 + grad_accum=8 (effective 32).
# If you OOM: reduce per_device_train_batch_size to 2 and increase grad_accum to 16.
PER_DEVICE_TRAIN_BS = 4
GRAD_ACCUM = 8

argsA = TrainingArguments(
    output_dir="/content/drive/MyDrive/teachers/teacherA_xlmr_large",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,

    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,   # effective batch 32

    num_train_epochs=3,
    learning_rate=1e-5,
    warmup_ratio=0.06,
    weight_decay=0.01,
    max_grad_norm=1.0,

    bf16=True,
    fp16=False,

    logging_steps=100,
    save_total_limit=2,
    report_to="none",

    seed=SEED,
    data_seed=SEED,
)

trainerA = Trainer(
    model=modelA,
    args=argsA,
    train_dataset=trainA,
    eval_dataset=valA,
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainerA.train()
trainerA.evaluate()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/45971 [00:00<?, ? examples/s]

Map:   0%|          | 0/2394 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainerA = Trainer(


Epoch,Training Loss,Validation Loss,Acc,F1 Macro,F1 Pos
1,0.2671,0.263704,0.879282,0.878873,0.87184
2,0.2231,0.269659,0.880117,0.879688,0.872501
3,0.1569,0.303638,0.883459,0.882789,0.873927


{'eval_loss': 0.3036383390426636,
 'eval_acc': 0.8834586466165414,
 'eval_f1_macro': 0.8827886408205704,
 'eval_f1_pos': 0.8739267962042476,
 'eval_runtime': 2.4296,
 'eval_samples_per_second': 985.36,
 'eval_steps_per_second': 30.87,
 'epoch': 3.0}

In [6]:
import os

path = "/content/drive/MyDrive/teachers/teacherA_xlmr_large"
print("Exists:", os.path.exists(path))
print("Files:", sorted(os.listdir(path))[:30])


Exists: True
Files: ['checkpoint-2874', 'checkpoint-4311']


## Export a “FINAL” Teacher A folder

In [19]:
import os, re
from transformers import AutoTokenizer, AutoModelForSequenceClassification

BASE = "/content/drive/MyDrive/teachers/teacherA_xlmr_large"

# pick latest checkpoint (4311)
ckpts = [d for d in os.listdir(BASE) if d.startswith("checkpoint-")]
ckpts = sorted(ckpts, key=lambda x: int(re.findall(r"\d+", x)[0]))
last_ckpt = os.path.join(BASE, ckpts[-1])
print("Using checkpoint:", last_ckpt)

FINAL_DIR = "/content/drive/MyDrive/teachers/teacherA_xlmr_large_FINAL"

tok = AutoTokenizer.from_pretrained(last_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(last_ckpt)

tok.save_pretrained(FINAL_DIR)
model.save_pretrained(FINAL_DIR)

print("Saved FINAL teacher to:", FINAL_DIR)
print("FINAL files:", sorted(os.listdir(FINAL_DIR))[:20])


Using checkpoint: /content/drive/MyDrive/teachers/teacherA_xlmr_large/checkpoint-4311


The tokenizer you are loading from '/content/drive/MyDrive/teachers/teacherA_xlmr_large/checkpoint-4311' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


Saved FINAL teacher to: /content/drive/MyDrive/teachers/teacherA_xlmr_large_FINAL
FINAL files: ['config.json', 'model.safetensors', 'sentencepiece.bpe.model', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json']


## Step 4 — Saving internal split IDs

In [20]:
train_internal[["id"]].to_csv("/content/drive/MyDrive/teachers/internal_train_ids.csv", index=False)
val_internal[["id"]].to_csv("/content/drive/MyDrive/teachers/internal_val_ids.csv", index=False)
print("Saved split IDs.")


Saved split IDs.
