In [None]:
!pip install transformers datasets wandb scikit-learn imbalanced-learn nlpaug accelerate tqdm
# from google.colab import drive
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer,  TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset

import os
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import wandb
import zipfile
from datetime import datetime


from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import classification_report
from pathlib import Path
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.dummy import DummyClassifier
from pathlib import Path

torch.manual_seed(42)
np.random.seed(42)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# ================== CONFIG ==================
DATA_PATH = "subtask3_train_eng.csv"  # ← YOUR SINGLE FILE HERE


# MODEL_NAME = 'Twitter/twhin-bert-base'
# MODEL_NAME = "FacebookAI/xlm-roberta-base"
# MODEL_NAME = "google/rembert"
# MODEL_NAME = 'distilbert/distilbert-base-multilingual-cased'#@param
MODEL_NAME = 'microsoft/mdeberta-v3-base'
# MODEL_NAME = 'metabloit/swahBERT'
# MODEL_NAME="castorini/afriberta_large"

# MODEL_NAME = "distilbert/distilbert-base-multilingual-cased"
# MODEL_NAME = "roberta-base"
MAX_LENGTH = 128
BATCH_SIZE = 32
ACCUM_STEPS = 2                              # Effective batch size = 64
EPOCHS = 10
LR = 2e-5
WARMUP_RATIO = 0.1
VAL_SIZE = 0.20                              # 20% for validation
RANDOM_STATE = 42
PATIENCE = 3

OUTPUT_DIR = Path("best")
OUTPUT_DIR.mkdir(exist_ok=True)

label_cols = ["political",	"racial/ethnic", "religious" ,	"gender/sexual",	"other"]
num_labels = len(label_cols)
# ===========================================

In [None]:

wandb.login()

In [None]:
# ===========================================

# Load the single labeled file
df = pd.read_csv(DATA_PATH)
print(f"Loaded full dataset: {df.shape}")

# Ensure labels are int
for col in label_cols:
    df[col] = df[col].astype(int)

# Create multi-label stratification column
df['num_labels'] = df[label_cols].sum(axis=1)

# Stratified train/val split based on number of labels (best we can do for multilabel)
train_df, val_df = train_test_split(
    df,
    test_size=VAL_SIZE,
    stratify=df['num_labels'],
    random_state=RANDOM_STATE
)

print(f"Train: {train_df.shape} | Val: {val_df.shape}")
print(f"Validation label distribution matches training: {train_df['num_labels'].value_counts(normalize=True).round(3).sort_index().tolist()}")

In [None]:
# Compute pos_weights ONLY from training set
pos_counts = train_df[label_cols].sum().values
neg_counts = len(train_df) - pos_counts
pos_weights = torch.tensor(neg_counts / pos_counts, dtype=torch.float)

print("\nPos weights for BCEWithLogitsLoss:")

for label, w in zip(label_cols, pos_weights.numpy().round(2)):
    print(f"  {label}: {w}")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class TextDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text'].tolist()
        self.labels = df[label_cols].values.astype(np.float32)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            str(self.texts[idx]),
            truncation=True,
            padding='max_length',
            max_length=MAX_LENGTH,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = TextDataset(train_df)
val_dataset = TextDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [None]:
# Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    problem_type="multi_label_classification",
    ignore_mismatched_sizes=True

)
model.to(device)

In [None]:
# Loss with per-label weighting
criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weights.to(device))

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
total_steps = len(train_loader) // ACCUM_STEPS * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(WARMUP_RATIO * total_steps),
    num_training_steps=total_steps
)

In [None]:
# =============== Threshold Tuning ===============
def find_best_thresholds(model, loader):
    model.eval()
    all_logits = []
    all_labels = []
    with torch.no_grad():
        for batch in loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            all_logits.append(outputs.logits.sigmoid().cpu())  # We'll use probabilities for search
            all_labels.append(labels.cpu())
    probs = torch.cat(all_logits).numpy()
    y_true = torch.cat(all_labels).numpy()

    # Coarse global search
    best_thresh = np.ones(num_labels) * 0.5
    best_macro = 0
    for base in np.arange(0.2, 0.8, 0.05):
        thresh = np.array([base] * num_labels)
        preds = (probs > thresh).astype(int)
        macro = f1_score(y_true, preds, average='macro', zero_division=0)
        if macro > best_macro:
            best_macro = macro
            best_thresh = thresh.copy()

    # Fine-tune per label
    thresholds = best_thresh.copy()
    for i in range(num_labels):
        best_t = thresholds[i]
        best_f1 = 0
        for t in np.arange(max(0.1, thresholds[i]-0.15), min(0.9, thresholds[i]+0.15), 0.01):
            temp_thresh = thresholds.copy()
            temp_thresh[i] = t
            preds = (probs > temp_thresh).astype(int)
            macro = f1_score(y_true, preds, average='macro', zero_division=0)
            if macro > best_f1:
                best_f1 = macro
                best_t = t
        thresholds[i] = best_t

    final_macro = f1_score(y_true, (probs > thresholds), average='macro', zero_division=0)
    print(f"\nBest per-label thresholds: {np.round(thresholds, 3)}")
    print(f"Validation Macro-F1 with tuned thresholds: {final_macro:.4f}")
    return thresholds

In [None]:
LANG = 'swa'

In [None]:
CURRENT_DATE = datetime.now().strftime("%Y-%m-%d_%H-%M")

wandb.init(
    project="polarization-multilabel",
    name=f"{LANG}_{MODEL_NAME}_{EPOCHS}_{CURRENT_DATE}",
    config={
        "model": MODEL_NAME,
        "batch_size": BATCH_SIZE,
        "accum_steps": ACCUM_STEPS,
        "epochs": EPOCHS,
        "learning_rate": LR,
        "warmup_ratio": WARMUP_RATIO,
        "labels": label_cols,
    }
)

wandb.log({
    f"pos_weight/{lbl}": w.item()
    for lbl, w in zip(label_cols, pos_weights)
})

# =============== Training Loop ===============
best_macro_f1 = 0
patience_counter = 0

for epoch in range(EPOCHS):
    model.train()
    optimizer.zero_grad()
    progress = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    for step, batch in enumerate(progress):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        loss = criterion(outputs.logits, labels)
        loss = loss / ACCUM_STEPS
        loss.backward()

        if (step + 1) % ACCUM_STEPS == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
    wandb.log({
    "train/loss": loss.item() * ACCUM_STEPS,
    "lr": scheduler.get_last_lr()[0],
  })


    progress.set_postfix({'loss': loss.item() * ACCUM_STEPS})

    # Threshold tuning and evaluation
    thresholds = find_best_thresholds(model, val_loader)

    for lbl, t in zip(label_cols, thresholds):
      wandb.log({f"threshold/{lbl}": t})


    # Final eval with best thresholds
    model.eval()
    all_probs = []
    with torch.no_grad():
        for batch in val_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            outputs = model(**inputs)
            all_probs.append(outputs.logits.sigmoid().cpu())
    probs = torch.cat(all_probs).numpy()
    preds = (probs > thresholds).astype(int)
    y_true = val_df[label_cols].values

    macro_f1 = f1_score(y_true, preds, average='macro')
    micro_f1 = f1_score(y_true, preds, average='micro')
    wandb.log({
    "val/macro_f1": macro_f1,
    "val/micro_f1": micro_f1,
})

    print(f"Epoch {epoch+1} → Macro-F1: {macro_f1:.4f} | Micro-F1: {micro_f1:.4f}")

    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
        patience_counter = 0
        model.save_pretrained(OUTPUT_DIR / "best_model")
        tokenizer.save_pretrained(OUTPUT_DIR / "best_model")
        np.save(OUTPUT_DIR / "best_thresholds.npy", thresholds)
        pd.DataFrame({'label': label_cols, 'threshold': thresholds}).to_csv(OUTPUT_DIR / "thresholds.csv", index=False)
        print(f"✓ New best model saved (Macro-F1: {best_macro_f1:.4f})")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print("Early stopping.")
            break

print("\n" + "="*50)
print("TRAINING FINISHED")
print(f"Best Validation Macro-F1: {best_macro_f1:.4f}")
print(f"Model + tokenizer saved to: {OUTPUT_DIR / 'best_model'}")
print(f"Optimal thresholds saved to: {OUTPUT_DIR / 'best_thresholds.npy'} and thresholds.csv")
print("="*50)

In [None]:
# ===================== DEV SET INFERENCE (SINGLE CELL) =====================

import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score

# -------- PATHS --------
DEV_PATH = "/content/drive/MyDrive/polarization_dataset/subtask3/dev/swa.csv"
MODEL_DIR = "best/best_model"
THRESH_PATH = "best/best_thresholds.npy"

# -------- CONFIG --------
MAX_LENGTH = 128
BATCH_SIZE = 32

label_cols = [
    "stereotype",
    "vilification",
    "dehumanization",
    "extreme_language",
    "lack_of_empathy",
    "invalidation",
]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------- LOAD MODEL / TOKENIZER / THRESHOLDS --------
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.to(device)
model.eval()

thresholds = np.load(THRESH_PATH)
print("Loaded thresholds:", np.round(thresholds, 3))

# -------- LOAD DEV DATA --------
dev_df = pd.read_csv(DEV_PATH)

# FIX: fill missing labels safely
dev_df[label_cols] = dev_df[label_cols].fillna(0).astype(int)

# -------- DATASET --------
class DevDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            str(self.texts[idx]),
            truncation=True,
            padding="max_length",
            max_length=MAX_LENGTH,
            return_tensors="pt",
        )
        return {k: v.squeeze(0) for k, v in enc.items()}

dev_loader = DataLoader(
    DevDataset(dev_df["text"].tolist()),
    batch_size=BATCH_SIZE,
)

# -------- INFERENCE --------
all_probs = []

with torch.no_grad():
    for batch in dev_loader:
        inputs = {k: v.to(device) for k, v in batch.items()}
        probs = model(**inputs).logits.sigmoid().cpu().numpy()
        all_probs.append(probs)

probs = np.vstack(all_probs)
preds = (probs > thresholds).astype(int)

# -------- EVALUATION --------
y_true = dev_df[label_cols].values

macro_f1 = f1_score(y_true, preds, average="macro", zero_division=0)
micro_f1 = f1_score(y_true, preds, average="micro", zero_division=0)

print(f"DEV Macro-F1: {macro_f1:.4f}")
print(f"DEV Micro-F1: {micro_f1:.4f}")

# -------- SAVE PREDICTIONS --------
pred_df = dev_df[["id"]].copy()
for i, lbl in enumerate(label_cols):
    pred_df[lbl] = preds[:, i]

pred_df.to_csv("pred_swa.csv", index=False)
print("Saved predictions to dev_predictions.csv")

# ========================================================================

wandb.finish()

In [None]:
wandb.finish()