In [5]:
"""
SemEval Task 9 - Subtask 1: BALANCED APPROACH
Target: 0.85+ on dev with <0.03 train/dev gap

BALANCED STRATEGY:
âœ“ Moderate regularization (not too strong, not too weak)
âœ“ Light augmentation only for minority class
âœ“ Better data sampling strategy
âœ“ Optimal hyperparameters from literature
âœ“ Focus on robust threshold tuning
"""

import os, pandas as pd, numpy as np, torch, torch.nn as nn
import re, random, gc, warnings
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.cuda.amp import autocast, GradScaler
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                         get_linear_schedule_with_warmup, set_seed)
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')
set_seed(42)

class Config:
    BASE_PATH = '/content/drive/MyDrive/NLP'
    TRAIN_ENG = f'{BASE_PATH}/subtask1/train/eng.csv'
    TRAIN_SWA = f'{BASE_PATH}/subtask1/train/swa.csv'
    DEV_ENG = f'{BASE_PATH}/subtask1/dev/eng.csv'
    DEV_SWA = f'{BASE_PATH}/subtask1/dev/swa.csv'
    OUTPUT_DIR = '/content/subtask1/models'
    PREDICTIONS_DIR = '/content/subtask_1'
    os.makedirs(PREDICTIONS_DIR, exist_ok=True)
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Models
    MODEL_ENG = 'microsoft/deberta-v3-base'
    MODEL_SWA = 'xlm-roberta-base'
    MAX_LENGTH = 128

    # BALANCED TRAINING - sweet spot
    BATCH_SIZE = 16
    GRAD_ACCUM = 2
    EPOCHS = 12  # Moderate
    LR_ENG = 1.5e-5  # Balanced LR
    LR_SWA = 2e-5
    WEIGHT_DECAY = 0.02  # Moderate weight decay
    WARMUP_RATIO = 0.15
    DROPOUT = 0.2  # Moderate dropout
    MAX_GRAD_NORM = 1.0

    # Loss
    FOCAL_ALPHA_ENG = 0.72
    FOCAL_GAMMA_ENG = 2.2
    FOCAL_ALPHA_SWA = 0.65
    FOCAL_GAMMA_SWA = 2.2

    # Validation
    VAL_SIZE = 0.18

    # Light augmentation
    USE_AUGMENTATION = True
    AUG_PROBABILITY = 0.25

    USE_FP16 = True
    SEED = 42
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Device: {Config.DEVICE}")

class TextPreprocessor:
    @staticmethod
    def clean(text):
        text = str(text).strip().lower()
        text = re.sub(r'http\S+|www\.\S+', ' [url] ', text)
        text = re.sub(r'@\w+', ' [user] ', text)
        text = re.sub(r'#(\w+)', r' \1 ', text)
        text = re.sub(r'(.)\1{3,}', r'\1\1', text)
        text = re.sub(r'([!?.]){2,}', r'\1', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text if text else "[empty]"

class LightAugmenter:
    """Very light augmentation - only word swapping"""
    def __init__(self, p=0.25):
        self.p = p
        self.protected = {'not', 'no', 'never', 'but', 'however', 'although',
                         '[url]', '[user]', 'only', 'all', 'always', 'never'}

    def augment(self, text):
        if random.random() > self.p:
            return text

        words = text.split()
        if len(words) < 4:
            return text

        # Simple adjacent word swap
        swappable = [i for i in range(len(words)-1)
                    if words[i] not in self.protected and words[i+1] not in self.protected]

        if swappable:
            idx = random.choice(swappable)
            words[idx], words[idx+1] = words[idx+1], words[idx]

        return ' '.join(words)

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.72, gamma=2.2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, targets):
        bce = nn.functional.binary_cross_entropy_with_logits(
            logits, targets.float(), reduction='none'
        )
        pt = torch.exp(-bce)
        loss = self.alpha * (1 - pt) ** self.gamma * bce
        return loss.mean()

class RobustModel(nn.Module):
    def __init__(self, model_name, dropout=0.2):
        super().__init__()
        self.transformer = AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=1,
            hidden_dropout_prob=dropout,
            attention_probs_dropout_prob=dropout
        )

    def forward(self, input_ids, attention_mask):
        return self.transformer(input_ids=input_ids, attention_mask=attention_mask)

class PolarizationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len, augmenter=None, train=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.augmenter = augmenter
        self.train = train
        self.prep = TextPreprocessor()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.prep.clean(self.texts[idx])
        label = self.labels[idx]

        # Light augmentation only for minority class during training
        if self.train and self.augmenter and label == 1:
            text = self.augmenter.augment(text)

        enc = self.tokenizer(text, max_length=self.max_len, padding='max_length',
                           truncation=True, return_tensors='pt')

        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

def load_data(path):
    df = pd.read_csv(path)
    df['text'] = df['text'].apply(TextPreprocessor.clean)
    df = df[df['text'].str.len() > 0].reset_index(drop=True)
    if 'polarization' in df.columns:
        dist = df['polarization'].value_counts()
        print(f"  Samples: {len(df)}")
        print(f"  Distribution: 0={dist[0]}, 1={dist[1]} (ratio {dist[0]/dist[1]:.1f}:1)")
    return df

def find_threshold(model, loader, device, use_fp16=False):
    print("  Finding optimal threshold...")
    model.eval()
    probs, labels = [], []

    with torch.no_grad():
        for batch in tqdm(loader, desc="  Threshold", leave=False):
            ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            lbl = batch['label']

            if use_fp16:
                with autocast():
                    out = model(ids, mask)
                    p = torch.sigmoid(out.logits.squeeze(-1))
            else:
                out = model(ids, mask)
                p = torch.sigmoid(out.logits.squeeze(-1))

            probs.extend(p.cpu().numpy())
            labels.extend(lbl.numpy())

    probs, labels = np.array(probs), np.array(labels)

    # Smart threshold search
    pos_probs = probs[labels == 1]
    neg_probs = probs[labels == 0]

    if len(pos_probs) > 0 and len(neg_probs) > 0:
        min_t = max(0.2, np.percentile(pos_probs, 10))
        max_t = min(0.8, np.percentile(neg_probs, 90))
    else:
        min_t, max_t = 0.25, 0.75

    thresholds = np.linspace(min_t, max_t, 121)
    best_t, best_f1 = 0.5, 0.0
    results = []

    for t in thresholds:
        f1 = f1_score(labels, (probs >= t).astype(int), average='macro')
        results.append((t, f1))
        if f1 > best_f1:
            best_f1, best_t = f1, t

    # Show top 3
    results.sort(key=lambda x: x[1], reverse=True)
    print(f"  Top 3: ", end="")
    for i, (t, f) in enumerate(results[:3]):
        print(f"{t:.3f}({f:.4f})", end=" ")
    print()
    print(f"  â†’ Best: Threshold={best_t:.3f}, F1={best_f1:.4f}")

    return best_t, best_f1

def train_epoch(model, loader, opt, sched, crit, device, grad_accum, scaler=None):
    model.train()
    total_loss = 0
    preds, labels = [], []

    opt.zero_grad()

    for step, batch in enumerate(tqdm(loader, desc="  Train", leave=False)):
        ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        lbl = batch['label'].to(device)

        if scaler:
            with autocast():
                out = model(ids, mask)
                loss = crit(out.logits.squeeze(-1), lbl) / grad_accum
                p = torch.sigmoid(out.logits.squeeze(-1))

            batch_loss = loss.item() * grad_accum
            preds.extend((p >= 0.5).long().cpu().numpy())
            labels.extend(lbl.cpu().numpy())

            scaler.scale(loss).backward()

            if (step + 1) % grad_accum == 0:
                scaler.unscale_(opt)
                torch.nn.utils.clip_grad_norm_(model.parameters(), Config.MAX_GRAD_NORM)
                scaler.step(opt)
                scaler.update()
                sched.step()
                opt.zero_grad()
        else:
            out = model(ids, mask)
            loss = crit(out.logits.squeeze(-1), lbl) / grad_accum

            with torch.no_grad():
                p = torch.sigmoid(out.logits.squeeze(-1))

            batch_loss = loss.item() * grad_accum
            preds.extend((p >= 0.5).long().cpu().numpy())
            labels.extend(lbl.cpu().numpy())

            loss.backward()

            if (step + 1) % grad_accum == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), Config.MAX_GRAD_NORM)
                opt.step()
                sched.step()
                opt.zero_grad()

        total_loss += batch_loss

    return total_loss / len(loader), f1_score(labels, preds, average='macro')

def evaluate(model, loader, crit, device, thresh=0.5, use_fp16=False, show_report=True):
    model.eval()
    total_loss = 0
    preds, labels = [], []

    with torch.no_grad():
        for batch in tqdm(loader, desc="  Eval", leave=False):
            ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            lbl = batch['label'].to(device)

            if use_fp16:
                with autocast():
                    out = model(ids, mask)
                    loss = crit(out.logits.squeeze(-1), lbl)
            else:
                out = model(ids, mask)
                loss = crit(out.logits.squeeze(-1), lbl)

            total_loss += loss.item()
            p = torch.sigmoid(out.logits.squeeze(-1))
            preds.extend((p >= thresh).long().cpu().numpy())
            labels.extend(lbl.cpu().numpy())

    f1 = f1_score(labels, preds, average='macro')

    if show_report:
        f1_per_class = f1_score(labels, preds, average=None)
        print(f"\n{classification_report(labels, preds, target_names=['Non-Pol','Pol'], digits=4)}")
        print(f"Per-class F1: Non-Pol={f1_per_class[0]:.4f}, Pol={f1_per_class[1]:.4f}")

    return total_loss / len(loader), f1

def train_model(train_df, lang, model_name, lr, alpha, gamma, config):
    print(f"\n{'='*70}")
    print(f"TRAINING: {lang.upper()}")
    print(f"{'='*70}")

    # Stratified split
    train_data, val_data = train_test_split(
        train_df, test_size=config.VAL_SIZE, random_state=config.SEED,
        stratify=train_df['polarization']
    )

    print(f"Train: {len(train_data)}, Val: {len(val_data)}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    aug = LightAugmenter(config.AUG_PROBABILITY) if config.USE_AUGMENTATION else None

    train_ds = PolarizationDataset(train_data['text'].values, train_data['polarization'].values,
                                   tokenizer, config.MAX_LENGTH, aug, True)
    val_ds = PolarizationDataset(val_data['text'].values, val_data['polarization'].values,
                                 tokenizer, config.MAX_LENGTH, None, False)

    # Balanced sampling with moderate weighting
    counts = np.bincount(train_data['polarization'].values)
    weights = (1.0 / counts) ** 0.75  # Between sqrt(0.5) and 1.0
    sample_weights = weights[train_data['polarization'].values]
    sampler = WeightedRandomSampler(sample_weights, len(sample_weights), True)

    train_loader = DataLoader(train_ds, config.BATCH_SIZE, sampler=sampler,
                              num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_ds, config.BATCH_SIZE*2, False,
                           num_workers=2, pin_memory=True)

    model = RobustModel(model_name, config.DROPOUT).to(config.DEVICE)

    opt = torch.optim.AdamW(model.parameters(), lr=lr,
                           weight_decay=config.WEIGHT_DECAY, eps=1e-8)

    steps = len(train_loader) * config.EPOCHS // config.GRAD_ACCUM
    warmup = int(steps * config.WARMUP_RATIO)
    sched = get_linear_schedule_with_warmup(opt, warmup, steps)

    crit = FocalLoss(alpha, gamma)
    scaler = GradScaler() if config.USE_FP16 else None

    best_f1, best_thresh = 0.0, 0.5
    patience, p_cnt = 4, 0

    for ep in range(config.EPOCHS):
        print(f"\n[Epoch {ep+1}/{config.EPOCHS}]")

        tr_loss, tr_f1 = train_epoch(model, train_loader, opt, sched, crit,
                                     config.DEVICE, config.GRAD_ACCUM, scaler)

        # Quick eval without report
        val_loss, val_f1 = evaluate(model, val_loader, crit, config.DEVICE, 0.5,
                                    config.USE_FP16, show_report=False)

        print(f"Train: Loss={tr_loss:.4f}, F1={tr_f1:.4f}")
        print(f"Val:   Loss={val_loss:.4f}, F1={val_f1:.4f}")

        # Find threshold from epoch 3
        if ep >= 3:
            thresh, _ = find_threshold(model, val_loader, config.DEVICE, config.USE_FP16)
            _, val_f1 = evaluate(model, val_loader, crit, config.DEVICE, thresh,
                               config.USE_FP16, show_report=(ep >= config.EPOCHS - 2))

            if val_f1 > best_f1:
                best_f1, best_thresh, p_cnt = val_f1, thresh, 0
                torch.save({
                    'model': model.state_dict(),
                    'threshold': float(thresh),
                    'f1': float(val_f1)
                }, f"{config.OUTPUT_DIR}/best_{lang}.pt", _use_new_zipfile_serialization=True)
                print(f"âœ“ Saved best (F1={best_f1:.4f}, T={best_thresh:.3f})")
            else:
                p_cnt += 1
                print(f"  No improvement ({p_cnt}/{patience})")

        if ep >= 5 and p_cnt >= patience:
            print(f"Early stopping at epoch {ep+1}")
            break

    # Load best model
    ckpt = torch.load(f"{config.OUTPUT_DIR}/best_{lang}.pt", map_location=config.DEVICE, weights_only=False)
    model.load_state_dict(ckpt['model'])
    best_f1 = ckpt['f1']
    best_thresh = ckpt['threshold']

    print(f"\n{'='*70}")
    print(f"FINAL: Val F1={best_f1:.4f}, Threshold={best_thresh:.3f}")
    print(f"{'='*70}\n")

    return model, tokenizer, best_f1, best_thresh

def predict(model, tokenizer, test_file, out_file, thresh, config):
    print(f"\nPredicting: {test_file}")

    df = pd.read_csv(test_file)
    df['text'] = df['text'].apply(TextPreprocessor.clean)

    ds = PolarizationDataset(df['text'].values, np.zeros(len(df)), tokenizer,
                            config.MAX_LENGTH, None, False)
    loader = DataLoader(ds, config.BATCH_SIZE*2, False, num_workers=2, pin_memory=True)

    model.eval()
    preds = []

    with torch.no_grad():
        for batch in tqdm(loader, desc="  Predict", leave=False):
            ids = batch['input_ids'].to(config.DEVICE)
            mask = batch['attention_mask'].to(config.DEVICE)

            if config.USE_FP16:
                with autocast():
                    out = model(ids, mask)
                    p = torch.sigmoid(out.logits.squeeze(-1))
            else:
                out = model(ids, mask)
                p = torch.sigmoid(out.logits.squeeze(-1))

            preds.extend((p >= thresh).long().cpu().numpy())

    out_df = pd.DataFrame({'id': df['id'], 'polarization': preds})
    out_df.to_csv(out_file, index=False)

    dist = pd.Series(preds).value_counts()
    print(f"âœ“ Saved: {out_file}")
    print(f"  Distribution: {dict(dist)}, Pos: {dist.get(1,0)/len(preds)*100:.1f}%\n")

if __name__ == "__main__":
    print("\n" + "="*70)
    print("SemEval Task 9 - Subtask 1: BALANCED APPROACH")
    print("="*70)
    print("\nStrategy:")
    print("  â€¢ Moderate regularization (WD=0.02, Dropout=0.2)")
    print("  â€¢ Light augmentation (25% prob, word swap only)")
    print("  â€¢ Balanced sampling (weights^0.75)")
    print("  â€¢ 12 epochs with patience=4")
    print("  â€¢ Conservative threshold search")
    print("="*70)

    print("\nðŸ“Š English...")
    eng_train = load_data(Config.TRAIN_ENG)
    eng_model, eng_tok, eng_f1, eng_t = train_model(
        eng_train, 'english', Config.MODEL_ENG, Config.LR_ENG,
        Config.FOCAL_ALPHA_ENG, Config.FOCAL_GAMMA_ENG, Config
    )
    predict(eng_model, eng_tok, Config.DEV_ENG,
           f"{Config.PREDICTIONS_DIR}/pred_eng.csv", eng_t, Config)

    del eng_model
    gc.collect()
    torch.cuda.empty_cache()

    print("\nðŸ“Š Kiswahili...")
    swa_train = load_data(Config.TRAIN_SWA)
    swa_model, swa_tok, swa_f1, swa_t = train_model(
        swa_train, 'kiswahili', Config.MODEL_SWA, Config.LR_SWA,
        Config.FOCAL_ALPHA_SWA, Config.FOCAL_GAMMA_SWA, Config
    )
    predict(swa_model, swa_tok, Config.DEV_SWA,
           f"{Config.PREDICTIONS_DIR}/pred_swa.csv", swa_t, Config)

    print("\n" + "="*70)
    print("RESULTS")
    print("="*70)
    print(f"English:   Val F1={eng_f1:.4f}, Threshold={eng_t:.3f}")
    print(f"Kiswahili: Val F1={swa_f1:.4f}, Threshold={swa_t:.3f}")
    print(f"Average:   Val F1={(eng_f1+swa_f1)/2:.4f}")
    print("="*70)

    print("\nðŸ“¦ Creating submission...")
    !zip -r -q subtask_1.zip subtask_1
    print("âœ… DONE!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Device: cuda

SemEval Task 9 - Subtask 1: BALANCED APPROACH

Strategy:
  â€¢ Moderate regularization (WD=0.02, Dropout=0.2)
  â€¢ Light augmentation (25% prob, word swap only)
  â€¢ Balanced sampling (weights^0.75)
  â€¢ 12 epochs with patience=4
  â€¢ Conservative threshold search

ðŸ“Š English...
  Samples: 3222
  Distribution: 0=2047, 1=1175 (ratio 1.7:1)

TRAINING: ENGLISH
Train: 2642, Val: 580


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Epoch 1/12]


  Train:   0%|          | 0/166 [00:00<?, ?it/s]

  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

Train: Loss=0.1091, F1=0.4517
Val:   Loss=0.0986, F1=0.4076

[Epoch 2/12]


  Train:   0%|          | 0/166 [00:00<?, ?it/s]

  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

Train: Loss=0.0799, F1=0.7662
Val:   Loss=0.0773, F1=0.7972

[Epoch 3/12]


  Train:   0%|          | 0/166 [00:00<?, ?it/s]

  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

Train: Loss=0.0675, F1=0.8255
Val:   Loss=0.0770, F1=0.7921

[Epoch 4/12]


  Train:   0%|          | 0/166 [00:00<?, ?it/s]

  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

Train: Loss=0.0532, F1=0.8649
Val:   Loss=0.0937, F1=0.8002
  Finding optimal threshold...


  Threshold:   0%|          | 0/19 [00:00<?, ?it/s]

  Top 3: 0.617(0.8162) 0.620(0.8162) 0.678(0.8152) 
  â†’ Best: Threshold=0.617, F1=0.8162


  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

âœ“ Saved best (F1=0.8162, T=0.617)

[Epoch 5/12]


  Train:   0%|          | 0/166 [00:00<?, ?it/s]

  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

Train: Loss=0.0475, F1=0.8868
Val:   Loss=0.0963, F1=0.7997
  Finding optimal threshold...


  Threshold:   0%|          | 0/19 [00:00<?, ?it/s]

  Top 3: 0.621(0.8181) 0.589(0.8167) 0.617(0.8164) 
  â†’ Best: Threshold=0.621, F1=0.8181


  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

âœ“ Saved best (F1=0.8181, T=0.621)

[Epoch 6/12]


  Train:   0%|          | 0/166 [00:00<?, ?it/s]

  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

Train: Loss=0.0402, F1=0.9049
Val:   Loss=0.1286, F1=0.7664
  Finding optimal threshold...


  Threshold:   0%|          | 0/19 [00:00<?, ?it/s]

  Top 3: 0.699(0.8219) 0.703(0.8216) 0.706(0.8216) 
  â†’ Best: Threshold=0.699, F1=0.8219


  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

âœ“ Saved best (F1=0.8219, T=0.699)

[Epoch 7/12]


  Train:   0%|          | 0/166 [00:00<?, ?it/s]

  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

Train: Loss=0.0350, F1=0.9200
Val:   Loss=0.1072, F1=0.8266
  Finding optimal threshold...


  Threshold:   0%|          | 0/19 [00:00<?, ?it/s]

  Top 3: 0.514(0.8293) 0.496(0.8286) 0.518(0.8273) 
  â†’ Best: Threshold=0.514, F1=0.8293


  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

âœ“ Saved best (F1=0.8293, T=0.514)

[Epoch 8/12]


  Train:   0%|          | 0/166 [00:00<?, ?it/s]

  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

Train: Loss=0.0263, F1=0.9423
Val:   Loss=0.1275, F1=0.8093
  Finding optimal threshold...


  Threshold:   0%|          | 0/19 [00:00<?, ?it/s]

  Top 3: 0.388(0.8195) 0.391(0.8195) 0.381(0.8179) 
  â†’ Best: Threshold=0.388, F1=0.8195


  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

  No improvement (1/4)

[Epoch 9/12]


  Train:   0%|          | 0/166 [00:00<?, ?it/s]

  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e8cf0d02520>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e8cf0d02520>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

Train: Loss=0.0259, F1=0.9439
Val:   Loss=0.1273, F1=0.8184
  Finding optimal threshold...


  Threshold:   0%|          | 0/19 [00:00<?, ?it/s]

  Top 3: 0.444(0.8246) 0.447(0.8246) 0.451(0.8246) 
  â†’ Best: Threshold=0.444, F1=0.8246


  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

  No improvement (2/4)

[Epoch 10/12]


  Train:   0%|          | 0/166 [00:00<?, ?it/s]

  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

Train: Loss=0.0228, F1=0.9517
Val:   Loss=0.1484, F1=0.8013
  Finding optimal threshold...


  Threshold:   0%|          | 0/19 [00:00<?, ?it/s]

  Top 3: 0.416(0.8147) 0.375(0.8143) 0.279(0.8139) 
  â†’ Best: Threshold=0.416, F1=0.8147


  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

  No improvement (3/4)

[Epoch 11/12]


  Train:   0%|          | 0/166 [00:00<?, ?it/s]

  Eval:   0%|          | 0/19 [00:00<?, ?it/s]

Train: Loss=0.0178, F1=0.9612
Val:   Loss=0.1605, F1=0.8080
  Finding optimal threshold...


  Threshold:   0%|          | 0/19 [00:00<?, ?it/s]

  Top 3: 0.430(0.8172) 0.433(0.8172) 0.438(0.8168) 
  â†’ Best: Threshold=0.430, F1=0.8172


  Eval:   0%|          | 0/19 [00:00<?, ?it/s]


              precision    recall  f1-score   support

     Non-Pol     0.8556    0.8859    0.8705       368
         Pol     0.7889    0.7406    0.7640       212

    accuracy                         0.8328       580
   macro avg     0.8223    0.8132    0.8172       580
weighted avg     0.8313    0.8328    0.8316       580

Per-class F1: Non-Pol=0.8705, Pol=0.7640
  No improvement (4/4)
Early stopping at epoch 11

FINAL: Val F1=0.8293, Threshold=0.514


Predicting: /content/drive/MyDrive/NLP/subtask1/dev/eng.csv


  Predict:   0%|          | 0/5 [00:00<?, ?it/s]

âœ“ Saved: /content/subtask_1/pred_eng.csv
  Distribution: {0: np.int64(104), 1: np.int64(56)}, Pos: 35.0%


ðŸ“Š Kiswahili...
  Samples: 6991
  Distribution: 0=3487, 1=3504 (ratio 1.0:1)

TRAINING: KISWAHILI
Train: 5732, Val: 1259


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Epoch 1/12]


  Train:   0%|          | 0/359 [00:00<?, ?it/s]

  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

Train: Loss=0.0988, F1=0.5148
Val:   Loss=0.0868, F1=0.6844

[Epoch 2/12]


  Train:   0%|          | 0/359 [00:00<?, ?it/s]

  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

Train: Loss=0.0881, F1=0.6720
Val:   Loss=0.0803, F1=0.7432

[Epoch 3/12]


  Train:   0%|          | 0/359 [00:00<?, ?it/s]

  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

Train: Loss=0.0812, F1=0.7249
Val:   Loss=0.0936, F1=0.7517

[Epoch 4/12]


  Train:   0%|          | 0/359 [00:00<?, ?it/s]

  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

Train: Loss=0.0711, F1=0.7738
Val:   Loss=0.0878, F1=0.7443
  Finding optimal threshold...


  Threshold:   0%|          | 0/40 [00:00<?, ?it/s]

  Top 3: 0.624(0.7619) 0.622(0.7612) 0.626(0.7611) 
  â†’ Best: Threshold=0.624, F1=0.7619


  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

âœ“ Saved best (F1=0.7619, T=0.624)

[Epoch 5/12]


  Train:   0%|          | 0/359 [00:00<?, ?it/s]

  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

Train: Loss=0.0679, F1=0.7817
Val:   Loss=0.0747, F1=0.7657
  Finding optimal threshold...


  Threshold:   0%|          | 0/40 [00:00<?, ?it/s]

  Top 3: 0.527(0.7788) 0.529(0.7787) 0.530(0.7786) 
  â†’ Best: Threshold=0.527, F1=0.7788


  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

âœ“ Saved best (F1=0.7788, T=0.527)

[Epoch 6/12]


  Train:   0%|          | 0/359 [00:00<?, ?it/s]

  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

Train: Loss=0.0632, F1=0.7895
Val:   Loss=0.0776, F1=0.7648
  Finding optimal threshold...


  Threshold:   0%|          | 0/40 [00:00<?, ?it/s]

  Top 3: 0.571(0.7901) 0.564(0.7896) 0.568(0.7894) 
  â†’ Best: Threshold=0.571, F1=0.7901


  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

âœ“ Saved best (F1=0.7901, T=0.571)

[Epoch 7/12]


  Train:   0%|          | 0/359 [00:00<?, ?it/s]

  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

Train: Loss=0.0637, F1=0.7929
Val:   Loss=0.0811, F1=0.7722
  Finding optimal threshold...


  Threshold:   0%|          | 0/40 [00:00<?, ?it/s]

  Top 3: 0.596(0.7904) 0.598(0.7896) 0.592(0.7889) 
  â†’ Best: Threshold=0.596, F1=0.7904


  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

âœ“ Saved best (F1=0.7904, T=0.596)

[Epoch 8/12]


  Train:   0%|          | 0/359 [00:00<?, ?it/s]

  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

Train: Loss=0.0587, F1=0.8222
Val:   Loss=0.0804, F1=0.7751
  Finding optimal threshold...


  Threshold:   0%|          | 0/40 [00:00<?, ?it/s]

  Top 3: 0.548(0.7827) 0.550(0.7826) 0.552(0.7817) 
  â†’ Best: Threshold=0.548, F1=0.7827


  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

  No improvement (1/4)

[Epoch 9/12]


  Train:   0%|          | 0/359 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e8cf0d02520><function _MultiProcessingDataLoaderIter.__del__ at 0x7e8cf0d02520>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
        if w.is_alive():if w.is_alive():

            ^ ^ ^^^^^^^^^^^^^^^^^^^^^
^  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive

  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
        assert self.

  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

Train: Loss=0.0584, F1=0.8126
Val:   Loss=0.0698, F1=0.7884
  Finding optimal threshold...


  Threshold:   0%|          | 0/40 [00:00<?, ?it/s]

  Top 3: 0.497(0.7908) 0.523(0.7893) 0.499(0.7892) 
  â†’ Best: Threshold=0.497, F1=0.7908


  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

âœ“ Saved best (F1=0.7908, T=0.497)

[Epoch 10/12]


  Train:   0%|          | 0/359 [00:00<?, ?it/s]

  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

Train: Loss=0.0538, F1=0.8337
Val:   Loss=0.0787, F1=0.7887
  Finding optimal threshold...


  Threshold:   0%|          | 0/40 [00:00<?, ?it/s]

  Top 3: 0.583(0.7952) 0.581(0.7937) 0.585(0.7936) 
  â†’ Best: Threshold=0.583, F1=0.7952


  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e8cf0d02520>
<function _MultiProcessingDataLoaderIter.__del__ at 0x7e8cf0d02520>Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
    
self._shutdown_workers()Traceback (most recent call last):

  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
        if w.is_alive():self._shutdown_workers()

    File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
      if w.is_alive(): 
     ^ ^ ^ ^ ^^^^^^^^^^^^^^
^  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
^    assert self._parent_pid == os.getpid(), 'can only test a child process'^
 ^  ^    ^ 
   File "/usr/l

âœ“ Saved best (F1=0.7952, T=0.583)

[Epoch 11/12]


  Train:   0%|          | 0/359 [00:00<?, ?it/s]

  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

Train: Loss=0.0563, F1=0.8279
Val:   Loss=0.0762, F1=0.7749
  Finding optimal threshold...


  Threshold:   0%|          | 0/40 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e8cf0d02520>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
Exception ignored in:     <function _MultiProcessingDataLoaderIter.__del__ at 0x7e8cf0d02520>self._shutdown_workers()
Traceback (most recent call last):

  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
        if w.is_alive():self._shutdown_workers()

  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
      if w.is_alive(): 
        ^ ^  ^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
^    ^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
assert self._par

  Top 3: 0.592(0.7939) 0.586(0.7933) 0.588(0.7932) 
  â†’ Best: Threshold=0.592, F1=0.7939


  Eval:   0%|          | 0/40 [00:00<?, ?it/s]


              precision    recall  f1-score   support

     Non-Pol     0.7548    0.8726    0.8095       628
         Pol     0.8499    0.7179    0.7784       631

    accuracy                         0.7951      1259
   macro avg     0.8024    0.7953    0.7939      1259
weighted avg     0.8025    0.7951    0.7939      1259

Per-class F1: Non-Pol=0.8095, Pol=0.7784
  No improvement (1/4)

[Epoch 12/12]


  Train:   0%|          | 0/359 [00:00<?, ?it/s]

  Eval:   0%|          | 0/40 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e8cf0d02520>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e8cf0d02520>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

Train: Loss=0.0529, F1=0.8348
Val:   Loss=0.0779, F1=0.7770
  Finding optimal threshold...


  Threshold:   0%|          | 0/40 [00:00<?, ?it/s]

  Top 3: 0.617(0.7951) 0.600(0.7948) 0.605(0.7947) 
  â†’ Best: Threshold=0.617, F1=0.7951


  Eval:   0%|          | 0/40 [00:00<?, ?it/s]


              precision    recall  f1-score   support

     Non-Pol     0.7514    0.8854    0.8129       628
         Pol     0.8613    0.7084    0.7774       631

    accuracy                         0.7967      1259
   macro avg     0.8063    0.7969    0.7951      1259
weighted avg     0.8064    0.7967    0.7951      1259

Per-class F1: Non-Pol=0.8129, Pol=0.7774
  No improvement (2/4)

FINAL: Val F1=0.7952, Threshold=0.583


Predicting: /content/drive/MyDrive/NLP/subtask1/dev/swa.csv


  Predict:   0%|          | 0/11 [00:00<?, ?it/s]

âœ“ Saved: /content/subtask_1/pred_swa.csv
  Distribution: {0: np.int64(196), 1: np.int64(153)}, Pos: 43.8%


RESULTS
English:   Val F1=0.8293, Threshold=0.514
Kiswahili: Val F1=0.7952, Threshold=0.583
Average:   Val F1=0.8123

ðŸ“¦ Creating submission...
âœ… DONE!
