In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
!pwd

/content


In [None]:
import os
import re
import pickle
from pathlib import Path
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# NLP tools
import nltk
from nltk.corpus import stopwords

# Scikit-learn utilities
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure reproducibility
RANDOM_STATE = 42

# Create output folders for artifacts
OUTPUT_DIR = Path("./outputs")
MODELS_DIR = OUTPUT_DIR / "models"
SPLITS_DIR = OUTPUT_DIR / "splits"
PLOTS_DIR = OUTPUT_DIR / "plots"

for d in [OUTPUT_DIR, MODELS_DIR, SPLITS_DIR, PLOTS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
DATA_PATH = "/content/gdrive/MyDrive/NLP course /dev_phase /subtask1/train/swa.csv"

df = pd.read_csv(DATA_PATH)

# Show shape and columns to confirm file loaded correctly
print("Loaded dataset:", DATA_PATH)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# Quick peek at first rows
display(df.head())

# Attempt to auto-detect probable text and label columns using common names
text_candidates = [c for c in df.columns if c.lower() in ('text', 'tweet', 'content', 'sentence', 'comment')]
label_candidates = [c for c in df.columns if c.lower() in ('label', 'labels', 'polarization', 'target')]

print("\nDetected text candidates:", text_candidates)
print("Detected label candidates:", label_candidates)


Loaded dataset: /content/gdrive/MyDrive/NLP course /dev_phase /subtask1/train/eng.csv
Shape: (3222, 3)
Columns: ['id', 'text', 'polarization']


Unnamed: 0,id,text,polarization
0,eng_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0
1,eng_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0
2,eng_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0
3,eng_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0
4,eng_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0



Detected text candidates: ['text']
Detected label candidates: ['polarization']


Step 1: Setup and Configuration

In [None]:
# 1. Install necessary libraries
!pip install transformers datasets accelerate evaluate scikit-learn sentencepiece

import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification, # CORRECT MODEL CLASS for classification
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)
# Use StratifiedKFold for better handling of class imbalance (Crucial for Macro F1)
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
import gc
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# 0. CONFIGURATION & CLEANUP
# =============================================================================
gc.collect()
torch.cuda.empty_cache()

# --- HYPOTHESIS: df_en_final is available ---
# If you are running this in a new session, ensure 'df_en_final' is loaded/created.
# For simplicity, we assume your final English DataFrame is ready here.

CONFIG_EN = {
    'model_name': 'google/rembert',
    'k_folds': 5,
    'max_length': 128,
    'batch_size': 8,
    'gradient_accumulation_steps': 2,
    'learning_rate': 2e-5,
    'num_epochs': 5,
    'output_dir_base': './rembert-polarization-en-kfold',
}

print(f"Model: {CONFIG_EN['model_name']} | Strategy: {CONFIG_EN['k_folds']}-Fold CV")

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6
Model: google/rembert | Strategy: 5-Fold CV


Step 2: Data Utilities (Dataset, Metrics, Custom Trainer)

In [None]:
# 1. Dataset Class (Handles tokenization and tensor formatting)
class PolarizationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            # Label must be torch.long for CrossEntropyLoss
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 2. Metrics Function (Focuses on Macro F1)
def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=1)
    labels = p.label_ids

    # Macro F1 is the official evaluation metric (Task 1)
    macro_f1 = f1_score(labels, predictions, average='macro', zero_division=0)

    return {
        'macro_f1': macro_f1,
        'accuracy': accuracy_score(labels, predictions),
        # F1 for the positive (polarized) class - for monitoring imbalance success
        'f1_polarized_class_1': f1_score(labels, predictions, pos_label=1, zero_division=0),
    }

# 3. Custom Trainer (Applies class weights to the loss function)
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    # Fix: Accepting **kwargs ensures compatibility with new Transformers versions
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Ensure weights are on the correct device (GPU/CPU)
        weights = self.class_weights.to(logits.device) if self.class_weights is not None else None

        # CrossEntropyLoss with weights for weighted binary classification
        loss_fct = nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

Step 3: K-Fold Training Function

In [None]:
def run_k_fold_training_en(df_full, config):
    """Executes Stratified K-Fold training for English Polarization."""

    texts_all = df_full['text'].tolist()
    labels_all = df_full['label'].values

    # Use StratifiedKFold for better class distribution across folds
    skf = StratifiedKFold(n_splits=config['k_folds'], shuffle=True, random_state=42)
    all_fold_metrics = []

    print(f"\n{'='*70}")
    print(f"üöÄ STARTING {config['k_folds']}-FOLD TRAINING for ENGLISH (RemBERT)")
    print(f"{'='*70}")

    tokenizer = AutoTokenizer.from_pretrained(config['model_name'])

    # Loop through each fold split
    for fold, (train_index, val_index) in enumerate(skf.split(texts_all, labels_all)):
        print(f"\n--- FOLD {fold+1}/{config['k_folds']} ---")

        # Split data based on StratifiedKFold indices
        train_texts = [texts_all[i] for i in train_index]
        val_texts = [texts_all[i] for i in val_index]
        train_labels = labels_all[train_index]
        val_labels = labels_all[val_index]

        # 1. CALCULATE CLASS WEIGHTS FOR CURRENT FOLD (CRUCIAL for Macro F1)
        total_samples = len(train_labels)
        count_0 = np.sum(train_labels == 0)
        count_1 = np.sum(train_labels == 1)
        # Inverse frequency weighting
        weight_0 = total_samples / (2.0 * count_0)
        weight_1 = total_samples / (2.0 * count_1)
        class_weights = torch.tensor([weight_0, weight_1], dtype=torch.float32)
        print(f"Class Weights (Fold {fold+1}): {class_weights.tolist()}")

        # Create Datasets
        train_dataset = PolarizationDataset(train_texts, train_labels, tokenizer, config['max_length'])
        val_dataset = PolarizationDataset(val_texts, val_labels, tokenizer, config['max_length'])

        gc.collect()
        torch.cuda.empty_cache()

        # 2. LOAD A FRESH MODEL for the new fold
        model = AutoModelForSequenceClassification.from_pretrained(
            config['model_name'],
            num_labels=2
        )

        # Calculate Warmup Steps
        total_steps = (len(train_dataset) // (config['batch_size'] * config['gradient_accumulation_steps'])) * config['num_epochs']
        warmup_ratio = config.get('warmup_ratio', 0.1)
        warmup_steps = int(total_steps * warmup_ratio)

        # Training Arguments
        output_dir_fold = f"{config['output_dir_base']}/fold_{fold+1}"
        training_args = TrainingArguments(
            output_dir=output_dir_fold,
            num_train_epochs=config['num_epochs'],
            per_device_train_batch_size=config['batch_size'],
            per_device_eval_batch_size=config['batch_size'],
            learning_rate=config['learning_rate'],
            warmup_steps=warmup_steps,
            weight_decay=0.01,
            eval_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=1,
            load_best_model_at_end=True,
            metric_for_best_model='macro_f1',
            greater_is_better=True,
            fp16=torch.cuda.is_available(),
            gradient_accumulation_steps=config['gradient_accumulation_steps'],
            report_to="none",
        )

        # 3. Initialize and Train
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            class_weights=class_weights, # Pass the weights to the Custom Trainer
            data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
        )

        trainer.train()

        # 4. Evaluate and Collect Metrics
        print(f"\nüìä FINAL EVALUATION FOR FOLD {fold+1}")
        eval_results = trainer.evaluate()
        all_fold_metrics.append(eval_results)

        print(f"Macro F1 (Fold {fold+1}): {eval_results['eval_macro_f1']:.4f}")

        # Save best model of the fold
        trainer.save_model(output_dir_fold)

    # =============================================================================
    # 5. FINAL RESULTS (AVERAGE)
    # =============================================================================
    print(f"\n{'='*70}")
    print("‚úÖ K-FOLD CROSS-VALIDATION COMPLETED!")
    print(f"{'='*70}")

    final_metrics = {}

    # Calculate average metrics across all folds
    for key in all_fold_metrics[0].keys():
        if key.startswith('eval_macro_f1') or key.startswith('eval_f1_polarized_class_1') or key.startswith('eval_accuracy'):
            avg_value = np.mean([metrics[key] for metrics in all_fold_metrics])
            final_metrics[key] = avg_value
            print(f"  {key:30s} (Avg): {avg_value:.4f}")

    return final_metrics, all_fold_metrics

Step 4: Execution

In [None]:
df = df.rename(columns={"polarization": "label"})


In [None]:
if __name__ == "__main__":
    # --- ASSUMING df_en_final IS AVAILABLE HERE ---

    final_avg_metrics, all_metrics = run_k_fold_training_en(df, CONFIG_EN)

    print("\n" + "="*70)
    print("FINAL ENGLISH K-FOLD AVERAGE METRICS:")
    print(final_avg_metrics)
    print("="*70)



üöÄ STARTING 5-FOLD TRAINING for ENGLISH (RemBERT)


tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

sentencepiece.model:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]


--- FOLD 1/5 ---
Class Weights (Fold 1): [0.7871105670928955, 1.3707447052001953]


config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.30G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.30G [00:00<?, ?B/s]

Some weights of RemBertForSequenceClassification were not initialized from the model checkpoint at google/rembert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


In [None]:
# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("google/rembert", dtype="auto")