# Step 1: Configuration et Préparation des Données

In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset as TorchDataset
import random
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    AutoModelForSeq2SeqLM
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')

EN_TRAIN_FILE_PATH = '/content/eng_train_3.csv'
SW_TRAIN_FILE_PATH = '/content/swa_train_3.csv'
EN_TEST_FILE_PATH = '/content/eng_test_3.csv'
PREVIOUS_SUBMISSION_PATH = '/content/pred_eng.csv'

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
gc.collect()
torch.cuda.empty_cache()

# Labels to isolate
SPECIALIZED_LABELS = ['dehumanization', 'lack_of_empathy']
NUM_SPECIALIZED_LABELS = len(SPECIALIZED_LABELS)

CONFIG_AUG = {
    'nllb_model': "facebook/nllb-200-distilled-600M",
    'src_lang_sw': "swh_Latn",
    'tgt_lang_en': "eng_Latn",
    'batch_size': 32
}

CONFIG_SPEC = {
    'model_checkpoint': 'microsoft/deberta-v3-large',
    'max_length': 256,
    'batch_size': 4,
    'gradient_accumulation_steps': 4,
    'learning_rate': 1e-5,
    'num_epochs': 6,
    'warmup_ratio': 0.1,
    'weight_decay': 0.01,
    'early_stopping_patience': 4,
    'min_pos_weight': 1.0,
    'max_pos_weight': 20.0,
    'output_dir': './deberta-manifestations-en-specialized',
    'seed': SEED,
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Device: {device}")
print(f"Specialized Labels: {SPECIALIZED_LABELS}")

class SpecializedDataset(TorchDataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(str(self.texts[idx]), padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float32)
        }

# Custom Trainer (with pos_weights for BCEWithLogitsLoss)
class ManifestationsTrainer(Trainer):
    def __init__(self, pos_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.pos_weights = pos_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        pos_weights = self.pos_weights.to(logits.device)
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

def compute_metrics_specialized(eval_pred):
    predictions, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(predictions)).numpy()
    y_pred = (probs > 0.5).astype(int)

    f1_macro = f1_score(labels, y_pred, average='macro', zero_division=0)
    metrics = {'f1_macro': f1_macro}

    for i, label_name in enumerate(SPECIALIZED_LABELS):
        metrics[f'f1_{label_name}'] = f1_score(labels[:, i], y_pred[:, i], zero_division=0)
    return metrics

🖥️ Device: cuda
🎯 Labels Spécialisés: ['dehumanization', 'lack_of_empathy']


In [None]:
def augment_polarized_data(swa_file, config):
    print("\n--- 1. DATA AUGMENTATION (SW -> EN) ---")

    df_swa = pd.read_csv(swa_file)
    df_swa.columns = df_swa.columns.str.strip()
    text_col = next((c for c in df_swa.columns if c.lower() in ['text', 'text']), df_swa.columns[1])

    df_polarized = df_swa[df_swa[SPECIALIZED_LABELS].sum(axis=1) > 0].copy()

    if df_polarized.empty:
        print("  WARNING: No polarized examples for augmentation. Skipping.")
        return None

    tokenizer_nllb = AutoTokenizer.from_pretrained(config['nllb_model'])
    model_nllb = AutoModelForSeq2SeqLM.from_pretrained(config['nllb_model']).to(device)

    texts = df_polarized[text_col].astype(str).tolist()
    results = []
    tokenizer_nllb.src_lang = config['src_lang_sw']

    for i in tqdm(range(0, len(texts), config['batch_size']), desc="Translating SW->EN"):
        batch_texts = texts[i : i + config['batch_size']]
        inputs = tokenizer_nllb(batch_texts, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            translated_tokens = model_nllb.generate(**inputs, forced_bos_token_id=tokenizer_nllb.convert_tokens_to_ids(config['tgt_lang_en']), max_length=200)
        decoded = tokenizer_nllb.batch_decode(translated_tokens, skip_special_tokens=True)
        results.extend(decoded)

    del model_nllb
    gc.collect()
    torch.cuda.empty_cache()

    df_aug = df_polarized[SPECIALIZED_LABELS].copy()
    df_aug[text_col] = results

    return df_aug

def load_english_specialized(df, config):

    df.columns = df.columns.str.strip()
    text_col = next((c for c in df.columns if c.lower() in ['text', 'text']), df.columns[1])
    df[text_col] = df[text_col].fillna("").astype(str)

    labels_matrix = df[SPECIALIZED_LABELS].values.astype(float)

    num_positives = labels_matrix.sum(axis=0)
    num_negatives = labels_matrix.shape[0] - num_positives
    raw_weights = num_negatives / np.maximum(num_positives, 1)

    weights_clamped = np.clip(raw_weights, config['min_pos_weight'], config['max_pos_weight'])
    pos_weights = torch.tensor(weights_clamped, dtype=torch.float32)

    print(f"\nPos Weights: {pos_weights.tolist()}")

    train_df, val_df = train_test_split(df, test_size=0.15, random_state=config['seed'], stratify=df[SPECIALIZED_LABELS[0]])

    train_texts = train_df[text_col].tolist()
    train_labels = train_df[SPECIALIZED_LABELS].values.tolist()
    val_texts = val_df[text_col].tolist()
    val_labels = val_df[SPECIALIZED_LABELS].values.tolist()

    return train_texts, train_labels, val_texts, val_labels, pos_weights

In [None]:
def find_best_thresholds(trainer, val_dataset):
    preds_output = trainer.predict(val_dataset)
    probs = torch.sigmoid(torch.tensor(preds_output.predictions)).numpy()
    y_true = np.array(preds_output.label_ids)

    best_thresholds = []
    for i, label_name in enumerate(SPECIALIZED_LABELS):
        best_t = 0.5
        best_f1 = 0.0
        for t in np.arange(0.1, 0.95, 0.05):
            y_pred_col = (probs[:, i] > t).astype(int)
            score = f1_score(y_true[:, i], y_pred_col, zero_division=0)
            if score > best_f1:
                best_f1 = score
                best_t = t
        best_thresholds.append(best_t)
        print(f"  {label_name:20s}: Threshold={best_t:.2f}, F1={best_f1:.4f}")
    return best_thresholds

def train_english_specialized_model(config):

    df_full_en = pd.read_csv(EN_TRAIN_FILE_PATH)
    df_aug = augment_polarized_data(SW_TRAIN_FILE_PATH, CONFIG_AUG)
    if df_aug is not None:
        df_full_en = pd.concat([df_full_en, df_aug], ignore_index=True)

    train_texts, train_labels, val_texts, val_labels, pos_weights = load_english_specialized(df_full_en, config)

    tokenizer = AutoTokenizer.from_pretrained(config['model_checkpoint'])
    train_dataset = SpecializedDataset(train_texts, train_labels, tokenizer, config['max_length'])
    val_dataset = SpecializedDataset(val_texts, val_labels, tokenizer, config['max_length'])
    model = AutoModelForSequenceClassification.from_pretrained(
        config['model_checkpoint'], num_labels=NUM_SPECIALIZED_LABELS, problem_type="multi_label_classification"
    )

    training_args = TrainingArguments(
        output_dir=f"{config['output_dir']}/seed_{config['seed']}", eval_strategy='epoch', save_strategy='epoch',
        learning_rate=config['learning_rate'], per_device_train_batch_size=config['batch_size'],
        per_device_eval_batch_size=config['batch_size'], num_train_epochs=config['num_epochs'],
        weight_decay=config['weight_decay'], warmup_ratio=config['warmup_ratio'],
        gradient_accumulation_steps=config['gradient_accumulation_steps'], load_best_model_at_end=True,
        metric_for_best_model='f1_macro', greater_is_better=True, fp16=torch.cuda.is_available(),
        save_total_limit=2, logging_steps=50, seed=config['seed'], report_to='none',
    )

    trainer = ManifestationsTrainer(
        pos_weights=pos_weights, model=model, args=training_args, train_dataset=train_dataset,
        eval_dataset=val_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics_specialized,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=config['early_stopping_patience'])]
    )

    print(f"\nStarting specialized training for {NUM_SPECIALIZED_LABELS} labels...")
    trainer.train()

    best_thresholds = find_best_thresholds(trainer, val_dataset)
    save_path = f"{config['output_dir']}/final_model"
    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)
    with open(f"{save_path}/thresholds.txt", 'w') as f:
        f.write(','.join([str(t) for t in best_thresholds]))

    return trainer, best_thresholds

In [None]:
def update_submission_with_specialized(submission_path, test_path, model_path, thresholds, specialized_labels):

    df_test = pd.read_csv(test_path)
    df_submission = pd.read_csv(submission_path)

    id_col = next((c for c in df_test.columns if c.lower() in ['id', 'id']), df_test.columns[0])
    text_col = next((c for c in df_test.columns if c.lower() in ['text', 'text']), df_test.columns[1])
    test_texts = df_test[text_col].astype(str).tolist()

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    all_probs = []
    with torch.no_grad():
        for i in tqdm(range(0, len(test_texts), 64), desc="Predicting Specialized"):
            batch_texts = test_texts[i:i + 64]
            encodings = tokenizer(batch_texts, max_length=CONFIG_SPEC['max_length'], padding='max_length', truncation=True, return_tensors='pt').to(device)
            outputs = model(**encodings)
            probs = torch.sigmoid(outputs.logits).cpu().numpy()
            all_probs.extend(probs)
    specialized_probs = np.array(all_probs)

    final_specialized_predictions = np.zeros_like(specialized_probs, dtype=int)

    print("\n--- Applying Thresholds and Updating ---")

    for i, threshold in enumerate(thresholds):
        final_specialized_predictions[:, i] = (specialized_probs[:, i] > threshold).astype(int)

    for i, label in enumerate(specialized_labels):
        df_submission[label] = final_specialized_predictions[:, i]

    output_filename = 'polarization_english_final_specialized_submission.csv'
    df_submission.to_csv(output_filename, index=False)

    print(f"\nFINAL submission file updated: {output_filename}")
    print(f"  Distribution for 'dehumanization': {df_submission['dehumanization'].sum()}")
    print(f"  Distribution for 'lack_of_empathy': {df_submission['lack_of_empathy'].sum()}")

    return df_submission

if __name__ == "__main__":

    trainer_spec, thresholds_spec = train_english_specialized_model(CONFIG_SPEC)

    final_submission = update_submission_with_specialized(
        PREVIOUS_SUBMISSION_PATH,
        EN_TEST_FILE_PATH,
        f"{CONFIG_SPEC['output_dir']}/final_model",
        thresholds_spec,
        SPECIALIZED_LABELS
    )

    print("\nCOMPLETE PIPELINE FINISHED. The final file is ready to be submitted.")


--- 1. DATA AUGMENTATION (SW -> EN) ---


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Translating SW->EN:   0%|          | 0/74 [00:00<?, ?it/s]


⚖️  Pos Weights: [3.3512461185455322, 1.2925728559494019]


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Démarrage de l'entraînement spécialisé pour 2 labels...


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Dehumanization,F1 Lack Of Empathy
1,0.6847,0.683448,0.668358,0.491961,0.844755
2,0.6424,0.636988,0.686169,0.502311,0.870027
3,0.4817,0.796225,0.667362,0.476027,0.858696
4,0.4242,0.768079,0.657289,0.448,0.866579
5,0.2885,0.904067,0.644683,0.421739,0.867628
6,0.2352,1.030543,0.625592,0.392947,0.858238


  dehumanization      : Threshold=0.60, F1=0.5145
  lack_of_empathy     : Threshold=0.50, F1=0.8700


Predicting Specialized:   0%|          | 0/3 [00:00<?, ?it/s]


--- Application des Thresholds et Mise à Jour ---

✅ Fichier de soumission FINAL mis à jour : polarization_english_final_specialized_submission.csv
  Distribution pour 'dehumanization': 19
  Distribution pour 'lack_of_empathy': 3

FIN DU PIPELINE COMPLET. Le fichier final est prêt à être soumis.


In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset as TorchDataset
import random
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import gc
import warnings

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
gc.collect()
torch.cuda.empty_cache()
warnings.filterwarnings('ignore')

EN_TRAIN_FILE_PATH = '/content/eng_train_3.csv'
EN_TEST_FILE_PATH = '/content/eng_test_3.csv'
PREVIOUS_SUBMISSION_PATH = '/content/pred_eng.csv'

TARGET_LABEL = 'Lack of Empathy'
LABEL_COL_NAME = 'lack_of_empathy'
NUM_LABELS = 2

CONFIG_EMPATHY = {
    'model_checkpoint': 'microsoft/deberta-v3-base',
    'max_length': 256,
    'batch_size': 8,
    'gradient_accumulation_steps': 2,
    'learning_rate': 3e-5,
    'num_epochs': 10,
    'warmup_ratio': 0.1,
    'weight_decay': 0.01,
    'output_dir': './deberta-monolabel-lack-of-empathy',
    'seed': SEED,
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device} | Task: Fixing '{TARGET_LABEL}'")

class EmpathyDataset(TorchDataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        encoding = self.tokenizer(str(self.texts[idx]), padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        weights = self.class_weights.to(logits.device)
        loss_fct = nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, NUM_LABELS), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

def compute_metrics_empathy(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids
    macro_f1 = f1_score(labels, predictions, average='macro', zero_division=0)
    f1_pos = f1_score(labels, predictions, pos_label=1, zero_division=0)
    return {'f1_macro': macro_f1, 'f1_lack_of_empathy': f1_pos}


def load_empathy_data(config):
    print(f"\nLoading data for '{TARGET_LABEL}'...")
    df = pd.read_csv(EN_TRAIN_FILE_PATH)
    df.columns = df.columns.str.strip()
    text_col = next((c for c in df.columns if c.lower() in ['text', 'text']), df.columns[1])

    if LABEL_COL_NAME not in df.columns:
         raise ValueError(f"Column '{LABEL_COL_NAME}' not found in file.")

    labels = df[LABEL_COL_NAME].values.astype(int)
    texts = df[text_col].astype(str).tolist()

    pos_count = np.sum(labels == 1)
    neg_count = np.sum(labels == 0)
    total = len(df)

    max_weight = 50.0
    weight_0 = total / (2.0 * neg_count)
    weight_1 = min(total / (2.0 * pos_count), max_weight)
    class_weights = torch.tensor([weight_0, weight_1], dtype=torch.float32)

    print(f"Distribution: Positives: {pos_count} ({pos_count/total:.2%})")
    print(f"Class Weights [0, 1]: {class_weights.tolist()} (Max weight applied: {max_weight})")

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.15, random_state=config['seed'], stratify=labels
    )

    return train_texts, val_texts, train_labels, val_labels, class_weights


def train_empathy_model(config):
    print(f"\nStarting training for '{TARGET_LABEL}'...")

    train_texts, val_texts, train_labels, val_labels, class_weights = load_empathy_data(config)

    tokenizer = AutoTokenizer.from_pretrained(config['model_checkpoint'])
    train_dataset = EmpathyDataset(train_texts, train_labels, tokenizer, config['max_length'])
    val_dataset = EmpathyDataset(val_texts, val_labels, tokenizer, config['max_length'])

    model = AutoModelForSequenceClassification.from_pretrained(
        config['model_checkpoint'], num_labels=NUM_LABELS
    )

    training_args = TrainingArguments(
        output_dir=config['output_dir'], eval_strategy='epoch', save_strategy='epoch',
        learning_rate=config['learning_rate'], per_device_train_batch_size=config['batch_size'],
        per_device_eval_batch_size=config['batch_size'], num_train_epochs=config['num_epochs'],
        weight_decay=config['weight_decay'], warmup_ratio=config['warmup_ratio'],
        gradient_accumulation_steps=config['gradient_accumulation_steps'], load_best_model_at_end=True,
        metric_for_best_model='f1_macro', greater_is_better=True, fp16=torch.cuda.is_available(),
        save_total_limit=1, logging_steps=50, seed=config['seed'], report_to='none',
    )

    trainer = WeightedTrainer(
        class_weights=class_weights, model=model, args=training_args, train_dataset=train_dataset,
        eval_dataset=val_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics_empathy
    )

    trainer.train()

    preds_output = trainer.predict(val_dataset)
    probs = torch.softmax(torch.tensor(preds_output.predictions), dim=1).numpy()[:, 1]
    y_true = np.array(preds_output.label_ids)

    best_t = 0.5
    best_f1 = 0.0
    for t in np.arange(0.05, 0.95, 0.01):
        y_pred_col = (probs > t).astype(int)
        score = f1_score(y_true, y_pred_col, zero_division=0)
        if score > best_f1:
            best_f1 = score
            best_t = t

    print(f"Optimal Threshold: {best_t:.4f} (F1: {best_f1:.4f})")

    save_path = f"{config['output_dir']}/final_model"
    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)

    return best_t


if __name__ == "__main__":

    print("--- STARTING MONOLABEL SPECIALIZED PIPELINE ---")

    optimal_threshold = train_empathy_model(CONFIG_EMPATHY)

    print("\n--- 4. UPDATING SUBMISSION FILE ---")

    df_test = pd.read_csv(EN_TEST_FILE_PATH)
    df_test.columns = df_test.columns.str.strip()
    df_submission = pd.read_csv(PREVIOUS_SUBMISSION_PATH)

    id_col = next((c for c in df_test.columns if c.lower() in ['id', 'id']), df_test.columns[0])
    text_col = next((c for c in df_test.columns if c.lower() in ['text', 'text']), df_test.columns[1])
    test_texts = df_test[text_col].astype(str).tolist()

    model_path = f"{CONFIG_EMPATHY['output_dir']}/final_model"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
    model.eval()

    all_probs_pos_class = []
    with torch.no_grad():
        for i in tqdm(range(0, len(test_texts), 64), desc="Predicting Lack of Empathy"):
            batch_texts = test_texts[i:i + 64]
            encodings = tokenizer(batch_texts, max_length=CONFIG_EMPATHY['max_length'], padding='max_length', truncation=True, return_tensors='pt').to(device)
            outputs = model(**encodings)
            probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()[:, 1]
            all_probs_pos_class.extend(probs)

    final_predictions = (np.array(all_probs_pos_class) > optimal_threshold).astype(int)

    df_submission[LABEL_COL_NAME] = final_predictions

    output_filename = 'polarization_english_lack_of_empathy_fix_submission.csv'
    df_submission.to_csv(output_filename, index=False)

    print(f"\nSUBMISSION FILE UPDATED: {output_filename}")
    print(f"  Distribution for '{TARGET_LABEL}': {df_submission[LABEL_COL_NAME].sum()} positive examples.")