# Step 1: Configuration et Préparation des Données

In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset as TorchDataset
import random
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoModelForSeq2SeqLM, # Pour l'augmentation NLLB
    DataCollatorWithPadding
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import gc
import warnings
import os

# =============================================================================
# 0. CONFIGURATION ET SETUP
# =============================================================================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
gc.collect()
torch.cuda.empty_cache()
warnings.filterwarnings('ignore')

# Chemins des fichiers (Tâche 2)
EN_TRAIN_FILE_PATH_TASK2 = '/content/eng_train_2.csv'
EN_TEST_FILE_PATH_TASK2 = '/content/eng_test_2.csv'
PREVIOUS_SUBMISSION_PATH_TASK2 = '/content/pred_eng.csv'
DE_TRAIN_FILE_PATH_TASK2 = 'chemin/vers/votre/deu_train_2.csv' # Fichier Allemand T2
IT_TRAIN_FILE_PATH_TASK2 = 'chemin/vers/votre/ita_train_2.csv' # Fichier Italien T2

# Cibles Tâche 2
TARGET_CLASSES = ['Gender/Sexual', 'Other']
LABEL_COL_MAP = {'Gender/Sexual': 'gender/sexual', 'Other': 'other'}
NUM_LABELS = 2

# Config du Modèle Spécialisé (RemBERT Poussé)
CONFIG_SPEC_TASK2 = {
    'model_checkpoint': 'google/rembert',
    'max_length': 128,
    'batch_size': 8,
    'gradient_accumulation_steps': 2,
    'learning_rate': 2e-5,
    'num_epochs': 20, # Poussé
    'warmup_ratio': 0.1,
    'output_dir': './rembert-monolabel-task2',
    'seed': 42,
}

# Config Augmentation Pivot
CONFIG_AUG = {
    'nllb_model': "facebook/nllb-200-distilled-600M",
    'src_lang_de': "deu_Latn", # Allemand (Pivot)
    'src_lang_it': "ita_Latn", # Italien (Augmentation)
    'tgt_lang_en': "eng_Latn",
    'batch_size': 32
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🖥️ Device: {device} | 🎯 Tâche 2: Pivot Language Augmentation")

# =============================================================================
# 1. CLASSES ET FONCTIONS DE BASE
# =============================================================================
# (PolarizationDataset, WeightedTrainer, compute_metrics_empathy - non affichés par concision, mais requis)
class PolarizationDataset(TorchDataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts; self.labels = labels; self.tokenizer = tokenizer; self.max_length = max_length
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        encoding = self.tokenizer(str(self.texts[idx]), padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(self.labels[idx], dtype=torch.long)}
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs); self.class_weights = class_weights
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels"); outputs = model(**inputs); logits = outputs.logits
        weights = self.class_weights.to(logits.device); loss_fct = nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, NUM_LABELS), labels.view(-1)); return (loss, outputs) if return_outputs else loss
def compute_metrics_empathy(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1); labels = eval_pred.label_ids
    macro_f1 = f1_score(labels, predictions, average='macro', zero_division=0)
    f1_pos = f1_score(labels, predictions, pos_label=1, zero_division=0)
    return {'f1_macro': macro_f1, 'f1_positive_class': f1_pos}


# =============================================================================
# 2. FONCTIONS D'AUGMENTATION PIVOT
# =============================================================================
def run_translation(texts, tokenizer_nllb, model_nllb, src_lang, tgt_lang, max_length=128, batch_size=32, desc="Translating"):
    """Fonction générique de traduction par batch."""
    results = []
    tokenizer_nllb.src_lang = src_lang
    for i in tqdm(range(0, len(texts), batch_size), desc=desc):
        batch_texts = texts[i : i + batch_size]
        inputs = tokenizer_nllb(batch_texts, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            translated_tokens = model_nllb.generate(**inputs, forced_bos_token_id=tokenizer_nllb.convert_tokens_to_ids(tgt_lang), max_length=max_length)
        results.extend(tokenizer_nllb.batch_decode(translated_tokens, skip_special_tokens=True))
    return results

def augment_pivot_data(df_en_original, target_col_name, config):
    """Effectue les deux types d'augmentation (Polarized + Round-Trip)"""

    print(f"\n--- Augmentation pour la classe '{target_col_name}' ---")

    # 1. Préparation NLLB
    tokenizer_nllb = AutoTokenizer.from_pretrained(config['nllb_model'])
    model_nllb = AutoModelForSeq2SeqLM.from_pretrained(config['nllb_model']).to(device)

    # Préparer les DataFrames vides pour la fusion
    df_aug_total = pd.DataFrame(columns=['text', 'label'])
    df_en_polarized = df_en_original[df_en_original[target_col_name] == 1].copy()

    # ----------------------------------------------------------------------
    # A. AUGMENTATION POLARISÉE (DEU + ITA -> EN)
    # ----------------------------------------------------------------------
    source_files_map = {config['src_lang_de']: DE_TRAIN_FILE_PATH_TASK2,
                        config['src_lang_it']: IT_TRAIN_FILE_PATH_TASK2}

    for lang_code, file_path in source_files_map.items():
        df_source = pd.read_csv(file_path); df_source.columns = df_source.columns.str.strip()
        df_polarized = df_source[df_source['polarization'] == 1].copy() # Filtre Tâche 1

        texts = df_polarized['text'].astype(str).tolist()
        if not texts: continue

        translated_texts = run_translation(texts, tokenizer_nllb, model_nllb, lang_code, config['tgt_lang_en'], config['max_length'], config['batch_size'], desc=f"Translating {lang_code}->EN")

        df_aug = pd.DataFrame({'text': translated_texts, 'label': 1})
        df_aug_total = pd.concat([df_aug_total, df_aug], ignore_index=True)
        print(f"  + Ajout de {len(df_aug)} exemples polarisés de {lang_code}.")

    # ----------------------------------------------------------------------
    # B. ROUND-TRIP TRANSLATION (EN -> DEU -> EN)
    # ----------------------------------------------------------------------
    if not df_en_polarized.empty:
        texts_en = df_en_polarized['text'].astype(str).tolist()

        # 1. EN -> DEU (Pivot)
        texts_de = run_translation(texts_en, tokenizer_nllb, model_nllb, config['tgt_lang_en'], config['src_lang_de'], config['max_length'], config['batch_size'], desc="Round-Trip EN->DEU")

        # 2. DEU -> EN (Retour)
        texts_rt = run_translation(texts_de, tokenizer_nllb, model_nllb, config['src_lang_de'], config['tgt_lang_en'], config['max_length'], config['batch_size'], desc="Round-Trip DEU->EN")

        df_aug_rt = pd.DataFrame({'text': texts_rt, 'label': 1})
        df_aug_total = pd.concat([df_aug_total, df_aug_rt], ignore_index=True)
        print(f"  + Ajout de {len(df_aug_rt)} exemples par Round-Trip.")

    # Nettoyage et Fusion
    del model_nllb
    gc.collect()
    torch.cuda.empty_cache()

    df_final = df_en_original.rename(columns={target_col_name: 'label'}).copy()

    # Concaténer l'original et l'augmentation
    df_final = pd.concat([df_final, df_aug_total], ignore_index=True)

    print(f"  Total final pour {target_col_name}: {len(df_final):,} échantillons. Positifs: {df_final['label'].sum()}")
    return df_final

🖥️ Device: cuda | 🎯 Tâche 2: Pivot Language Augmentation


In [8]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset as TorchDataset
import random
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoModelForSeq2SeqLM, # Pour l'augmentation NLLB
    DataCollatorWithPadding
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import gc
import warnings
import os

# =============================================================================
# 0. CONFIGURATION ET SETUP
# =============================================================================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
gc.collect()
torch.cuda.empty_cache()
warnings.filterwarnings('ignore')

# Chemins des fichiers (Tâche 2)
EN_TRAIN_FILE_PATH_TASK2 = '/content/eng_train_2.csv'
EN_TEST_FILE_PATH_TASK2 = '/content/eng_test_2.csv'
PREVIOUS_SUBMISSION_PATH_TASK2 = '/content/pred_eng.csv'
DE_TRAIN_FILE_PATH_TASK2 = '/content/deu_train_2.csv' # Fichier Allemand T2
IT_TRAIN_FILE_PATH_TASK2 = '/content/ita_train_2.csv' # Fichier Italien T2

# Cibles Tâche 2
TARGET_CLASSES = ['Gender/Sexual', 'Other']
LABEL_COL_MAP = {'Gender/Sexual': 'gender/sexual', 'Other': 'other'}
NUM_LABELS = 2
ALL_TASK2_COLS = ['political', 'racial/ethnic', 'religious', 'gender/sexual', 'other'] # Toutes les colonnes T2

# Config du Modèle Spécialisé (RemBERT Poussé)
CONFIG_SPEC_TASK2 = {
    'model_checkpoint': 'google/rembert',
    'max_length': 128,
    'batch_size': 8,
    'gradient_accumulation_steps': 2,
    'learning_rate': 2e-5,
    'num_epochs': 20,
    'warmup_ratio': 0.1,
    'output_dir': './rembert-monolabel-task2',
    'seed': 42,
}

# Config Augmentation Pivot
CONFIG_AUG = {
    'nllb_model': "facebook/nllb-200-distilled-600M",
    'src_lang_de': "deu_Latn", # Allemand (Pivot)
    'src_lang_it': "ita_Latn", # Italien (Augmentation)
    'tgt_lang_en': "eng_Latn",
    'batch_size': 32
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🖥️ Device: {device} | 🎯 Tâche 2: Pivot Language Augmentation")

# =============================================================================
# 1. CLASSES ET FONCTIONS DE BASE
# =============================================================================
class PolarizationDataset(TorchDataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts; self.labels = labels; self.tokenizer = tokenizer; self.max_length = max_length
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        encoding = self.tokenizer(str(self.texts[idx]), padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs); self.class_weights = class_weights
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels"); outputs = model(**inputs); logits = outputs.logits
        weights = self.class_weights.to(logits.device); loss_fct = nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, NUM_LABELS), labels.view(-1)); return (loss, outputs) if return_outputs else loss
def compute_metrics_empathy(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1); labels = eval_pred.label_ids
    macro_f1 = f1_score(labels, predictions, average='macro', zero_division=0)
    f1_pos = f1_score(labels, predictions, pos_label=1, zero_division=0)
    return {'f1_macro': macro_f1, 'f1_positive_class': f1_pos}
def run_translation(texts, tokenizer_nllb, model_nllb, src_lang, tgt_lang, max_length=128, batch_size=32, desc="Translating"):
    """Fonction générique de traduction par batch."""
    results = []; tokenizer_nllb.src_lang = src_lang
    for i in tqdm(range(0, len(texts), batch_size), desc=desc):
        batch_texts = texts[i : i + batch_size]; inputs = tokenizer_nllb(batch_texts, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            translated_tokens = model_nllb.generate(**inputs, forced_bos_token_id=tokenizer_nllb.convert_tokens_to_ids(tgt_lang), max_length=max_length)
        results.extend(tokenizer_nllb.batch_decode(translated_tokens, skip_special_tokens=True))
    return results

# =============================================================================
# 2. FONCTIONS D'AUGMENTATION PIVOT (CORRIGÉES POUR MAX_LENGTH)
# =============================================================================
def augment_pivot_data_part_a(original_csv_col_name, source_files_map, config_aug, config_rembert):
    """Augmentation POLARISÉE (DEU/ITA -> EN) utilisant la Tâche 2 (sum > 0) comme filtre."""
    all_aug_dfs = []

    tokenizer_nllb = AutoTokenizer.from_pretrained(config_aug['nllb_model'])
    model_nllb = AutoModelForSeq2SeqLM.from_pretrained(config_aug['nllb_model']).to(device)

    for lang_code, file_path in source_files_map.items():
        df_source = pd.read_csv(file_path); df_source.columns = df_source.columns.str.strip()

        # Filtre CRUCIAL: Utiliser la somme des colonnes Tâche 2 pour déterminer si l'exemple est 'Polarized'
        df_source['is_polarized'] = df_source[ALL_TASK2_COLS].sum(axis=1) > 0
        df_polarized = df_source[df_source['is_polarized'] == True].copy()

        if df_polarized.empty: continue

        texts = df_polarized['text'].astype(str).tolist()

        # Utilisation de config_rembert['max_length']
        translated_texts = run_translation(texts, tokenizer_nllb, model_nllb, lang_code, config_aug['tgt_lang_en'],
                                           config_rembert['max_length'], config_aug['batch_size'], desc=f"Polarized {lang_code}->EN")

        df_aug = pd.DataFrame({'text': translated_texts, 'label': 1}) # Label est 1
        all_aug_dfs.append(df_aug)

    del model_nllb
    gc.collect()
    torch.cuda.empty_cache()

    return pd.concat(all_aug_dfs, ignore_index=True) if all_aug_dfs else pd.DataFrame(columns=['text', 'label'])

def augment_pivot_data_part_b(df_en_original, original_csv_col_name, config_aug, config_rembert):
    """ROUND-TRIP TRANSLATION (EN -> DEU -> EN) sur les exemples Anglais POSITIFS."""

    # *** Le filtrage utilise le nom de colonne CSV ! ***
    df_polarized_en = df_en_original[df_en_original[original_csv_col_name] == 1].copy()
    if df_polarized_en.empty:
        return pd.DataFrame(columns=['text', 'label'])

    texts_en = df_polarized_en['text'].astype(str).tolist()

    tokenizer_nllb = AutoTokenizer.from_pretrained(config_aug['nllb_model'])
    model_nllb = AutoModelForSeq2SeqLM.from_pretrained(config_aug['nllb_model']).to(device)

    # 1. EN -> DEU (Pivot)
    texts_de = run_translation(texts_en, tokenizer_nllb, model_nllb, config_aug['tgt_lang_en'], config_aug['src_lang_de'],
                               config_rembert['max_length'], config_aug['batch_size'], desc="Round-Trip EN->DEU")

    # 2. DEU -> EN (Retour)
    texts_rt = run_translation(texts_de, tokenizer_nllb, model_nllb, config_aug['src_lang_de'], config_aug['tgt_lang_en'],
                               config_rembert['max_length'], config_aug['batch_size'], desc="Round-Trip DEU->EN")

    del model_nllb
    gc.collect()
    torch.cuda.empty_cache()

    # Créer le DataFrame augmenté
    df_aug_rt = pd.DataFrame({'text': texts_rt, 'label': 1}) # Label est 1

    return df_aug_rt

def train_and_get_threshold_pivot(target_class, config):
    """Pipeline complet d'entraînement Monolabel avec Augmentation Pivot."""
    print(f"\n--- Démarrage de l'entraînement Poussé Pivot pour : {target_class} ---")

    # 1. Préparation des données AUGMENTÉES
    df_en_original = pd.read_csv(EN_TRAIN_FILE_PATH_TASK2)
    df_en_original.columns = df_en_original.columns.str.strip()

    original_csv_col_name = LABEL_COL_MAP[target_class]

    # Définir les fichiers sources pour l'augmentation T1 (polarization)
    source_files_map = {CONFIG_AUG['src_lang_de']: DE_TRAIN_FILE_PATH_TASK2,
                        CONFIG_AUG['src_lang_it']: IT_TRAIN_FILE_PATH_TASK2}

    # Augmentation A et B (PASSAGE DE CONFIG_SPEC_TASK2)
    df_aug_de_en = augment_pivot_data_part_a(original_csv_col_name, source_files_map, CONFIG_AUG, CONFIG_SPEC_TASK2)
    df_aug_rt = augment_pivot_data_part_b(df_en_original, original_csv_col_name, CONFIG_AUG, CONFIG_SPEC_TASK2)

    # 2. Concaténation (Original + Augmentation Pol. + Round Trip)
    df_en_final = df_en_original.rename(columns={original_csv_col_name: 'label'}).copy()

    df_en_final = pd.concat([df_en_final, df_aug_de_en, df_aug_rt], ignore_index=True)

    # 3. Calcul des Poids et Entraînement
    labels = df_en_final['label'].values.astype(int); texts = df_en_final['text'].astype(str).tolist()
    pos_count = np.sum(labels == 1); neg_count = np.sum(labels == 0); total = len(df_en_final)
    max_weight = 50.0; weight_0 = total / (2.0 * neg_count); weight_1 = min(total / (2.0 * pos_count), max_weight)
    class_weights = torch.tensor([weight_0, weight_1], dtype=torch.float32)

    # Split, Tokenizer, Modèle
    train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.15, random_state=config['seed'], stratify=labels)
    tokenizer = AutoTokenizer.from_pretrained(config['model_checkpoint'])
    train_dataset = PolarizationDataset(train_texts, train_labels, tokenizer, config['max_length'])
    val_dataset = PolarizationDataset(val_texts, val_labels, tokenizer, config['max_length'])
    model = AutoModelForSequenceClassification.from_pretrained(config['model_checkpoint'], num_labels=2)

    # Entraînement
    output_dir = f"{config['output_dir']}/{target_class.replace('/', '_')}_PIVOT"
    training_args = TrainingArguments(output_dir=output_dir, eval_strategy='epoch', save_strategy='epoch', num_train_epochs=config['num_epochs'], per_device_train_batch_size=config['batch_size'], gradient_accumulation_steps=config['gradient_accumulation_steps'], load_best_model_at_end=True, metric_for_best_model='f1_macro', save_total_limit=1, fp16=torch.cuda.is_available(), report_to='none',
    )
    trainer = WeightedTrainer(class_weights=class_weights, model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics_empathy)

    print(f"  Total samples (Augmented): {len(df_en_final):,}. Positives: {df_en_final['label'].sum()}")
    trainer.train()

    # Recherche du Threshold Optimal
    preds_output = trainer.predict(val_dataset)
    probs = torch.softmax(torch.tensor(preds_output.predictions), dim=1).numpy()[:, 1]
    y_true = np.array(preds_output.label_ids)

    best_t = 0.0
    best_f1 = 0.0
    for t in np.arange(0.05, 0.95, 0.01):
        y_pred_col = (probs > t).astype(int)
        score = f1_score(y_true, y_pred_col, pos_label=1, zero_division=0)
        if score > best_f1:
            best_f1 = score
            best_t = t

    trainer.save_model(output_dir); tokenizer.save_pretrained(output_dir)
    print(f"  ✅ Entraînement {target_class} terminé. Optimal Threshold: {best_t:.4f} (F1: {best_f1:.4f})")

    return output_dir, best_t


def update_submission_task2(submission_path, test_path, model_threshold_map, config):
    print("\n--- Démarrage de la Mise à Jour de la Soumission Tâche 2 ---")

    df_test = pd.read_csv(test_path); df_submission = pd.read_csv(submission_path); df_test.columns = df_test.columns.str.strip(); df_submission.columns = df_submission.columns.str.strip()

    id_col = next((c for c in df_test.columns if c.lower() in ['id', 'id']), df_test.columns[0]);
    text_col = next((c for c in df_test.columns if c.lower() in ['text', 'text']), df_test.columns[1])
    test_texts = df_test[text_col].astype(str).tolist()

    for target_class, data in model_threshold_map.items():
        print(f"\n-> Mise à jour de la colonne : {target_class}")
        model_path = data['path']; threshold = data['threshold']; col_name_csv = LABEL_COL_MAP[target_class]

        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
        model.eval()

        all_probs_pos_class = []
        with torch.no_grad():
            for i in tqdm(range(0, len(test_texts), 64), desc=f"Predicting {target_class}"):
                batch_texts = test_texts[i:i + 64]
                encodings = tokenizer(batch_texts, max_length=config['max_length'], padding='max_length', truncation=True, return_tensors='pt').to(device)
                outputs = model(**encodings)
                probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()[:, 1]
                all_probs_pos_class.extend(probs)

        final_predictions = (np.array(all_probs_pos_class) > threshold).astype(int)

        if col_name_csv in df_submission.columns:
            df_submission[col_name_csv] = final_predictions
            print(f"  ✅ Colonne '{col_name_csv}' mise à jour. Pos Count: {df_submission[col_name_csv].sum()}")
        else:
            print(f"  ❌ ATTENTION: Colonne '{col_name_csv}' non trouvée dans le fichier de soumission.")

        del model; gc.collect(); torch.cuda.empty_cache()

    output_filename = 'polarization_english_task2_pivot_submission.csv'
    df_submission.to_csv(output_filename, index=False)
    print(f"\n✅ FICHIER DE SOUMISSION FINAL CRÉÉ: {output_filename}")
    return df_submission


# =============================================================================
# 4. EXÉCUTION DU PIPELINE COMPLET
# =============================================================================
if __name__ == "__main__":

    # 1. Entraînement des modèles spécialisés (Gender/Sexual et Other)
    MODEL_THRESHOLD_MAP = {}
    for target in TARGET_CLASSES:
        model_path, threshold = train_and_get_threshold_pivot(target, CONFIG_SPEC_TASK2)
        MODEL_THRESHOLD_MAP[target] = {'path': model_path, 'threshold': threshold}

    # 2. Mise à jour de la soumission
    update_submission_task2(
        PREVIOUS_SUBMISSION_PATH_TASK2,
        EN_TEST_FILE_PATH_TASK2,
        MODEL_THRESHOLD_MAP,
        CONFIG_SPEC_TASK2
    )

    print("\n--- PIPELINE TÂCHE 2 (PIVOT AUGMENTATION) TERMINÉ ---")

🖥️ Device: cuda | 🎯 Tâche 2: Pivot Language Augmentation

--- Démarrage de l'entraînement Poussé Pivot pour : Gender/Sexual ---


Polarized deu_Latn->EN:   0%|          | 0/48 [00:00<?, ?it/s]

Polarized ita_Latn->EN:   0%|          | 0/43 [00:00<?, ?it/s]

Round-Trip EN->DEU:   0%|          | 0/3 [00:00<?, ?it/s]

Round-Trip DEU->EN:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of RemBertForSequenceClassification were not initialized from the model checkpoint at google/rembert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Total samples (Augmented): 6,174. Positives: 3024


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Positive Class
1,No log,0.697266,0.337857,0.0
2,0.707100,0.704349,0.337857,0.0
3,0.707100,0.694385,0.337857,0.0
4,0.697500,0.693507,0.337857,0.0
5,0.696400,0.705273,0.328747,0.657495
6,0.696400,0.693238,0.337857,0.0
7,0.695600,0.694135,0.337857,0.0
8,0.695400,0.693144,0.337857,0.0
9,0.695400,0.693627,0.328747,0.657495
10,0.695400,0.694388,0.328747,0.657495


  ✅ Entraînement Gender/Sexual terminé. Optimal Threshold: 0.0500 (F1: 0.6575)

--- Démarrage de l'entraînement Poussé Pivot pour : Other ---


Polarized deu_Latn->EN:   0%|          | 0/48 [00:00<?, ?it/s]

Polarized ita_Latn->EN:   0%|          | 0/43 [00:00<?, ?it/s]

Round-Trip EN->DEU:   0%|          | 0/4 [00:00<?, ?it/s]

Round-Trip DEU->EN:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of RemBertForSequenceClassification were not initialized from the model checkpoint at google/rembert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Total samples (Augmented): 6,228. Positives: 3132


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Positive Class
1,No log,0.332497,0.887428,0.892966
2,0.395000,0.718409,0.831448,0.853081
3,0.395000,0.728854,0.332143,0.0
4,0.575200,0.693712,0.33452,0.669039
5,0.697400,0.695172,0.33452,0.669039
6,0.697400,0.693726,0.332143,0.0
7,0.696500,0.694938,0.332143,0.0
8,0.696400,0.699602,0.332143,0.0
9,0.696400,0.69419,0.33452,0.669039
10,0.695100,0.693199,0.332143,0.0


  ✅ Entraînement Other terminé. Optimal Threshold: 0.7500 (F1: 0.9107)

--- Démarrage de la Mise à Jour de la Soumission Tâche 2 ---

-> Mise à jour de la colonne : Gender/Sexual


Predicting Gender/Sexual:   0%|          | 0/3 [00:00<?, ?it/s]

  ✅ Colonne 'gender/sexual' mise à jour. Pos Count: 160

-> Mise à jour de la colonne : Other


Predicting Other:   0%|          | 0/3 [00:00<?, ?it/s]

  ✅ Colonne 'other' mise à jour. Pos Count: 9

✅ FICHIER DE SOUMISSION FINAL CRÉÉ: polarization_english_task2_pivot_submission.csv

--- PIPELINE TÂCHE 2 (PIVOT AUGMENTATION) TERMINÉ ---
