In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
import torch
from transformers import (
    CamembertTokenizer,
    CamembertForSequenceClassification,
    Trainer,
    TrainingArguments
)

In [5]:
path_vers_mon_fichier = '/content/drive/My Drive/HACKATON_MEDIA_SCAN/data/facebook/facebook_posts_final.csv'

In [7]:
df = pd.read_csv(path_vers_mon_fichier)

In [8]:
df = df.rename(columns={'contenu': 'text', 'categorie': 'label'})

In [9]:
df = df[['text', 'label']].dropna()

In [10]:
categories = ["Politique","Gouvernance","Économie", "Sécurité", "Santé", "Culture", "Sport", "Autres", "Social", "Environnement", "Diplomatie","Justice","Humanitaire"]
label_map = {label: i for i, label in enumerate(categories)}
df['label'] = df['label'].map(label_map)

In [11]:
# Filtrer les labels inconnus (au cas où une 'categorie' n'est pas dans la liste)
df = df[df['label'].notna()]
df['label'] = df['label'].astype(int)

In [12]:
print(f"Données prêtes : {len(df)} échantillons.")
print("Distribution des labels :")
print(df['label'].value_counts())

Données prêtes : 1999 échantillons.
Distribution des labels :
label
7     561
1     228
8     226
12    157
5     152
2     145
6     121
3     113
4      87
9      71
0      58
11     52
10     28
Name: count, dtype: int64


In [13]:
# --- 2. Split Train/Test ---
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

In [14]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [25]:
# --- 3. Tokenization (Optimisée) ---
MODEL_NAME = "camembert-base"
tokenizer = CamembertTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    # On passe à 128 (plus rapide et suffisant pour des posts Facebook)
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

print("Tokenization...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Tokenization...


Map:   0%|          | 0/1599 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [19]:
# --- 4. Configuration du Modèle ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Utilisation de : {device}")

model = CamembertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(categories)
).to(device)

Utilisation de : cuda


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# --- 5. Métriques ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='macro')
    return {"accuracy": acc, "f1_macro": f1}

In [36]:
# --- 6. Entraînement (Optimisé pour la vitesse) ---
print("Configuration de l'entraînement...")

training_args = TrainingArguments(
    output_dir="./camembert_classifier",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

print("--- DÉBUT DE L'ENTRAÎNEMENT ---")
trainer.train()
print("--- FIN DE L'ENTRAÎNEMENT ---")

Configuration de l'entraînement...
--- DÉBUT DE L'ENTRAÎNEMENT ---


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.0623,2.395884,0.625,0.596692
2,0.1357,2.309204,0.65,0.634639
3,0.1297,2.305777,0.66,0.654222
4,0.0583,2.29718,0.6675,0.655734
5,0.0726,2.512315,0.6575,0.644197
6,0.0487,2.467558,0.66,0.65185
7,0.0235,2.409191,0.6775,0.662314
8,0.024,2.474552,0.665,0.65333
9,0.0286,2.500126,0.66,0.652313
10,0.0159,2.515598,0.6625,0.654759


--- FIN DE L'ENTRAÎNEMENT ---


In [37]:
# --- 7. Sauvegarde finale ---
print("Sauvegarde du meilleur modèle...")
trainer.save_model("./best_model")
tokenizer.save_pretrained("./best_model")

print("Évaluation finale sur le set de test :")
print(trainer.evaluate())

# --- 8. Compression pour téléchargement ---
!zip -r camembert_classifier_model.zip ./best_model

Sauvegarde du meilleur modèle...
Évaluation finale sur le set de test :


{'eval_loss': 2.4091906547546387, 'eval_accuracy': 0.6775, 'eval_f1_macro': 0.6623135610427359, 'eval_runtime': 0.89, 'eval_samples_per_second': 449.432, 'eval_steps_per_second': 28.089, 'epoch': 10.0}
  adding: best_model/ (stored 0%)
  adding: best_model/added_tokens.json (stored 0%)
  adding: best_model/tokenizer_config.json (deflated 81%)
  adding: best_model/sentencepiece.bpe.model (deflated 49%)
  adding: best_model/special_tokens_map.json (deflated 52%)
  adding: best_model/config.json (deflated 60%)
  adding: best_model/training_args.bin (deflated 53%)
  adding: best_model/model.safetensors (deflated 11%)


ENTRIANEMENT DU MODLEE DE DETECTION DE CONTENUE SENSIBLE



In [6]:
path_vers_mon_fichier ='/content/drive/MyDrive/HACKATON_MEDIA_SCAN/data/facebook/comments_FOR_TRAINING.csv'

In [7]:
df = pd.read_csv(path_vers_mon_fichier)

In [9]:

df = df.rename(columns={'comment_text': 'text', 'true_category': 'label'})
# ***************************************************************
df = df[['text', 'label']].dropna()

In [10]:
# ***************************************************************
categories = ['normal', 'toxic', 'hateful', 'misinfo', 'adult']
# ***************************************************************

In [11]:
label_map = {label: i for i, label in enumerate(categories)}
df['label'] = df['label'].map(label_map)

In [12]:
print(f"Données prêtes : {len(df)} échantillons.")
print("Distribution des labels :")
print(df['label'].value_counts())

Données prêtes : 1206 échantillons.
Distribution des labels :
label
0    963
2     64
4     61
1     61
3     57
Name: count, dtype: int64


In [13]:
# --- 2. Split Train/Test ---
# On utilise plus de données pour le test (30%) car le dataset est petit
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [14]:
# --- 3. Tokenization ---
MODEL_NAME = "camembert-base"
tokenizer = CamembertTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

print("Tokenization...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Tokenization...


Map:   0%|          | 0/844 [00:00<?, ? examples/s]

Map:   0%|          | 0/362 [00:00<?, ? examples/s]

In [15]:
# --- 4. Configuration du Modèle ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Utilisation de : {device}")

model = CamembertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(categories) # 5 labels cette fois
).to(device)

Utilisation de : cuda


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# --- 5. Métriques (On va se concentrer sur la Précision) ---
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # On calcule tout
    # 'macro' est une bonne moyenne pour l'objectif de 75%
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)

    return {
        "accuracy": acc,
        "f1_macro": f1,
        "precision_macro": precision, # C'est cette métrique qu'on veut > 0.75
        "recall_macro": recall
    }

In [21]:
# --- 6. Entraînement (Anti-Overfitting) ---
print("Configuration de l'entraînement (Détection Sensible)...")

training_args = TrainingArguments(
    output_dir="./sensitive_classifier",
    num_train_epochs=5,             # 5 epochs, c'est bien
    per_device_train_batch_size=16, # 16, c'est bien pour un petit dataset
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs_sensitive',
    logging_steps=20,
    eval_strategy="epoch",    # Corrected: Changed 'evaluation_strategy' to 'eval_strategy'
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="precision_macro", # ON OPTIMISE POUR LA PRÉCISION !
    fp16=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

print("--- DÉBUT DE L'ENTRAÎNEMENT (MODULE 5) ---")
trainer.train()
print("--- FIN DE L'ENTRAÎNEMENT (MODULE 5) ---")

Configuration de l'entraînement (Détection Sensible)...
--- DÉBUT DE L'ENTRAÎNEMENT (MODULE 5) ---


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,0.7669,0.539485,0.798343,0.177573,0.159669,0.2
2,0.4089,0.331691,0.969613,0.864158,0.921034,0.871242
3,0.316,0.278836,0.991713,0.970939,0.97681,0.965944
4,0.3118,0.247061,0.994475,0.982484,0.987546,0.977709
5,0.2431,0.238429,0.994475,0.982484,0.987546,0.977709


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


--- FIN DE L'ENTRAÎNEMENT (MODULE 5) ---


In [22]:
# --- 7. Sauvegarde finale ---
print("Sauvegarde du meilleur modèle (Détection Sensible)...")
trainer.save_model("./best_model_sensitive")
tokenizer.save_pretrained("./best_model_sensitive")

print("Évaluation finale V2 (meilleur modèle) :")
print(trainer.evaluate()) # Regarde 'eval_precision_macro' ici !

# --- 8. Compression pour téléchargement ---
!zip -r sensitive_classifier_model.zip ./best_model_sensitive

Sauvegarde du meilleur modèle (Détection Sensible)...
Évaluation finale V2 (meilleur modèle) :


{'eval_loss': 0.24706120789051056, 'eval_accuracy': 0.994475138121547, 'eval_f1_macro': 0.982484465568891, 'eval_precision_macro': 0.9875456389452333, 'eval_recall_macro': 0.9777089783281733, 'eval_runtime': 0.8296, 'eval_samples_per_second': 436.374, 'eval_steps_per_second': 27.725, 'epoch': 5.0}
  adding: best_model_sensitive/ (stored 0%)
  adding: best_model_sensitive/added_tokens.json (stored 0%)
  adding: best_model_sensitive/tokenizer_config.json (deflated 81%)
  adding: best_model_sensitive/sentencepiece.bpe.model (deflated 49%)
  adding: best_model_sensitive/special_tokens_map.json (deflated 52%)
  adding: best_model_sensitive/config.json (deflated 55%)
  adding: best_model_sensitive/training_args.bin (deflated 53%)
  adding: best_model_sensitive/model.safetensors (deflated 13%)
