In [1]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ============================================
# 1. CHARGEMENT DES DONN√âES
# ============================================
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Garder seulement les colonnes n√©cessaires
train_df = train_df[['text', 'label_numeric']].rename(columns={'label_numeric': 'label'})
test_df = test_df[['text', 'label_numeric']].rename(columns={'label_numeric': 'label'})

print(f"Train: {len(train_df)} articles | Test: {len(test_df)} articles")

Train: 30916 articles | Test: 7730 articles


In [3]:
# ============================================
# 2. CONVERSION EN DATASET HUGGINGFACE
# ============================================
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [4]:
# ============================================
# 3. TOKENIZATION
# ============================================
print("\nTokenization...")
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples['text'], 
        padding="max_length", 
        truncation=True, 
        max_length=512
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


Tokenization...


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30916/30916 [00:11<00:00, 2797.63 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7730/7730 [00:02<00:00, 2646.10 examples/s]


In [5]:
# ============================================
# 4. MOD√àLE
# ============================================
print("\nChargement du mod√®le BERT...")
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2
)

# D√©tecter si GPU disponible
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device utilis√©: {device}")



Chargement du mod√®le BERT...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Device utilis√©: cuda


In [6]:
# ============================================
# 5. M√âTRIQUES D'√âVALUATION
# ============================================
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary'
    )
    acc = accuracy_score(labels, preds)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [9]:
# ============================================
# 6. CONFIGURATION DE L'ENTRA√éNEMENT
# ============================================
print("\nConfiguration de l'entra√Ænement...")
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # R√©duit pour GPU 8GB
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    logging_steps=100,
    fp16=True,  # Pr√©cision mixte pour acc√©l√©rer
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


Configuration de l'entra√Ænement...


In [10]:
# ============================================
# 7. ENTRA√éNEMENT
# ============================================
print("\nEntra√Ænement en cours...")

trainer.train()


Entra√Ænement en cours...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0,0.003989,0.999483,0.999528,0.999057,1.0
2,0.0,0.001453,0.999871,0.999882,1.0,0.999764


TrainOutput(global_step=15458, training_loss=0.005121192545072739, metrics={'train_runtime': 3044.7436, 'train_samples_per_second': 20.308, 'train_steps_per_second': 5.077, 'total_flos': 1.626868277501952e+16, 'train_loss': 0.005121192545072739, 'epoch': 2.0})

In [None]:
# ============================================
# 8. √âVALUATION FINALE
# ============================================
print("\n√âvaluation finale...")
results = trainer.evaluate()

print("\n" + "="*50)
print("R√âSULTATS FINAUX")
print("="*50)
for key, value in results.items():
    print(f"{key}: {value:.4f}")

# ============================================
# 9. MATRICE DE CONFUSION
# ============================================
print("\nG√©n√©ration de la matrice de confusion...")
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

cm = confusion_matrix(labels, preds)
print("\nMatrice de confusion:")
print("                 Predicted")
print("               Fake  Real")
print(f"Actual Fake    {cm[0][0]:4d}  {cm[0][1]:4d}")
print(f"       Real    {cm[1][0]:4d}  {cm[1][1]:4d}")

# ============================================
# 10. SAUVEGARDE
# ============================================
print("\n Sauvegarde du mod√®le...")
model.save_pretrained("./baseline_model")
tokenizer.save_pretrained("./baseline_model")

print("\n Mod√®le sauvegard√© dans ./baseline_model/")
print("Entra√Ænement termin√© avec succ√®s!")


üìä √âvaluation finale...



üéØ R√âSULTATS FINAUX
eval_loss: 0.0015
eval_accuracy: 0.9999
eval_f1: 0.9999
eval_precision: 1.0000
eval_recall: 0.9998
eval_runtime: 62.0637
eval_samples_per_second: 124.5500
eval_steps_per_second: 31.1450
epoch: 2.0000

üìà G√©n√©ration de la matrice de confusion...

Matrice de confusion:
                 Predicted
               Fake  Real
Actual Fake    3491     0
       Real       1  4238

üíæ Sauvegarde du mod√®le...

‚úÖ Mod√®le sauvegard√© dans ./baseline_model/
üéâ Entra√Ænement termin√© avec succ√®s!


In [17]:

# ============================================
# 11. TEST DE PR√âDICTION - VERSION CORRIG√âE
# ============================================
print("\n Test de pr√©diction sur VRAIS articles...")

def predict_news(text):
    """Pr√©dire si une news est fake ou real"""
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        max_length=512,
        padding=True
    )
    
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model.to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        pred = torch.argmax(probs).item()
        confidence = probs[0][pred].item()
    
    label = "üî¥ FAKE" if pred == 0 else "üü¢ REAL"
    return label, confidence

# ============================================
# EXEMPLES DE VRAIS ARTICLES (pas de phrases courtes)
# ============================================
test_examples = [
    # FAKE NEWS (style sensationnaliste)
    """Breaking: Scientists at a secret lab have discovered that 
    eating chocolate every day can cure all types of cancer and 
    make you live forever! The pharmaceutical industry is hiding 
    this information because they want to keep selling expensive 
    treatments. Share this before it gets deleted!""",
    
    # REAL NEWS (style journalistique)
    """PARIS (Reuters) - The Eiffel Tower, one of the world's most 
    iconic landmarks, celebrated its 134th anniversary today. Built 
    in 1889 by engineer Gustave Eiffel, the 324-meter iron structure 
    attracts millions of visitors annually and remains a symbol of 
    French culture and engineering excellence.""",
    
    # FAKE NEWS (complot)
    """Leaked documents reveal that Bill Gates and the WHO are planning 
    to implant microchips in COVID vaccines to control the population. 
    The mainstream media refuses to report this shocking truth. Wake up 
    people! They're tracking your every move and reading your thoughts!""",
    
    # FAKE NEWS (sant√©)
    """Doctors don't want you to know this simple trick! Drinking lemon 
    water with baking soda cures diabetes in just 3 days. Big Pharma 
    hates this because it costs only $2. Thousands of people have already 
    been cured. Try it now before the government bans this information!""",
]

print("\n" + "="*70)
print(" TESTS DE PR√âDICTION SUR ARTICLES COMPLETS")
print("="*70)

for i, example in enumerate(test_examples, 1):
    label, confidence = predict_news(example)
    
    # Afficher un extrait
    preview = example[:100].replace("\n", " ") + "..."
    print(f"\n{i}. {preview}")
    print(f"   ‚Üí Pr√©diction: {label}")
    print(f"   ‚Üí Confiance: {confidence:.2%}")

# ============================================
# TEST AVEC DES VRAIS ARTICLES DU DATASET
# ============================================
print("\n" + "="*70)
print(" TESTS SUR ARTICLES R√âELS DU DATASET")
print("="*70)

# Prendre quelques exemples du test set
sample_articles = test_df.sample(5, random_state=42)

for idx, row in sample_articles.iterrows():
    label, confidence = predict_news(row['text'])
    true_label = "FAKE" if row['label'] == 0 else "REAL"
    
    # Extraire le titre ou d√©but
    preview = row['text'][:100].replace("\n", " ") + "..."
    
    match = "‚úÖ" if label.replace("üî¥ ", "").replace("üü¢ ", "") == true_label else "‚ùå"
    print(f"\n{match} Vrai: {true_label}")
    print(f"   Article: {preview}")
    print(f"   Pr√©dit: {label} ({confidence:.1%})")






 Test de pr√©diction sur VRAIS articles...

 TESTS DE PR√âDICTION SUR ARTICLES COMPLETS

1. Breaking: Scientists at a secret lab have discovered that      eating chocolate every day can cure a...
   ‚Üí Pr√©diction: üî¥ FAKE
   ‚Üí Confiance: 100.00%

2. PARIS (Reuters) - The Eiffel Tower, one of the world's most      iconic landmarks, celebrated its 13...
   ‚Üí Pr√©diction: üü¢ REAL
   ‚Üí Confiance: 100.00%

3. Leaked documents reveal that Bill Gates and the WHO are planning      to implant microchips in COVID...
   ‚Üí Pr√©diction: üî¥ FAKE
   ‚Üí Confiance: 100.00%

4. Doctors don't want you to know this simple trick! Drinking lemon      water with baking soda cures d...
   ‚Üí Pr√©diction: üî¥ FAKE
   ‚Üí Confiance: 100.00%

 TESTS SUR ARTICLES R√âELS DU DATASET

‚úÖ Vrai: REAL
   Article: TOKYO (Reuters) - Japanese Prime Minister Shinzo Abe is considering calling a snap election for as e...
   Pr√©dit: üü¢ REAL (100.0%)

‚úÖ Vrai: REAL
   Article: WASHINGTON (Reuters) - Pr

In [14]:
# ============================================
# DIAGNOSTIC DU MOD√àLE
# ============================================
print("\n DIAGNOSTIC APPROFONDI...")

# 1. V√©rifier les labels du dataset
print("\n V√©rification des labels:")
print(f"Label 0 = {test_df[test_df['label'] == 0]['label'].count()} articles")
print(f"Label 1 = {test_df[test_df['label'] == 1]['label'].count()} articles")
print("\nRappel: Label 0 = FAKE, Label 1 = REAL")

# 2. Tester sur des exemples du test set r√©el
print("\n Test sur exemples r√©els du dataset:")

# Prendre 5 fake et 5 real du test set
fake_samples = test_df[test_df['label'] == 0].sample(5)
real_samples = test_df[test_df['label'] == 1].sample(5)

print("\n FAKE NEWS du dataset:")
for idx, row in fake_samples.iterrows():
    label, conf = predict_news(row['text'])
    true_label = "FAKE" if row['label'] == 0 else "REAL"
    match = "‚úÖ" if label.replace("üî¥ ", "").replace("üü¢ ", "") == true_label else "‚ùå"
    print(f"{match} Vrai: {true_label} | Pr√©dit: {label} ({conf:.1%})")

print("\nREAL NEWS du dataset:")
for idx, row in real_samples.iterrows():
    label, conf = predict_news(row['text'])
    true_label = "FAKE" if row['label'] == 0 else "REAL"
    match = "‚úÖ" if label.replace("üî¥ ", "").replace("üü¢ ", "") == true_label else "‚ùå"
    print(f"{match} Vrai: {true_label} | Pr√©dit: {label} ({conf:.1%})")

# 3. V√©rifier les logits bruts
print("\n V√©rification des logits bruts:")
test_text = "The Eiffel Tower is in Paris, France."
inputs = tokenizer(test_text, return_tensors="pt", truncation=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    
    print(f"Logits bruts: {logits}")
    print(f"Probabilit√©s: Fake={probs[0][0]:.4f}, Real={probs[0][1]:.4f}")
    print(f"Pr√©diction: {torch.argmax(probs).item()} (0=Fake, 1=Real)")

# 4. Statistiques globales sur tout le test set
print("\n Statistiques sur TOUT le test set (√©chantillon de 100):")
sample_test = test_df.sample(100, random_state=42)
predictions = []
true_labels = []

for idx, row in sample_test.iterrows():
    label, conf = predict_news(row['text'])
    pred = 0 if "FAKE" in label else 1
    predictions.append(pred)
    true_labels.append(row['label'])

from sklearn.metrics import confusion_matrix, accuracy_score

acc = accuracy_score(true_labels, predictions)
cm = confusion_matrix(true_labels, predictions)

print(f"\nAccuracy sur 100 exemples: {acc:.2%}")
print("\nMatrice de confusion:")
print(f"              Predicted")
print(f"            Fake  Real")
print(f"Actual Fake {cm[0][0]:4d}  {cm[0][1]:4d}")
print(f"       Real {cm[1][0]:4d}  {cm[1][1]:4d}")

print(f"\n Distribution des pr√©dictions:")
fake_pred = sum(1 for p in predictions if p == 0)
real_pred = sum(1 for p in predictions if p == 1)
print(f"Pr√©dit FAKE: {fake_pred}/100 ({fake_pred}%)")
print(f"Pr√©dit REAL: {real_pred}/100 ({real_pred}%)")


 DIAGNOSTIC APPROFONDI...

 V√©rification des labels:
Label 0 = 3491 articles
Label 1 = 4239 articles

Rappel: Label 0 = FAKE, Label 1 = REAL

 Test sur exemples r√©els du dataset:

 FAKE NEWS du dataset:
‚úÖ Vrai: FAKE | Pr√©dit: üî¥ FAKE (100.0%)
‚úÖ Vrai: FAKE | Pr√©dit: üî¥ FAKE (100.0%)
‚úÖ Vrai: FAKE | Pr√©dit: üî¥ FAKE (100.0%)
‚úÖ Vrai: FAKE | Pr√©dit: üî¥ FAKE (100.0%)
‚úÖ Vrai: FAKE | Pr√©dit: üî¥ FAKE (100.0%)

REAL NEWS du dataset:
‚úÖ Vrai: REAL | Pr√©dit: üü¢ REAL (100.0%)
‚úÖ Vrai: REAL | Pr√©dit: üü¢ REAL (100.0%)
‚úÖ Vrai: REAL | Pr√©dit: üü¢ REAL (100.0%)
‚úÖ Vrai: REAL | Pr√©dit: üü¢ REAL (100.0%)
‚úÖ Vrai: REAL | Pr√©dit: üü¢ REAL (100.0%)

 V√©rification des logits bruts:
Logits bruts: tensor([[ 5.6953, -5.2773]], device='cuda:0')
Probabilit√©s: Fake=1.0000, Real=0.0000
Pr√©diction: 0 (0=Fake, 1=Real)

 Statistiques sur TOUT le test set (√©chantillon de 100):

Accuracy sur 100 exemples: 100.00%

Matrice de confusion:
              Predicted
            F