# Étape 3 — Modélisation (Deep Learning)

Ce notebook entraîne des modèles de **classification de texte** sur vos tables MySQL (`news` et `labeled`).
Nous comparons :
- **Baseline TF‑IDF + Logistic Regression** (référence rapide)
- **DistilBERT** (fine‑tuning efficace sur CPU/venv)

Pourquoi DistilBERT ?
- Très bon compromis **performance/ressources** vs BERT
- Fine‑tuning rapide en 1–3 epochs, même sans GPU

Si vous ajoutez de l'arabe/français sans traduction, utilisez **XLM‑Roberta** (multilingue).


In [1]:
import warnings
warnings.filterwarnings('ignore')

import os, random
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)


<torch._C.Generator at 0x18ff220b830>

In [2]:
# Connexion MySQL (fallback CSV)
DB_USERNAME = 'root'
DB_PASSWORD = ''
DB_HOST = 'localhost'
DB_PORT = '3306'
DB_NAME = 'dataControl'

conn_str = f'mysql+pymysql://{DB_USERNAME}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
print('='*70)
print('CONNEXION & CHARGEMENT')
print('='*70)
try:
    engine = create_engine(conn_str)
    with engine.connect() as _:
        print('✓ Connexion MySQL réussie!')
    df_news = pd.read_sql('SELECT * FROM news', engine)
    df_labeled = pd.read_sql('SELECT * FROM labeled', engine)
except Exception as e:
    print(f'✗ Erreur MySQL: {e}')
    print('⚠️ Fallback CSV: data/*.csv')
    df_news = pd.read_csv('data/fake_news_dataset.csv')
    df_labeled = pd.read_csv('data/labeled_data.csv')

# Choix des colonnes texte
def pick_text_column(df):
    for c in ['text_processed','text_cleaned','text','tweet','title']:
        if c in df.columns: return c
    raise ValueError('Aucune colonne texte trouvée')

text_col_news = pick_text_column(df_news)
text_col_lab = pick_text_column(df_labeled)
print(f'NEWS text column: {text_col_news}')
print(f'LABELED text column: {text_col_lab}')


CONNEXION & CHARGEMENT
✓ Connexion MySQL réussie!
NEWS text column: text_processed
LABELED text column: text_processed


In [3]:
# Préparation des labels
def map_news_label(v):
    v = str(v).lower() if pd.notna(v) else ''
    return 1 if v == 'fake' else 0

def map_lab_class(v):
    # 0: hate_speech, 1: offensive_language -> 1 (hate); 2 neither -> 0 (normal)
    try:
        vi = int(v)
    except Exception:
        return 0
    return 1 if vi in (0,1) else 0

y_news = df_news['label'].apply(map_news_label) if 'label' in df_news.columns else pd.Series([0]*len(df_news))
X_news = df_news[text_col_news].astype(str)

y_lab = df_labeled['class'].apply(map_lab_class) if 'class' in df_labeled.columns else pd.Series([0]*len(df_labeled))
X_lab = df_labeled[text_col_lab].astype(str)

print('NEWS labels: fake=1, normal=0')
print('LABELED labels: hate=1, normal=0')


NEWS labels: fake=1, normal=0
LABELED labels: hate=1, normal=0


## Baseline — TF‑IDF + Logistic Regression
Simple, rapide, utile pour comparer avec des modèles profonds.


In [4]:
def run_tfidf_lr(X, y, title):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
    vec = TfidfVectorizer(max_features=50000, ngram_range=(1,2))
    Xtr = vec.fit_transform(X_train); Xte = vec.transform(X_test)
    clf = LogisticRegression(max_iter=200)
    clf.fit(Xtr, y_train)
    pred = clf.predict(Xte)
    acc = accuracy_score(y_test, pred); f1 = f1_score(y_test, pred)
    print(f'[{title}] Accuracy: {acc:.4f} | F1: {f1:.4f}')
    print(classification_report(y_test, pred, digits=4))
    return acc, f1

print('='*70)
print('BASELINE TF-IDF + LR')
print('='*70)
run_tfidf_lr(X_news, y_news, 'NEWS')
run_tfidf_lr(X_lab, y_lab, 'LABELED')


BASELINE TF-IDF + LR
[NEWS] Accuracy: 0.5022 | F1: 0.5071
              precision    recall  f1-score   support

           0     0.4995    0.4952    0.4973      1989
           1     0.5049    0.5092    0.5071      2011

    accuracy                         0.5022      4000
   macro avg     0.5022    0.5022    0.5022      4000
weighted avg     0.5022    0.5022    0.5022      4000

[LABELED] Accuracy: 0.9307 | F1: 0.9593
              precision    recall  f1-score   support

           0     0.8754    0.6847    0.7684       831
           1     0.9391    0.9803    0.9593      4121

    accuracy                         0.9307      4952
   macro avg     0.9072    0.8325    0.8638      4952
weighted avg     0.9284    0.9307    0.9272      4952



(0.930735056542811, 0.9592781669238989)

## DistilBERT — Fine‑tuning
Plus puissant que la baseline, tout en restant accessible sans GPU.


In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.enc = tokenizer(list(texts), truncation=True, padding='max_length', max_length=max_len)
        self.labels = list(labels)
    def __len__(self): return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.enc.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

def train_distilbert(X, y, title, out_dir):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
    train_ds = TextDataset(X_train, y_train, tokenizer)
    val_ds = TextDataset(X_val, y_val, tokenizer)

    def compute_metrics(p):
        preds = np.argmax(p.predictions, axis=1)
        acc = accuracy_score(y_val, preds); f1 = f1_score(y_val, preds)
        return {'accuracy': acc, 'f1': f1}

    args = TrainingArguments(
        output_dir=out_dir,
        num_train_epochs=2,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        logging_steps=50,
        learning_rate=2e-5
    )

    trainer = Trainer(model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds, compute_metrics=compute_metrics)
    trainer.train()
    metrics = trainer.evaluate()
    print(f"[{title}] DistilBERT → Accuracy: {metrics['eval_accuracy']:.4f} | F1: {metrics['eval_f1']:.4f}")
    trainer.save_model(out_dir)
    return metrics

print('='*70)
print('DISTILBERT FINE-TUNING')
print('='*70)
train_distilbert(X_news, y_news, 'NEWS', './models/news_distilbert')
train_distilbert(X_lab, y_lab, 'LABELED', './models/labeled_distilbert')

print("\n✅ Étape 3 complétée — modèles entraînés et sauvegardés.")

DISTILBERT FINE-TUNING


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.6981
100,0.7047
150,0.6954
