# Réentraînement sur données vectorisées (ref + prod)
Ce notebook réentraîne un modèle à partir de features déjà vectorisées :
- `ref_data.csv` : données de référence vectorisées (SVD)
- `prod_data_vectorized*.csv` : feedback prod vectorisé (même schéma que ref, + colonne `prediction`)

`DataModeling.ipynb` reste le notebook d'entraînement initial (texte → TF-IDF → SVD).

In [1]:
# Imports et configuration
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, precision_recall_fscore_support
)

RANDOM_SEED = 42
DATA_DIR = Path('../data')
ARTIFACT_DIR = Path('../artifacts')
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

REF_PATH = DATA_DIR / 'ref_data.csv'
PROD_PATH = DATA_DIR / 'prod_data.csv'

# Écraser le modèle et les métriques utilisés en prod
MODEL_OUT = ARTIFACT_DIR / 'phishing_tfidf_logreg.joblib'
METRICS_OUT = ARTIFACT_DIR / 'metrics.json'

# Charger les artefacts figés du modèle initial
original_pipeline = joblib.load(ARTIFACT_DIR / 'phishing_tfidf_logreg.joblib')
svd_frozen = joblib.load(ARTIFACT_DIR / 'svd_ref.joblib')

print(f"Original pipeline steps: {[name for name, _ in original_pipeline.steps]}")
print(f"SVD n_components: {svd_frozen.n_components}")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Original pipeline steps: ['tfidf', 'svd', 'clf']
SVD n_components: 200


In [2]:
# Chargement ref et prod vectorisés
ref_df = pd.read_csv(REF_PATH)
prod_df = pd.read_csv(PROD_PATH)

# Retirer la colonne prediction du fichier prod (elle ne sert pas à l'entraînement)
if 'prediction' in prod_df.columns:
    prod_df = prod_df.drop(columns=['prediction'])
if 'proba_phishing' in prod_df.columns:
    prod_df = prod_df.drop(columns=['proba_phishing'])

# Vérification du schéma (les features doivent matcher)
ref_features = [c for c in ref_df.columns if c != 'target']
prod_features = [c for c in prod_df.columns if c != 'target']

missing_in_prod = [c for c in ref_features if c not in prod_features]
extra_in_prod = [c for c in prod_features if c not in ref_features]
if missing_in_prod or extra_in_prod:
    raise ValueError(f"Schéma incohérent. Manquantes: {missing_in_prod[:5]} | Extra: {extra_in_prod[:5]}")

# Réordonner prod pour coller exactement à ref (par prudence)
prod_df = prod_df[ref_features + ['target']]

print('ref shape', ref_df.shape, 'prod shape', prod_df.shape)

# Concat ref + prod
train_df = pd.concat([ref_df, prod_df], axis=0, ignore_index=True)
print('concat shape', train_df.shape)


ref shape (5000, 201) prod shape (15, 201)
concat shape (5015, 201)


In [3]:
# Features / cible
feature_cols = [c for c in train_df.columns if c != 'target']
X = train_df[feature_cols].values
y = train_df['target'].astype(int).values

# Split 70 / 15 / 15 (stratifié),
X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size=0.30, random_state=RANDOM_SEED, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.50, random_state=RANDOM_SEED, stratify=y_tmp
)

print('train', X_train.shape, 'val', X_val.shape, 'test', X_test.shape)

# Entraînement final sur train + val
X_train_final = np.concatenate([X_train, X_val])
y_train_final = np.concatenate([y_train, y_val])

train (3510, 200) val (752, 200) test (753, 200)


In [4]:
# Entraînement Logistic Regression sur features déjà vectorisées (train + val)
clf = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    random_state=RANDOM_SEED
)
clf.fit(X_train_final, y_train_final)
print('modèle entraîné (train+val)')

modèle entraîné (train+val)


In [5]:
# Évaluation
def evaluate(model, X_eval, y_eval, label='eval'):
    y_pred = model.predict(X_eval)
    y_score = model.predict_proba(X_eval)[:, 1] if hasattr(model, 'predict_proba') else None

    acc = accuracy_score(y_eval, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_eval, y_pred, average='binary', zero_division=0)
    auc = roc_auc_score(y_eval, y_score) if y_score is not None else None
    cm = confusion_matrix(y_eval, y_pred)

    print(f"== {label} ==")
    print(f"Accuracy: {acc:.4f} | Precision: {p:.4f} | Recall: {r:.4f} | F1: {f1:.4f}")
    if auc is not None:
        print(f"AUC: {auc:.4f}")
    print('Confusion matrix:\n', cm)

    out = {'accuracy': acc, 'precision': p, 'recall': r, 'f1': f1}
    if auc is not None:
        out['auc'] = auc
    return out

metrics = {
    'val': evaluate(clf, X_val, y_val, 'val'),
    'test': evaluate(clf, X_test, y_test, 'test')
}

== val ==
Accuracy: 0.9734 | Precision: 0.9479 | Recall: 0.9820 | F1: 0.9647
AUC: 0.9976
Confusion matrix:
 [[459  15]
 [  5 273]]
== test ==
Accuracy: 0.9562 | Precision: 0.9239 | Recall: 0.9604 | F1: 0.9418
AUC: 0.9937
Confusion matrix:
 [[453  22]
 [ 11 267]]


In [6]:
# Sauvegarde modèle + métriques
# SOLUTION: Reconstruire le pipeline avec TF-IDF + SVD + nouvelle LR
# Extraire TF-IDF du pipeline original et ajouter SVD + nouvelle LR
tfidf_component = original_pipeline.named_steps['tfidf']

retrained_pipeline = Pipeline([
    ('tfidf', tfidf_component),  # TF-IDF figé du modèle initial
    ('svd', svd_frozen),          # SVD figé (200 dim)
    ('clf', clf)                  # Nouvelle LogisticRegression réentraînée
])

joblib.dump(retrained_pipeline, MODEL_OUT)
with open(METRICS_OUT, 'w', encoding='utf-8') as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)

print('Pipeline réentraîné (TF-IDF + SVD + nouvelle LR) enregistré:', MODEL_OUT)
print('Métriques enregistrées:', METRICS_OUT)


Pipeline réentraîné (TF-IDF + SVD + nouvelle LR) enregistré: ../artifacts/phishing_tfidf_logreg.joblib
Métriques enregistrées: ../artifacts/metrics.json
