In [5]:
# Imports + paramètres

import pandas as pd
import joblib
from google.colab import drive

# Monter Google Drive (Colab)
drive.mount('/content/drive')

# Chemins
ARTIFACT_PATH = "/content/drive/MyDrive/Formation DATA ANALYST/P12/billets_artifact.joblib"
INPUT_PATH = "/content/drive/MyDrive/Formation DATA ANALYST/P12/billets_test.csv" # à changer avec le fichier test
OUTPUT_PATH = "/content/drive/MyDrive/Formation DATA ANALYST/P12/billets_production_predictions.csv"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Chargement de l'artefact + vérifications

import os

# Vérifier que le fichier existe
assert os.path.exists(ARTIFACT_PATH), f"Artefact introuvable : {ARTIFACT_PATH}"

artifact = joblib.load(ARTIFACT_PATH)

# Contrôles de structure
assert isinstance(artifact, dict), "L'artefact chargé n'est pas un dictionnaire."
assert "model" in artifact and "features" in artifact, "L'artefact doit contenir 'model' et 'features'."

model = artifact["model"]
FEATURES = artifact["features"]

print("Artefact chargé avec succès.")
print("Features attendues :", FEATURES)

# afficher les étapes de la pipeline
try:
    print("Étapes de la pipeline :", [name for name, _ in model.steps])
except Exception:
    print("Info pipeline non disponible (objet non pipeline).")


Artefact chargé avec succès.
Features attendues : ['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up', 'length']
Étapes de la pipeline : ['simpleimputer', 'standardscaler', 'logisticregression']


In [7]:
# Chargement des données de production + contrôles

import os

# Vérifier que le fichier existe
assert os.path.exists(INPUT_PATH), f"Fichier d'entrée introuvable : {INPUT_PATH}"

df_prod = pd.read_csv(INPUT_PATH)

print("Fichier production chargé.")
print("Dimensions :", df_prod.shape)
display(df_prod.head())

# Vérifier la présence des colonnes attendues
missing_cols = [c for c in FEATURES if c not in df_prod.columns]
assert len(missing_cols) == 0, f"Colonnes manquantes dans billets_production.csv : {missing_cols}"

# Vérifier les NaN
na_counts = df_prod[FEATURES].isna().sum()
print("\nNombre de NaN par feature (production) :")
display(na_counts[na_counts > 0] if (na_counts > 0).any() else "Aucun NaN sur les features.")


Fichier production chargé.
Dimensions : (5, 7)


Unnamed: 0,diagonal,height_left,height_right,margin_low,margin_up,length,id
0,172.09,103.95,103.73,4.39,3.09,113.19,B_1
1,171.52,104.17,104.03,5.27,3.16,111.82,B_2
2,171.78,103.8,103.75,3.81,3.24,113.39,B_3
3,172.02,104.08,103.99,5.57,3.3,111.1,B_4
4,171.79,104.34,104.37,5.0,3.07,111.87,B_5



Nombre de NaN par feature (production) :


'Aucun NaN sur les features.'

In [8]:
# Prédiction + export

# Construire X de production
X_prod = df_prod[FEATURES]

# Prédictions
pred_class = model.predict(X_prod)

# Probabilités (si le modèle le permet)
proba_true = None
if hasattr(model, "predict_proba"):
    proba_true = model.predict_proba(X_prod)[:, 1]

# Construire la sortie
df_out = df_prod.copy()
df_out["is_genuine_pred"] = pred_class

if proba_true is not None:
    df_out["proba_true"] = proba_true

# Résumé rapide
print("Résumé des prédictions :")
display(df_out["is_genuine_pred"].value_counts(dropna=False))

# Export CSV
df_out.to_csv(OUTPUT_PATH, index=False)
print(f"Fichier exporté : {OUTPUT_PATH}")

display(df_out.head())


Résumé des prédictions :


Unnamed: 0_level_0,count
is_genuine_pred,Unnamed: 1_level_1
False,3
True,2


Fichier exporté : /content/drive/MyDrive/Formation DATA ANALYST/P12/billets_production_predictions.csv


Unnamed: 0,diagonal,height_left,height_right,margin_low,margin_up,length,id,is_genuine_pred,proba_true
0,172.09,103.95,103.73,4.39,3.09,113.19,B_1,True,0.995288
1,171.52,104.17,104.03,5.27,3.16,111.82,B_2,False,0.003422
2,171.78,103.8,103.75,3.81,3.24,113.39,B_3,True,0.999399
3,172.02,104.08,103.99,5.57,3.3,111.1,B_4,False,3e-05
4,171.79,104.34,104.37,5.0,3.07,111.87,B_5,False,0.011461
