In [4]:
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from collections import Counter
import matplotlib.pyplot as plt
import joblib
import mlflow
import mlflow.pyfunc
from mlflow.models.signature import infer_signature
from typing import Any

BERT

In [5]:
# Classe wrapper pour MLflow
class BertWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, model, mlb):
        self.model = model
        self.mlb = mlb
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')

    def predict(self, context: Any, model_input: pd.DataFrame) -> np.ndarray:
        texts = model_input["text"].tolist()
        embeddings = self.embedder.encode(texts, show_progress_bar=False)
        preds = self.model.predict(embeddings)
        return preds  # numpy.ndarray binaire

# 1. Chargement des données
df = pd.read_csv("stack_questions_api.csv")

# 2. Nettoyage
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text

df["clean_title"] = df["title"].astype(str).apply(clean_text)
df["tags"] = df["tags"].apply(lambda x: x.strip("[]").replace("'", "").split(", "))

# Garder les 50 tags les plus fréquents
all_tags = [tag for tags in df["tags"] for tag in tags]
top_tags = set([tag for tag, count in Counter(all_tags).most_common(50)])
df["tags"] = df["tags"].apply(lambda tags: [tag for tag in tags if tag in top_tags])
df = df[df["tags"].map(len) > 0]

# 3. Features et labels
X = df["clean_title"].tolist()
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df["tags"])

# 4. Split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Embedding avec BERT
model_embedder = SentenceTransformer('all-MiniLM-L6-v2')
X_train = model_embedder.encode(X_train_raw, show_progress_bar=True)
X_test = model_embedder.encode(X_test_raw, show_progress_bar=True)

# 6. Modèle
clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# 7. Évaluation
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_samples = f1_score(y_test, y_pred, average='samples')
print("BERT - F1-score (micro):", f1_micro)
print("BERT - F1-score (samples):", f1_samples)

# MLflow tracking
mlflow.set_tracking_uri("file:///C:/Mes documents/OpenClassRooms/PROJET5/mlruns")
mlflow.set_experiment("Tag_Prediction_Improved")

# Construire input_example sous forme DataFrame
input_example = pd.DataFrame({"text": X_test_raw[:5]})

# Inférer la signature
signature = infer_signature(input_example)

with mlflow.start_run(run_name="BERT_LogReg") as run:
    mlflow.log_metric("f1_micro", f1_micro)
    mlflow.log_metric("f1_samples", f1_samples)

    mlflow.pyfunc.log_model(
        name="model",
        python_model=BertWrapper(clf, mlb),
        signature=signature,
        input_example=input_example,
        # conda_env=None  # facultatif
    )
    artifact_uri = mlflow.get_artifact_uri("model")
    print("Chemin du modèle loggé :", artifact_uri)

    joblib.dump(mlb, "mlb.pkl")
    mlflow.log_artifact("mlb.pkl")

    print(f"Run ID : {run.info.run_id}")

# 9. Visualisation des F1 par tag
report = classification_report(y_test, y_pred, target_names=mlb.classes_, output_dict=True, zero_division=0)
f1_scores = {label: score['f1-score'] for label, score in report.items() if label in mlb.classes_}


Batches: 100%|██████████| 6/6 [00:00<00:00,  9.95it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00, 13.67it/s]


BERT - F1-score (micro): 0.6095238095238096
BERT - F1-score (samples): 0.5967736369910284


2025/07/16 14:00:46 INFO mlflow.pyfunc: Validating input example against model signature
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 222.59it/s]  


Chemin du modèle loggé : file:///C:/Mes documents/OpenClassRooms/PROJET5/mlruns/101836269437250841/9502cd1c143649cd9aea99cb61711e21/artifacts/model
Run ID : 9502cd1c143649cd9aea99cb61711e21


Dans le terminal : 
pytest tests/test_main.py

= Cette commande sert à lancer les tests automatiques définis dans le fichier test_main.py situé dans le dossier tests.

Résultat obtenu :
platform win32 -- Python 3.13.5 = Indique que les tests tournent sous Windows avec Python 3.13.5.

pytest-8.4.1, pluggy-1.6.0, plugins: anyio-4.9.0 = Version de pytest et des plugins utilisés.

collected 5 items = pytest a trouvé 5 tests dans le fichier test_main.py.

tests\test_main.py ..... = Chaque . correspond à un test qui a réussi. Ici, les 5 tests ont passé avec succès.

[100%] = Tous les tests (100%) ont été exécutés.

5 passed in 2.52s = Tous les 5 tests sont passés sans erreur en 2.52 secondes.

USE

TensorFlow est officiellement compatible jusqu’à Python 3.10, hors Python 3.10.x ne propose plus d'installateurs officiels pour Windows

In [7]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
import tensorflow_hub as hub
import tensorflow as tf

ModuleNotFoundError: No module named 'tensorflow_hub'

In [None]:
# Chargement du modèle USE
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Texte à encoder
sentences = list(X_train)

# Encodage avec USE
X_train_emb = use_model(sentences).numpy()
X_test_emb = use_model(list(X_test)).numpy()

# Entraînement multi-label
clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
clf.fit(X_train_emb, y_train)

# Prédiction
y_pred = clf.predict(X_test_emb)

# Évaluation
print("USE - F1-score (micro):", f1_score(y_test, y_pred, average='micro'))
print("USE - F1-score (samples):", f1_score(y_test, y_pred, average='samples'))