### Définition de la fonction build_model

In [15]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score, recall_score
from mlflow.models.signature import infer_signature

mlflow.set_tracking_uri("http://localhost:5000")
experiment_name = "pipeline_countVectorizer_lr"
mlflow.set_experiment(experiment_name)

def build_model(
    training_set,
    pipeline,
    model_name,
    mlflow_run_tags=None,
    mlflow_run_parameters=None,
    mlflow_run_description=None,
    validation_set=None
):
    """
    Build a sentiment analysis model, print the evaluation result and store everything to MLFlow.

    @param training_set: pandas dataframe containing the input training set.
    @param pipeline: scikit-learn pipeline that will be applied to the input data.
    @param mlflow_run_tags: dict of tags that will be stored in the MLFlow run.
    @param mlflow_run_parameters: dict of parameters that will be stored in the MLFlow run.
    @param mlflow_run_description: textual description of the run.
    @param validation_set: if provided, used to evaluate the model and log result in MLFlow.
    @return: the trained pipeline.
    """
    # Début de l'expérience MLFlow
    with mlflow.start_run() as run:
        # Log des tags supplémentaires passés en argument
        if mlflow_run_tags:
            for tag, value in mlflow_run_tags.items():
                mlflow.set_tag(tag, value)

        # Log des paramètres passés en argument
        if mlflow_run_parameters:
            mlflow.log_params(mlflow_run_parameters)

        # Log de la description de l'expérience
        if mlflow_run_description:
            mlflow.set_tag("Description", mlflow_run_description)
        
        # Entraînement du modèle sur l'ensemble d'apprentissage
        X_train, y_train = training_set["review"], training_set["polarity"]

        pipeline.fit(X_train, y_train)

        # Log de la signature du modèle
        signature = infer_signature(X_train, pipeline.predict(X_train))

        # Si un ensemble de validation est fourni, on évalue le modèle dessus
        if validation_set is not None:
            X_val, y_val = validation_set.iloc[:, :-1], validation_set.iloc[:, -1]
            y_pred = pipeline.predict(X_val)
            
            # Calcul de métriques sur l'ensemble de validation
            accuracy = accuracy_score(y_val, y_pred)
            recall = recall_score(y_val, y_pred, average="weighted")

            # Log des métriques dans MLFlow
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("recall", recall)
        
        # Log du modèle avec MLFlow
        mlflow.sklearn.log_model(
            sk_model = pipeline,
            artifact_path = "train_review",
            signature = signature,
            registered_model_name = model_name,
        )
    
    return pipeline

### Préparation des paramètres de build_model

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import spacy
import subprocess

data_train = pd.read_csv('../../data/archive/train.csv')
data_test = pd.read_csv('../../data/archive/test.csv')

nlp = spacy.load("fr_core_news_sm")
french_stop_words = nlp.Defaults.stop_words

# Initialisation de CountVectorizer avec les stop words
vectorizer = CountVectorizer(stop_words=list(french_stop_words))

# Application de CountVectorizer sur la colonne 'review'
X_train = data_train['review']
X_test = data_test['review']

y_train = data_train["polarity"]
y_test = data_test["polarity"]

training_set = pd.DataFrame({
    "review": X_train,
    "polarity": y_train
})

source_name = "model_design_2.ipynb"
commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")
branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).strip().decode("utf-8")



In [17]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

estimators1 = [('vectorizer', CountVectorizer(stop_words=list(french_stop_words))), 
              ('model', LogisticRegression(C= 0.1, penalty= 'l2', solver= 'liblinear',max_iter = 10000))]
estimators2 = [('vectorizer', CountVectorizer(stop_words=list(french_stop_words))), 
              ('model', LogisticRegression(C= 0.1, penalty= 'l1', solver= 'liblinear',max_iter = 10000))]
estimators3 = [('vectorizer', CountVectorizer(stop_words=list(french_stop_words))), 
              ('model', LogisticRegression(C= 0.5, penalty= 'l2', solver= 'liblinear',max_iter = 10000))]

pipe1 = Pipeline(estimators1)
pipe2 = Pipeline(estimators2)
pipe3 = Pipeline(estimators3)

mlflow_run_parameters1 = {
    "C": 0.1,
    "penalty": 'l2',
    "solver": 'liblinear',
    "max_iter": 10000
}
mlflow_run_parameters2 = {
    "C": 0.1,
    "penalty": 'l1',
    "solver": 'liblinear',
    "max_iter": 10000
}
mlflow_run_parameters3 = {
    "C": 0.5,
    "penalty": 'l2',
    "solver": 'liblinear',
    "max_iter": 10000
}

mlflow_run_tags = {
    "User": "Nathan Gerussi",
    "version": "1.0",
    "Description": "Test d'un modèle de régression logistique avec CountVectorizer",
    "Preprocessing": "CountVectorizer",
    "Algorithm": "Logistic Regression",
    "mlflow.source.name": source_name,
    "mlflow.source.git.commit": commit,
    "mlflow.source.git.branch": branch
}

model_name = "logisticRegression"

In [18]:
build_model(
    training_set,
    pipe1,
    model_name,
    mlflow_run_tags=mlflow_run_tags,
    mlflow_run_parameters=mlflow_run_parameters1,
    mlflow_run_description=None,
    validation_set=None
)
build_model(
    training_set,
    pipe2,
    model_name,
    mlflow_run_tags=mlflow_run_tags,
    mlflow_run_parameters=mlflow_run_parameters2,
    mlflow_run_description=None,
    validation_set=None
)
build_model(
    training_set,
    pipe3,
    model_name,
    mlflow_run_tags=mlflow_run_tags,
    mlflow_run_parameters=mlflow_run_parameters3,
    mlflow_run_description=None,
    validation_set=None
)

<class 'pandas.core.series.Series'> 0    Si vous cherchez du cinéma abrutissant à tous ...
1    Trash, re-trash et re-re-trash...! Une horreur...
2    Et si, dans les 5 premières minutes du film, l...
3    Mon dieu ! Quelle métaphore filée ! Je suis ab...
4    Premier film de la saga Kozure Okami, "Le Sabr...
Name: review, dtype: object


Registered model 'logisticRegression' already exists. Creating a new version of this model...
2024/11/19 14:39:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logisticRegression, version 10
Created version '10' of model 'logisticRegression'.


🏃 View run bustling-kite-923 at: http://localhost:5000/#/experiments/493843891007736314/runs/d2897cd95a1e4277b3ff016ae0e6c26c
🧪 View experiment at: http://localhost:5000/#/experiments/493843891007736314
<class 'pandas.core.series.Series'> 0    Si vous cherchez du cinéma abrutissant à tous ...
1    Trash, re-trash et re-re-trash...! Une horreur...
2    Et si, dans les 5 premières minutes du film, l...
3    Mon dieu ! Quelle métaphore filée ! Je suis ab...
4    Premier film de la saga Kozure Okami, "Le Sabr...
Name: review, dtype: object


Registered model 'logisticRegression' already exists. Creating a new version of this model...
2024/11/19 14:40:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logisticRegression, version 11
Created version '11' of model 'logisticRegression'.


🏃 View run inquisitive-loon-431 at: http://localhost:5000/#/experiments/493843891007736314/runs/cb92e06b08d0439399401f224210bb5b
🧪 View experiment at: http://localhost:5000/#/experiments/493843891007736314
<class 'pandas.core.series.Series'> 0    Si vous cherchez du cinéma abrutissant à tous ...
1    Trash, re-trash et re-re-trash...! Une horreur...
2    Et si, dans les 5 premières minutes du film, l...
3    Mon dieu ! Quelle métaphore filée ! Je suis ab...
4    Premier film de la saga Kozure Okami, "Le Sabr...
Name: review, dtype: object


Registered model 'logisticRegression' already exists. Creating a new version of this model...
2024/11/19 14:41:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logisticRegression, version 12


🏃 View run bald-stork-476 at: http://localhost:5000/#/experiments/493843891007736314/runs/fb94674e040543d2974df84114432b95
🧪 View experiment at: http://localhost:5000/#/experiments/493843891007736314


Created version '12' of model 'logisticRegression'.
