In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from hyperopt import hp
from sklearn.metrics import accuracy_score
from hyperopt import fmin, tpe
import mlflow
from mlflow.models import infer_signature
import os
from mlflow import MlflowClient
import git

## MLFLOW

In [2]:
mlflow.set_tracking_uri(uri="http://localhost:5000")

## load data

In [3]:
#On recupere les donn√©es depuis les differents csv puis on les stocks dans differentes variables correspondantes (jeu de train dans X_train)

df_train = pd.read_csv("../data/archive/train.csv")
df_train
X_train, y_train = df_train["review"],  df_train["polarity"]

df_test = pd.read_csv("../data/archive/test.csv")
X_test, y_test = df_train["review"],  df_train["polarity"]
df_val = pd.read_csv("../data/archive/valid.csv")
X_val, y_val= df_train["review"],  df_train["polarity"]

Nous verifions la pr√©sence de donn√©es manquantes

In [4]:
df_train.isna().sum(), df_train.isnull().sum()

(Unnamed: 0    0
 film-url      0
 review        0
 polarity      0
 dtype: int64,
 Unnamed: 0    0
 film-url      0
 review        0
 polarity      0
 dtype: int64)

verification de la balance des labels

In [5]:
np.unique(df_train["polarity"], return_counts=True)

(array([0, 1]), array([79413, 80587]))

it's balance

## preprocessing

In [6]:
STOP_WORDS = list(
    """
a √† √¢ abord afin ah ai aie ainsi ait allaient allons
alors anterieur anterieure anterieures ant√©rieur ant√©rieure ant√©rieures
apres apr√®s as assez attendu au
aupres auquel aura auraient aurait auront
aussi autre autrement autres autrui aux auxquelles auxquels avaient
avais avait avant avec avoir avons ayant

bas basee bat

c' c‚Äô √ßa car ce ceci cela celle celle-ci celle-la celle-l√† celles celles-ci celles-la celles-l√†
celui celui-ci celui-la celui-l√† cent cependant certain certaine certaines certains certes ces
cet cette ceux ceux-ci ceux-l√† chacun chacune chaque chez ci cinq cinquantaine cinquante
cinquanti√®me cinqui√®me combien comme comment compris concernant

d' d‚Äô da dans de debout dedans dehors deja dej√† del√† depuis derriere
derri√®re des desormais desquelles desquels dessous dessus deux deuxi√®me
deuxi√®mement devant devers devra different differente differentes differents diff√©rent
diff√©rente diff√©rentes diff√©rents dire directe directement dit dite dits divers
diverse diverses dix dix-huit dix-neuf dix-sept dixi√®me doit doivent donc dont
douze douzi√®me du duquel durant d√®s d√©ja d√©j√† d√©sormais

effet egalement eh elle elle-meme elle-m√™me elles elles-memes elles-m√™mes en encore
enfin entre envers environ es √®s est et etaient √©taient etais √©tais etait √©tait
etant √©tant etc etre √™tre eu eux eux-m√™mes exactement except√© √©galement

fais faisaient faisant fait facon fa√ßon feront font

gens

ha hem hep hi ho hormis hors hou houp hue hui huit huiti√®me
h√© i il ils importe

j' j‚Äô je jusqu jusque juste

l' l‚Äô la laisser laquelle le lequel les lesquelles lesquels leur leurs longtemps
lors lorsque lui lui-meme lui-m√™me l√† l√®s

m' m‚Äô ma maint maintenant mais malgre malgr√© me meme memes merci mes mien
mienne miennes miens mille moi moi-meme moi-m√™me moindres moins
mon m√™me m√™mes

n' n‚Äô na ne neanmoins neuvi√®me ni nombreuses nombreux nos notamment
notre nous nous-m√™mes nouveau nul n√©anmoins n√¥tre n√¥tres

o √¥ on ont onze onzi√®me or ou ouias ouste outre
ouvert ouverte ouverts o√π

par parce parfois parle parlent parler parmi partant
pas pendant pense permet personne peu peut peuvent peux plus
plusieurs plutot plut√¥t possible possibles pour pourquoi
pourrais pourrait pouvait prealable precisement
premier premi√®re premi√®rement
pres procedant proche pr√®s pr√©alable pr√©cisement pu puis puisque

qu' qu‚Äô quand quant quant-√†-soi quarante quatorze quatre quatre-vingt
quatri√®me quatri√®mement que quel quelconque quelle quelles quelqu'un quelque
quelques quels qui quiconque quinze quoi quoique

relative relativement rend rendre restant reste
restent retour revoici revoila revoil√†

s' s‚Äô sa sait sans sauf se seize selon semblable semblaient
semble semblent sent sept septi√®me sera seraient serait seront ses seul seule
seulement seuls seules si sien sienne siennes siens sinon six sixi√®me soi soi-meme soi-m√™me soit
soixante son sont sous souvent specifique specifiques sp√©cifique sp√©cifiques stop
suffisant suffisante suffit suis suit suivant suivante
suivantes suivants suivre sur surtout

t' t‚Äô ta tant te tel telle tellement telles tels tenant tend tenir tente
tes tien tienne tiennes tiens toi toi-meme toi-m√™me ton touchant toujours tous
tout toute toutes treize trente tres trois troisi√®me troisi√®mement tr√®s
tu t√©

un une unes uns

va vais vas vers via vingt voici voila voil√† vont vos
votre votres vous vous-m√™mes vu v√© v√¥tre v√¥tres

y

""".split()
)

source https://github.com/explosion/spaCy/blob/master/spacy/lang/fr/stop_words.py


we use  TfidfVectorizer, it use count vectorizer followed bytransform tfid
- count vectorizer create a matrice document x word  the cell i,j correspond of ith document avec jth word and it count the  number of occurence of the word j in i
- tfid use this matrix to calculate the tfidf matrix

### regression lineaire

In [7]:
params = {
    "solver": "lbfgs",
    "max_iter": 1000,
    "multi_class": "auto",
    "random_state": 42,
}

In [8]:
pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, stop_words = STOP_WORDS)),
        ('classifier', LogisticRegression(**params))
    ])# creation de la pipeline avec le preprocessing et le mod√®le
    

## MLFLOW EXPERIMENT

In [9]:
## definition des commit du notebook et de la brance
session_name = os.environ.get('JPY_SESSION_NAME', '')
notebook_name = os.path.basename(session_name)
notebook_name

path = os.getcwd()
repo = git.Repo(os.path.abspath(os.path.join(path, os.pardir)))
sha_commit = repo.head.object.hexsha 
branch = repo.active_branch.name

In [12]:
# Create a new MLflow Experiment
mlflow.set_experiment("MLflow LRregression")
# Start an MLflow run
with mlflow.start_run() as run:
    mlflow.sklearn.autolog(log_datasets=False)
    pipeline.fit(X_train, y_train)
    accuracy = pipeline.score(X_test, y_test)
    mlflow.log_metric("accuracy_test", accuracy)
    mlflow.set_tag("mlflow.source.name",notebook_name)
    mlflow.set_tag("mlflow.source.git.commit", sha_commit) 
    signature = infer_signature(X_train, pipeline.predict(X_train))
    mlflow.set_tag("mlflow.source.git.branch", branch)
    mlflow.sklearn.log_model(pipeline, artifact_path="sklearn-model", signature=signature)


# Create source model version
client = MlflowClient()
src_name = "LR-staging"
client.create_registered_model(src_name)
src_uri = f"runs:/{run.info.run_id}/sklearn-model"
mv_src = client.create_model_version(src_name, src_uri, run.info.run_id)


2024/11/12 19:09:19 INFO mlflow.tracking._tracking_service.client: üèÉ View run bright-skunk-374 at: http://localhost:5000/#/experiments/257713885670805413/runs/2c8e3295da134b2a97888cee6d740e8d.
2024/11/12 19:09:19 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/257713885670805413.


KeyboardInterrupt: 

Consultez les m√©triques loggu√©es par autolog lors de l'entrainement. Qu'en pensez-vous ? Ces m√©triques sont-elles satisfaisantes pour √©valuer la qualit√© de votre mod√®le?

Ajoutez au moins une m√©trique pertinente d'√©valuation de votre mod√®le.
c'est l'accuracy du training on preferait l'avoir sur le test

## build model 

In [13]:
def build_model(
    training_set,
    pipeline,
    mlflow_run_tags = None,
    mlflow_run_parameters = None,
    mlflow_run_description = None,
    validation_set = None
):
    """
    Build a sentiment analysis model, print the evaluation result and store everything to MLFlow
    @param: training_set: pandas dataframe containing the input training set
    @param: pipeline: scikit-learn pipeline that will be applied to the input data
    @param: mlflow_run_tags: dict of tags that will be stored in the MLFlow run
    @param: mlflow_run_parameters: dict of parameters that will be stored in the MLFlow run
    @param: mlflow_run_description: textual description of the run
    @param: validation_set: if provided, used to evaluate the model and log result in MLFlow
    @return: the trained pipeline
    """
    with mlflow.start_run(tags=mlflow_run_tags, description=mlflow_run_description) as run:
        if mlflow_run_parameters:
            for key, value in mlflow_run_parameters.items():
                mlflow.log_param(key, value)
        mlflow.sklearn.autolog(log_datasets=False)
        X_train, y_train = training_set["review"],  training_set["polarity"]
        pipeline.fit(X_train, y_train)
        if not validation_set.empty:
            X_val, y_val= validation_set["review"],  validation_set["polarity"]
        pipeline.score(X_val, y_val)
    return pipeline
        


In [14]:
mlflow_run_tags = {"mlflow.source.name": notebook_name,  
        "mlflow.source.git.commit": sha_commit, 
        "mlflow.source.git.branch": branch  
    }

In [16]:
mlflow.set_experiment("MLflow LRregression C =0.1")
pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=15000, stop_words = STOP_WORDS)),
        ('classifier', LogisticRegression(random_state=42, C = 0.1))
    ])
    
build_model(df_train, pipeline,  mlflow_run_description = "test hyper param lr c=0.1", mlflow_run_tags=mlflow_run_tags,validation_set=df_val)

2024/11/12 19:10:11 INFO mlflow.tracking._tracking_service.client: üèÉ View run monumental-croc-711 at: http://localhost:5000/#/experiments/723041387040427611/runs/ea5ee1074b794e7eb0c4211548c2feb9.
2024/11/12 19:10:11 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/723041387040427611.


In [17]:
mlflow.set_experiment("MLflow LRregression C =2.")
pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=15000, stop_words = STOP_WORDS)),
        ('classifier', LogisticRegression(random_state=42, C = 2))
    ])
    
build_model(df_train, pipeline,  mlflow_run_description = "test hyper param lr c=2.0", validation_set=df_val)

2024/11/12 19:10:51 INFO mlflow.tracking._tracking_service.client: üèÉ View run rebellious-skunk-859 at: http://localhost:5000/#/experiments/377664617740827083/runs/6da0625a11e742008ea664f7bfbe09ba.
2024/11/12 19:10:51 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/377664617740827083.


## hyperopt

In [18]:
def objective(params,pipeline, training_set, validation_set,mlflow_run_description, mlflow_run_parameters, mlflow_run_tags):
    """Fonction objectif pour l'optimisation des hyperparam√®tres."""
    mlflow.set_experiment("MLflow LRregression opti")
    with mlflow.start_run(tags=mlflow_run_tags, description=mlflow_run_description): 
        pipeline = create_pipeline(params)
        X_train, y_train = training_set["review"],  training_set["polarity"]
        X_val, y_val= validation_set["review"],  validation_set["polarity"]
        pipeline.fit(X_train, y_train)
        
        if mlflow_run_parameters:
            for key, value in mlflow_run_parameters.items():
                mlflow.log_param(key, value)
        mlflow.sklearn.autolog(log_datasets=False)
        X_train, y_train = training_set["review"],  training_set["polarity"]
        pipeline.fit(X_train, y_train)
        mlflow.set_tag("hyperopt_candidate", True)
        y_pred = pipeline.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        if not validation_set.empty:
            X_val, y_val= validation_set["review"],  validation_set["polarity"]
        pipeline.score(X_val, y_val)
        
        return {'loss': 1 - accuracy, 'status': "ok"}


def create_pipeline(params):
    print(params)
    return Pipeline([
        ('tfidf', TfidfVectorizer(max_features=params['tfidf__max_features'])),
        ('logreg', LogisticRegression(C=params['logreg__C']))])


def build_optimized_model(training_set,
    pipeline,
    space,
    objective, 
    mlflow_run_tags = None,
    mlflow_run_parameters = None,
    mlflow_run_description = None,
    validation_set = None,
    test_set = None,
    src_name = "LR best"
    
):
    """Optimise les hyperparam√®tres et construit le mod√®le final."""
    X_train, y_train = training_set["review"],  training_set["polarity"]

    best = fmin(fn=lambda params: objective(params, pipeline, training_set, validation_set,mlflow_run_description, mlflow_run_parameters, mlflow_run_tags),
                 space=space,
                 algo=tpe.suggest,
                 max_evals=10,
                 return_argmin=False)
    
    print(f'Best parameters: {best}')


    final_model = create_pipeline(best)

    with mlflow.start_run(tags=mlflow_run_tags, description=mlflow_run_description) as run:  
        mlflow.log_params(best)
        final_model.fit( X_train, y_train)
        mlflow.set_tag("hyperopt_selected", True)
        if not validation_set.empty:
            X_test, y_test= test_set["review"],  test_set["polarity"]
            final_accuracy = accuracy_score(y_test, final_model.predict(X_test))
            mlflow.log_metric("final_accuracy", final_accuracy)
    signature = infer_signature(X_train, final_model.predict(X_train))
    mlflow.sklearn.log_model(final_model, artifact_path="sklearn-model", signature=signature)
    # Create source model version
    client = MlflowClient()
    client.create_registered_model(src_name)
    src_uri = f"runs:/{run.info.run_id}/sklearn-model"
    mv_src = client.create_model_version(src_name, src_uri, run.info.run_id)
    return final_model


In [19]:
space = {
        'tfidf__max_features': hp.choice('tfidf__max_features', [1000, 5000, 10000]),
        'logreg__C': hp.loguniform('logreg__C', np.log(0.001), np.log(100))
    }

In [20]:
build_optimized_model(df_train, create_pipeline, space,objective,  mlflow_run_description = "test hyper param lr",  mlflow_run_tags=mlflow_run_tags, validation_set=df_val, test_set=df_test)

{'logreg__C': 0.01713015470159118, 'tfidf__max_features': 1000}                                     
  0%|                                                        | 0/10 [00:00<?, ?trial/s, best loss=?]

2024/11/12 19:12:17 INFO mlflow.tracking._tracking_service.client: üèÉ View run honorable-snail-239 at: http://localhost:5000/#/experiments/158909635666735086/runs/13c60f962e8a46089ef5ef5e604bf596.

2024/11/12 19:12:17 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/158909635666735086.



{'logreg__C': 3.29511271149477, 'tfidf__max_features': 5000}                                        
 10%|‚ñà‚ñà‚ñà‚ñà                                     | 1/10 [01:25<12:50, 85.57s/trial, best loss: 0.13305]

2024/11/12 19:13:45 INFO mlflow.tracking._tracking_service.client: üèÉ View run nervous-bat-612 at: http://localhost:5000/#/experiments/158909635666735086/runs/91792c6b1f9c4050ac89f5a635161f6f.

2024/11/12 19:13:45 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/158909635666735086.



{'logreg__C': 0.5940617222166208, 'tfidf__max_features': 5000}                                      
 20%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä                       | 2/10 [02:53<11:33, 86.69s/trial, best loss: 0.08055000000000001]

2024/11/12 19:15:13 INFO mlflow.tracking._tracking_service.client: üèÉ View run enthused-elk-492 at: http://localhost:5000/#/experiments/158909635666735086/runs/d2a7f16981b24cd49c6ddb0e413afc19.

2024/11/12 19:15:13 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/158909635666735086.



{'logreg__C': 7.439604855913225, 'tfidf__max_features': 5000}                                       
 30%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã                    | 3/10 [04:21<10:11, 87.35s/trial, best loss: 0.08055000000000001]

2024/11/12 19:16:40 INFO mlflow.tracking._tracking_service.client: üèÉ View run stately-shoat-935 at: http://localhost:5000/#/experiments/158909635666735086/runs/2b37e9b5231b4943989fd11be58ee2c7.

2024/11/12 19:16:40 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/158909635666735086.



{'logreg__C': 59.92071662424678, 'tfidf__max_features': 10000}                                      
 40%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå                 | 4/10 [05:48<08:44, 87.48s/trial, best loss: 0.08004999999999995]

2024/11/12 19:18:09 INFO mlflow.tracking._tracking_service.client: üèÉ View run overjoyed-dove-457 at: http://localhost:5000/#/experiments/158909635666735086/runs/ffa5e218336b4eefbc45da9a3125822d.

2024/11/12 19:18:09 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/158909635666735086.



{'logreg__C': 0.0023009904379652116, 'tfidf__max_features': 10000}                                  
 50%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå                    | 5/10 [07:17<07:19, 87.88s/trial, best loss: 0.07865]

2024/11/12 19:19:36 INFO mlflow.tracking._tracking_service.client: üèÉ View run selective-grub-297 at: http://localhost:5000/#/experiments/158909635666735086/runs/5a7c03bf06a449bc81bb8ba7174e18ed.

2024/11/12 19:19:36 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/158909635666735086.



{'logreg__C': 0.002578204747151482, 'tfidf__max_features': 10000}                                   
 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå                | 6/10 [08:44<05:50, 87.64s/trial, best loss: 0.07865]

2024/11/12 19:21:03 INFO mlflow.tracking._tracking_service.client: üèÉ View run fun-squid-975 at: http://localhost:5000/#/experiments/158909635666735086/runs/75fa1b111ebc424c98d434b3da50d519.

2024/11/12 19:21:03 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/158909635666735086.



{'logreg__C': 6.559904621294854, 'tfidf__max_features': 1000}                                       
 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã            | 7/10 [10:11<04:22, 87.48s/trial, best loss: 0.07865]

2024/11/12 19:22:29 INFO mlflow.tracking._tracking_service.client: üèÉ View run youthful-slug-6 at: http://localhost:5000/#/experiments/158909635666735086/runs/e712dc00ce444b3993e1c320a522add2.

2024/11/12 19:22:29 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/158909635666735086.



{'logreg__C': 0.001176034505616751, 'tfidf__max_features': 5000}                                    
 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä        | 8/10 [11:37<02:53, 86.79s/trial, best loss: 0.07865]

2024/11/12 19:23:55 INFO mlflow.tracking._tracking_service.client: üèÉ View run nosy-finch-184 at: http://localhost:5000/#/experiments/158909635666735086/runs/4b2d8d0713fc47e78670456042e7df63.

2024/11/12 19:23:55 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/158909635666735086.



{'logreg__C': 0.006293202473270156, 'tfidf__max_features': 10000}                                   
 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 9/10 [13:03<01:26, 86.61s/trial, best loss: 0.07865]

2024/11/12 19:25:22 INFO mlflow.tracking._tracking_service.client: üèÉ View run welcoming-mink-980 at: http://localhost:5000/#/experiments/158909635666735086/runs/c728e99bce9d439a8179ddc29701cf0b.

2024/11/12 19:25:22 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/158909635666735086.



100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [14:30<00:00, 87.09s/trial, best loss: 0.07865]
Best parameters: {'logreg__C': 59.92071662424678, 'tfidf__max_features': 10000}
{'logreg__C': 59.92071662424678, 'tfidf__max_features': 10000}


2024/11/12 19:26:07 INFO mlflow.tracking._tracking_service.client: üèÉ View run powerful-cow-62 at: http://localhost:5000/#/experiments/158909635666735086/runs/36205669e8384550b8548b88a8b16855.
2024/11/12 19:26:07 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/158909635666735086.
2024/11/12 19:26:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LR best, version 1
