In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import spacy

### Pré-processing

In [2]:
data_train = pd.read_csv('../../data/archive/train.csv')
data_test = pd.read_csv('../../data/archive/test.csv')

In [3]:
data_train

Unnamed: 0.1,Unnamed: 0,film-url,review,polarity
0,0,http://www.allocine.fr/film/fichefilm-135259/c...,Si vous cherchez du cinéma abrutissant à tous ...,0
1,1,http://www.allocine.fr/film/fichefilm-172430/c...,"Trash, re-trash et re-re-trash...! Une horreur...",0
2,2,http://www.allocine.fr/film/fichefilm-15105/cr...,"Et si, dans les 5 premières minutes du film, l...",0
3,3,http://www.allocine.fr/film/fichefilm-188629/c...,Mon dieu ! Quelle métaphore filée ! Je suis ab...,0
4,4,http://www.allocine.fr/film/fichefilm-23514/cr...,"Premier film de la saga Kozure Okami, ""Le Sabr...",1
...,...,...,...,...
159995,159995,http://www.allocine.fr/film/fichefilm-132387/c...,Un rythme bien trop lent et un Ashton Kutcher ...,0
159996,159996,http://www.allocine.fr/film/fichefilm-53313/cr...,Monsieur Duchovny vous êtes aussi piètre acteu...,0
159997,159997,http://www.allocine.fr/film/fichefilm-248258/c...,Complètement différent des films de la série C...,1
159998,159998,http://www.allocine.fr/film/fichefilm-268731/c...,Alors franchement pour le moment c'est le meil...,1


In [4]:
nlp = spacy.load("fr_core_news_sm")
french_stop_words = nlp.Defaults.stop_words

# Initialisation de CountVectorizer avec les stop words
vectorizer = CountVectorizer(stop_words=list(french_stop_words))

# Application de CountVectorizer sur la colonne 'review'
X_train = vectorizer.fit_transform(data_train['review'])
X_test = vectorizer.transform(data_test['review'])

y_train = data_train["polarity"]
y_test = data_test["polarity"]

print("Taille de la matrice de features pour train :", X_train.shape)
print("Taille de la matrice de features pour test :", X_test.shape)
print("Les premiers mots uniques extraits :", vectorizer.get_feature_names_out()[:30])



Taille de la matrice de features pour train : (160000, 152189)
Taille de la matrice de features pour test : (20000, 152189)
Les premiers mots uniques extraits : ['00' '000' '000001ct' '0001' '000m' '000mots' '000volts' '001'
 '003023_21708_2' '005' '006' '007' '01' '015' '01h15' '01h30' '01h37'
 '01h40' '01h45' '01h49' '01h50' '01h57' '01min' '02' '02h10' '02h20'
 '02h45' '03' '031119' '04']


### Conception du modèle

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, accuracy_score, recall_score

model_lr = LogisticRegression(max_iter = 10000)
model_lr.fit(X_train, y_train)

prediction = model_lr.predict(X_test)

precision = precision_score(y_test, prediction)
accuracy = accuracy_score(y_test, prediction)
recall = recall_score(y_test, prediction)

print("La précision est de :", precision)
print("L'accuracy est de :", accuracy)
print("Le recall est de :", recall)

La précision est de : 0.9103319888302823
L'accuracy est de : 0.91715
Le recall est de : 0.9176396997497915


The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.

The accuracy is the ratio of (values right predicted) / (number of total values)

The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.

Here for our case the metric to proritize seems to be the accuracy.

### Création du pipeline scikit-learn

In [6]:
from sklearn.pipeline import Pipeline

estimators = [('vectorizer', CountVectorizer(stop_words=list(french_stop_words))), 
              ('model', LogisticRegression(max_iter = 10000))]
pipe = Pipeline(estimators)

pipe.fit(data_train["review"], y_train)
pipe_prediction = pipe.predict(data_test["review"])

pipe_accuracy = accuracy_score(y_test, pipe_prediction)

print("L'accuracy de la pipeline est de :", pipe_accuracy)



L'accuracy de la pipeline est de : 0.91715


### Logging des paramètres MLflow

In [14]:
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

In [10]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.sklearn.autolog()

experiment_name = "pipeline_countVectorizer_lr"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/493843891007736314', creation_time=1731418818145, experiment_id='493843891007736314', last_update_time=1731418818145, lifecycle_stage='active', name='pipeline_countVectorizer_lr', tags={}>

In [31]:
import subprocess

X_train = data_train["review"]

params = {
    "max_iter" : 10000
}

input_example = pd.DataFrame(X_train, columns = ["review"])

source_name = "model_design_2.ipynb"
commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")
branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).strip().decode("utf-8")

In [32]:
with mlflow.start_run() as run:
    mlflow.log_params(params)
    mlflow.log_metric("test_accuracy", pipe_accuracy)
    mlflow.set_tag("Training Info", "Basic LR model with a CountVectorizer on pipeline")
    mlflow.set_tag("Preprocessing", "CountVectorizer")
    mlflow.set_tag("Algorithm", "Logistic Regression")
    
    mlflow.set_tag("mlflow.source.name", source_name)
    mlflow.set_tag("mlflow.source.git.commit", commit)
    mlflow.set_tag("mlflow.source.git.branch", branch)
    
    signature = infer_signature(X_train, pipe.predict(X_train))
    
    pipe.fit(X_train, y_train)
    
    model_info = mlflow.sklearn.log_model(
        sk_model = pipe,
        artifact_path = "train_review",
        signature = signature,
        input_example = input_example,
        registered_model_name = "logisticRegression",
    )

Registered model 'logisticRegression' already exists. Creating a new version of this model...
2024/11/12 15:38:37 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logisticRegression, version 4
Created version '4' of model 'logisticRegression'.
Downloading artifacts: 100%|██████████████████████| 7/7 [00:02<00:00,  2.74it/s]
2024/11/12 15:39:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run powerful-lynx-188 at: http://localhost:5000/#/experiments/493843891007736314/runs/8ec4c3cc7a8e4086962934411cc0f071.
2024/11/12 15:39:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/493843891007736314.
