In [43]:
# manipulation des données
import numpy as np
import pandas as pd

# matplotlib et seaborn pour les représentations graphiques
import matplotlib.pyplot as plt
import seaborn as sns

# Gestion du système de fichiers
import os

In [44]:
df = pd.read_csv("../Data/2.train_df.csv")
df

Unnamed: 0,target,id,date,flag,user,text,words
0,1,1969889460,Sat May 30 01:36:50 PDT 2009,NO_QUERY,mcclorypatrick,got to the hotel safe going to sleeplt myspace...,get hotel safe go sleeplt myspacecomwearemcclo...
1,0,2207682917,Wed Jun 17 08:14:56 PDT 2009,NO_QUERY,NikkiMarieStarr,hates that she has to work when all she wants ...,hate work want sleep family need friend
2,1,1978407277,Sat May 30 23:15:51 PDT 2009,NO_QUERY,ArtisticQueen,watching west wing series with girlfriend,watch west wing series girlfriend
3,1,1882357841,Fri May 22 06:43:36 PDT 2009,NO_QUERY,E_mma_S,is at work but not working,work work
4,0,2006961885,Tue Jun 02 11:56:33 PDT 2009,NO_QUERY,kazzylady,enjoy the rush we were to work with you on you...,enjoy rush work last series unfortunately ill ...
...,...,...,...,...,...,...,...
799995,0,2011695720,Tue Jun 02 19:25:29 PDT 2009,NO_QUERY,It_Mi,the service here is too slow,service slow
799996,1,1559841221,Sun Apr 19 12:11:03 PDT 2009,NO_QUERY,jessys1239,hiye am jessicanew here on twitterwelcome me,hiye jessicanew twitterwelcome
799997,1,1998631763,Mon Jun 01 18:52:15 PDT 2009,NO_QUERY,PinUpMom,well you should get one im firm believer you s...,well get one im firm believer get want
799998,0,2044203318,Fri Jun 05 09:01:12 PDT 2009,NO_QUERY,anomit,this goddamned conn is absolutely unusable for...,goddamned conn absolutely unusable even single...


In [45]:
from sklearn.feature_extraction.text import CountVectorizer
import joblib

vectorizer = CountVectorizer()
df = df.dropna(subset=['words'])

X_CountVecorizer = vectorizer.fit_transform(df['words'])
y_CountVecorizer = df['target']

with open("embedding/CountVectorizer.pkl", 'wb') as vec_file:
    joblib.dump(vectorizer, vec_file)

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

vectorizer = TfidfVectorizer()

X_TfidfVectorizer = vectorizer.fit_transform(df['words'])
y_TfidfVectorizer = df['target']

with open("embedding/TfidfVectorizer.pkl", 'wb') as vec_file:
    joblib.dump(vectorizer, vec_file)

In [47]:
from sklearn.model_selection import train_test_split

X_train_Cv, X_test_Cv, y_train_Cv, y_test_Cv = train_test_split(X_CountVecorizer,
                                                                 y_CountVecorizer,
                                                                 test_size = 0.2,
                                                                 random_state = 42)

X_train_Tv, X_test_Tv, y_train_Tv, y_test_Tv = train_test_split(X_TfidfVectorizer,
                                                                 y_TfidfVectorizer,
                                                                 test_size = 0.2,
                                                                 random_state = 42)

In [48]:
from sklearn.linear_model import LogisticRegression
import mlflow

param_sets = [
    {'solver': 'liblinear'},
    {'solver': 'saga'},
    {'solver': 'lbfgs'},
    {'max_iter': 100},
    {'max_iter': 200},
    {'max_iter': 300},
    {'C': 0.0001},
    {'C': 0.001},
    {'C': 0.01},
    {'C': 0.1},
    {'C': 1},
    {'C': 10},
    {'C': 100}
]

In [49]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix
import joblib

artifact_path = './artifacts/'

best_metric_value = -1
best_model_path = ""

mlflow.set_experiment("DeepLearningTweet")

for i, params in enumerate(param_sets):
    name_experience = f'{list(params.keys())[0]}_{list(params.values())[0]}'
    with mlflow.start_run(run_name=f"reg_logistic_Cv_{name_experience}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Cv, y_train_Cv)

        y_pred = clf.predict(X_test_Cv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Cv, y_test_Cv))
        mlflow.log_metric("Precision", precision_score(y_test_Cv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Cv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Cv, y_pred))

        f1 = f1_score(y_test_Cv, y_pred)

        fpr, tpr, thresholds = roc_curve(y_test_Cv, clf.predict_proba(X_test_Cv)[:, 1])
        roc_auc = auc(fpr, tpr)
        mlflow.log_metric("AUC", roc_auc)

        if f1 > best_metric_value:
            best_metric_value = f1
            mlflow.set_tag("tag1", "Logistic regression with CountVectorizer()")
            mlflow.set_tags({"tag2":f'{name_experience}'})
            mlflow.sklearn.log_model(clf, "model", registered_model_name="Logistic_regression_Cv")

            conf_matrix = confusion_matrix(y_test_Cv, y_pred)
            conf_matrix_path = f"{artifact_path}confMat_RLCountVectorizer_{name_experience}.csv"
            pd.DataFrame(conf_matrix).to_csv(conf_matrix_path, index=False, header=False)
            mlflow.log_artifact(conf_matrix_path, "metrics")

            plt.figure(figsize=(8, 8))
            plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
            plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Receiver Operating Characteristic (ROC) Curve')
            plt.legend(loc="lower right")
            plt.grid(True)
            roc_curve_path = f"{artifact_path}roc_RLCountVectorizer_{name_experience}.png"
            plt.savefig(roc_curve_path)
            plt.close()
            mlflow.log_artifact(roc_curve_path, "plots")

2024/02/15 11:25:39 INFO mlflow.tracking.fluent: Experiment with name 'DeepLearningTweet' does not exist. Creating a new experiment.
Successfully registered model 'Logistic_regression_Cv'.
Created version '1' of model 'Logistic_regression_Cv'.
Registered model 'Logistic_regression_Cv' already exists. Creating a new version of this model...
Created version '2' of model 'Logistic_regression_Cv'.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for 

In [50]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix

artifact_path = './artifacts/'

best_metric_value = -1
best_model_path = ""

mlflow.set_experiment("DeepLearningTweet")

for i, params in enumerate(param_sets):
    name_experience = f'{list(params.keys())[0]}_{list(params.values())[0]}'
    with mlflow.start_run(run_name=f"reg_logistic_Tv_{name_experience}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Tv, y_train_Tv)

        y_pred = clf.predict(X_test_Tv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Tv, y_test_Tv))
        mlflow.log_metric("Precision", precision_score(y_test_Tv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Tv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Tv, y_pred))

        f1 = f1_score(y_test_Cv, y_pred)

        fpr, tpr, thresholds = roc_curve(y_test_Tv, clf.predict_proba(X_test_Tv)[:, 1])
        roc_auc = auc(fpr, tpr)
        mlflow.log_metric("AUC", roc_auc)

        if f1 > best_metric_value:
            best_metric_value = f1
            mlflow.set_tag("tag1", "Logistic regression with TfidfVectorizer()")
            mlflow.set_tags({"tag2":f'{name_experience}'})
            mlflow.sklearn.log_model(clf, "model", registered_model_name="Logistic_regression_Tv")

            conf_matrix = confusion_matrix(y_test_Tv, y_pred)
            conf_matrix_path = f"{artifact_path}confMat_RLTfidfVectorizer_{name_experience}.csv"
            pd.DataFrame(conf_matrix).to_csv(conf_matrix_path, index=False, header=False)
            mlflow.log_artifact(conf_matrix_path, "metrics")

            plt.figure(figsize=(8, 8))
            plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
            plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Receiver Operating Characteristic (ROC) Curve')
            plt.legend(loc="lower right")
            plt.grid(True)
            roc_curve_path = f"{artifact_path}roc_RLTfidfVectorizer_{name_experience}.png"
            plt.savefig(roc_curve_path)
            plt.close()
            mlflow.log_artifact(roc_curve_path, "plots")

Successfully registered model 'Logistic_regression_Tv'.
Created version '1' of model 'Logistic_regression_Tv'.
Registered model 'Logistic_regression_Tv' already exists. Creating a new version of this model...
Created version '2' of model 'Logistic_regression_Tv'.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_opt