In [7]:
# manipulation des données
import numpy as np
import pandas as pd

# matplotlib et seaborn pour les représentations graphiques
import matplotlib.pyplot as plt
import seaborn as sns

# Gestion du système de fichiers
import os

In [8]:
df = pd.read_csv("../Data/2.sample_dataset.csv")
df

Unnamed: 0,target,id,date,flag,user,text,words
0,0,2054665466,Sat Jun 06 07:50:24 PDT 2009,NO_QUERY,boring_alice,ive had awesome day but the sun is missing wan...,ive awesome day sun miss want well weather
1,1,1823190427,Sat May 16 21:20:00 PDT 2009,NO_QUERY,yassychan,you will do great saw kevin teaching you,great saw kevin teach
2,0,1826975071,Sun May 17 09:43:20 PDT 2009,NO_QUERY,PRNCSmuriel3,its cold in md too,cold md
3,0,2202406793,Tue Jun 16 21:44:37 PDT 2009,NO_QUERY,mariazimmerman,does anyone know the girl that died of swine f...,anyone know girl die swine flu maybe go ucsd f...
4,0,2242106714,Fri Jun 19 11:46:38 PDT 2009,NO_QUERY,katrinachelsea,watching amelie and wishing was french,watch amelie wish french
...,...,...,...,...,...,...,...
999995,1,1679383266,Sat May 02 09:13:22 PDT 2009,NO_QUERY,Shawna1976,goodmorning jordan needs to talk to you,goodmorning jordan need talk
999996,1,2053651702,Sat Jun 06 05:16:26 PDT 2009,NO_QUERY,harlequinxgirl,good mprninh to you too,good mprninh
999997,0,1563449159,Sun Apr 19 22:52:58 PDT 2009,NO_QUERY,Cynthi_ocho,cant believe spring break is coming to an end,cant believe spring break come end
999998,0,1751244042,Sat May 09 19:00:02 PDT 2009,NO_QUERY,Ericanderson09,everyone have texted in the last hour complete...,everyone texted last hour completely ignore im...


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
df = df.dropna(subset=['words'])

X_CountVecorizer = vectorizer.fit_transform(df['words'])
y_CountVecorizer = df['target']

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

vectorizer = TfidfVectorizer()

X_TfidfVectorizer = vectorizer.fit_transform(df['words'])
y_TfidfVectorizer = df['target']

with open("vectorizer.pkl", 'wb') as vec_file:
    joblib.dump(vectorizer, vec_file)

In [11]:
def predict_sentence(preprocess_sentence):
    # Load the saved vectorizer
    with open("vectorizer.pkl", 'rb') as vec_file:
        vectorizer = joblib.load(vec_file)

    # Transform the new sentence using the fitted vectorizer
    preprocess_sentence = vectorizer.transform([preprocess_sentence])

    # Load the classifier model
    with open("model.pkl", 'rb') as model_file:
        classifier = joblib.load(model_file)

    # Make predictions
    predictions_test = classifier.predict(preprocess_sentence)

    return predictions_test

In [12]:
preprocess_sentence = 'Hello how are you?'
result = predict_sentence(preprocess_sentence)
print(result)

[1]


In [59]:
from sklearn.model_selection import train_test_split

X_train_Cv, X_test_Cv, y_train_Cv, y_test_Cv = train_test_split(X_CountVecorizer,
                                                                 y_CountVecorizer,
                                                                 test_size = 0.4,
                                                                 random_state = 42)

X_train_Tv, X_test_Tv, y_train_Tv, y_test_Tv = train_test_split(X_TfidfVectorizer,
                                                                 y_TfidfVectorizer,
                                                                 test_size = 0.4,
                                                                 random_state = 42)

In [60]:
from sklearn.linear_model import LogisticRegression
import mlflow

param_sets = [
    {'solver': 'liblinear'},
    {'solver': 'saga'},
    {'solver': 'lbfgs'},
    {'max_iter': 100},
    {'max_iter': 200},
    {'max_iter': 300},
    {'C': 0.0001},
    {'C': 0.001},
    {'C': 0.01},
    {'C': 0.1},
    {'C': 1},
    {'C': 10},
    {'C': 100}
]

In [61]:
mlflow.end_run()

In [62]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix
import joblib

artifact_path = './artifacts/'

best_metric_value = -1  # Initialize with a value that will be surpassed by the actual metrics
best_model_path = ""

mlflow.set_experiment("Reg_Logistic_CountVectorizer")

for i, params in enumerate(param_sets):
    name_experience = f'{list(params.keys())[0]}_{list(params.values())[0]}'
    with mlflow.start_run(run_name=f"reg_logistic_{name_experience}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Cv, y_train_Cv)

        y_pred = clf.predict(X_test_Tv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Cv, y_test_Cv))
        mlflow.log_metric("Precision", precision_score(y_test_Cv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Cv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Cv, y_pred))

        f1 = f1_score(y_test_Cv, y_pred)

        fpr, tpr, thresholds = roc_curve(y_test_Cv, clf.predict_proba(X_test_Cv)[:, 1])
        roc_auc = auc(fpr, tpr)
        mlflow.log_metric("AUC", roc_auc)

        if f1 > best_metric_value:
            best_metric_value = f1
            best_model_path = f"{artifact_path}best_model_RLCountVectorizer_{name_experience}.pkl"
            joblib.dump(clf, best_model_path)
            mlflow.log_artifact(best_model_path, "best_model_RLCountVectorizer_")

        conf_matrix = confusion_matrix(y_test_Cv, y_pred)
        conf_matrix_path = f"{artifact_path}confMat_RLCountVectorizer_{name_experience}.csv"
        pd.DataFrame(conf_matrix).to_csv(conf_matrix_path, index=False, header=False)
        mlflow.log_artifact(conf_matrix_path, "metrics")

        plt.figure(figsize=(8, 8))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")
        plt.grid(True)
        roc_curve_path = f"{artifact_path}roc_RLCountVectorizer_{name_experience}.png"
        plt.savefig(roc_curve_path)
        plt.close()
        mlflow.log_artifact(roc_curve_path, "plots")

2024/02/05 16:36:46 INFO mlflow.tracking.fluent: Experiment with name 'Reg_Logistic_CountVectorizer' does not exist. Creating a new experiment.


In [63]:
result = mlflow.register_model(
    "runs:/0bac74c362034f96a7b1e1d41ad9580f/RLCountVectorizer-model", "RLCountVectorizer-model"
)

Successfully registered model 'RLCountVectorizer-model'.
Created version '1' of model 'RLCountVectorizer-model'.


In [64]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

artifact_path = './artifacts/'

best_metric_value = -1
best_model_path = ""

mlflow.set_experiment("Reg_Logistic_TfidfVectorizer")

for i, params in enumerate(param_sets):
    name_experience = f'{list(params.keys())[0]}_{list(params.values())[0]}'
    with mlflow.start_run(run_name=f"reg_logistic_{name_experience}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Tv, y_train_Tv)
        
        y_pred = clf.predict(X_test_Tv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Tv, y_test_Tv))
        mlflow.log_metric("Precision", precision_score(y_test_Tv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Tv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Tv, y_pred))

        f1 = f1_score(y_test_Tv, y_pred)

        fpr, tpr, thresholds = roc_curve(y_test_Tv, clf.predict_proba(X_test_Tv)[:, 1])
        roc_auc = auc(fpr, tpr)
        mlflow.log_metric("AUC", roc_auc)

        if f1 > best_metric_value:
            best_metric_value = f1
            best_model_path = f"{artifact_path}best_model_RLTfidfVectorizer_{name_experience}.pkl"
            joblib.dump(clf, best_model_path)
            mlflow.log_artifact(best_model_path, "best_model_RLTfidfVectorizer")

        conf_matrix = confusion_matrix(y_test_Tv, y_pred)
        conf_matrix_path = f"{artifact_path}confMat_RLTfidfVectorizer_{name_experience}.csv"
        pd.DataFrame(conf_matrix).to_csv(conf_matrix_path, index=False, header=False)
        mlflow.log_artifact(conf_matrix_path, "metrics")

        plt.figure(figsize=(8, 8))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")
        plt.grid(True)
        roc_curve_path = f"{artifact_path}roc_RLTfidfVectorizer_{name_experience}.png"
        plt.savefig(roc_curve_path)
        plt.close()
        mlflow.log_artifact(roc_curve_path, "plots")

2024/02/05 16:43:02 INFO mlflow.tracking.fluent: Experiment with name 'Reg_Logistic_TfidfVectorizer' does not exist. Creating a new experiment.


In [65]:
result = mlflow.register_model(
    "runs:/8192780356124971a6f9d4a8bbdb9e27/RLTfidfVectorizer-model", "RLTfidfVectorizer-model"
)

Successfully registered model 'RLTfidfVectorizer-model'.
Created version '1' of model 'RLTfidfVectorizer-model'.


In [66]:
from sklearn.linear_model import LogisticRegression
import mlflow
from sklearn.model_selection import ParameterSampler

artifact_path = './artifacts/'

best_metric_value = -1
best_model_path = ""

param_space = {
    'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter': [100, 200, 300],
    'C': [0.001, 0.01, 0.1, 1, 10]
}

mlflow.set_experiment("LR_RandomParam_CountVectorizer")
random_params = ParameterSampler(param_space, n_iter=20, random_state=42)

for i, params in enumerate(random_params):
    with mlflow.start_run(run_name=f"reg_logistic_{i}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Cv, y_train_Cv)
        
        y_pred = clf.predict(X_test_Cv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Cv, y_test_Cv))
        mlflow.log_metric("Precision", precision_score(y_test_Cv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Cv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Cv, y_pred))

        f1 = f1_score(y_test_Cv, y_pred)

        fpr, tpr, thresholds = roc_curve(y_test_Cv, clf.predict_proba(X_test_Cv)[:, 1])
        roc_auc = auc(fpr, tpr)
        mlflow.log_metric("AUC", roc_auc)

        if f1 > best_metric_value:
            best_metric_value = f1
            best_model_path = f"{artifact_path}best_model_RLCountVectorizer_random_{name_experience}.pkl"
            joblib.dump(clf, best_model_path)
            mlflow.log_artifact(best_model_path, "best_model_RLCountVectorizer_random")

        conf_matrix = confusion_matrix(y_test_Cv, y_pred)
        conf_matrix_path = f"{artifact_path}confMat_RLCountVectorizer_random_{i}.csv"
        pd.DataFrame(conf_matrix).to_csv(conf_matrix_path, index=False, header=False)
        mlflow.log_artifact(conf_matrix_path, "metrics")

        plt.figure(figsize=(8, 8))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")
        plt.grid(True)
        roc_curve_path = f"{artifact_path}roc_RLCountVectorizer_random_{i}.png"
        plt.savefig(roc_curve_path)
        plt.close()
        mlflow.log_artifact(roc_curve_path, "plots")

2024/02/05 16:55:02 INFO mlflow.tracking.fluent: Experiment with name 'LR_RandomParam_CountVectorizer' does not exist. Creating a new experiment.


In [67]:
result = mlflow.register_model(
    "runs:/c95101aa04dd4fb18d4dd16b2fed7dd0/RLCountVectorizer_random-model", "RLCountVectorizer_random-model"
)

Successfully registered model 'RLCountVectorizer_random-model'.
Created version '1' of model 'RLCountVectorizer_random-model'.


In [68]:
from sklearn.linear_model import LogisticRegression
import mlflow
from sklearn.model_selection import ParameterSampler

artifact_path = './artifacts/'

best_metric_value = -1
best_model_path = ""

param_space = {
    'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter': [100, 200, 300],
    'C': [0.001, 0.01, 0.1, 1, 10]
}

mlflow.set_experiment("LR_RandomParam_TfidfVectorizer")
random_params = ParameterSampler(param_space, n_iter=20, random_state=42)

for i, params in enumerate(random_params):
    with mlflow.start_run(run_name=f"reg_logistic_{i}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Tv, y_train_Tv)
        
        y_pred = clf.predict(X_test_Tv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Tv, y_test_Tv))
        mlflow.log_metric("Precision", precision_score(y_test_Tv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Tv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Tv, y_pred))

        f1 = f1_score(y_test_Tv, y_pred)

        fpr, tpr, thresholds = roc_curve(y_test_Tv, clf.predict_proba(X_test_Tv)[:, 1])
        roc_auc = auc(fpr, tpr)
        mlflow.log_metric("AUC", roc_auc)

        if f1 > best_metric_value:
            best_metric_value = f1
            best_model_path = f"{artifact_path}best_model_RLTfidfVectorizer_random_{name_experience}.pkl"
            joblib.dump(clf, best_model_path)
            mlflow.log_artifact(best_model_path, "best_model_RLTfidfVectorizer_random")

        conf_matrix = confusion_matrix(y_test_Tv, y_pred)
        conf_matrix_path = f"{artifact_path}confMat_RLTfidfVectorizer_random_{i}.csv"
        pd.DataFrame(conf_matrix).to_csv(conf_matrix_path, index=False, header=False)
        mlflow.log_artifact(conf_matrix_path, "metrics")

        plt.figure(figsize=(8, 8))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")
        plt.grid(True)
        roc_curve_path = f"{artifact_path}roc_RLTfidfVectorizer_random_{i}.png"
        plt.savefig(roc_curve_path)
        plt.close()
        mlflow.log_artifact(roc_curve_path, "plots")

2024/02/05 17:07:46 INFO mlflow.tracking.fluent: Experiment with name 'LR_RandomParam_TfidfVectorizer' does not exist. Creating a new experiment.


In [69]:
result = mlflow.register_model(
    "runs:/b0b640e75d9d4eeaa86ab1107994e63f/RLTfidfVectorizer_random-model", "RLTfidfVectorizer_random-model"
)

Successfully registered model 'RLTfidfVectorizer_random-model'.
Created version '1' of model 'RLTfidfVectorizer_random-model'.
