In [1]:
# manipulation des données
import numpy as np
import pandas as pd

# matplotlib et seaborn pour les représentations graphiques
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# sklearn preprocessing pour le traiter les variables catégorielles
from sklearn.preprocessing import LabelEncoder

# Gestion du système de fichiers
import os

# Suppression des alertes
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../Data/2.sample_dataset.csv")
df

Unnamed: 0,target,id,date,flag,user,text,words
0,0,2054665466,Sat Jun 06 07:50:24 PDT 2009,NO_QUERY,boring_alice,ive had awesome day but the sun is missing wan...,ive awesome day sun miss want well weather
1,1,1823190427,Sat May 16 21:20:00 PDT 2009,NO_QUERY,yassychan,you will do great saw kevin teaching you,great saw kevin teach
2,0,1826975071,Sun May 17 09:43:20 PDT 2009,NO_QUERY,PRNCSmuriel3,its cold in md too,cold md
3,0,2202406793,Tue Jun 16 21:44:37 PDT 2009,NO_QUERY,mariazimmerman,does anyone know the girl that died of swine f...,anyone know girl die swine flu maybe go ucsd f...
4,0,2242106714,Fri Jun 19 11:46:38 PDT 2009,NO_QUERY,katrinachelsea,watching amelie and wishing was french,watch amelie wish french
...,...,...,...,...,...,...,...
999995,1,1679383266,Sat May 02 09:13:22 PDT 2009,NO_QUERY,Shawna1976,goodmorning jordan needs to talk to you,goodmorning jordan need talk
999996,1,2053651702,Sat Jun 06 05:16:26 PDT 2009,NO_QUERY,harlequinxgirl,good mprninh to you too,good mprninh
999997,0,1563449159,Sun Apr 19 22:52:58 PDT 2009,NO_QUERY,Cynthi_ocho,cant believe spring break is coming to an end,cant believe spring break come end
999998,0,1751244042,Sat May 09 19:00:02 PDT 2009,NO_QUERY,Ericanderson09,everyone have texted in the last hour complete...,everyone texted last hour completely ignore im...


In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
df = df.dropna(subset=['words'])

X_CountVecorizer = vectorizer.fit_transform(df['words'])
y_CountVecorizer = df['target']

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_TfidfVectorizer = vectorizer.fit_transform(df['words'])
y_TfidfVectorizer = df['target']

In [5]:
from sklearn.model_selection import train_test_split

X_train_Cv, X_test_Cv, y_train_Cv, y_test_Cv = train_test_split(X_CountVecorizer,
                                                                 y_CountVecorizer,
                                                                 test_size = 0.4,
                                                                 random_state = 42)

X_train_Tv, X_test_Tv, y_train_Tv, y_test_Tv = train_test_split(X_TfidfVectorizer,
                                                                 y_TfidfVectorizer,
                                                                 test_size = 0.4,
                                                                 random_state = 42)

In [6]:
from sklearn.linear_model import LogisticRegression
import mlflow

param_sets = [
    {'solver': 'liblinear'},
    {'solver': 'saga'},
    {'solver': 'lbfgs'},
    {'max_iter': 100},
    {'max_iter': 200},
    {'max_iter': 300},
    {'C': 0.0001},
    {'C': 0.001},
    {'C': 0.01},
    {'C': 0.1},
    {'C': 1},
    {'C': 10},
    {'C': 100}
]

In [7]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


mlflow.set_experiment("Reg_Logistic_CountVectorizer")

for i, params in enumerate(param_sets):
    name_experience = f'{list(params.keys())[0]}_{list(params.values())[0]}' # héhéhéhé ça marche
    with mlflow.start_run(run_name=f"reg_logistic_{name_experience}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Cv, y_train_Cv)

        y_pred = clf.predict(X_test_Cv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Cv, y_test_Cv))
        mlflow.log_metric("Precision", precision_score(y_test_Cv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Cv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Cv, y_pred))
        
        

2024/01/24 20:32:51 INFO mlflow.tracking.fluent: Experiment with name 'Reg_Logistic_CountVectorizer' does not exist. Creating a new experiment.


In [8]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

mlflow.set_experiment("Reg_Logistic_TfidfVectorizer")

for i, params in enumerate(param_sets):
    name_experience = f'{list(params.keys())[0]}_{list(params.values())[0]}'
    with mlflow.start_run(run_name=f"reg_logistic_{name_experience}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Tv, y_train_Tv)
        
        y_pred = clf.predict(X_test_Tv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Tv, y_test_Tv))
        mlflow.log_metric("Precision", precision_score(y_test_Tv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Tv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Tv, y_pred))

2024/01/24 20:38:26 INFO mlflow.tracking.fluent: Experiment with name 'Reg_Logistic_TfidfVectorizer' does not exist. Creating a new experiment.


In [9]:
from sklearn.linear_model import LogisticRegression
import mlflow
from sklearn.model_selection import ParameterSampler

param_space = {
    'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter': [100, 200, 300],
    'C': [0.001, 0.01, 0.1, 1, 10]
}

mlflow.set_experiment("LR_RandomParam_CountVectorizer")
random_params = ParameterSampler(param_space, n_iter=20, random_state=42)

for i, params in enumerate(random_params):
    with mlflow.start_run(run_name=f"reg_logistic_{i}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Cv, y_train_Cv)
        
        y_pred = clf.predict(X_test_Tv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Cv, y_test_Cv))
        mlflow.log_metric("Precision", precision_score(y_test_Tv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Tv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Tv, y_pred))

2024/01/24 20:41:00 INFO mlflow.tracking.fluent: Experiment with name 'LR_RandomParam_CountVectorizer' does not exist. Creating a new experiment.


In [10]:
from sklearn.linear_model import LogisticRegression
import mlflow
from sklearn.model_selection import ParameterSampler

param_space = {
    'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter': [100, 200, 300],
    'C': [0.001, 0.01, 0.1, 1, 10]
}

mlflow.set_experiment("LR_RandomParam_TfidfVectorizer")
random_params = ParameterSampler(param_space, n_iter=20, random_state=42)

for i, params in enumerate(random_params):
    with mlflow.start_run(run_name=f"reg_logistic_{i}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Tv, y_train_Tv)
        
        y_pred = clf.predict(X_test_Tv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Tv, y_test_Tv))
        mlflow.log_metric("Precision", precision_score(y_test_Tv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Tv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Tv, y_pred))

2024/01/24 20:49:40 INFO mlflow.tracking.fluent: Experiment with name 'LR_RandomParam_TfidfVectorizer' does not exist. Creating a new experiment.
