In [14]:
# manipulation des données
import numpy as np
import pandas as pd

# matplotlib et seaborn pour les représentations graphiques
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# sklearn preprocessing pour le traiter les variables catégorielles
from sklearn.preprocessing import LabelEncoder

# Gestion du système de fichiers
import os

# Suppression des alertes
import warnings
warnings.filterwarnings('ignore')

In [15]:
df = pd.read_csv("../Data/2.sample_dataset2.csv")
df

Unnamed: 0,target,id,date,flag,user,text,words
0,0,2199811894,Tue Jun 16 18:02:13 PDT 2009,NO_QUERY,Bella94,have anything to tell miss my back midfielder ...,anything tell miss back midfielder guysbut thats
1,0,2070608955,Sun Jun 07 18:00:01 PDT 2009,NO_QUERY,AdamRamsay,its looking like it will,look like
2,1,2072052858,Sun Jun 07 20:19:12 PDT 2009,NO_QUERY,Rachepwns,ps when gonna pick up crest endorcement perfec...,p gon na pick crest endorcement perfect smile
3,0,2301050364,Tue Jun 23 14:56:07 PDT 2009,NO_QUERY,satori,no pell grant for me need to talk to finaid at...,pell grant need talk finaid school see option ...
4,1,1964060922,Fri May 29 13:39:52 PDT 2009,NO_QUERY,BianaBabinsky,thank you pat,thank pat
...,...,...,...,...,...,...,...
99995,1,1979381791,Sun May 31 02:48:15 PDT 2009,NO_QUERY,jadekirk,thats gorgeous pic,thats gorgeous pic
99996,1,2002103466,Tue Jun 02 03:09:14 PDT 2009,NO_QUERY,whatsvickydoing,okay well im gonna turn up to your doorstep wi...,okay well im gon na turn doorstep cash afternoon
99997,0,2008511597,Tue Jun 02 14:14:22 PDT 2009,NO_QUERY,Ryanduff,so wana get bk into djing properly but cant atm,wana get bk dj properly cant atm
99998,0,1970679476,Sat May 30 04:49:09 PDT 2009,NO_QUERY,malifal,gotta go get my mail from the post office in t...,get ta go get mail post office day age still d...


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
df = df.dropna(subset=['words'])

X_CountVecorizer = vectorizer.fit_transform(df['words'])
y_CountVecorizer = df['target']

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_TfidfVectorizer = vectorizer.fit_transform(df['words'])
y_TfidfVectorizer = df['target']

In [18]:
from sklearn.model_selection import train_test_split

X_train_Cv, X_test_Cv, y_train_Cv, y_test_Cv = train_test_split(X_CountVecorizer,
                                                                 y_CountVecorizer,
                                                                 test_size = 0.4,
                                                                 random_state = 42)

X_train_Tv, X_test_Tv, y_train_Tv, y_test_Tv = train_test_split(X_TfidfVectorizer,
                                                                 y_TfidfVectorizer,
                                                                 test_size = 0.4,
                                                                 random_state = 42)

In [19]:
from sklearn.linear_model import LogisticRegression
import mlflow

param_sets = [
    {'solver': 'liblinear'},
    {'solver': 'saga'},
    {'solver': 'lbfgs'},
    {'max_iter': 100},
    {'max_iter': 200},
    {'max_iter': 300},
    {'C': 0.0001},
    {'C': 0.001},
    {'C': 0.01},
    {'C': 0.1},
    {'C': 1},
    {'C': 10},
    {'C': 100}
]

In [20]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


mlflow.set_experiment("Reg_Logistic_CountVectorizer2")

for i, params in enumerate(param_sets):
    name_experience = f'{list(params.keys())[0]}_{list(params.values())[0]}' # héhéhéhé ça marche
    with mlflow.start_run(run_name=f"reg_logistic_{name_experience}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Cv, y_train_Cv)

        y_pred = clf.predict(X_test_Cv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Cv, y_test_Cv))
        mlflow.log_metric("Precision", precision_score(y_test_Cv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Cv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Cv, y_pred))
        
        

In [9]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

mlflow.set_experiment("Reg_Logistic_TfidfVectorizer2")

for i, params in enumerate(param_sets):
    name_experience = f'{list(params.keys())[0]}_{list(params.values())[0]}'
    with mlflow.start_run(run_name=f"reg_logistic_{name_experience}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Tv, y_train_Tv)
        
        y_pred = clf.predict(X_test_Tv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Tv, y_test_Tv))
        mlflow.log_metric("Precision", precision_score(y_test_Tv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Tv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Tv, y_pred))

2024/01/24 15:29:47 INFO mlflow.tracking.fluent: Experiment with name 'Reg_Logistic_TfidfVectorizer' does not exist. Creating a new experiment.


In [14]:
from sklearn.linear_model import LogisticRegression
import mlflow
from sklearn.model_selection import ParameterSampler

param_space = {
    'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter': [100, 200, 300],
    'C': [0.001, 0.01, 0.1, 1, 10]
}

mlflow.set_experiment("LR_RandomParam_CountVectorizer2")
random_params = ParameterSampler(param_space, n_iter=20, random_state=42)

for i, params in enumerate(random_params):
    with mlflow.start_run(run_name=f"reg_logistic_{i}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Cv, y_train_Cv)
        
        y_pred = clf.predict(X_test_Tv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Cv, y_test_Cv))
        mlflow.log_metric("Precision", precision_score(y_test_Tv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Tv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Tv, y_pred))

2024/01/24 15:43:27 INFO mlflow.tracking.fluent: Experiment with name 'LR_RandomParam_CountVectorizer' does not exist. Creating a new experiment.


In [15]:
from sklearn.linear_model import LogisticRegression
import mlflow
from sklearn.model_selection import ParameterSampler

param_space = {
    'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter': [100, 200, 300],
    'C': [0.001, 0.01, 0.1, 1, 10]
}

mlflow.set_experiment("LR_RandomParam_TfidfVectorizer2")
random_params = ParameterSampler(param_space, n_iter=20, random_state=42)

for i, params in enumerate(random_params):
    with mlflow.start_run(run_name=f"reg_logistic_{i}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Tv, y_train_Tv)
        
        y_pred = clf.predict(X_test_Tv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Tv, y_test_Tv))
        mlflow.log_metric("Precision", precision_score(y_test_Tv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Tv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Tv, y_pred))

2024/01/24 15:55:16 INFO mlflow.tracking.fluent: Experiment with name 'LR_RandomParam_TfidfVectorizer' does not exist. Creating a new experiment.
