<a href="https://colab.research.google.com/github/Manal-L/implementez_modele_scoring/blob/main/projet7_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Implémentez un modèle de scoring

#Chargement des données

In [1]:
import pandas as pd
import zipfile
from google.colab import drive

drive.mount('/content/drive')

#le chemin vers le fichier
file_path = '/content/drive/My Drive/df_clean.zip'

with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall('/content/')

Mounted at /content/drive


In [14]:
import os
os.chdir('/content/drive/My Drive/MLflow')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
!ls

In [None]:
df = pd.read_csv('df_clean.csv')
print(df.shape)
df.head()

In [3]:
list(df)

['SK_ID_CURR',
 'TARGET',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356251 entries, 0 to 356250
Columns: 770 entries, SK_ID_CURR to CC_COUNT
dtypes: bool(133), float64(584), int64(42), object(11)
memory usage: 1.7+ GB


#Les valeurs manquantes

In [5]:
#le pourcentage de valeurs manquantes pour chaque colonne
valeur_m_p = df.isnull().mean() * 100

#les colonnes avec plus de 70% de valeurs manquantes
valeur_m_cols = valeur_m_p[valeur_m_p > 70].index

#le nombre de colonnes avec plus de 70% de valeurs manquantes
nb_valeur_m_cols = valeur_m_cols.shape[0]
print(f"Le nombre de colonnes avec plus de 70% de valeurs manquantes: {nb_valeur_m_cols}")

Le nombre de colonnes avec plus de 70% de valeurs manquantes: 144


In [6]:
#le nombre de valeurs manquantes par colonne
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
print(missing_values)

TARGET                                  48744
AMT_ANNUITY                                36
AMT_GOODS_PRICE                           278
DAYS_EMPLOYED                           64648
OWN_CAR_AGE                            235239
                                        ...  
CC_NAME_CONTRACT_STATUS_Signed_MAX     252693
CC_NAME_CONTRACT_STATUS_Signed_MEAN    252693
CC_NAME_CONTRACT_STATUS_Signed_SUM     252693
CC_NAME_CONTRACT_STATUS_Signed_VAR     253385
CC_COUNT                               252693
Length: 590, dtype: int64


In [7]:
#la suppression des colonnes avec plus de 70% de valeurs manquantes
df = df.drop(columns=valeur_m_cols)

print(f"Le nombre de colonnes restantes après suppression: {df.shape[1]}")
print(f"Le nombre de colonnes supprimées: {len(valeur_m_cols)}")

Le nombre de colonnes restantes après suppression: 626
Le nombre de colonnes supprimées: 144


In [8]:
#je ne garde que les donneées où la colonne 'TARGET' n'est pas vide.
# 'TARGET' est la variable à prédire (classification), donc ces données sont nécessaires pour l'entraînement.
df_classification = df[df['TARGET'].notnull()]
print(df_classification.shape)

(307507, 626)


In [9]:
#les colonnes cat
cat_cols = df_classification.select_dtypes(include=['object']).columns
print(cat_cols)

Index([], dtype='object')


Après la suppression des cols à >70% de valeurs manquantes, il ne reste plus de cols cat.

#Imputation

In [10]:
#je remplace les valeurs manquantes par la moyenne des colonnes numériques
df_classification_imputed = df_classification.copy()
df_classification_imputed.fillna(df_classification.mean(), inplace=True)

In [11]:
#vérification qu'il n'y a plus de valeurs manquantes
print(df_classification_imputed.isnull().sum().sum())

0


In [12]:
#le df avec imputation mean en csv
df_classification_imputed.to_csv('df_classification_imputed.csv', index=False)

In [None]:
df = pd.read_csv('df_classification_imputed.csv')
df.shape

#Echantillon

In [13]:
import pandas as pd

#la taille de l'échantillon total
sample_size = 50000

#je calcule la proportion des classes
class_distribution = df_classification_imputed['TARGET'].value_counts(normalize=True)

#échantillon stratifié en fonction de la distribution de 'TARGET'
df_sample = df_classification_imputed.groupby('TARGET', group_keys=False).apply(lambda x: x.sample(int(sample_size * class_distribution.loc[x.name]), random_state=42))

#la distribution dans l'échantillon
print(df_sample['TARGET'].value_counts(normalize=True))


TARGET
0.0    0.919278
1.0    0.080722
Name: proportion, dtype: float64


In [14]:
df_sample.to_csv('df_sample.csv', index=False)

In [1]:
import pandas as pd

df_sample = pd.read_csv('df_sample.csv')
df_sample.shape

(49999, 626)

#Entrainement des modeles avec tracking MLFlow

In [2]:
from sklearn.model_selection import train_test_split

#la séparation des caractéristiques (X) et de la cible (y) en supprimant les colonnes 'TARGET' et 'SK_ID_CURR'
X = df_sample.drop(columns=["TARGET", "SK_ID_CURR"])

#la cible est la colonne 'TARGET'
y = df_sample["TARGET"]

#division stratifiée des données (80% train, 20% val)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

#les formes des ensembles
print(f"TRAIN: entraînement: {X_train.shape}, TRAIN: val: {X_val.shape}")
print(f"CIBLE: entraînement: {y_train.shape}, CIBLE: val: {y_val.shape}")

TRAIN: entraînement: (39999, 624), TRAIN: val: (10000, 624)
CIBLE: entraînement: (39999,), CIBLE: val: (10000,)


In [3]:
#j'install MLFlow
! pip install mlflow

Collecting mlflow
  Downloading mlflow-2.16.2-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.16.2 (from mlflow)
  Downloading mlflow_skinny-2.16.2-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.16.2->mlflow)
  Downloading databricks_sdk-0.32.3-py3-none-any.whl.metadata (37 kB)
Collecting gitpython<4,>=3.1.9 (from mlflow-skinny==2.16.2->mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==2.16.2->mlflow)
  Downloading opentelemetry_api-1.2

##Random Forest Classifier

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import mlflow
import mlflow.sklearn

In [18]:
#l'expérience MLFlow : RandomForestClassifier
mlflow.set_experiment("RandomForestClassifier")

# RUN 1 : random_forest_default

#initialisation du RandomForestClassifier avec hyperparamètres par défaut
rf_clf = RandomForestClassifier()

with mlflow.start_run(run_name='random_forest_default'):
    #l'enregistrement des hyperparamètres
    mlflow.log_param("model_type", "RandomForestClassifier")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", None)
    mlflow.log_param("random_state", None)

    #l'entraînement du modèle
    rf_clf.fit(X_train, y_train)

    #la prédictions
    y_pred_rf = rf_clf.predict(X_val)
    y_prob_rf = rf_clf.predict_proba(X_val)[:, 1]

    #les métriques
    try:
        auc_score_rf = roc_auc_score(y_val, y_prob_rf)
    except ValueError:
        auc_score_rf = float('nan')

    accuracy_rf = accuracy_score(y_val, y_pred_rf)

    #l'enregistrement des métriques
    mlflow.log_metric("auc", auc_score_rf)
    mlflow.log_metric("accuracy", accuracy_rf)

    #l'enregistrement du modèle Random Forest
    mlflow.sklearn.log_model(rf_clf, "random_forest_model")

#pour vérification
print(f"Random Forest AUC: {auc_score_rf}, Accuracy: {accuracy_rf}")


2024/09/21 06:18:10 INFO mlflow.tracking.fluent: Experiment with name 'RandomForestClassifier' does not exist. Creating a new experiment.


Random Forest AUC: 0.7005143453392627, Accuracy: 0.9194


In [None]:
#pour vérifier que mon modèle est bien enregistré sur le dossier MLflow (Drive)
!ls

In [20]:
#l'expérience MLFlow : RandomForestClassifier
mlflow.set_experiment("RandomForestClassifier")

# RUN 2 : random_forest_hyper1

#initialisation du RandomForestClassifier avec hyperparamètres
rf_clf_h1 = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42)

with mlflow.start_run(run_name='random_forest_hyper1'):
    #l'enregistrement des hyperparamètres
    mlflow.log_param("model_type", "RandomForestClassifier")
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", 5)
    mlflow.log_param("random_state", 42)

    #l'entraînement du modèle
    rf_clf_h1.fit(X_train, y_train)

    #la prédictions
    y_pred_rf_h1 = rf_clf_h1.predict(X_val)
    y_prob_rf_h1 = rf_clf_h1.predict_proba(X_val)[:, 1]

    #les métriques
    try:
        auc_score_rf_h1 = roc_auc_score(y_val, y_prob_rf_h1)
    except ValueError:
        auc_score_rf_h1 = float('nan')

    accuracy_rf_h1 = accuracy_score(y_val, y_pred_rf_h1)

    #l'enregistrement des métriques
    mlflow.log_metric("auc_h1", auc_score_rf_h1)
    mlflow.log_metric("accuracy_h1", accuracy_rf_h1)

    #l'enregistrement du modèle Random Forest
    mlflow.sklearn.log_model(rf_clf_h1, "random_forest_model_h1")

#pour vérification
print(f"Random Forest AUC_h1: {auc_score_rf_h1}, Accuracy_h1: {accuracy_rf_h1}")



Random Forest AUC_h1: 0.7231446371498382, Accuracy_h1: 0.9193


In [26]:
#pour vérifier que mon modèle est bien enregistré dans mlruns sur le dossier MLflow (Drive)
!ls

mlruns	projet7_1.ipynb


In [25]:
mlflow.get_experiment_by_name("RandomForestClassifier")


<Experiment: artifact_location='file:///content/drive/MyDrive/MLflow/mlruns/565008962710455641', creation_time=1726899490519, experiment_id='565008962710455641', last_update_time=1726899490519, lifecycle_stage='active', name='RandomForestClassifier', tags={}>

Je vais utilier GridSearchCV : le modèle est entraîné avec toutes les combinaisons d'hyperparamètres spécifiées, et la meilleure combinaison est sélectionnée en fonction de la métrique de score :accuracy pour l'instant.

Ensuite, après avoir trouvé les meilleurs hyperparamètres, les prédictions sont effectuées, les métriques sont calculées

In [None]:
from sklearn.model_selection import GridSearchCV

#l'expérience MLFlow : RandomForestClassifier
mlflow.set_experiment("RandomForestClassifier")

#les hyperparamètres à explorer
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'random_state': [42]
}

# RUN 3 :
#initialisation du RandomForestClassifier
rf_clf_cv = RandomForestClassifier()

#la configuration de GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf_clf_cv, param_grid=param_grid_rf, cv=5, scoring='accuracy')

#exécution de GridSearchCV
with mlflow.start_run(run_name='random_forest_grid_search_cv'):
    grid_search_rf.fit(X_train, y_train)

    #meilleurs paramètres et score
    best_params_rf = grid_search_rf.best_params_
    best_score_rf = grid_search_rf.best_score_

    #enregistrement des meilleurs paramètres
    mlflow.log_params(best_params_rf)
    mlflow.log_param("best_score", best_score_rf)

    #la prédictions avec les meilleurs paramètres
    y_pred_rf_cv = grid_search_rf.best_estimator_.predict(X_val)
    y_prob_rf_cv = grid_search_rf.best_estimator_.predict_proba(X_val)[:, 1]

    #calcul des métriques
    auc_score_rf_cv = roc_auc_score(y_val, y_prob_rf_cv)
    accuracy_rf_cv = accuracy_score(y_val, y_pred_rf_cv)

    #enregistrement des métriques
    mlflow.log_metric("auc_cv", auc_score_rf_cv)
    mlflow.log_metric("accuracy_cv", accuracy_rf_cv)

    #enregistrement du modèle RandomForest avec les meilleurs paramètres
    mlflow.sklearn.log_model(grid_search_rf.best_estimator_, "random_forest_best_model")

print(f"Best Random Forest Parameters: {best_params_rf}")
print(f"Random Forest AUC_cv: {auc_score_rf_cv}, Accuracy_cv: {accuracy_rf_cv}")


penalized_score = fp + 10 * fn

Je calcule le score en pénalisant les faux négatifs plus fortement

In [None]:
from sklearn.metrics import confusion_matrix

#l'expérience MLFlow : RandomForestClassifier
mlflow.set_experiment("RandomForestClassifier")

#les hyperparamètres à explorer
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'random_state': [42]
}

# RUN 4
#initialisation du RandomForestClassifier
rf_clf = RandomForestClassifier()

#la configuration de GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf_clf, param_grid=param_grid_rf, cv=5, scoring='accuracy')

#exécution de GridSearchCV
with mlflow.start_run(run_name='random_forest_grid_search_score'):
    grid_search_rf.fit(X_train, y_train)

    # Meilleurs paramètres et score
    best_params_rf = grid_search_rf.best_params_
    best_score_rf = grid_search_rf.best_score_

    # Enregistrement des meilleurs paramètres
    mlflow.log_params(best_params_rf)
    mlflow.log_param("best_score", best_score_rf)

    # Prédictions avec les meilleurs paramètres
    y_pred_rf = grid_search_rf.best_estimator_.predict(X_val)

    # Calcul des métriques
    confusion = confusion_matrix(y_val, y_pred_rf)
    fp = confusion[0, 1]  # Faux positifs
    fn = confusion[1, 0]  # Faux négatifs

    penalized_score = fp + 10 * fn

    try:
        auc_score_rf = roc_auc_score(y_val, y_pred_rf)
    except ValueError:
        auc_score_rf = float('nan')

    accuracy_rf = accuracy_score(y_val, y_pred_rf)

    # Enregistrement des métriques
    mlflow.log_metric("auc", auc_score_rf)
    mlflow.log_metric("accuracy", accuracy_rf)
    mlflow.log_metric("penalized_score", penalized_score)

    # Enregistrement du modèle RandomForest avec les meilleurs paramètres
    mlflow.sklearn.log_model(grid_search_rf.best_estimator_, "random_forest_best_model_score")

print(f"Best Random Forest Parameters: {best_params_rf}")
print(f"Random Forest AUC: {auc_score_rf}, Accuracy: {accuracy_rf}, Penalized Score: {penalized_score}")

##Light GBM

In [28]:
#lightGBM ne prend pas en charge les caractères spéciaux dans les noms de colonnes donc je les supprime
import re

#la fonction pour nettoyer les noms de colonnes
def clean_column_names(df):
    df.columns = [re.sub(r'\W+', '_', col) for col in df.columns]
    return df

#j'applique la fonction à X_train et X_val
X_train = clean_column_names(X_train)
X_val = clean_column_names(X_val)

#maintenant les colonnes ne contiennent plus de caractères spéciaux

In [29]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, roc_auc_score
import mlflow
import mlflow.sklearn

#l'expérience MLFlow : LightGBM
mlflow.set_experiment("LightGBM")

# RUN 1: lightgbm_default

#initialisation du LightGBM avec hyperparamètres par défaut
lgb_clf = lgb.LGBMClassifier()

with mlflow.start_run(run_name='lightgbm_default'):
    #enregistrement des hyperparamètres par défaut
    mlflow.log_param("model_type", "LGBMClassifier")
    mlflow.log_param("boosting_type", lgb_clf.boosting_type)  # 'gbdt' par défaut
    mlflow.log_param("num_leaves", lgb_clf.num_leaves)  # 31 par défaut
    mlflow.log_param("learning_rate", lgb_clf.learning_rate)  # 0.1 par défaut
    mlflow.log_param("n_estimators", lgb_clf.n_estimators)  # 100 par défaut
    mlflow.log_param("max_depth", lgb_clf.max_depth)  # -1 par défaut (pas de limite)

    #entraînement du modèle
    lgb_clf.fit(X_train, y_train)

    #la prédictions
    y_pred_lgb = lgb_clf.predict(X_val)

    #la probabilité pour la classe positive
    y_prob_lgb = lgb_clf.predict_proba(X_val)[:, 1]

    #je calcul les métriques
    try:
        auc_score_lgb = roc_auc_score(y_val, y_prob_lgb)
    except ValueError:
        auc_score_lgb = float('nan')

    accuracy_lgb = accuracy_score(y_val, y_pred_lgb)

    #enregistrement des métriques
    mlflow.log_metric("auc", auc_score_lgb)
    mlflow.log_metric("accuracy", accuracy_lgb)

    #enregistrement du modèle LightGBM
    mlflow.sklearn.log_model(lgb_clf, "lightgbm_model")

#les résultats pour vérification
print(f"LightGBM AUC: {auc_score_lgb}, Accuracy: {accuracy_lgb}")


[LightGBM] [Info] Number of positive: 3229, number of negative: 36770
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.767886 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68500
[LightGBM] [Info] Number of data points in the train set: 39999, number of used features: 601
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080727 -> initscore=-2.432510
[LightGBM] [Info] Start training from score -2.432510




LightGBM AUC: 0.7595228630803218, Accuracy: 0.9187


In [30]:
#l'expérience MLFlow : LightGBM
mlflow.set_experiment("LightGBM")

# RUN 2: lightgbm_hyper

#initialisation du LightGBM avec des hyperparamètres personnalisés
lgb_clf_h1 = lgb.LGBMClassifier(
    num_leaves=50,          #j'augmenter le nombre de feuilles
    max_depth=10,           #la profondeur des arbres
    learning_rate=0.05,     #je diminue le taux d'apprentissage
    n_estimators=200        #j'augmenter le nombre d'estimations (arbres)
)

with mlflow.start_run(run_name='lightgbm_custom_h1'):
    #enregistrement des hyperparamètres personnalisés
    mlflow.log_param("model_type", "LGBMClassifier")
    mlflow.log_param("boosting_type", lgb_clf_h1.boosting_type)  #c'était 'gbdt' par défaut
    mlflow.log_param("num_leaves", lgb_clf_h1.num_leaves)
    mlflow.log_param("learning_rate", lgb_clf_h1.learning_rate)
    mlflow.log_param("n_estimators", lgb_clf_h1.n_estimators)
    mlflow.log_param("max_depth", lgb_clf_h1.max_depth)

    #entraînement du modèle
    lgb_clf_h1.fit(X_train, y_train)

    #la prédictions
    y_pred_lgb_h1 = lgb_clf_h1.predict(X_val)
    y_prob_lgb_h1 = lgb_clf_h1.predict_proba(X_val)[:, 1]

    #je calcul les métriques
    try:
        auc_score_lgb_h1 = roc_auc_score(y_val, y_prob_lgb_h1)
    except ValueError:
        auc_score_lgb_h1 = float('nan')

    accuracy_lgb_h1 = accuracy_score(y_val, y_pred_lgb_h1)

    #l'enregistrement des métriques
    mlflow.log_metric("auc_h1", auc_score_lgb_h1)
    mlflow.log_metric("accuracy_h1", accuracy_lgb_h1)

    #l'enregistrement du modèle LightGBM
    mlflow.sklearn.log_model(lgb_clf_h1, "lightgbm_custom_model_h1")

#les résultats pour vérification
print(f"LightGBM AUC_h1: {auc_score_lgb_h1}, Accuracy_h1: {accuracy_lgb_h1}")


[LightGBM] [Info] Number of positive: 3229, number of negative: 36770
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.401872 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68500
[LightGBM] [Info] Number of data points in the train set: 39999, number of used features: 601
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080727 -> initscore=-2.432510
[LightGBM] [Info] Start training from score -2.432510




LightGBM AUC_h1: 0.7687009578836114, Accuracy_h1: 0.9203


In [None]:
#l'expérience MLFlow : LightGBM
mlflow.set_experiment("LightGBM")

#les hyperparamètres à explorer
param_grid_lgb = {
    'num_leaves': [31, 50, 70],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300]
}

#initialisation du LGBMClassifier
lgb_clf_cv = lgb.LGBMClassifier()

# RUN 3

#configuration de GridSearchCV
grid_search_lgb = GridSearchCV(estimator=lgb_clf_cv, param_grid=param_grid_lgb, cv=5, scoring='accuracy')

#exécution de GridSearchCV
with mlflow.start_run(run_name='lightgbm_grid_search_cv'):
    grid_search_lgb.fit(X_train, y_train)

    #meilleurs paramètres et score
    best_params_lgb = grid_search_lgb.best_params_
    best_score_lgb = grid_search_lgb.best_score_

    #enregistrement des meilleurs paramètres
    mlflow.log_params(best_params_lgb)
    mlflow.log_param("best_score", best_score_lgb)

    #prédictions avec les meilleurs paramètres
    y_pred_lgb_cv = grid_search_lgb.best_estimator_.predict(X_val)
    y_prob_lgb_cv = grid_search_lgb.best_estimator_.predict_proba(X_val)[:, 1]

    #calcul des métriques
    auc_score_lgb_cv = roc_auc_score(y_val, y_prob_lgb_cv)
    accuracy_lgb_cv = accuracy_score(y_val, y_pred_lgb_cv)

    #enregistrement des métriques
    mlflow.log_metric("auc_cv", auc_score_lgb_cv)
    mlflow.log_metric("accuracy_cv", accuracy_lgb_cv)

    #enregistrement du modèle LGBM avec les meilleurs paramètres
    mlflow.sklearn.log_model(grid_search_lgb.best_estimator_, "lightgbm_best_model")

print(f"Best LightGBM Parameters: {best_params_lgb}")
print(f"LightGBM AUC_cv: {auc_score_lgb_cv}, Accuracy_cv: {accuracy_lgb_cv}")


#MLFlow UI

In [22]:
#maintenant pour visualiser sur mlflw j'ai besoin de ngrok
#j'installe le package
! pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0


In [23]:
from pyngrok import ngrok
#terminate open tunnels if exist
ngrok.kill()

#setting the authtoken (optional)
NGROK_AUTH_TOKEN = "2m0LsyXXeuXcuZ6p34hmutJGy1u_4yK2gMFBwyn1ka2HPfR1i"
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

#open an HTTPs tunnel on port 5000 for http://localhost:5000
ngrok_tunnel = ngrok.connect(addr = "5000", proto= "http", bind_tls =True)
print('MLFlow Tracking UI:', ngrok_tunnel.public_url)

MLFlow Tracking UI: https://fbfe-34-132-205-88.ngrok-free.app


In [24]:
#afin de visualiser les expérimentations avec MLFlow, j'exécute l'interface utilisateur
!mlflow ui

[2024-09-21 06:27:54 +0000] [19162] [INFO] Starting gunicorn 23.0.0
[2024-09-21 06:27:54 +0000] [19162] [INFO] Listening at: http://127.0.0.1:5000 (19162)
[2024-09-21 06:27:54 +0000] [19162] [INFO] Using worker: sync
[2024-09-21 06:27:54 +0000] [19167] [INFO] Booting worker with pid: 19167
[2024-09-21 06:27:54 +0000] [19168] [INFO] Booting worker with pid: 19168
[2024-09-21 06:27:55 +0000] [19169] [INFO] Booting worker with pid: 19169
[2024-09-21 06:27:55 +0000] [19170] [INFO] Booting worker with pid: 19170
[2024-09-21 06:36:36 +0000] [19162] [INFO] Handling signal: int

Aborted!
[2024-09-21 06:36:36 +0000] [19168] [INFO] Worker exiting (pid: 19168)
[2024-09-21 06:36:36 +0000] [19170] [INFO] Worker exiting (pid: 19170)
[2024-09-21 06:36:36 +0000] [19169] [INFO] Worker exiting (pid: 19169)
[2024-09-21 06:36:36 +0000] [19167] [INFO] Worker exiting (pid: 19167)
[2024-09-21 06:36:38 +0000] [19162] [INFO] Shutting down: Master
