In [1]:
# Standard Libraries
import os
import sys
import warnings
from datetime import datetime
from time import time
import unicodedata

# Data Manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from os import listdir
from joblib import dump
from joblib import load
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import auc

# Détermine si vous êtes sur Google Colab
is_colab = 'google.colab' in sys.modules

# Détermine si vous êtes sur Kaggle
is_kaggle = '/kaggle' in os.getcwd()

# Chemin par défaut
path = None

if is_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    path = '/content/drive/MyDrive/Python/OCRP/Projet07/data/credit-default-risk'
    output_dir = '/content/drive/MyDrive/Python/OCRP/Projet07/working/graphs'
    os.makedirs(output_dir, exist_ok=True)
    print("\nTu es sur Google Colab")
elif is_kaggle:
    path = "/kaggle/input/credit-risk"
    output_dir = '/kaggle/working/graphs'
    os.makedirs(output_dir, exist_ok=True)
    print("\nTu es sur Kaggle")
else:
    path = "~/Documents/Python/OCR/Projet07/data/credit-risk"
    output_dir = "~/Documents/Python/OCR/Projet07/working/graphs"
    os.makedirs(output_dir, exist_ok=True)
    print("\nTu es en local")

# Utilisez le chemin d'accès sélectionné pour accéder à vos données
if path is not None:
    print("Chemin d'accès aux données:", path)
else:
    print("Impossible de déterminer l'environnement.")
# Chemin d'accès aux données
save_path = "/kaggle/working/"


Mounted at /content/drive

Tu es sur Google Colab
Chemin d'accès aux données: /content/drive/MyDrive/Python/OCRP/Projet07/data/credit-default-risk


## Chargement des données

In [2]:
# Charger le DataFrame avec pandas
path = '/content/drive/MyDrive/Python/OCRP/Projet07/working/data/'
df_cleaned_filtered = pd.read_csv(os.path.join(path, 'df_64_features.csv'))
print("Dimenssion du dataset groupé:", df_cleaned_filtered.shape)

Dimenssion du dataset groupé: (356251, 66)


In [44]:
import pandas as pd
import re
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from contextlib import contextmanager
import time

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

def one_hot_encoder(df, nan_as_category=True):
    """
    Encodage one-hot pour les colonnes catégorielles et booléennes avec get_dummies.

    Paramètres :
    - df : pandas.DataFrame
        Le DataFrame contenant les données.
    - nan_as_category : bool, facultatif, default=True
        Indique si les NaN doivent être traités comme une catégorie à part.

    Retour :
    - df : pandas.DataFrame
        Le DataFrame avec les colonnes encodées en one-hot.
    - new_columns : list
        Liste des nouveaux noms de colonnes ajoutées lors de l'encodage.
    """

    original_columns = list(df.columns)

    # Identifier les colonnes catégorielles (type object) et booléennes
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    bool_columns = [col for col in df.columns if df[col].dtype == 'bool']

    # Combiner les colonnes catégorielles et booléennes pour l'encodage one-hot
    columns_to_encode = categorical_columns + bool_columns

    # Appliquer l'encodage one-hot
    df = pd.get_dummies(df, columns=columns_to_encode, dummy_na=nan_as_category)

    # Identifier les nouvelles colonnes créées par l'encodage one-hot
    new_columns = [c for c in df.columns if c not in original_columns]

    return df, new_columns


# Identifier et convertir les colonnes de type entier en float64
int_columns = df_cleaned_filtered.select_dtypes(include=['int64']).columns
df_cleaned_filtered[int_columns] = df_cleaned_filtered[int_columns].astype('float64')

# Prendre un échantillon aléatoire de 10 000 lignes du DataFrame
df_sample = df_cleaned_filtered.sample(n=10000, random_state=42).copy()

# Charger le DataFrame avec pandas
with timer("One-hot encoding"):
    df_encoded, new_cols = one_hot_encoder(df_sample)

with timer("Nettoyage des noms de colonnes"):
    df_encoded = df_encoded.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '_', x.upper()))

with timer("Séparation des colonnes cibles et d'identification"):
    features = df_encoded.drop(['TARGET', 'SK_ID_CURR'], axis=1)  # Exclure 'TARGET' et 'SK_ID_CURR'
    target = df_encoded['TARGET']  # Colonne cible
    ids = df_encoded['SK_ID_CURR']  # Colonne d'identification

# Vérifier et supprimer les valeurs manquantes dans la colonne cible
with timer("Traitement des valeurs manquantes dans la colonne cible"):
    target = target.dropna()
    features = features.loc[target.index]
    ids = ids.loc[target.index]

with timer("Imputation des valeurs manquantes avec la médiane"):
    imputer = SimpleImputer(strategy='median')
    features_imputed = pd.DataFrame(imputer.fit_transform(features), columns=features.columns, index=features.index)

with timer("Réintégration des colonnes cibles et d'identification"):
    df_imputed = pd.concat([features_imputed, target, ids], axis=1)

with timer("Split stratifié des données"):
    X = df_imputed.drop(['TARGET'], axis=1)  # Features sans la colonne cible
    y = df_imputed['TARGET']  # Colonne cible
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Afficher les dimensions des ensembles d'entraînement et de test
print("Dimensions de l'ensemble d'entraînement (X_train) :", X_train.shape)
print("Dimensions de l'ensemble de test (X_test) :", X_test.shape)
print("Dimensions de l'ensemble d'entraînement (y_train) :", y_train.shape)
print("Dimensions de l'ensemble de test (y_test) :", y_test.shape)


One-hot encoding - done in 0s
Nettoyage des noms de colonnes - done in 0s
Séparation des colonnes cibles et d'identification - done in 0s
Traitement des valeurs manquantes dans la colonne cible - done in 0s
Imputation des valeurs manquantes avec la médiane - done in 0s
Réintégration des colonnes cibles et d'identification - done in 0s
Split stratifié des données - done in 0s
Dimensions de l'ensemble d'entraînement (X_train) : (6890, 85)
Dimensions de l'ensemble de test (X_test) : (1723, 85)
Dimensions de l'ensemble d'entraînement (y_train) : (6890,)
Dimensions de l'ensemble de test (y_test) : (1723,)



## Logistique Regression

In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score, log_loss
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
import mlflow
import mlflow.sklearn
from pyngrok import ngrok


# Configurer MLFlow
mlflow.set_experiment("scoring_2_model_experiment")
mlflow.sklearn.autolog()


# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Définir la pipeline avec SMOTE et une régression logistique
model_logistic = ImbPipeline([
    ('sampling', SMOTE()),
    ('scaler', StandardScaler()),  # Ajout de la standardisation
    ('classification', LogisticRegression(solver='liblinear'))
])


# Définir les hyperparamètres à optimiser
param_grid_logistic = {
    'classification__C': [0.1, 1, 10, 100],
    'classification__penalty': ['l1', 'l2']
}


# Configurer GridSearchCV
grid_search_logistic = GridSearchCV(model_logistic, param_grid_logistic, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)

# Ouvrir un tunnel HTTPs sur le port 5000 pour http://localhost:5000
#ngrok.kill()
#ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
#print("MLflow Tracking UI:", ngrok_tunnel.public_url)

# Démarrer le serveur de suivi MLflow
get_ipython().system_raw("mlflow ui --port 5000 &")

# Créer un exemple d'entrée
example_input = X_train.head(1)

with mlflow.start_run(run_name='parametres_opt_logistic'):
    grid_search_logistic.fit(X_train, y_train)

    best_params_logistic = grid_search_logistic.best_params_
    print("Best parameters found for Logistic Regression: ", best_params_logistic)

    y_pred_logistic = grid_search_logistic.predict(X_test)
    y_pred_proba_logistic = grid_search_logistic.predict_proba(X_test)[:, 1]

    auc_logistic = roc_auc_score(y_test, y_pred_proba_logistic)
    accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
    recall_logistic = recall_score(y_test, y_pred_logistic)
    precision_logistic = precision_score(y_test, y_pred_logistic)
    f1_logistic = f1_score(y_test, y_pred_logistic)
    logloss_logistic = log_loss(y_test, y_pred_proba_logistic)

    print(f"AUC (Logistic Regression): {auc_logistic}")
    print(f"Accuracy (Logistic Regression): {accuracy_logistic}")
    print(f"Recall (Logistic Regression): {recall_logistic}")
    print(f"Precision (Logistic Regression): {precision_logistic}")
    print(f"F1 Score (Logistic Regression): {f1_logistic}")
    print(f"Log Loss (Logistic Regression): {logloss_logistic}")

    mlflow.log_params(best_params_logistic)
    mlflow.log_metric("AUC (Logistic Regression)", auc_logistic)
    mlflow.log_metric("Accuracy (Logistic Regression)", accuracy_logistic)
    mlflow.log_metric("Recall (Logistic Regression)", recall_logistic)
    mlflow.log_metric("Precision (Logistic Regression)", precision_logistic)
    mlflow.log_metric("F1 Score (Logistic Regression)", f1_logistic)
    mlflow.log_metric("Log Loss (Logistic Regression)", logloss_logistic)

    example_input_logistic = X_train.head(1)
    mlflow.sklearn.log_model(grid_search_logistic.best_estimator_, "model_logistic", input_example=example_input_logistic)

# Définir la pipeline avec SMOTE et XGBoost
model_xgboost = ImbPipeline([
    ('sampling', SMOTE()),
    ('classification', XGBClassifier())
])

# Définir les hyperparamètres à optimiser pour XGBoost
param_grid_xgboost = {
    'classification__learning_rate': [0.1, 0.3, 0.5],
    'classification__max_depth': [3, 5, 7],
    'classification__n_estimators': [50, 100, 200]
}

# Configurer GridSearchCV pour XGBoost
grid_search_xgboost = GridSearchCV(model_xgboost, param_grid_xgboost, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)

# Entraîner et évaluer le modèle XGBoost
with mlflow.start_run(run_name='parametres_opt_xgboost'):
    grid_search_xgboost.fit(X_train, y_train)

    best_params_xgboost = grid_search_xgboost.best_params_
    print("Best parameters found for XGBoost: ", best_params_xgboost)

    y_pred_xgboost = grid_search_xgboost.predict(X_test)
    y_pred_proba_xgboost = grid_search_xgboost.predict_proba(X_test)[:, 1]

    auc_xgboost = roc_auc_score(y_test, y_pred_proba_xgboost)
    accuracy_xgboost = accuracy_score(y_test, y_pred_xgboost)
    recall_xgboost = recall_score(y_test, y_pred_xgboost)
    precision_xgboost = precision_score(y_test, y_pred_xgboost)
    f1_xgboost = f1_score(y_test, y_pred_xgboost)
    logloss_xgboost = log_loss(y_test, y_pred_proba_xgboost)

    print(f"AUC (XGBoost): {auc_xgboost}")
    print(f"Accuracy (XGBoost): {accuracy_xgboost}")
    print(f"Recall (XGBoost): {recall_xgboost}")
    print(f"Precision (XGBoost): {precision_xgboost}")
    print(f"F1 Score (XGBoost): {f1_xgboost}")
    print(f"Log Loss (XGBoost): {logloss_xgboost}")

    mlflow.log_params(best_params_xgboost)
    mlflow.log_metric("AUC (XGBoost)", auc_xgboost)
    mlflow.log_metric("Accuracy (XGBoost)", accuracy_xgboost)
    mlflow.log_metric("Recall (XGBoost)", recall_xgboost)
    mlflow.log_metric("Precision (XGBoost)", precision_xgboost)
    mlflow.log_metric("F1 Score (XGBoost)", f1_xgboost)
    mlflow.log_metric("Log Loss (XGBoost)", logloss_xgboost)

    example_input_xgboost = X_train.head(1)
    mlflow.sklearn.log_model(grid_search_xgboost.best_estimator_, "model_xgboost", input_example=example_input_xgboost)


Fitting 5 folds for each of 8 candidates, totalling 40 fits


2024/08/07 23:58:38 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


Best parameters found:  {'classification__C': 1, 'classification__penalty': 'l1'}
AUC: 0.7401887150799407
Accuracy: 0.6993615786419036
Recall: 0.6802721088435374
Precision: 0.17513134851138354
F1 Score: 0.2785515320334262
Log Loss: 0.5964140629771456


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [53]:
import mlflow
from pyngrok import ngrok

# Terminer les tunnels ouverts s'ils existent
ngrok.kill()

# Si tu as déjà configuré et sauvegardé ton authtoken, tu n'as plus besoin de cette partie
NGROK_AUTH_TOKEN = "2kKv5A8GPf3tcCNOB8w9JMHIzAn_NEuxLmtU3bwg1LMd4EqL"
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Ouvrir un tunnel HTTPs sur le port 5000 pour http://localhost:5000
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)


# Démarrer le serveur de suivi MLflow
!mlflow ui --port 5000

MLflow Tracking UI: https://9d97-34-170-27-224.ngrok-free.app
[2024-08-07 23:59:11 +0000] [68278] [INFO] Starting gunicorn 22.0.0
[2024-08-07 23:59:11 +0000] [68278] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-08-07 23:59:11 +0000] [68278] [ERROR] Retrying in 1 second.
[2024-08-07 23:59:12 +0000] [68278] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-08-07 23:59:12 +0000] [68278] [ERROR] Retrying in 1 second.
[2024-08-07 23:59:13 +0000] [68278] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-08-07 23:59:13 +0000] [68278] [ERROR] Retrying in 1 second.
[2024-08-07 23:59:14 +0000] [68278] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-08-07 23:59:14 +0000] [68278] [ERROR] Retrying in 1 second.
[2024-08-07 23:59:15 +0000] [68278] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-08-07 23:59:15 +0000] [68278] [ERROR] Retrying in 1 second.
[2024-08-07 23:59:16 +0000] [68278] [ERROR] Can't connect to ('127.0.0.1', 5000)
Running the mlflow server failed. Please see

In [54]:
!mlflow ui

[2024-08-07 23:59:25 +0000] [68377] [INFO] Starting gunicorn 22.0.0
[2024-08-07 23:59:25 +0000] [68377] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-08-07 23:59:25 +0000] [68377] [ERROR] Retrying in 1 second.
[2024-08-07 23:59:26 +0000] [68377] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-08-07 23:59:26 +0000] [68377] [ERROR] Retrying in 1 second.
[2024-08-07 23:59:27 +0000] [68377] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-08-07 23:59:27 +0000] [68377] [ERROR] Retrying in 1 second.
[2024-08-07 23:59:28 +0000] [68377] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-08-07 23:59:28 +0000] [68377] [ERROR] Retrying in 1 second.
[2024-08-07 23:59:29 +0000] [68377] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-08-07 23:59:29 +0000] [68377] [ERROR] Retrying in 1 second.
[2024-08-07 23:59:30 +0000] [68377] [ERROR] Can't connect to ('127.0.0.1', 5000)
Running the mlflow server failed. Please see the logs above for details.


In [13]:
!ngrok authtoken YOUR_NGROK_AUTH_TOKEN

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
!cat /root/.config/ngrok/ngrok.yml


In [21]:
!ngrok version

ngrok version 3.14.0
pyngrok version 7.2.0


In [23]:
# Terminer les tunnels ouverts s'ils existent
ngrok.kill()

In [22]:
!lsof -i :5000

In [2]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.15.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.15.1 (from mlflow)
  Downloading mlflow_skinny-2.15.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting querystring-parser<2 (from mlflow)
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl.metadata (559 bytes)
Collecting gunicorn<23 (from mlflow)
  Downloading gunicorn-22.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.15.1->mlflow)
  Downloading databricks_sdk-0.29.0-py3-none-any.whl.metadata (35 kB)
Collecting gitpython<4,>=3.1.9 (from mlflow-skinny==2.15.1->mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl.m

In [8]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0


In [5]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.15.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.15.1 (from mlflow)
  Downloading mlflow_skinny-2.15.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting querystring-parser<2 (from mlflow)
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl.metadata (559 bytes)
Collecting gunicorn<23 (from mlflow)
  Downloading gunicorn-22.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.15.1->mlflow)
  Downloading databricks_sdk-0.29.0-py3-none-any.whl.metadata (35 kB)
Collecting gitpython<4,>=3.1.9 (from mlflow-skinny==2.15.1->mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl.m

In [6]:
!pip install nbdev

Collecting nbdev
  Downloading nbdev-2.3.27-py3-none-any.whl.metadata (10 kB)
Collecting execnb>=0.1.4 (from nbdev)
  Downloading execnb-0.1.6-py3-none-any.whl.metadata (3.2 kB)
Collecting ghapi>=1.0.3 (from nbdev)
  Downloading ghapi-1.0.5-py3-none-any.whl.metadata (13 kB)
Collecting watchdog (from nbdev)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl.metadata (37 kB)
Collecting asttokens (from nbdev)
  Downloading asttokens-2.4.1-py2.py3-none-any.whl.metadata (5.2 kB)
Collecting jedi>=0.16 (from ipython->execnb>=0.1.4->nbdev)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading nbdev-2.3.27-py3-none-any.whl (67 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading execnb-0.1.6-py3-none-any.whl (14 kB)
Downloading ghapi-1.0.5-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:0