In [6]:
# Standard Libraries
import os
import sys
import warnings
from datetime import datetime
from time import time
import unicodedata

# Data Manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from os import listdir
from joblib import dump
from joblib import load
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import auc

# Détermine si vous êtes sur Google Colab
is_colab = 'google.colab' in sys.modules

# Détermine si vous êtes sur Kaggle
is_kaggle = '/kaggle' in os.getcwd()

# Chemin par défaut
path = None

if is_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    path = '/content/drive/MyDrive/Python/OCRP/Projet07/data/credit-default-risk'
    output_dir = '/content/drive/MyDrive/Python/OCRP/Projet07/working/graphs'
    os.makedirs(output_dir, exist_ok=True)
    print("\nTu es sur Google Colab")
elif is_kaggle:
    path = "/kaggle/input/credit-risk"
    output_dir = '/kaggle/working/graphs'
    os.makedirs(output_dir, exist_ok=True)
    print("\nTu es sur Kaggle")
else:
    path = "~/Documents/Python/OCR/Projet07/data/credit-risk"
    output_dir = "~/Documents/Python/OCR/Projet07/working/graphs"
    os.makedirs(output_dir, exist_ok=True)
    print("\nTu es en local")

# Utilisez le chemin d'accès sélectionné pour accéder à vos données
if path is not None:
    print("Chemin d'accès aux données:", path)
else:
    print("Impossible de déterminer l'environnement.")
# Chemin d'accès aux données
save_path = "/kaggle/working/"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Tu es sur Google Colab
Chemin d'accès aux données: /content/drive/MyDrive/Python/OCRP/Projet07/data/credit-default-risk


## Chargement des données

In [2]:
# Charger le DataFrame avec pandas
path = '/content/drive/MyDrive/Python/OCRP/Projet07/working/data/'
df_cleaned_filtered = pd.read_csv(os.path.join(path, 'df_64_features.csv'))
print("Dimenssion du dataset groupé:", df_cleaned_filtered.shape)

Dimenssion du dataset groupé: (356251, 66)


In [16]:
import pandas as pd
import re
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from contextlib import contextmanager
import time

# Définir le contexte de gestion du temps
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# Fonction pour effectuer l'encodage one-hot
def one_hot_encoder(df, nan_as_category=True):
    """
    Encodage one-hot pour les colonnes catégorielles et booléennes avec get_dummies.

    Paramètres :
    - df : pandas.DataFrame
        Le DataFrame contenant les données.
    - nan_as_category : bool, facultatif, default=True
        Indique si les NaN doivent être traités comme une catégorie à part.

    Retour :
    - df : pandas.DataFrame
        Le DataFrame avec les colonnes encodées en one-hot.
    - new_columns : list
        Liste des nouveaux noms de colonnes ajoutées lors de l'encodage.
    """

    original_columns = list(df.columns)

    # Identifier les colonnes catégorielles (type object) et booléennes
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    bool_columns = [col for col in df.columns if df[col].dtype == 'bool']

    # Combiner les colonnes catégorielles et booléennes pour l'encodage one-hot
    columns_to_encode = categorical_columns + bool_columns

    # Appliquer l'encodage one-hot
    df = pd.get_dummies(df, columns=columns_to_encode, dummy_na=nan_as_category)

    # Identifier les nouvelles colonnes créées par l'encodage one-hot
    new_columns = [c for c in df.columns if c not in original_columns]

    return df, new_columns

# Fonction pour initialiser les données
def initialize_data(df, sample_fraction=0.3):
    with timer("Conversion des colonnes entières en float64"):
        int_columns = df.select_dtypes(include=['int64']).columns
        df[int_columns] = df[int_columns].astype('float64')

    with timer("Prendre un échantillon aléatoire"):
        sample_size = int(len(df) * sample_fraction)
        df_sample = df.sample(n=sample_size, random_state=42).copy()

    with timer("One-hot encoding"):
        df_encoded, new_cols = one_hot_encoder(df_sample)
        # Vérifier les NaN et les inf
        print("# Nombre total d'inf:",np.isinf(df_encoded).sum().sum())
    with timer("Nettoyage des noms de colonnes"):
        df_encoded = df_encoded.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '_', x.upper()))

    with timer("Séparation des colonnes cibles et d'identification"):
        features = df_encoded.drop(['TARGET', 'SK_ID_CURR'], axis=1)  # Exclure 'TARGET' et 'SK_ID_CURR'
        target = df_encoded['TARGET']  # Colonne cible
        ids = df_encoded['SK_ID_CURR']  # Colonne d'identification

    with timer("Traitement des valeurs manquantes dans la colonne cible"):
        target = target.dropna()
        features = features.loc[target.index]
        ids = ids.loc[target.index]

    with timer("Imputation des valeurs manquantes avec la médiane"):
        imputer = SimpleImputer(strategy='median')
        features_imputed = pd.DataFrame(imputer.fit_transform(features), columns=features.columns, index=features.index)

    with timer("Réintégration des colonnes cibles et d'identification"):
        df_imputed = pd.concat([features_imputed, target, ids], axis=1)
        print("# Nombre total de NaN:",df_imputed.isna().sum().sum())

    with timer("Split stratifié des données"):
        X = df_imputed.drop(['TARGET'], axis=1)  # Features sans la colonne cible
        y = df_imputed['TARGET']  # Colonne cible
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    print("Dimensions de l'ensemble d'entraînement (X_train) :", X_train.shape)
    #print("Dimensions de l'ensemble de test (X_test) :", X_test.shape)
    print("Dimensions de l'ensemble d'entraînement (y_train) :", y_train.shape)
    #print("Dimensions de l'ensemble de test (y_test) :", y_test.shape)

    return X_train, X_test, y_train, y_test

#Appliquer les transformations
#X_train, X_test, y_train, y_test = initialize_data(df_cleaned_filtered, sample_fraction=0.2)


In [17]:
import pandas as pd
X_train, X_test, y_train, y_test = initialize_data(df_cleaned_filtered, sample_fraction=0.2)
df = pd.DataFrame(X_train.iloc[:5], columns=X_train.columns)
df.dtypes.value_counts()

Conversion des colonnes entières en float64 - done in 0s
Prendre un échantillon aléatoire - done in 0s
# Nombre total d'inf: 0
One-hot encoding - done in 0s
Nettoyage des noms de colonnes - done in 0s
Séparation des colonnes cibles et d'identification - done in 0s
Traitement des valeurs manquantes dans la colonne cible - done in 0s
Imputation des valeurs manquantes avec la médiane - done in 1s
# Nombre total de NaN: 0
Réintégration des colonnes cibles et d'identification - done in 0s
Split stratifié des données - done in 0s
Dimensions de l'ensemble d'entraînement (X_train) : (49192, 85)
Dimensions de l'ensemble d'entraînement (y_train) : (49192,)


Unnamed: 0,count
float64,85



## Entrainement des modèles

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score, log_loss
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import mlflow
import mlflow.sklearn
from pyngrok import ngrok
import joblib

# Fonction pour sauvegarder un modèle
def save_model(model, model_name):
    path = "/content/drive/MyDrive/Python/OCRP/Projet07/working/models/"
    os.makedirs(path, exist_ok=True)
    filename = f"{model_name}_model.pkl"
    file_path = os.path.join(path, filename)
    joblib.dump(model, file_path)
    print(f"Model saved as {file_path}")

# Terminer les tunnels ouverts s'ils existent
ngrok.kill()

# Configurer MLFlow
mlflow.set_experiment("scoring_model_experiment_increment_1")  # Changer l'incrément pour chaque expérimentation
mlflow.sklearn.autolog()


# Fonction pour créer la pipeline avec SMOTE et StandardScaler
def create_pipeline(model):
    return ImbPipeline([
        ('sampling', SMOTE()),
        ('scaler', StandardScaler()),
        ('classification', model)
    ])

# Définir les modèles
models = {
    'LogisticRegression': LogisticRegression(solver='liblinear'),
    'RandomForest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier()
}

# Définir les grilles de paramètres pour GridSearchCV
param_grids = {
    'LogisticRegression': {
        'classification__C': [0.1, 1, 10, 100],
        'classification__penalty': ['l1', 'l2']
    },
    'RandomForest': {
        'classification__n_estimators': [50, 100, 200],
        'classification__max_depth': [None, 5, 10, 20]
    },
    'XGBoost': {
        'classification__n_estimators': [50, 100, 200],
        'classification__max_depth': [3, 5, 7],
        'classification__learning_rate': [0.01, 0.1, 0.3]
    },
    'LightGBM': {
        'classification__n_estimators': [50, 100, 200],
        'classification__max_depth': [3, 5, 7],
        'classification__learning_rate': [0.01, 0.1, 0.3]
    }
}

# Fonction pour exécuter GridSearchCV ou RandomizedSearchCV
def run_model(X_train, y_train, X_test, y_test, model_name, model, param_grid, use_random_search=False, results=None):
    if use_random_search:
        grid_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_iter=10, random_state=42, n_jobs=-1, verbose=1)
    else:
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)

    with mlflow.start_run(run_name=model_name, nested=True):
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_

        # Enregistrer les meilleurs paramètres
        mlflow.log_params(best_params)

        # Prédire sur l'ensemble de test
        y_pred_proba = grid_search.predict_proba(X_test)[:, 1]

        # Calculer les métriques
        auc = roc_auc_score(y_test, y_pred_proba)
        accuracy = accuracy_score(y_test, grid_search.predict(X_test))
        recall = recall_score(y_test, grid_search.predict(X_test))
        precision = precision_score(y_test, grid_search.predict(X_test))
        f1 = f1_score(y_test, grid_search.predict(X_test))
        logloss = log_loss(y_test, y_pred_proba)

        # Logger les métriques dans MLFlow
        mlflow.log_metric("AUC", auc)
        mlflow.log_metric("Accuracy", accuracy)
        mlflow.log_metric("Recall", recall)
        mlflow.log_metric("Precision", precision)
        mlflow.log_metric("F1 Score", f1)
        mlflow.log_metric("Log Loss", logloss)

        # Ajouter les métriques au dictionnaire results
        results['Model'].append(model_name)
        results['AUC'].append(auc)
        results['Accuracy'].append(accuracy)
        results['Recall'].append(recall)
        results['Precision'].append(precision)
        results['F1 Score'].append(f1)
        results['Log Loss'].append(logloss)

        # Exemple d'entrée pour le modèle
        example_input = X_train.head(1)
        # Logger le modèle
        mlflow.sklearn.log_model(grid_search.best_estimator_, f"{model_name}_model", input_example=example_input)

        # Sauvegarder le modèle
        save_model(grid_search.best_estimator_, model_name)

        # End the MLflow run
        #mlflow.end_run()
        # Retourner les métriques sous forme de dictionnaire
    return {
        'Model': model_name,
        'AUC': auc,
        'Accuracy': accuracy,
        'Recall': recall,
        'Precision': precision,
        'F1 Score': f1,
        'Log Loss': logloss,
        'Best Params': best_params
    }

# Initialiser le dictionnaire pour stocker les résultats
results = {
    'Model': [],
    'AUC': [],
    'Accuracy': [],
    'Recall': [],
    'Precision': [],
    'F1 Score': [],
    'Log Loss': []
}

# Initialiser les données
X_train, X_test, y_train, y_test = initialize_data(df_cleaned_filtered, sample_fraction=0.10)

# Liste pour stocker les résultats
results_list = []

# Boucler à travers chaque modèle
for model_name, model in models.items():
    param_grid = param_grids[model_name]

    # Utiliser GridSearchCV pour la majorité des modèles, RandomizedSearchCV pour un modèle
    if model_name == 'LightGBM':
        result = run_model(X_train, y_train, X_test, y_test, model_name, create_pipeline(model), param_grid, use_random_search=True, results=results)
    else:
        result = run_model(X_train, y_train, X_test, y_test, model_name, create_pipeline(model), param_grid, results=results)

    # Ajouter les résultats à la liste
    results_list.append(result)

# Convertir la liste des résultats en DataFrame
results_df = pd.DataFrame(results_list)

# Afficher le DataFrame
results_df

Conversion des colonnes entières en float64 - done in 0s
Prendre un échantillon aléatoire - done in 0s
One-hot encoding - done in 0s
Nettoyage des noms de colonnes - done in 0s
Séparation des colonnes cibles et d'identification - done in 0s
Traitement des valeurs manquantes dans la colonne cible - done in 0s
Imputation des valeurs manquantes avec la médiane - done in 0s
Réintégration des colonnes cibles et d'identification - done in 0s
Split stratifié des données - done in 0s
Dimensions de l'ensemble d'entraînement (X_train) : (24538, 85)
Dimensions de l'ensemble de test (X_test) : (6135, 85)
Dimensions de l'ensemble d'entraînement (y_train) : (24538,)
Dimensions de l'ensemble de test (y_test) : (6135,)
Fitting 5 folds for each of 8 candidates, totalling 40 fits


2024/08/08 12:39:32 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model saved as /content/drive/MyDrive/Python/OCRP/Projet07/working/models/LogisticRegression_model.pkl
Fitting 5 folds for each of 12 candidates, totalling 60 fits


2024/08/08 12:43:07 INFO mlflow.sklearn.utils: Logging the 5 best runs, 7 runs will be omitted.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model saved as /content/drive/MyDrive/Python/OCRP/Projet07/working/models/RandomForest_model.pkl
Fitting 5 folds for each of 27 candidates, totalling 135 fits


2024/08/08 12:44:27 INFO mlflow.sklearn.utils: Logging the 5 best runs, 22 runs will be omitted.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model saved as /content/drive/MyDrive/Python/OCRP/Projet07/working/models/XGBoost_model.pkl
Fitting 5 folds for each of 10 candidates, totalling 50 fits




[LightGBM] [Info] Number of positive: 22497, number of negative: 22497
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004362 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17787
[LightGBM] [Info] Number of data points in the train set: 44994, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


2024/08/08 13:16:14 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model saved as /content/drive/MyDrive/Python/OCRP/Projet07/working/models/LightGBM_model.pkl


Unnamed: 0,Model,AUC,Accuracy,Recall,Precision,F1 Score,Log Loss,Best Params
0,LogisticRegression,0.737282,0.692421,0.656863,0.163654,0.262026,0.601606,"{'classification__C': 10, 'classification__pen..."
1,RandomForest,0.71128,0.915729,0.001961,0.111111,0.003854,0.280837,"{'classification__max_depth': 20, 'classificat..."
2,XGBoost,0.719271,0.915892,0.021569,0.392857,0.040892,0.267368,"{'classification__learning_rate': 0.1, 'classi..."
3,LightGBM,0.714887,0.915892,0.027451,0.411765,0.051471,0.266795,"{'classification__n_estimators': 200, 'classif..."


In [34]:
import mlflow
from pyngrok import ngrok

# Terminer les tunnels ouverts s'ils existent
ngrok.kill()

# Si tu as déjà configuré et sauvegardé ton authtoken, tu n'as plus besoin de cette partie
NGROK_AUTH_TOKEN = ""
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Check for existing tunnels before starting a new one
active_tunnels = ngrok.get_tunnels()
if not active_tunnels:  # Start a new tunnel only if no active tunnels exist
    # Ouvrir un tunnel HTTPs sur le port 5000 pour http://localhost:5000
    ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
    print("MLflow Tracking UI:", ngrok_tunnel.public_url)
else:
    print("An active ngrok tunnel already exists. Using existing tunnel.")
    print("MLflow Tracking UI:", active_tunnels[0].public_url)  # Assuming the first tunnel is the relevant one


# Démarrer le serveur de suivi MLflow
#!mlflow ui --port 5000

MLflow Tracking UI: https://22d4-34-143-157-231.ngrok-free.app


In [35]:
!mlflow ui

[2024-08-08 13:19:04 +0000] [61337] [INFO] Starting gunicorn 22.0.0
[2024-08-08 13:19:04 +0000] [61337] [INFO] Listening at: http://127.0.0.1:5000 (61337)
[2024-08-08 13:19:04 +0000] [61337] [INFO] Using worker: sync
[2024-08-08 13:19:04 +0000] [61338] [INFO] Booting worker with pid: 61338
[2024-08-08 13:19:04 +0000] [61339] [INFO] Booting worker with pid: 61339
[2024-08-08 13:19:04 +0000] [61340] [INFO] Booting worker with pid: 61340
[2024-08-08 13:19:04 +0000] [61341] [INFO] Booting worker with pid: 61341
2024/08/08 13:23:58 ERROR mlflow.server: Exception on /get-artifact [GET]
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/flask/app.py", line 2529, in wsgi_app
    response = self.full_dispatch_request()
  File "/usr/local/lib/python3.10/dist-packages/flask/app.py", line 1825, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/usr/local/lib/python3.10/dist-packages/flask/app.py", line 1823, in full_dispatch_request
    rv = se



[2024-08-08 13:27:27 +0000] [61337] [INFO] Handling signal: int
[2024-08-08 13:27:27 +0000] [61338] [INFO] Worker exiting (pid: 61338)
[2024-08-08 13:27:27 +0000] [61339] [INFO] Worker exiting (pid: 61339)
[2024-08-08 13:27:27 +0000] [61340] [INFO] Worker exiting (pid: 61340)
[2024-08-08 13:27:27 +0000] [61341] [INFO] Worker exiting (pid: 61341)

Aborted!
[2024-08-08 13:27:28 +0000] [61337] [INFO] Shutting down: Master


In [None]:
# Spécifier le nom du fichier CSV où vous souhaitez sauvegarder les résultats
csv_file_path = 'results.csv'

# Sauvegarder le DataFrame dans le fichier CSV
results_df.to_csv(csv_file_path, index=False)

print(f"Results have been saved to {csv_file_path}")

In [None]:
get_ipython().system_raw("mlflow ui --port 5000 &")  # Ou le port que tu as choisi


In [27]:
!lsof -i :5000


In [None]:
#kill -9 59403 59410 59411 59412 59416


In [20]:
!kill -9 PID


/bin/bash: line 1: kill: PID: arguments must be process or job IDs


In [None]:
!ngrok authtoken YOUR_NGROK_AUTH_TOKEN

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
!cat /root/.config/ngrok/ngrok.yml


In [None]:
#!echo 'authtoken: 2kKv5A8GPf3tcCNOB8w9JMHIzAn_NEuxLmtU3bwg1LMd4EqL' > /root/.config/ngrok/ngrok.yml


In [14]:
!ngrok version

ngrok version 3.14.0
pyngrok version 7.2.0


In [None]:
# Terminer les tunnels ouverts s'ils existent
ngrok.kill()

In [None]:
!lsof -i :5000

In [2]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.15.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.15.1 (from mlflow)
  Downloading mlflow_skinny-2.15.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting querystring-parser<2 (from mlflow)
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl.metadata (559 bytes)
Collecting gunicorn<23 (from mlflow)
  Downloading gunicorn-22.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.15.1->mlflow)
  Downloading databricks_sdk-0.29.0-py3-none-any.whl.metadata (35 kB)
Collecting gitpython<4,>=3.1.9 (from mlflow-skinny==2.15.1->mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl.m

In [3]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0


In [4]:
!pip install nbdev

Collecting nbdev
  Downloading nbdev-2.3.27-py3-none-any.whl.metadata (10 kB)
Collecting execnb>=0.1.4 (from nbdev)
  Downloading execnb-0.1.6-py3-none-any.whl.metadata (3.2 kB)
Collecting ghapi>=1.0.3 (from nbdev)
  Downloading ghapi-1.0.5-py3-none-any.whl.metadata (13 kB)
Collecting watchdog (from nbdev)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl.metadata (37 kB)
Collecting asttokens (from nbdev)
  Downloading asttokens-2.4.1-py2.py3-none-any.whl.metadata (5.2 kB)
Collecting jedi>=0.16 (from ipython->execnb>=0.1.4->nbdev)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading nbdev-2.3.27-py3-none-any.whl (67 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading execnb-0.1.6-py3-none-any.whl (14 kB)
Downloading ghapi-1.0.5-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:0

In [None]:
#pip install --force-reinstall blinker


In [5]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [10]:
!pip install xgboost
!pip install torch

# Installer les dépendances manquantes
!pip install nvidia-cublas-cu12==12.1.3.1
!pip install nvidia-cuda-cupti-cu12==12.1.105
!pip install nvidia-cuda-nvrtc-cu12==12.1.105
!pip install nvidia-cuda-runtime-cu12==12.1.105
!pip install nvidia-cudnn-cu12==8.9.2.26
!pip install nvidia-cufft-cu12==11.0.2.54
!pip install nvidia-curand-cu12==10.3.2.106
!pip install nvidia-cusolver-cu12==11.4.5.107
!pip install nvidia-cusparse-cu12==12.1.0.106
!pip install nvidia-nvtx-cu12==12.1.105

# Downgrader si nécessaire
# !pip install nvidia-nccl-cu12==2.20.5


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [6]:
!pip install dask[dataframe]

Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.10-py3-none-any.whl.metadata (2.5 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.9-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.9-py3-none-any.whl (241 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m241.9/241.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dask-expr
Successfully installed dask-expr-1.1.9


In [16]:
!pip install --upgrade setuptools

Collecting setuptools
  Using cached setuptools-72.1.0-py3-none-any.whl.metadata (6.6 kB)
Using cached setuptools-72.1.0-py3-none-any.whl (2.3 MB)
Installing collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 71.0.4
    Uninstalling setuptools-71.0.4:
      Successfully uninstalled setuptools-71.0.4
Successfully installed setuptools-72.1.0


In [None]:
import mlflow.sklearn

# Charger un modèle depuis MLFlow
model_uri = f"runs:/{run_id}/{model_name}_model"
loaded_model = mlflow.sklearn.load_model(model_uri)


In [None]:
# Fonction pour charger un modèle
def load_model(model_name):
    filename = f"{model_name}_model.pkl"
    return joblib.load(filename)

# Exemple de chargement d'un modèle
loaded_model = load_model('LogisticRegression')


In [None]:
import mlflow
import shutil
import os

# Remplacez par l'ID du run dont vous voulez télécharger les artefacts
run_id = "9dc6875d9f794f78a0d48bc502d7f63a"

# Répertoire local où les artefacts seront sauvegardés
local_dir = "/content/drive/MyDrive/Python/OCRP/Projet07/working/artifacts/"
os.makedirs(local_dir, exist_ok=True)

# Télécharger les artefacts
artifact_uri = mlflow.get_artifact_uri(run_id)
print(f"Artifact URI: {artifact_uri}")

# Utiliser l'API MLFlow pour télécharger les artefacts
mlflow.artifacts.download_artifacts(run_id=run_id, dst_path=local_dir)

print(f"Artifacts downloaded to {local_dir}")
