In [1]:
import pandas as pd
import numpy as np

def reduce_mem_usage_safe(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print(f"Usage mémoire initial du DataFrame: {start_mem:.2f} MB")

    for col in df.columns:
        col_type = df[col].dtype

        if col_type == 'bool':
            df[col] = df[col].astype('bool')
        elif str(col_type)[:3] == 'int':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)
        elif str(col_type)[:5] == 'float':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Usage mémoire final du DataFrame: {end_mem:.2f} MB")
    print(f"Mémoire réduite de {(start_mem - end_mem) / start_mem * 100:.1f} %")

    return df


In [2]:
def reduce_mem_usage(df):
    """
    Itère sur toutes les colonnes d'un DataFrame et réduit la précision
    des types numériques (int et float) pour diminuer la consommation de mémoire.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(f"Usage mémoire initial du DataFrame: {start_mem:.2f} MB")

    for col in df.columns:
        col_type = df[col].dtype

        # Traiter uniquement les colonnes numériques
        if col_type != object and col_type != str and col_type != bool:
            c_min = df[col].min()
            c_max = df[col].max()

            # --- Conversion des entiers (Integers) ---
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)

            # --- Conversion des décimaux (Floats) ---
            else:
                # La majorité de vos colonnes d'agrégats (mean, var, proportions) sont ici
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    # Conversion principale : float64 -> float32
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64) # Garder float64 si la précision est nécessaire

    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Usage mémoire final du DataFrame: {end_mem:.2f} MB")
    print(f"Mémoire réduite de {(start_mem - end_mem) / start_mem * 100:.1f} %")

    return df

ETAPE 1
netoyage et imputation 

In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm  # Pour les barres de progression


# --- 1. Optimisation Mémoire ---
def reduce_mem_usage_safe(df, verbose=True):
    """
    Itère sur toutes les colonnes d'un DataFrame et modifie les types de données
    pour minimiser l'utilisation de la mémoire.
    """
    df_out = df.copy()  # Evite les SettingWithCopyWarning
    start_mem = df_out.memory_usage().sum() / 1024**2
    if verbose:
        print(f'Memory usage of dataframe before: {start_mem:.2f} MB')

    for col in tqdm(df_out.columns, desc="Reducing memory"):
        col_type = df_out[col].dtype

        if col_type != object:
            c_min = df_out[col].min()
            c_max = df_out[col].max()
            if str(col_type)[:3] == 'int':
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df_out[col] = df_out[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df_out[col] = df_out[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df_out[col] = df_out[col].astype(np.int32)
                else:
                    df_out[col] = df_out[col].astype(np.int64)
            else:
                if c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df_out[col] = df_out[col].astype(np.float32)
                else:
                    df_out[col] = df_out[col].astype(np.float64)

    end_mem = df_out.memory_usage().sum() / 1024**2
    if verbose:
        print(f'Memory usage after optimization: {end_mem:.2f} MB ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df_out


# --- 2. Analyse des NaN ---
def get_missing_values_table(df, df_name, top_n=20):
    """Calcule et affiche le tableau des valeurs manquantes pour un DataFrame."""
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * mis_val / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
        columns={0: 'Missing Values', 1: '% of Total Values'}
    )
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:, 1] != 0
    ].sort_values('% of Total Values', ascending=False).round(1)

    print(f"\n--- Missing values for {df_name} ---")
    print(f"DataFrame has {df.shape[1]} columns.")
    print(f"There are {mis_val_table_ren_columns.shape[0]} columns with missing values.")

    return mis_val_table_ren_columns.head(top_n)


# --- 3. Définition des chemins et chargement des fichiers ---
DATA_DIR = r"C:\Users\maill\OneDrive\Bureau\majeur_ia\dataenginnering\projet\home-credit-default-risk"

csv_files = {
    'train': 'application_train.csv',
    'test': 'application_test.csv',
    'bureau': 'bureau.csv',
    'bureau_balance': 'bureau_balance.csv',
    'credit_card_balance': 'credit_card_balance.csv',
    'installments_payments': 'installments_payments.csv',
    'POS_CASH_balance': 'POS_CASH_balance.csv',
    'previous_application': 'previous_application.csv'
}

dataframes = {}

print("Starting to load and optimize dataframes...")
for name, filename in csv_files.items():
    file_path = os.path.join(DATA_DIR, filename)
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        dataframes[name] = reduce_mem_usage_safe(df, verbose=True)
    else:
        print(f"File not found: {file_path}. Skipping.")
print("All available dataframes loaded.")


# --- 4. Analyse des NaN pour tous les fichiers ---
for name, df in dataframes.items():
    missing_table = get_missing_values_table(df, name)
    print(missing_table)


# --- 5. Nettoyage et imputation des fichiers principaux ---
df_train = dataframes.get('train')
df_test = dataframes.get('test')

if df_train is not None and df_test is not None:
    print("\n--- Imputation Strategy on Main DataFrames ---")

    house_cols = [
        col for col in df_train.columns
        if ('APARTMENTS' in col or 'YEARS_BUILD' in col or 'COMMONAREA' in col or 'ELEVATORS' in col or 'FONDKAPREMONT' in col)
        and 'MODE' not in col
    ]

    for df in [df_train, df_test]:
        # Imputation variables immobilières
        for col in house_cols:
            if col in df.columns:
                df[f'{col}_IS_NAN'] = df[col].isnull().astype(np.int8)
                df[col].fillna(0, inplace=True)

        # Imputation EXT_SOURCE
        for col in ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']:
            if col in df.columns:
                df[f'{col}_IS_NAN'] = df[col].isnull().astype(np.int8)
                df[col].fillna(df[col].median(), inplace=True)

        # Imputation colonnes catégorielles
        for col in ['NAME_TYPE_SUITE', 'OCCUPATION_TYPE', 'FONDKAPREMONT_MODE', 'WALLSMATERIAL_MODE']:
            if col in df.columns and df[col].dtype == object:
                df[col].fillna('Missing', inplace=True)

        # Correction DAYS_EMPLOYED
        if 'DAYS_EMPLOYED' in df.columns:
            df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
            df['DAYS_EMPLOYED'].fillna(df['DAYS_EMPLOYED'].median(), inplace=True)

    print("Main DataFrames (train/test) cleaned and imputed.")
    dataframes['train'] = df_train
    dataframes['test'] = df_test


Starting to load and optimize dataframes...
Memory usage of dataframe before: 286.23 MB


Reducing memory: 100%|██████████| 122/122 [00:00<00:00, 289.83it/s]


Memory usage after optimization: 128.16 MB (55.2% reduction)
Memory usage of dataframe before: 45.00 MB


Reducing memory: 100%|██████████| 121/121 [00:00<00:00, 1628.20it/s]

Memory usage after optimization: 20.27 MB (55.0% reduction)





Memory usage of dataframe before: 222.62 MB


Reducing memory: 100%|██████████| 17/17 [00:00<00:00, 73.20it/s]


Memory usage after optimization: 119.49 MB (46.3% reduction)
Memory usage of dataframe before: 624.85 MB


Reducing memory: 100%|██████████| 3/3 [00:00<00:00, 16.55it/s]


Memory usage after optimization: 338.46 MB (45.8% reduction)
Memory usage of dataframe before: 673.88 MB


Reducing memory: 100%|██████████| 23/23 [00:00<00:00, 29.54it/s]


Memory usage after optimization: 318.63 MB (52.7% reduction)
Memory usage of dataframe before: 830.41 MB


Reducing memory: 100%|██████████| 8/8 [00:00<00:00, 12.72it/s]


Memory usage after optimization: 389.25 MB (53.1% reduction)
Memory usage of dataframe before: 610.43 MB


Reducing memory: 100%|██████████| 8/8 [00:00<00:00, 23.51it/s]


Memory usage after optimization: 276.60 MB (54.7% reduction)


1.2 enrichissement de donner 

In [11]:
import pandas as pd
import numpy as np

# Récupération des DataFrames chargés
dataframes = globals().get('dataframes')

if dataframes is None:
    print("Error: DataFrames not loaded. Please ensure the previous block of code was executed.")
    exit() 

df_train = dataframes['train']
df_test = dataframes['test']
df_bureau = dataframes['bureau']
df_bureau_balance = dataframes['bureau_balance']

print("\n--- Starting Feature Engineering: Bureau & Bureau Balance ---")

# --- 1. TRAITEMENT DE BUREAU_BALANCE ---
print("Processing bureau_balance...")

# One-Hot Encoding pour STATUS
df_bureau_balance = pd.get_dummies(df_bureau_balance, dummy_na=False)

if 'DAYS_CREDIT_ENDDATE_FLAG' in df_bureau.columns:
    df_bureau = df_bureau.drop(columns=['DAYS_CREDIT_ENDDATE_FLAG'])

# Fonction d'agrégation corrigée
def group_and_aggregate(df, group_key, df_name, agg_funcs):
    """Effectue l'agrégation et renomme les colonnes."""
    # Agrégation standard
    agg_df = df.groupby(group_key).agg(agg_funcs)
    # Aplatir les noms de colonnes
    agg_df.columns = pd.Index([f'{df_name}_{col[0]}_{col[1].upper()}' for col in agg_df.columns.tolist()])
    return agg_df

# Agrégation bureau_balance par SK_ID_BUREAU
bb_agg_funcs = ['min', 'max', 'count', 'mean']  # <--- corrigé
bb_grouped = group_and_aggregate(
    df_bureau_balance.drop(columns=['MONTHS_BALANCE']),
    group_key='SK_ID_BUREAU',
    df_name='BUREAU_BALANCE',
    agg_funcs=bb_agg_funcs
)

# Fusion dans bureau
df_bureau = df_bureau.merge(bb_grouped, how='left', on='SK_ID_BUREAU')
print(f"Merged bureau_balance into bureau. New bureau shape: {df_bureau.shape}")

# --- 2. TRAITEMENT DE BUREAU ---
print("Processing bureau...")

df_bureau = pd.get_dummies(df_bureau, dummy_na=False)

BUREAU_IGNORE_COLS = ['SK_ID_CURR', 'SK_ID_BUREAU', 'CREDIT_ACTIVE_Active'] 
bureau_cols = [col for col in df_bureau.columns if col not in BUREAU_IGNORE_COLS]

num_agg = ['min', 'max', 'mean', 'sum', 'var']  # <--- corrigé
cat_agg = ['mean', 'sum']

# Définition des agrégations spécifiques
bureau_agg_funcs = {}
for col in bureau_cols:
    if df_bureau[col].dtype != object:
        if 'COUNT' in col or 'SUM' in col or 'STATUS' in col:
            bureau_agg_funcs[col] = cat_agg
        else:
            bureau_agg_funcs[col] = num_agg

# Agrégation bureau par SK_ID_CURR
bureau_grouped = df_bureau.groupby('SK_ID_CURR').agg(bureau_agg_funcs)
bureau_grouped.columns = pd.Index([f'BUREAU_{col[0]}_{col[1].upper()}' for col in bureau_grouped.columns.tolist()])
print(f"Bureau aggregation done. Aggregated shape: {bureau_grouped.shape}")

# --- 3. FUSION DANS LES DATAFRAMES PRINCIPAUX ---
df_train = df_train.merge(bureau_grouped, how='left', on='SK_ID_CURR')
print(f"Merged bureau features into df_train. New shape: {df_train.shape}")

df_test = df_test.merge(bureau_grouped, how='left', on='SK_ID_CURR')
print(f"Merged bureau features into df_test. New shape: {df_test.shape}")

# Mise à jour des DataFrames globaux
dataframes['train'] = df_train
dataframes['test'] = df_test

print("\nFeature Engineering for Bureau & Bureau Balance completed.")
print("The main DataFrames now contain over 300 new features based on credit history.")



--- Starting Feature Engineering: Bureau & Bureau Balance ---
Processing bureau_balance...
Merged bureau_balance into bureau. New bureau shape: (1716428, 49)
Processing bureau...
Bureau aggregation done. Aggregated shape: (305811, 248)
Merged bureau features into df_train. New shape: (307511, 385)
Merged bureau features into df_test. New shape: (48744, 384)

Feature Engineering for Bureau & Bureau Balance completed.
The main DataFrames now contain over 300 new features based on credit history.


enrichissement des donner2 

In [12]:
# Nécessaire pour les opérations de fusion et d'agrégation
import pandas as pd
import numpy as np

# Récupération des DataFrames mis à jour
dataframes = globals().get('dataframes') 

if dataframes is None or 'previous_application' not in dataframes:
    print("Error: DataFrames not loaded or previous_application is missing.")
    exit() 

df_train = dataframes['train']
df_test = dataframes['test']
df_prev = dataframes['previous_application']

print("\n--- Starting Feature Engineering: Previous Application ---")

# --- 1. PRÉ-TRAITEMENT ET FEATURE CREATION ---

print("Processing previous_application...")

# One-Hot Encoding pour les colonnes catégorielles
df_prev = pd.get_dummies(df_prev, dummy_na=False)

# Création de quelques ratios et features clés (bonnes pratiques du métier)
# Ratio Montant du Crédit / Montant Demandé
df_prev['APP_CREDIT_PER_APPLICATION'] = df_prev['AMT_CREDIT'] / df_prev['AMT_APPLICATION']
# Ratio Montant Annuité / Montant Demandé
df_prev['ANNUITY_PER_APPLICATION'] = df_prev['AMT_ANNUITY'] / df_prev['AMT_APPLICATION']
# Ratio Montant Crédit / Montant Annuité
df_prev['CREDIT_PER_ANNUITY'] = df_prev['AMT_CREDIT'] / df_prev['AMT_ANNUITY']
# Ratio Montant Crédit / Durée du prêt (approximative)
df_prev['CREDIT_PER_TERM'] = df_prev['AMT_CREDIT'] / df_prev['CNT_PAYMENT']


# Correction de l'outlier DAYS_FIRST_DRAWING
# La valeur 365243 signifie ici "Inconnu" ou "Pas d'événement"
df_prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
df_prev['DAYS_FIRST_DRAWING'].fillna(0, inplace=True) # Imputation par 0 pour le moment (à affiner)


# --- 2. AGRÉGATION PAR CLIENT (SK_ID_CURR) ---

PREV_IGNORE_COLS = ['SK_ID_CURR', 'SK_ID_PREV']
prev_cols = [col for col in df_prev.columns if col not in PREV_IGNORE_COLS]

# Définition des fonctions d'agrégation
prev_num_agg = ['mean', 'min', 'max', 'sum', 'var']
prev_cat_agg = ['mean', 'sum'] # Pour les colonnes issues du OHE

# Structure des agrégations
prev_agg_funcs = {}
for col in prev_cols:
    if df_prev[col].dtype != object:
        # Les colonnes de OHE/compte sont agrégées avec mean (pour le ratio) et sum (pour le compte total)
        if 'FLAG' in col or 'MODE' in col or 'TYPE' in col or 'NAME' in col or 'CODE' in col or 'WEEKDAY' in col or 'HOUR' in col:
            prev_agg_funcs[col] = prev_cat_agg
        # Autres colonnes numériques
        else:
            prev_agg_funcs[col] = prev_num_agg

# Agrégation
prev_grouped = df_prev.groupby('SK_ID_CURR').agg(prev_agg_funcs)

# Aplatir les noms de colonnes
prev_grouped.columns = pd.Index([f'PREV_{col[0]}_{col[1].upper()}' for col in prev_grouped.columns.tolist()])

print(f"Previous Application aggregation done. Aggregated shape: {prev_grouped.shape}")

# --- 3. FUSION DANS LES DATAFRAMES PRINCIPAUX ---

# Fusion avec df_train
df_train = df_train.merge(prev_grouped, how='left', on='SK_ID_CURR')
print(f"Merged previous_application features into df_train. New shape: {df_train.shape}")

# Fusion avec df_test
df_test = df_test.merge(prev_grouped, how='left', on='SK_ID_CURR')
print(f"Merged previous_application features into df_test. New shape: {df_test.shape}")

# Mise à jour des DataFrames
dataframes['train'] = df_train
dataframes['test'] = df_test

print("\nFeature Engineering for Previous Application completed.")
print(f"Total columns in df_train: {df_train.shape[1]}")


--- Starting Feature Engineering: Previous Application ---
Processing previous_application...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_prev['DAYS_FIRST_DRAWING'].fillna(0, inplace=True) # Imputation par 0 pour le moment (à affiner)


Previous Application aggregation done. Aggregated shape: (338857, 443)
Merged previous_application features into df_train. New shape: (307511, 828)
Merged previous_application features into df_test. New shape: (48744, 827)

Feature Engineering for Previous Application completed.
Total columns in df_train: 828


enrichissement des donner 3 

In [13]:
# Nécessaire pour les opérations de fusion et d'agrégation
import pandas as pd
import numpy as np

# Récupération des DataFrames mis à jour
dataframes = globals().get('dataframes') 

if dataframes is None or 'installments_payments' not in dataframes:
    print("Error: DataFrames not loaded or installments_payments is missing.")
    exit() 

df_train = dataframes['train']
df_test = dataframes['test']
df_installments = dataframes['installments_payments']

print("\n--- Starting Feature Engineering: Installments Payments ---")

# --- 1. FEATURE CREATION SPÉCIFIQUE ---

# Calcul de l'écart entre la date de paiement prévue et la date de paiement réelle
# (Valeur négative = paiement en avance, positive = paiement en retard)
df_installments['DAYS_DELAY'] = df_installments['DAYS_ENTRY_PAYMENT'] - df_installments['DAYS_INSTALMENT']

# Ratio du montant payé par rapport au montant attendu (si > 1, le client a trop payé)
df_installments['PAYMENT_RATIO'] = df_installments['AMT_PAYMENT'] / df_installments['AMT_INSTALMENT']

# Difference entre le montant payé et le montant attendu
df_installments['PAYMENT_DIFF'] = df_installments['AMT_PAYMENT'] - df_installments['AMT_INSTALMENT']

# Flag pour le paiement en retard (> 0 jour de retard)
df_installments['LATE_PAYMENT_FLAG'] = (df_installments['DAYS_DELAY'] > 0).astype(np.int8)

# --- 2. AGRÉGATION PAR CLIENT (SK_ID_CURR) ---

INSTALL_IGNORE_COLS = ['SK_ID_CURR', 'SK_ID_PREV', 'NUM_INSTALMENT_VERSION', 'NUM_INSTALMENT_NUMBER']
install_cols = [col for col in df_installments.columns if col not in INSTALL_IGNORE_COLS]

# Fonctions d'agrégation pour les colonnes numériques
install_agg_funcs = {
    'DAYS_DELAY': ['mean', 'min', 'max', 'sum', 'std'],
    'PAYMENT_RATIO': ['mean', 'min', 'max', 'var'],
    'PAYMENT_DIFF': ['mean', 'min', 'max', 'sum', 'var'],
    'LATE_PAYMENT_FLAG': ['sum', 'mean'], # Nombre total et fréquence des retards
    'AMT_INSTALMENT': ['mean', 'sum', 'std'],
    'AMT_PAYMENT': ['mean', 'sum', 'std'],
    'SK_ID_PREV': ['nunique'] # Compter le nombre de prêts précédents associés aux versements
}

# Agrégation
install_grouped = df_installments.groupby('SK_ID_CURR').agg(install_agg_funcs)

# Aplatir les noms de colonnes
install_grouped.columns = pd.Index([f'INSTAL_{col[0]}_{col[1].upper()}' for col in install_grouped.columns.tolist()])

print(f"Installments Payments aggregation done. Aggregated shape: {install_grouped.shape}")

# --- 3. FUSION DANS LES DATAFRAMES PRINCIPAUX ---

# Fusion avec df_train
df_train = df_train.merge(install_grouped, how='left', on='SK_ID_CURR')
print(f"Merged installments_payments features into df_train. New shape: {df_train.shape}")

# Fusion avec df_test
df_test = df_test.merge(install_grouped, how='left', on='SK_ID_CURR')
print(f"Merged installments_payments features into df_test. New shape: {df_test.shape}")

# Mise à jour des DataFrames
dataframes['train'] = df_train
dataframes['test'] = df_test

print("\nFeature Engineering for Installments Payments completed.")
print(f"Total columns in df_train: {df_train.shape[1]}")


--- Starting Feature Engineering: Installments Payments ---
Installments Payments aggregation done. Aggregated shape: (339587, 23)
Merged installments_payments features into df_train. New shape: (307511, 851)
Merged installments_payments features into df_test. New shape: (48744, 850)

Feature Engineering for Installments Payments completed.
Total columns in df_train: 851


enrichissement des donner 4

In [14]:
import pandas as pd
import numpy as np

# Récupération des DataFrames mis à jour
dataframes = globals().get('dataframes') 

if dataframes is None or 'POS_CASH_balance' not in dataframes:
    print("Error: DataFrames not loaded or POS_CASH_balance is missing.")
    exit() 

df_train = dataframes['train']
df_test = dataframes['test']
df_pos_cash = dataframes['POS_CASH_balance']

print("\n--- Starting Feature Engineering: POS CASH Balance ---")

# --- 1. PRÉ-TRAITEMENT ---
print("Processing POS_CASH_balance...")

# One-Hot Encoding pour le statut du prêt (NAME_CONTRACT_STATUS)
df_pos_cash = pd.get_dummies(df_pos_cash, columns=['NAME_CONTRACT_STATUS'], dummy_na=False)

# Création d'une colonne indiquant les retards (Status 1 à 5)
status_cols = [col for col in df_pos_cash.columns if 'NAME_CONTRACT_STATUS_' in col]
if status_cols:
    df_pos_cash['LATE_PAYMENT_POS_CASH'] = df_pos_cash[status_cols].sum(axis=1)
    df_pos_cash['LATE_PAYMENT_POS_CASH'] = df_pos_cash['LATE_PAYMENT_POS_CASH'].clip(upper=1)
else:
    df_pos_cash['LATE_PAYMENT_POS_CASH'] = 0  # Aucun retard si la colonne n'existe pas

# --- 2. AGRÉGATION PAR CLIENT (SK_ID_CURR) ---
POS_CASH_IGNORE_COLS = ['SK_ID_CURR', 'SK_ID_PREV', 'MONTHS_BALANCE']
pos_cash_cols = [col for col in df_pos_cash.columns if col not in POS_CASH_IGNORE_COLS]

# Fonctions d'agrégation
pos_cash_agg_funcs = {
    'SK_ID_PREV': ['nunique'],
    'MONTHS_BALANCE': ['min', 'max', 'size'],
    'CNT_INSTALMENT': ['mean', 'sum', 'min', 'max', 'std'],
    'CNT_INSTALMENT_FUTURE': ['mean', 'sum', 'min', 'max', 'std'],
    'SK_DPD': ['mean', 'min', 'max', 'sum'],
    'SK_DPD_DEF': ['mean', 'min', 'max', 'sum'],
    'LATE_PAYMENT_POS_CASH': ['mean', 'sum'],
}

# Ajout des colonnes OHE existantes
for col in status_cols:
    pos_cash_agg_funcs[col] = ['mean', 'sum']

# Agrégation
pos_cash_grouped = df_pos_cash.groupby('SK_ID_CURR').agg(pos_cash_agg_funcs)

# Aplatir les noms de colonnes
pos_cash_grouped.columns = pd.Index([f'POS_{col[0]}_{col[1].upper()}' for col in pos_cash_grouped.columns.tolist()])

print(f"POS CASH Balance aggregation done. Aggregated shape: {pos_cash_grouped.shape}")

# --- 3. FUSION DANS LES DATAFRAMES PRINCIPAUX ---
df_train = df_train.merge(pos_cash_grouped, how='left', on='SK_ID_CURR')
print(f"Merged POS_CASH_balance features into df_train. New shape: {df_train.shape}")

df_test = df_test.merge(pos_cash_grouped, how='left', on='SK_ID_CURR')
print(f"Merged POS_CASH_balance features into df_test. New shape: {df_test.shape}")

# Mise à jour des DataFrames
dataframes['train'] = df_train
dataframes['test'] = df_test

print("\nFeature Engineering for POS CASH Balance completed.")
print(f"Total columns in df_train: {df_train.shape[1]}")



--- Starting Feature Engineering: POS CASH Balance ---
Processing POS_CASH_balance...
POS CASH Balance aggregation done. Aggregated shape: (337252, 42)
Merged POS_CASH_balance features into df_train. New shape: (307511, 893)
Merged POS_CASH_balance features into df_test. New shape: (48744, 892)

Feature Engineering for POS CASH Balance completed.
Total columns in df_train: 893


enrichissemnt donner 5

In [15]:
# Nécessaire pour les opérations de fusion et d'agrégation
import pandas as pd
import numpy as np

# Récupération des DataFrames mis à jour
dataframes = globals().get('dataframes') 

if dataframes is None or 'credit_card_balance' not in dataframes:
    print("Error: DataFrames not loaded or credit_card_balance is missing.")
    exit() 

df_train = dataframes['train']
df_test = dataframes['test']
df_cc_balance = dataframes['credit_card_balance']

print("\n--- Starting Feature Engineering: Credit Card Balance (FINAL FILE) ---")

# --- 1. PRÉ-TRAITEMENT ET FEATURE CREATION ---

print("Processing credit_card_balance...")

# One-Hot Encoding
df_cc_balance = pd.get_dummies(df_cc_balance, columns=['NAME_CONTRACT_STATUS'], dummy_na=False)

# Remplacer les valeurs infinies (souvent le résultat de divisions par zéro dans les ratios)
df_cc_balance = df_cc_balance.replace([np.inf, -np.inf], np.nan)
# Imputer les NaN par 0 est souvent une bonne première approche pour ce fichier
df_cc_balance.fillna(0, inplace=True) 

# Création de quelques ratios clés
# Ratio d'utilisation du crédit (très prédictif)
df_cc_balance['LIMIT_USE'] = df_cc_balance['AMT_BALANCE'] / df_cc_balance['AMT_CREDIT_LIMIT_ACTUAL']
df_cc_balance['LIMIT_USE'].replace([np.inf, -np.inf], np.nan, inplace=True)
df_cc_balance['LIMIT_USE'].fillna(0, inplace=True)

# Ratio Paiement / Paiement minimum (important : si < 1, le client n'a pas respecté le minimum)
df_cc_balance['PAYMENT_DIV_MIN'] = df_cc_balance['AMT_PAYMENT_CURRENT'] / df_cc_balance['AMT_INST_MIN_REGULARITY']
df_cc_balance['PAYMENT_DIV_MIN'].replace([np.inf, -np.inf], np.nan, inplace=True)
df_cc_balance['PAYMENT_DIV_MIN'].fillna(0, inplace=True)

# Jours de retard flag
df_cc_balance['CC_LATE_PAYMENT'] = (df_cc_balance['SK_DPD'] > 0).astype(np.int8)


# --- 2. AGRÉGATION PAR CLIENT (SK_ID_CURR) ---

CC_IGNORE_COLS = ['SK_ID_CURR', 'SK_ID_PREV', 'MONTHS_BALANCE']
cc_cols = [col for col in df_cc_balance.columns if col not in CC_IGNORE_COLS]

# Fonctions d'agrégation
cc_agg_funcs = {
    'SK_ID_PREV': ['nunique'],
    'MONTHS_BALANCE': ['min', 'max', 'size'],
    'AMT_BALANCE': ['mean', 'min', 'max', 'sum', 'std'],
    'AMT_CREDIT_LIMIT_ACTUAL': ['mean', 'min', 'max'],
    'AMT_DRAWINGS_CURRENT': ['mean', 'sum', 'max'],
    'AMT_INST_MIN_REGULARITY': ['mean', 'min', 'max', 'sum'],
    'AMT_PAYMENT_TOTAL_CURRENT': ['mean', 'sum', 'max', 'std'],
    'LIMIT_USE': ['mean', 'min', 'max', 'std'],
    'PAYMENT_DIV_MIN': ['mean', 'min', 'max', 'std'],
    'CC_LATE_PAYMENT': ['mean', 'sum'],
    'SK_DPD': ['mean', 'max', 'sum'],
}

# Ajout des colonnes OHE
for col in df_cc_balance.columns:
    if 'NAME_CONTRACT_STATUS_' in col:
        cc_agg_funcs[col] = ['mean', 'sum']

# Agrégation
cc_grouped = df_cc_balance.groupby('SK_ID_CURR').agg(cc_agg_funcs)

# Aplatir les noms de colonnes
cc_grouped.columns = pd.Index([f'CC_{col[0]}_{col[1].upper()}' for col in cc_grouped.columns.tolist()])

print(f"Credit Card Balance aggregation done. Aggregated shape: {cc_grouped.shape}")

# --- 3. FUSION DANS LES DATAFRAMES PRINCIPAUX ---

# Fusion avec df_train
df_train = df_train.merge(cc_grouped, how='left', on='SK_ID_CURR')
print(f"Merged credit_card_balance features into df_train. New shape: {df_train.shape}")

# Fusion avec df_test
df_test = df_test.merge(cc_grouped, how='left', on='SK_ID_CURR')
print(f"Merged credit_card_balance features into df_test. New shape: {df_test.shape}")

# Mise à jour des DataFrames
dataframes['train'] = df_train
dataframes['test'] = df_test

print("\nFeature Engineering for Credit Card Balance completed.")
print(f"Total columns in df_train: {df_train.shape[1]}")


--- Starting Feature Engineering: Credit Card Balance (FINAL FILE) ---
Processing credit_card_balance...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cc_balance['LIMIT_USE'].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cc_balance['LIMIT_USE'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate obje

Credit Card Balance aggregation done. Aggregated shape: (103558, 50)
Merged credit_card_balance features into df_train. New shape: (307511, 943)
Merged credit_card_balance features into df_test. New shape: (48744, 942)

Feature Engineering for Credit Card Balance completed.
Total columns in df_train: 943


ETAPE 2 TRAITEMNT ML FLOW 

In [16]:
import mlflow
import mlflow.lightgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

# Features et target
X = df_train.drop(columns=['TARGET', 'SK_ID_CURR'])
y = df_train['TARGET']

# Split train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [28]:
import pandas as pd
import mlflow
import mlflow.lightgbm
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np

# --- 1. Chargement des données ---
data_path = "C:/Users/maill/OneDrive/Bureau/majeur_ia/dataenginnering/projet/home-credit-default-risk/application_train.csv"
df_train = pd.read_csv(data_path)

# Features / cible
X = df_train.drop(columns=['SK_ID_CURR', 'TARGET'])
y = df_train['TARGET']

# Identifier les colonnes numériques et catégorielles
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# Split train / validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Conversion types pour LightGBM
X_train[num_cols] = X_train[num_cols].astype(np.float32)
X_val[num_cols] = X_val[num_cols].astype(np.float32)

# Conversion colonnes catégorielles en "category"
X_train[cat_cols] = X_train[cat_cols].astype("category")
X_val[cat_cols] = X_val[cat_cols].astype("category")

y_train = y_train.astype(np.int8)
y_val = y_val.astype(np.int8)

# --- 2. Configurer MLflow ---
mlflow.set_tracking_uri("file:///C:/Users/maill/OneDrive/Bureau/mlruns")
mlflow.set_experiment("home_credit_default_risk")

# --- 3. Entraînement LightGBM avec MLflow ---
with mlflow.start_run(run_name="lightgbm_baseline"):
    # Définition du modèle
    model = LGBMClassifier(
        objective='binary',
        boosting_type='gbdt',
        learning_rate=0.05,
        num_leaves=31,
        max_depth=-1,
        n_estimators=1000,
        verbosity=-1,
        random_state=42
    )

    # Fit avec early stopping et logs toutes les 100 itérations
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='auc',
        callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=100)]
    )

    # Prédictions et AUC
    y_pred = model.predict_proba(X_val)[:, 1]
    auc_score = roc_auc_score(y_val, y_pred)
    print(f"Validation AUC: {auc_score:.4f}")

    # Logging manuel hyperparamètres et métrique
    mlflow.log_params(model.get_params())
    mlflow.log_metric("val_auc", auc_score)

    # Log du modèle LightGBM (booster_)
    mlflow.lightgbm.log_model(model.booster_, artifact_path="model")

    print("MLflow run completed and model logged successfully!")


Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.75657	valid_0's binary_logloss: 0.246512
[200]	valid_0's auc: 0.758908	valid_0's binary_logloss: 0.245595
Early stopping, best iteration is:
[190]	valid_0's auc: 0.758976	valid_0's binary_logloss: 0.245578




Validation AUC: 0.7590
MLflow run completed and model logged successfully!


PARTIE 3

In [29]:
import optuna
from optuna.integration import LightGBMPruningCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import mlflow
import mlflow.lightgbm
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# --- 1. RE-CHARGEMENT ET PRÉPARATION DES DONNÉES (Pour être autonome) ---
# Vous avez déjà fait ce chargement, mais on le garde ici si le notebook est relancé
data_path = "C:/Users/maill/OneDrive/Bureau/majeur_ia/dataenginnering/projet/home-credit-default-risk/application_train.csv"
df_train = pd.read_csv(data_path)
X = df_train.drop(columns=['SK_ID_CURR', 'TARGET'])
y = df_train['TARGET']

num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# Split train / validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Conversion types pour LightGBM
X_train[num_cols] = X_train[num_cols].astype(np.float32)
X_val[num_cols] = X_val[num_cols].astype(np.float32)
X_train[cat_cols] = X_train[cat_cols].astype("category")
X_val[cat_cols] = X_val[cat_cols].astype("category")
y_train = y_train.astype(np.int8)
y_val = y_val.astype(np.int8)


# --- 2. CONFIGURER MLFLOW ---
# On réutilise votre configuration MLflow
mlflow.set_tracking_uri("file:///C:/Users/maill/OneDrive/Bureau/mlruns")
mlflow.set_experiment("home_credit_default_risk")


# --- 3. FONCTION OBJECTIF OPTUNA (avec Nested MLflow Run) ---

def objective(trial):
    """
    Fonction objectif pour Optuna, trace les résultats dans une run MLflow imbriquée.
    """
    # 1. Définition de l'espace de recherche
    param = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'n_estimators': 1000, 
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 16, 128),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': -1,
    }

    # 2. Démarrer la run MLflow imbriquée
    # Le paramètre 'nested=True' permet d'enregistrer cet essai sous la run principale de l'étude
    with mlflow.start_run(run_name=f"trial_{trial.number}", nested=True):
        
        # Loguer les paramètres testés
        mlflow.log_params(param)
        
        # 3. Entraînement du modèle
        model = LGBMClassifier(**param)

        # Callback Optuna pour l'élagage (pruning) intelligent
        pruning_callback = LightGBMPruningCallback(trial, 'auc', valid_name='valid_0')

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='auc',
            callbacks=[early_stopping(stopping_rounds=50, verbose=-1), pruning_callback]
        )

        # 4. Évaluation et Log
        y_pred = model.predict_proba(X_val)[:, 1]
        auc_score = roc_auc_score(y_val, y_pred)
        
        mlflow.log_metric("val_auc", auc_score)
        
        # Log du modèle si c'est le meilleur essai de l'étude (facultatif mais utile)
        # On ne logue que si l'AUC est significativement meilleure que la baseline (0.7590)
        if auc_score > 0.7600:
             mlflow.lightgbm.log_model(
                 model.booster_, 
                 artifact_path=f"model_trial_{trial.number}",
                 registered_model_name="HomeCredit_LGBM_Optimized"
             )
        
        # Retourner la métrique à Optuna
        return auc_score

# --- 4. EXÉCUTION DE L'ÉTUDE OPTUNA (Main MLflow Run) ---

N_TRIALS = 50 # Nombre d'essais (augmentez si vous avez plus de temps)
STUDY_NAME = "LGBM_Optimization_Optuna"

# Démarrer la run principale pour l'étude entière
with mlflow.start_run(run_name=STUDY_NAME) as study_run:
    
    # Tags pour l'étude complète
    mlflow.set_tag("Optimization_Tool", "Optuna")
    mlflow.set_tag("Optimization_Phase", "4.2_Hyperparameter_Tuning")
    mlflow.log_param("N_TRIALS", N_TRIALS)
    mlflow.log_param("Validation_Strategy", "Holdout_Split_20pct")
    
    # Créer le study Optuna
    study = optuna.create_study(direction='maximize', study_name=STUDY_NAME)
    
    print(f"\nStarting {N_TRIALS} optimization trials...")
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

    # Récupération et log des meilleurs résultats
    best_params = study.best_params
    best_value = study.best_value
    
    # Loguer les résultats finaux dans la main run
    mlflow.log_metric("best_auc_found", best_value)
    mlflow.log_params({f"final_{k}": v for k, v in best_params.items()})

    print("\n--- Optimization Complete ---")
    print(f"Best AUC found: {best_value:.4f}")
    print(f"Best parameters: {best_params}")

[I 2025-12-13 16:29:35,297] A new study created in memory with name: LGBM_Optimization_Optuna



Starting 50 optimization trials...


  0%|          | 0/50 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[273]	valid_0's auc: 0.761202	valid_0's binary_logloss: 0.244932


  return FileStore(store_uri)
Successfully registered model 'HomeCredit_LGBM_Optimized'.
Created version '1' of model 'HomeCredit_LGBM_Optimized'.
Best trial: 0. Best value: 0.761202:   2%|▏         | 1/50 [00:37<30:33, 37.42s/it]

[I 2025-12-13 16:30:12,717] Trial 0 finished with value: 0.7612021352766262 and parameters: {'learning_rate': 0.0686366684731388, 'num_leaves': 74, 'max_depth': 5, 'min_child_samples': 45, 'subsample': 0.9451176882644856, 'colsample_bytree': 0.6950097958247732, 'reg_alpha': 0.010544887165465613, 'reg_lambda': 0.003193184470047511}. Best is trial 0 with value: 0.7612021352766262.
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[969]	valid_0's auc: 0.761335	valid_0's binary_logloss: 0.244817


Registered model 'HomeCredit_LGBM_Optimized' already exists. Creating a new version of this model...
Created version '2' of model 'HomeCredit_LGBM_Optimized'.
Best trial: 1. Best value: 0.761335:   4%|▍         | 2/50 [01:23<33:51, 42.33s/it]

[I 2025-12-13 16:30:58,475] Trial 1 finished with value: 0.7613351759390266 and parameters: {'learning_rate': 0.043521511072668516, 'num_leaves': 89, 'max_depth': 3, 'min_child_samples': 55, 'subsample': 0.9428355459739732, 'colsample_bytree': 0.8348599127686024, 'reg_alpha': 2.1264075957530384, 'reg_lambda': 2.717094731430946}. Best is trial 1 with value: 0.7613351759390266.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[541]	valid_0's auc: 0.761422	valid_0's binary_logloss: 0.244769


Registered model 'HomeCredit_LGBM_Optimized' already exists. Creating a new version of this model...
Created version '3' of model 'HomeCredit_LGBM_Optimized'.
Best trial: 2. Best value: 0.761422:   6%|▌         | 3/50 [02:03<32:34, 41.59s/it]

[I 2025-12-13 16:31:39,195] Trial 2 finished with value: 0.7614221728333789 and parameters: {'learning_rate': 0.09568614915227072, 'num_leaves': 33, 'max_depth': 3, 'min_child_samples': 82, 'subsample': 0.996852935670556, 'colsample_bytree': 0.9999538384549915, 'reg_alpha': 4.529034056419769e-05, 'reg_lambda': 1.3491337131447425e-07}. Best is trial 2 with value: 0.7614221728333789.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[438]	valid_0's auc: 0.760317	valid_0's binary_logloss: 0.245095


Registered model 'HomeCredit_LGBM_Optimized' already exists. Creating a new version of this model...
Created version '4' of model 'HomeCredit_LGBM_Optimized'.
Best trial: 2. Best value: 0.761422:   8%|▊         | 4/50 [02:52<33:56, 44.26s/it]

[I 2025-12-13 16:32:27,548] Trial 3 finished with value: 0.7603174465768496 and parameters: {'learning_rate': 0.02059809995365799, 'num_leaves': 127, 'max_depth': 9, 'min_child_samples': 47, 'subsample': 0.6237727549362793, 'colsample_bytree': 0.641129360204188, 'reg_alpha': 0.28733274655242, 'reg_lambda': 0.0030085559688135598}. Best is trial 2 with value: 0.7614221728333789.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[518]	valid_0's auc: 0.761413	valid_0's binary_logloss: 0.244777


Registered model 'HomeCredit_LGBM_Optimized' already exists. Creating a new version of this model...
Created version '5' of model 'HomeCredit_LGBM_Optimized'.
Best trial: 2. Best value: 0.761422:  10%|█         | 5/50 [03:32<32:07, 42.83s/it]

[I 2025-12-13 16:33:07,847] Trial 4 finished with value: 0.7614131991968828 and parameters: {'learning_rate': 0.07498813548376373, 'num_leaves': 50, 'max_depth': 3, 'min_child_samples': 37, 'subsample': 0.9732981505855366, 'colsample_bytree': 0.942999532462288, 'reg_alpha': 7.300707743788206e-05, 'reg_lambda': 0.021511945818516495}. Best is trial 2 with value: 0.7614221728333789.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  12%|█▏        | 6/50 [03:59<27:23, 37.35s/it]

[I 2025-12-13 16:33:34,558] Trial 5 pruned. Trial was pruned at iteration 239.


Best trial: 2. Best value: 0.761422:  14%|█▍        | 7/50 [04:20<22:55, 31.99s/it]

[I 2025-12-13 16:33:55,508] Trial 6 pruned. Trial was pruned at iteration 0.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  16%|█▌        | 8/50 [04:41<20:00, 28.59s/it]

[I 2025-12-13 16:34:16,813] Trial 7 pruned. Trial was pruned at iteration 26.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  18%|█▊        | 9/50 [05:02<17:58, 26.30s/it]

[I 2025-12-13 16:34:38,079] Trial 8 pruned. Trial was pruned at iteration 28.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  20%|██        | 10/50 [05:23<16:25, 24.64s/it]

[I 2025-12-13 16:34:58,998] Trial 9 pruned. Trial was pruned at iteration 19.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[175]	valid_0's auc: 0.759992	valid_0's binary_logloss: 0.24528


Best trial: 2. Best value: 0.761422:  22%|██▏       | 11/50 [05:56<17:37, 27.11s/it]

[I 2025-12-13 16:35:31,722] Trial 10 finished with value: 0.7599915030100155 and parameters: {'learning_rate': 0.0997946467919018, 'num_leaves': 16, 'max_depth': 5, 'min_child_samples': 88, 'subsample': 0.6736297378098522, 'colsample_bytree': 0.9951266654957489, 'reg_alpha': 0.0005113576626955219, 'reg_lambda': 1.3909840694277171e-06}. Best is trial 2 with value: 0.7614221728333789.


Best trial: 2. Best value: 0.761422:  24%|██▍       | 12/50 [06:17<15:54, 25.13s/it]

[I 2025-12-13 16:35:52,315] Trial 11 pruned. Trial was pruned at iteration 0.


Best trial: 2. Best value: 0.761422:  26%|██▌       | 13/50 [06:37<14:40, 23.80s/it]

Training until validation scores don't improve for 50 rounds
[I 2025-12-13 16:36:13,052] Trial 12 pruned. Trial was pruned at iteration 7.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  28%|██▊       | 14/50 [06:58<13:44, 22.91s/it]

[I 2025-12-13 16:36:33,897] Trial 13 pruned. Trial was pruned at iteration 16.


Best trial: 2. Best value: 0.761422:  30%|███       | 15/50 [07:19<12:56, 22.17s/it]

[I 2025-12-13 16:36:54,369] Trial 14 pruned. Trial was pruned at iteration 0.


Best trial: 2. Best value: 0.761422:  32%|███▏      | 16/50 [07:39<12:15, 21.65s/it]

Training until validation scores don't improve for 50 rounds
[I 2025-12-13 16:37:14,789] Trial 15 pruned. Trial was pruned at iteration 3.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[216]	valid_0's auc: 0.759686	valid_0's binary_logloss: 0.245377


Best trial: 2. Best value: 0.761422:  34%|███▍      | 17/50 [08:16<14:30, 26.38s/it]

[I 2025-12-13 16:37:52,177] Trial 16 finished with value: 0.7596859433844403 and parameters: {'learning_rate': 0.07290090191214572, 'num_leaves': 58, 'max_depth': 6, 'min_child_samples': 21, 'subsample': 0.8238474475612387, 'colsample_bytree': 0.9331951636673116, 'reg_alpha': 1.7783352777032117e-05, 'reg_lambda': 0.006523072719765516}. Best is trial 2 with value: 0.7614221728333789.


Best trial: 2. Best value: 0.761422:  36%|███▌      | 18/50 [08:39<13:29, 25.30s/it]

Training until validation scores don't improve for 50 rounds
[I 2025-12-13 16:38:14,964] Trial 17 pruned. Trial was pruned at iteration 7.


Best trial: 2. Best value: 0.761422:  38%|███▊      | 19/50 [09:00<12:21, 23.91s/it]

Training until validation scores don't improve for 50 rounds
[I 2025-12-13 16:38:35,654] Trial 18 pruned. Trial was pruned at iteration 3.


Best trial: 2. Best value: 0.761422:  40%|████      | 20/50 [09:22<11:44, 23.48s/it]

[I 2025-12-13 16:38:58,111] Trial 19 pruned. Trial was pruned at iteration 0.


Best trial: 2. Best value: 0.761422:  42%|████▏     | 21/50 [09:44<11:09, 23.08s/it]

Training until validation scores don't improve for 50 rounds
[I 2025-12-13 16:39:20,254] Trial 20 pruned. Trial was pruned at iteration 1.


Best trial: 2. Best value: 0.761422:  44%|████▍     | 22/50 [10:05<10:26, 22.38s/it]

[I 2025-12-13 16:39:41,011] Trial 21 pruned. Trial was pruned at iteration 0.


Best trial: 2. Best value: 0.761422:  46%|████▌     | 23/50 [10:26<09:51, 21.92s/it]

[I 2025-12-13 16:40:01,842] Trial 22 pruned. Trial was pruned at iteration 0.


Best trial: 2. Best value: 0.761422:  48%|████▊     | 24/50 [10:46<09:18, 21.48s/it]

[I 2025-12-13 16:40:22,291] Trial 23 pruned. Trial was pruned at iteration 0.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  50%|█████     | 25/50 [11:07<08:52, 21.31s/it]

[I 2025-12-13 16:40:43,221] Trial 24 pruned. Trial was pruned at iteration 12.


Best trial: 2. Best value: 0.761422:  52%|█████▏    | 26/50 [11:28<08:25, 21.06s/it]

[I 2025-12-13 16:41:03,702] Trial 25 pruned. Trial was pruned at iteration 0.


Best trial: 2. Best value: 0.761422:  54%|█████▍    | 27/50 [11:49<08:01, 20.94s/it]

Training until validation scores don't improve for 50 rounds
[I 2025-12-13 16:41:24,360] Trial 26 pruned. Trial was pruned at iteration 12.


Best trial: 2. Best value: 0.761422:  56%|█████▌    | 28/50 [12:09<07:36, 20.76s/it]

[I 2025-12-13 16:41:44,712] Trial 27 pruned. Trial was pruned at iteration 0.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  58%|█████▊    | 29/50 [12:30<07:15, 20.73s/it]

[I 2025-12-13 16:42:05,375] Trial 28 pruned. Trial was pruned at iteration 11.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  60%|██████    | 30/50 [12:50<06:53, 20.69s/it]

[I 2025-12-13 16:42:25,969] Trial 29 pruned. Trial was pruned at iteration 15.


Best trial: 2. Best value: 0.761422:  62%|██████▏   | 31/50 [13:10<06:30, 20.54s/it]

[I 2025-12-13 16:42:46,138] Trial 30 pruned. Trial was pruned at iteration 0.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  64%|██████▍   | 32/50 [13:31<06:09, 20.54s/it]

[I 2025-12-13 16:43:06,677] Trial 31 pruned. Trial was pruned at iteration 15.


Best trial: 2. Best value: 0.761422:  66%|██████▌   | 33/50 [13:51<05:47, 20.44s/it]

[I 2025-12-13 16:43:26,903] Trial 32 pruned. Trial was pruned at iteration 0.


Best trial: 2. Best value: 0.761422:  68%|██████▊   | 34/50 [14:11<05:26, 20.40s/it]

[I 2025-12-13 16:43:47,215] Trial 33 pruned. Trial was pruned at iteration 0.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[218]	valid_0's auc: 0.760918	valid_0's binary_logloss: 0.245231


Registered model 'HomeCredit_LGBM_Optimized' already exists. Creating a new version of this model...
Created version '6' of model 'HomeCredit_LGBM_Optimized'.
Best trial: 2. Best value: 0.761422:  70%|███████   | 35/50 [15:07<07:44, 30.99s/it]

[I 2025-12-13 16:44:42,908] Trial 34 finished with value: 0.7609181814888235 and parameters: {'learning_rate': 0.059675697488993644, 'num_leaves': 93, 'max_depth': 8, 'min_child_samples': 58, 'subsample': 0.9506751811770928, 'colsample_bytree': 0.7292348899363165, 'reg_alpha': 0.030221647727149862, 'reg_lambda': 0.08815481107636274}. Best is trial 2 with value: 0.7614221728333789.


Best trial: 2. Best value: 0.761422:  72%|███████▏  | 36/50 [15:27<06:28, 27.78s/it]

[I 2025-12-13 16:45:03,181] Trial 35 pruned. Trial was pruned at iteration 0.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  74%|███████▍  | 37/50 [15:48<05:32, 25.58s/it]

[I 2025-12-13 16:45:23,623] Trial 36 pruned. Trial was pruned at iteration 12.


Best trial: 2. Best value: 0.761422:  76%|███████▌  | 38/50 [16:08<04:47, 23.95s/it]

[I 2025-12-13 16:45:43,787] Trial 37 pruned. Trial was pruned at iteration 0.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[196]	valid_0's auc: 0.760031	valid_0's binary_logloss: 0.245274


Registered model 'HomeCredit_LGBM_Optimized' already exists. Creating a new version of this model...
Created version '7' of model 'HomeCredit_LGBM_Optimized'.
Best trial: 2. Best value: 0.761422:  78%|███████▊  | 39/50 [16:46<05:09, 28.15s/it]

[I 2025-12-13 16:46:21,732] Trial 38 finished with value: 0.7600314551073974 and parameters: {'learning_rate': 0.07108442853103897, 'num_leaves': 45, 'max_depth': 9, 'min_child_samples': 53, 'subsample': 0.9996223254719888, 'colsample_bytree': 0.9769518139401069, 'reg_alpha': 0.0014320927186858296, 'reg_lambda': 0.0007394330177547064}. Best is trial 2 with value: 0.7614221728333789.


Best trial: 2. Best value: 0.761422:  80%|████████  | 40/50 [17:06<04:17, 25.76s/it]

[I 2025-12-13 16:46:41,924] Trial 39 pruned. Trial was pruned at iteration 0.


Best trial: 2. Best value: 0.761422:  82%|████████▏ | 41/50 [17:26<03:36, 24.08s/it]

[I 2025-12-13 16:47:02,066] Trial 40 pruned. Trial was pruned at iteration 0.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  84%|████████▍ | 42/50 [17:49<03:08, 23.62s/it]

[I 2025-12-13 16:47:24,633] Trial 41 pruned. Trial was pruned at iteration 97.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[126]	valid_0's auc: 0.759776	valid_0's binary_logloss: 0.245287


Best trial: 2. Best value: 0.761422:  86%|████████▌ | 43/50 [18:22<03:05, 26.52s/it]

[I 2025-12-13 16:47:57,915] Trial 42 finished with value: 0.7597764064750256 and parameters: {'learning_rate': 0.08480913778555102, 'num_leaves': 88, 'max_depth': 8, 'min_child_samples': 50, 'subsample': 0.9765139433979347, 'colsample_bytree': 0.6254646170540221, 'reg_alpha': 0.020564663022969186, 'reg_lambda': 0.15987670128162923}. Best is trial 2 with value: 0.7614221728333789.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  88%|████████▊ | 44/50 [18:46<02:34, 25.71s/it]

[I 2025-12-13 16:48:21,729] Trial 43 pruned. Trial was pruned at iteration 123.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  90%|█████████ | 45/50 [19:08<02:03, 24.68s/it]

[I 2025-12-13 16:48:44,010] Trial 44 pruned. Trial was pruned at iteration 46.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  92%|█████████▏| 46/50 [19:29<01:34, 23.64s/it]

[I 2025-12-13 16:49:05,205] Trial 45 pruned. Trial was pruned at iteration 11.
Training until validation scores don't improve for 50 rounds


Best trial: 2. Best value: 0.761422:  94%|█████████▍| 47/50 [19:53<01:10, 23.48s/it]

[I 2025-12-13 16:49:28,333] Trial 46 pruned. Trial was pruned at iteration 105.


Best trial: 2. Best value: 0.761422:  96%|█████████▌| 48/50 [20:13<00:45, 22.69s/it]

[I 2025-12-13 16:49:49,186] Trial 47 pruned. Trial was pruned at iteration 0.


Best trial: 2. Best value: 0.761422:  98%|█████████▊| 49/50 [20:35<00:22, 22.26s/it]

Training until validation scores don't improve for 50 rounds
[I 2025-12-13 16:50:10,430] Trial 48 pruned. Trial was pruned at iteration 6.


Best trial: 2. Best value: 0.761422: 100%|██████████| 50/50 [20:57<00:00, 25.14s/it]

[I 2025-12-13 16:50:32,406] Trial 49 pruned. Trial was pruned at iteration 0.

--- Optimization Complete ---
Best AUC found: 0.7614
Best parameters: {'learning_rate': 0.09568614915227072, 'num_leaves': 33, 'max_depth': 3, 'min_child_samples': 82, 'subsample': 0.996852935670556, 'colsample_bytree': 0.9999538384549915, 'reg_alpha': 4.529034056419769e-05, 'reg_lambda': 1.3491337131447425e-07}





PARTIE 4

In [30]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.lightgbm
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

# --- 1. CONFIGURATION ET RÉCUPÉRATION DES DONNÉES ---
mlflow.set_tracking_uri("file:///C:/Users/maill/OneDrive/Bureau/mlruns")
mlflow.set_experiment("home_credit_default_risk")

# Chemin des données (à ajuster si besoin)
train_path = "C:/Users/maill/OneDrive/Bureau/majeur_ia/dataenginnering/projet/home-credit-default-risk/application_train.csv"
test_path = "C:/Users/maill/OneDrive/Bureau/majeur_ia/dataenginnering/projet/home-credit-default-risk/application_test.csv"
sample_submission_path = "C:/Users/maill/OneDrive/Bureau/majeur_ia/dataenginnering/projet/home-credit-default-risk/sample_submission.csv"

# Chargement des données
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
submission = pd.read_csv(sample_submission_path)

# Préparation (doit être la même que pour l'optimisation)
X_train_full = df_train.drop(columns=['SK_ID_CURR', 'TARGET'])
y_train_full = df_train['TARGET']
X_test = df_test.drop(columns=['SK_ID_CURR'])

# Identifier les colonnes et types
num_cols = X_train_full.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_train_full.select_dtypes(include=['object']).columns

# Appliquer la même transformation de types pour LightGBM
for df in [X_train_full, X_test]:
    df[num_cols] = df[num_cols].astype(np.float32)
    for col in cat_cols:
        # Gère les catégories manquantes dans le test set
        if col in df.columns:
            df[col] = df[col].astype("category")

y_train_full = y_train_full.astype(np.int8)

# --- 2. RÉCUPÉRATION DES MEILLEURS PARAMÈTRES OPTUNA ---

# Nous utilisons ici les meilleurs paramètres que vous avez trouvés
BEST_PARAMS = {
    'learning_rate': 0.09568614915227072, 
    'num_leaves': 33, 
    'max_depth': 3, 
    'min_child_samples': 82, 
    'subsample': 0.996852935670556, 
    'colsample_bytree': 0.9999538384549915, 
    'reg_alpha': 4.529034056419769e-05, 
    'reg_lambda': 1.3491337131447425e-07,
    # Paramètres additionnels LightGBM
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'n_estimators': 1000, # Laisser élevé pour que l'early stopping fonctionne
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': -1,
}

# --- 3. ENTRAÎNEMENT FINAL AVEC MLflow ---
print("--- Starting Final Model Training (Partie 4 / Étape 4.3) ---")

with mlflow.start_run(run_name="final_optimized_model"):
    
    # Ajout de tags pour identifier cette run
    mlflow.set_tag("Model_Phase", "4.3_Final_Training")
    mlflow.set_tag("Optimization_Source", "Optuna_Trial_2")
    mlflow.log_param("Train_Data_Size", len(X_train_full))
    
    # Initialisation du modèle avec les meilleurs paramètres
    final_model = LGBMClassifier(**BEST_PARAMS)
    
    # Entraînement sur la totalité des données d'entraînement (sans validation split ici)
    # Dans un vrai cas, on ferait du K-Fold ici, mais pour la simplicité, on fait un fit direct.
    final_model.fit(
        X_train_full, y_train_full,
    )
    
    # Prédictions sur le jeu de test
    test_preds = final_model.predict_proba(X_test)[:, 1]

    # --- 4. CRÉATION DU FICHIER DE SOUMISSION ---
    submission['TARGET'] = test_preds
    submission_filename = f'submission_optimized_auc_0.7614_{pd.Timestamp.now().strftime("%Y%m%d_%H%M")}.csv'
    submission.to_csv(submission_filename, index=False)
    
    print(f"\nSubmission file created: {submission_filename}")
    
    # --- 5. LOGGING FINAL DANS MLflow ---
    # Loguer les paramètres complets du modèle final
    mlflow.log_params(final_model.get_params())
    
    # Loguer le modèle final lui-même
    mlflow.lightgbm.log_model(
        final_model.booster_, 
        artifact_path="final_model_artifact",
        registered_model_name="HomeCredit_LGBM_FINAL"
    )

    # Loguer le fichier de soumission comme artefact
    mlflow.log_artifact(submission_filename)

print("\n--- Final Model Training and Submission Completed ---")
print("Votre fichier de soumission est prêt à être envoyé à Kaggle !")

--- Starting Final Model Training (Partie 4 / Étape 4.3) ---





Submission file created: submission_optimized_auc_0.7614_20251213_1711.csv

--- Final Model Training and Submission Completed ---
Votre fichier de soumission est prêt à être envoyé à Kaggle !


Successfully registered model 'HomeCredit_LGBM_FINAL'.
Created version '1' of model 'HomeCredit_LGBM_FINAL'.
