In [None]:
# pip install seaborn

In [None]:
import mlflow
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



# Pr√©paration des donn√©es

In [None]:
app_test = pd.read_csv(r'application_test.csv')
print('Testing data shape: ', app_test.shape)
app_test.head()

In [None]:
app_train = pd.read_csv(r'application_train.csv')
print('Training data shape: ', app_train.shape)
app_train.head()

In [None]:
app_train['TARGET'].value_counts()


In [None]:
app_train['TARGET'].astype(int).plot.hist();


On a un fort d√©s√©quilibre entre les classes dans le jeu de donn√©es, les valeurs 1 sont beaucoup plus rares quue les 0.

In [None]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

missing_values = missing_values_table(app_train)
missing_values.head(20)

In [None]:
# Number of each type of column
app_train.dtypes.value_counts()

In [None]:
# Number of unique classes in each object column
app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

## Encoding des variables cat√©gorielles

Label encoding qui associe chaque cat√©gorie √† un chiffre arbitraire pour les variables qui n'ont que deux cat√©gories et OneHot encoding pour les variables qui ont plus de deux cat√©gories.

In [None]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

In [None]:
# one-hot encoding of categorical variables
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

In [None]:
train_labels = app_train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

# Add the target back in
app_train['TARGET'] = train_labels

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

Le jeu de donn√©es de train et de test ont maintenant les m√™mes dimensions, ce qui est requis pour faire du machine learning. 

# Analyses exploratoires des donn√©es

In [None]:
(app_train['DAYS_BIRTH'] / -365).describe()

Il ne semble pas y avoir d'√¢ge abh√©rrant. 

In [None]:
app_train['DAYS_EMPLOYED'].describe()

L√† il y a un probl√®me car les valeurs maximum sont bien trop grandes pour √™tre r√©alistes.

In [None]:
app_train['DAYS_EMPLOYED'].plot.hist(title = 'Days Employment Histogram');
plt.xlabel('Days Employment')

In [None]:
#Analyse approfondie des valeurs anormales de la variable DAYS_EMPLOYED
anom = app_train[app_train['DAYS_EMPLOYED'] == 365243]
non_anom = app_train[app_train['DAYS_EMPLOYED'] != 365243]
print('The non-anomalies default on %0.2f%% of loans' % (100 * non_anom['TARGET'].mean()))
print('The anomalies default on %0.2f%% of loans' % (100 * anom['TARGET'].mean()))
print('There are %d anomalous days of employment' % len(anom))

In [None]:
# Create an anomalous flag column
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243

# Replace the anomalous values with nan
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

app_train['DAYS_EMPLOYED'].plot.hist(title = 'Days Employment Histogram');
plt.xlabel('Days Employment');

In [None]:
app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

print('Il y a %d anomalies dans le jeu de donn√©es de test parmi %d donn√©es' % (app_test["DAYS_EMPLOYED_ANOM"].sum(), len(app_test)))

In [None]:
# Find correlations with the target and sort
correlations = app_train.corr()['TARGET'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))

L'√¢ge est la plus importante corr√©lation positive, cependant cett evaleur √©tant n√©gative (jour depuis la naissance des clients par rapport au jour du pr√™t) cela signifie que plus un client est √¢g√© plus il y a de chance que le pr√™t soit rembours√© √† temps.

In [None]:
# Find the correlation of the positive days since birth and target
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_train['DAYS_BIRTH'].corr(app_train['TARGET'])

In [None]:
# Set the style of plots
plt.style.use('fivethirtyeight')

# Plot the distribution of ages in years
plt.hist(app_train['DAYS_BIRTH'] / 365, edgecolor = 'k', bins = 25)
plt.title('Age of Client'); plt.xlabel('Age (years)'); plt.ylabel('Count');

In [None]:
plt.figure(figsize = (10, 8))

# KDE plot of loans that were repaid on time
sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'DAYS_BIRTH'] / 365, label = 'target == 0')

# KDE plot of loans which were not repaid on time
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'DAYS_BIRTH'] / 365, label = 'target == 1')

# Labeling of plot
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');


In [None]:
# Age information into a separate dataframe
age_data = app_train[['TARGET', 'DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH'] / 365

# Bin the age data
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11))
age_data.head(10)

In [None]:
age_groups  = age_data.groupby('YEARS_BINNED').mean()
age_groups

In [None]:
plt.figure(figsize = (8, 8))

# Graph the age bins and the average of the target as a bar plot
plt.bar(age_groups.index.astype(str), 100 * age_groups['TARGET'])

# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('Age Group (years)'); plt.ylabel('Failure to Repay (%)')
plt.title('Failure to Repay by Age Group');

In [None]:
# Extract the EXT_SOURCE variables and show correlations
ext_data = app_train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
ext_data_corrs

In [None]:
plt.figure(figsize = (8, 6))

# Heatmap of correlations
sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)
plt.title('Correlation Heatmap');

L'√¢ge est fortement corr√©l√©e √† la variable "External_source_1".

In [None]:
plt.figure(figsize = (10, 12))

# iterate through the sources
for i, source in enumerate(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']):
    
    # create a new subplot for each source
    plt.subplot(3, 1, i + 1)
    # plot repaid loans
    sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, source], label = 'target == 0')
    # plot loans that were not repaid
    sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, source], label = 'target == 1')
    
    # Label the plots
    plt.title('Distribution of %s by Target Value' % source)
    plt.xlabel('%s' % source); plt.ylabel('Density')
    plt.legend()
    
plt.tight_layout(h_pad = 2.5)

In [None]:
# Copy the data for plotting
plot_data = ext_data.drop(columns = ['DAYS_BIRTH']).copy()

# Add in the age of the client in years
plot_data['YEARS_BIRTH'] = age_data['YEARS_BIRTH']

# Drop na values and limit to first 100000 rows
plot_data = plot_data.dropna().loc[:100000, :]

# Function to calculate correlation coefficient between two columns
def corr_func(x, y, **kwargs):
    r = np.corrcoef(x, y)[0][1]
    ax = plt.gca()
    ax.annotate("r = {:.2f}".format(r),
                xy=(.2, .8), xycoords=ax.transAxes,
                size = 20)

# Create the pairgrid object
grid = sns.PairGrid(data = plot_data, height = 3, diag_sharey=False,
                    hue = 'TARGET', 
                    vars = [x for x in list(plot_data.columns) if x != 'TARGET'])

# Upper is a scatter plot
grid.map_upper(plt.scatter, alpha = 0.2)

# Diagonal is a histogram
grid.map_diag(sns.kdeplot)

# Bottom is density plot
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r);

plt.suptitle('Ext Source and Age Features Pairs Plot', size = 32, y = 1.05);

# Feature engineering

In [None]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from lightgbm import LGBMClassifier

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [None]:
# Preprocess application_train.csv and application_test.csv
def application_train_test(df_path, num_rows = None, nan_as_category = False):
    # Read data and merge
    df = pd.read_csv(df_path, nrows= num_rows)
    # test_df = pd.read_csv('application_test.csv', nrows= num_rows)
    print("Dataframe lengh: {}".format(len(df)))
    # df = pd.concat([df, test_df], ignore_index=True)
    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']
    
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
    # Some simple new features (percentages)
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    # del test_df
    gc.collect()
    return df

In [None]:
# Preprocess bureau.csv and bureau_balance.csv
def bureau_and_balance(num_rows = None, nan_as_category = True):
    bureau = pd.read_csv('bureau.csv', nrows = num_rows)
    bb = pd.read_csv('bureau_balance.csv', nrows = num_rows)
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    
    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    del bb, bb_agg
    gc.collect()
    
    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    del closed, closed_agg, bureau
    gc.collect()
    return bureau_agg


In [None]:
# Preprocess previous_applications.csv
def previous_applications(num_rows = None, nan_as_category = True):
    prev = pd.read_csv('previous_application.csv', nrows = num_rows)
    prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg

# Preprocess POS_CASH_balance.csv
def pos_cash(num_rows = None, nan_as_category = True):
    pos = pd.read_csv('POS_CASH_balance.csv', nrows = num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg
    
# Preprocess installments_payments.csv
def installments_payments(num_rows = None, nan_as_category = True):
    ins = pd.read_csv('installments_payments.csv', nrows = num_rows)
    ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum'],
        'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
    del ins
    gc.collect()
    return ins_agg

# Preprocess credit_card_balance.csv
def credit_card_balance(num_rows = None, nan_as_category = True):
    cc = pd.read_csv('credit_card_balance.csv', nrows = num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg


In [None]:
def run_full_feature_engineering(df_path, debug = False):
    # Si debug=True, on ne charge que 10000 lignes pour tester vite
    num_rows = 10000 if debug else None
    
    # 1. On pr√©pare la table principale
    print("Processing application train/test...")
    df = application_train_test(df_path, num_rows)
    print("Main table shape:", df.shape)
    
    # 2. On traite 'bureau' et on FUSIONNE (Left Join) sur SK_ID_CURR
    print("Processing bureau...")
    bureau = bureau_and_balance(num_rows)
    df = df.join(bureau, how='left', on='SK_ID_CURR')
    del bureau; gc.collect() # Important pour lib√©rer la m√©moire
    
    # 3. Idem pour les demandes pr√©c√©dentes
    print("Processing previous applications...")
    prev = previous_applications(num_rows)
    df = df.join(prev, how='left', on='SK_ID_CURR')
    del prev; gc.collect()
    
    # 4. Idem pour POS_CASH
    print("Processing POS-CASH balance...")
    pos = pos_cash(num_rows)
    df = df.join(pos, how='left', on='SK_ID_CURR')
    del pos; gc.collect()
    
    # 5. Idem pour les paiements √©chelonn√©s
    print("Processing installments payments...")
    ins = installments_payments(num_rows)
    df = df.join(ins, how='left', on='SK_ID_CURR')
    del ins; gc.collect()
    
    # 6. Idem pour les cartes de cr√©dit
    print("Processing credit card balance...")
    cc = credit_card_balance(num_rows)
    df = df.join(cc, how='left', on='SK_ID_CURR')
    del cc; gc.collect()
    
    print("Final DataFrame shape:", df.shape)
    # On retourne le gros DataFrame final, pr√™t pour le Machine Learning
    return df

In [None]:
#Application de tout ce processus
train_df = run_full_feature_engineering(df_path = "application_train.csv", debug=True)

In [None]:
train_df.head()

In [None]:
train_df.isna().mean()

In [None]:
#Application de tout ce processus
test_df = run_full_feature_engineering(df_path = "application_test.csv", debug=True)

# Test d'apr√®s la page MLFlow

In [None]:
# mlflow.set_experiment("MLflow Quickstart")

# # Load the Iris dataset
# X, y = datasets.load_iris(return_X_y=True)

# # Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )

# # Define the model hyperparameters
# params = {
#     "solver": "lbfgs",
#     "max_iter": 1000,
#     "multi_class": "auto",
#     "random_state": 8888,
# }

In [None]:
# # Enable autologging for scikit-learn
# mlflow.sklearn.autolog()

# # Just train the model normally
# lr = LogisticRegression(**params)
# lr.fit(X_train, y_train)

### Pour log le mod√®le manuellement

In [None]:
# # Start an MLflow run
# with mlflow.start_run():
#     # Log the hyperparameters
#     mlflow.log_params(params)

#     # Train the model
#     lr = LogisticRegression(**params)
#     lr.fit(X_train, y_train)

#     # Log the model
#     model_info = mlflow.sklearn.log_model(sk_model=lr, name="iris_model")

#     # Predict on the test set, compute and log the loss metric
#     y_pred = lr.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)
#     mlflow.log_metric("accuracy", accuracy)

#     # Optional: Set a tag that we can use to remind ourselves what this run was for
#     mlflow.set_tag("Training Info", "Basic LR model for iris data")

### Pour r√©cup√©rer le mod√®le dpeuis MLFlow

In [None]:
# # Load the model back for predictions as a generic Python Function model
# loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

# predictions = loaded_model.predict(X_test)

# iris_feature_names = datasets.load_iris().feature_names

# result = pd.DataFrame(X_test, columns=iris_feature_names)
# result["actual_class"] = y_test
# result["predicted_class"] = predictions

# result[:4]

# Initialisation de l'environnment MLFlow

In [None]:
mlflow.set_experiment("Elaboration du mod√®le de scoring - P7 DS")

# Test de diff√©rents mod√®les

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
import mlflow.sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer



In [None]:
y_complet = train_df['TARGET']
X_complet = train_df.drop(columns=['TARGET', 'SK_ID_CURR'])

X_train, X_val, y_train, y_val = train_test_split(
    X_complet, y_complet, test_size=0.2, random_state=42, stratify=y_complet
)

print(f"Taille du jeu d'entra√Ænement complet : {X_complet.shape}")
print(f"Taille du nouveau jeu d'entra√Ænement : {X_train.shape}")
print(f"Taille du jeu de validation : {X_val.shape}")

In [None]:
X_test = test_df.drop(columns=['SK_ID_CURR'])

print("Shape de X_train :", X_test.shape)

In [None]:
def get_param_grid(model):
    name = model.__class__.__name__  #pour trouver le type de mod√®le, permet √† la fonction d'√™tre dynamique et de l'utiliser pour plusieurs types de mod√®les
    grids = {
        "LogisticRegression": {
            'model__C': [0.001, 0.01, 0.1, 1, 10, 100],
            'model__penalty': ['l1', 'l2'],
            'model__solver': ['liblinear']
        },
        "RandomForestClassifier": {
            'model__n_estimators': [100, 200],
            'model__max_depth': [5, 10, None],
            'model__class_weight': ['balanced', None] #pour g√©rer le d√©s√©quilibre des classes
        },
        "XGBClassifier": {
            'model__n_estimators': [50, 100, 200],
            'model__max_depth': [2, 3, 6],
            'model__scale_pos_weight': [1, 10], #pour g√©rer le d√©s√©quilibre des classes
            'model__learning_rate': [0.01, 0.05, 0.1],
            'model__random_state': [42],
            'model__use_label_encoder': [False], # Pour √©viter un warning
            'model__eval_metric': ['logloss']
        }         
    }

    return grids.get(name, {})

In [None]:
#Mise en place de la pipeline pour faire le preprocessing et la mise en place du mod√®le
def pipeline_model(model_type, X_train, y_train, X_val, y_val):
    model_name = model_type.__class__.__name__
    with mlflow.start_run(run_name=f"GS_{model_name}", nested=True) as child_run:
        steps=[
            ('imputer', SimpleImputer(strategy='median')), #g√®re les NaN
            ('scaler', StandardScaler()), #standardisation des donn√©es
            ('model', model_type)                        
        ]

        pipeline=Pipeline(steps=steps)

        param_grid = get_param_grid(model_type)
        
        print(f"Hyperparam√®tres pour {model_name} : {param_grid}")
        
        mlflow.set_tag("model_type", model_name)
        mlflow.set_tag("mlflow.runName", f"GS_{model_name}")

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        grid_search_model = GridSearchCV(pipeline, param_grid, cv=cv, scoring='roc_auc', verbose=1, n_jobs=-1)
        grid_search_model.fit(X_train, y_train)

        y_val_pred__dumb = grid_search_model.predict_proba(X_val)[:, 1]
        val_auc = roc_auc_score(y_val, y_val_pred__dumb)
        print(f"Score AUC sur validation : {val_auc:.4f}")
        #Tracer de la courbe ROC
        fpr, tpr, thresholds = roc_curve(y_val, y_val_pred__dumb)
        fig, ax = plt.subplots(figsize=(8, 6))
        ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'Mod√®le (AUC = {val_auc:.3f})')
        ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Hasard')
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('Taux de Faux Positifs')
        ax.set_ylabel('Taux de Vrais Positifs')
        ax.set_title(f'Courbe ROC - Mod√®le {model_name}')
        ax.legend(loc="lower right")
        ax.grid(True)

    return grid_search_model, val_auc, fig

## Mod√®le de base - Dummy

C'est le mod√®le de base qui va nous servir √† comparer les mod√®les que nous allons tester. En effet, si leur performances sont √©quivalentes voire inf√©rieures √† celui-ci alors ces mod√®les ne seront pas √† retenir pour r√©pondre au client.

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
dumb_model = DummyClassifier(strategy="most_frequent")

In [None]:
with mlflow.start_run(run_name="Dummy Model"):
    dumb_model.fit(X_train, y_train)
    y_val_pred__dumb = dumb_model.predict_proba(X_val)[:, 1]
    val_auc = roc_auc_score(y_val, y_val_pred__dumb)
    print(f"Score AUC sur validation : {val_auc:.4f}")
    #Tracer de la courbe ROC
    fpr, tpr, thresholds = roc_curve(y_val, y_val_pred__dumb)
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'Mod√®le (AUC = {val_auc:.3f})')
    ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Hasard')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('Taux de Faux Positifs')
    ax.set_ylabel('Taux de Vrais Positifs')
    ax.set_title('Courbe ROC - Mod√®le Dummy')
    ax.legend(loc="lower right")
    ax.grid(True)
    mlflow.log_figure(fig, "roc_curve_dumb.png")
    plt.close(fig)
    #Enregistrement du mod√®le dans MLflow
    mlflow.sklearn.log_model(dumb_model, "dummy_model")
    mlflow.log_param("strategy", "most_frequent")
    mlflow.log_metric("auc_score", val_auc)
    print("Run MLFlow termin√©.")


## R√©gression logistique

In [None]:
mlflow.sklearn.autolog()

In [None]:
lr, auc_lr, fig_lr = pipeline_model(LogisticRegression(), X_train, y_train, X_val, y_val)



## Random Forest Classification

In [None]:
rf, auc_rf, fig_rf = pipeline_model(RandomForestClassifier(), X_train, y_train, X_val, y_val)


## Mod√®le XGBoost

In [None]:
xgb, auc_xgb, fig_xgb = pipeline_model(XGBClassifier(), X_train, y_train, X_val, y_val)

# Mod√®le LightGBM

In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import re


In [None]:
def clean_dataset_for_lgbm(df):
    # S√©lectionne les colonnes qui sont de type 'object' (texte)
    obj_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    if len(obj_cols) > 0:
        print(f"‚ö†Ô∏è Correction de {len(obj_cols)} colonnes de type 'object'...")
        for col in obj_cols:
            # On force la conversion en num√©rique. 
            # Les erreurs (texte qui n'est pas un nombre) deviennent NaN
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    print("‚úÖ Types de donn√©es nettoy√©s. Pr√™t pour LightGBM.")
    return df

# Applique le nettoyage sur tes jeux de donn√©es
X_train = clean_dataset_for_lgbm(X_train)
X_val = clean_dataset_for_lgbm(X_val)

In [None]:
def clean_feature_names(df):
    # Remplace tous les caract√®res "interdits" par un underscore
    # On garde seulement les lettres, chiffres et l'underscore
    new_cols = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df.columns]
    
    # On assigne les nouveaux noms
    df.columns = new_cols
    print("‚úÖ Noms de colonnes nettoy√©s pour LightGBM.")
    return df

# --- APPLIQUE √áA AVANT TON GRIDSEARCH ---
X_train = clean_feature_names(X_train)
X_val = clean_feature_names(X_val)

In [None]:
def grid_search_lightgbm_with_early_stopping(X_train, y_train, X_val, y_val):
    
    # 1. On d√©finit le mod√®le de base (sans hyperparam√®tres fix√©s, sauf le n_jobs)
    lgbm = lgb.LGBMClassifier(random_state=42, n_jobs=1) # n_jobs=1 pour laisser le GridSearch g√©rer les c≈ìurs

    # 2. La grille de param√®tres √† tester
    # Note : On ne met pas de "model__" devant car il n'y a plus de pipeline !
    param_grid ={
            'n_estimators': [100, 500, 1000],
            'learning_rate': [0.02, 0.05],
            'num_leaves': [34, 50],
            'colsample_bytree': [0.7, 0.9],
            'subsample': [0.8, 1],
            'max_depth': [8, -1],
            'scale_pos_weight': [1, 10], #pour g√©rer le d√©s√©quilibre des classes
            'random_state': [42],
            'n_jobs': [1]
        }
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) # 3 folds pour aller plus vite
        
    grid = GridSearchCV(
        estimator=lgbm,
        param_grid=param_grid,
        cv=cv,
        scoring='roc_auc',
        verbose=1,
        n_jobs=-1
    )

    print("üöÄ Lancement du GridSearch avec Early Stopping...")
        
    grid.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],        # Le jeu de validation pour l'early stopping
        eval_metric='auc',                # La m√©trique √† surveiller
        callbacks=[                       # Les callbacks LightGBM
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=0)  # period=0 pour ne pas spammer la console √† chaque essai
        ]
    )

    print(f"‚úÖ Meilleur score AUC (interne CV) : {grid.best_score_:.4f}")
    print(f"‚úÖ Meilleurs param√®tres : {grid.best_params_}")
    
    return grid.best_estimator_

In [None]:
best_lgbmmodel = grid_search_lightgbm_with_early_stopping(X_train, y_train, X_val, y_val)

# Calcule du score m√©tier et du seuil

In [None]:
from sklearn.metrics import confusion_matrix, make_scorer


In [None]:
def calcul_score_metier(y_true, y_pred):
    # 1. Extraction des valeurs de la matrice de confusion
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    score = (10 * fn) + fp
    
    return score

In [None]:
score_metier = make_scorer(calcul_score_metier, greater_is_better=False)
score_metier

In [None]:
def pipeline_model_score(model_type, X_train, y_train, X_val, y_val):
    model_name = model_type.__class__.__name__
    
    with mlflow.start_run(run_name=f"GS_{model_name}", nested=True) as child_run:
        steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('model', model_type)                        
        ]

        pipeline = Pipeline(steps=steps)
        param_grid = get_param_grid(model_type)
        
        print(f"Hyperparam√®tres pour {model_name} : {param_grid}")
        
        mlflow.set_tag("model_type", model_name)
        mlflow.set_tag("mlflow.runName", f"GS_{model_name}")

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        grid_search_model = GridSearchCV(
            pipeline, 
            param_grid, 
            cv=cv, 
            scoring=score_metier, # <--- C'est ici que la magie op√®re
            verbose=1, 
            n_jobs=-1
        )
        
        grid_search_model.fit(X_train, y_train)
        
        # --- LOGGING DU MEILLEUR SCORE ---
        # Attention : comme greater_is_better=False, ce score sera n√©gatif (ex: -150)
        # On prend la valeur absolue pour l'affichage si on veut
        best_cost = abs(grid_search_model.best_score_)
        print(f"Meilleur co√ªt m√©tier moyen sur le CV : {best_cost:.2f}")
        mlflow.log_metric("best_cv_business_cost", best_cost) # <--- On loggue √ßa dans MLflow

        # --- EVALUATION SUR VAL ---
        # 1. Calcul de l'AUC (on garde √ßa, c'est utile)
        y_val_probs = grid_search_model.predict_proba(X_val)[:, 1]
        val_auc = roc_auc_score(y_val, y_val_probs)
        
        # 2. Calcul du CO√õT R√âEL sur la validation (NOUVEAU)
        y_val_pred = grid_search_model.predict(X_val) # Pr√©dictions dures (0 ou 1)
        val_cost = calcul_score_metier(y_val, y_val_pred) # On utilise ta fonction directement
        
        print(f"Score AUC sur validation : {val_auc:.4f}")
        print(f"Co√ªt M√©tier sur validation : {val_cost} ‚Ç¨") # <--- Affichage important
        
        mlflow.log_metric("val_auc", val_auc)
        mlflow.log_metric("val_business_cost", val_cost)

        # --- TRAC√â ROC (INCHANG√â) ---
        fpr, tpr, thresholds = roc_curve(y_val, y_val_probs)
        fig, ax = plt.subplots(figsize=(8, 6))
        ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'Mod√®le (AUC = {val_auc:.3f})')
        ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Hasard')
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('Taux de Faux Positifs')
        ax.set_ylabel('Taux de Vrais Positifs')
        ax.set_title(f'Courbe ROC - Mod√®le {model_name}')
        ax.legend(loc="lower right")
        ax.grid(True)
        
        # Log de la figure dans MLflow
        mlflow.log_figure(fig, f"roc_curve_{model_name}.png")

    return grid_search_model, val_auc, fig, val_cost

In [None]:
def grid_search_lightgbm_with_early_stopping_score(X_train, y_train, X_val, y_val):
    model_name = "LGBMClassifier"
    with mlflow.start_run(run_name=f"GS_{model_name}", nested=True) as child_run:
        preprocessor = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        X_train_proc = preprocessor.fit_transform(X_train)
        X_val_proc = preprocessor.transform(X_val)
    
    # 1. On d√©finit le mod√®le de base (sans hyperparam√®tres fix√©s, sauf le n_jobs)
        lgbm = lgb.LGBMClassifier(random_state=42, n_jobs=1) # n_jobs=1 pour laisser le GridSearch g√©rer les c≈ìurs

    # 2. La grille de param√®tres √† tester
    # Note : On ne met pas de "model__" devant car il n'y a plus de pipeline !
        param_grid ={
                'n_estimators': [100, 500, 1000],
                'learning_rate': [0.02, 0.05],
                'num_leaves': [34, 50],
                'colsample_bytree': [0.7, 0.9],
                'subsample': [0.8, 1],
                'max_depth': [8, -1],
                'class_weight': [None, 'balanced'], #pour g√©rer le d√©s√©quilibre des classes
                'random_state': [42],
                'n_jobs': [1]
            }
        
        mlflow.set_tag("model_type", model_name)
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) # 3 folds pour aller plus vite
            
        grid = GridSearchCV(
            estimator=lgbm,
            param_grid=param_grid,
            cv=cv,
            scoring=score_metier,
            verbose=1,
            n_jobs=-1
        )

        print("üöÄ Lancement du GridSearch avec Early Stopping...")
            
        grid.fit(
            X_train_proc, y_train,
            eval_set=[(X_val_proc, y_val)],        # Le jeu de validation pour l'early stopping
            eval_metric='auc', #mieux que le score directement car mon score n'est pas une variable continue
            callbacks=[                       # Les callbacks LightGBM
                lgb.early_stopping(stopping_rounds=100),
                lgb.log_evaluation(period=0)  # period=0 pour ne pas spammer la console √† chaque essai
            ]
        )
        best_cost = abs(grid.best_score_) #car le score retourner est n√©gatif avec greater_is_better=False
        print(f"Meilleur co√ªt m√©tier moyen sur le CV : {best_cost:.2f}")
        mlflow.log_metric("best_cv_business_cost", best_cost) # <--- On loggue √ßa dans MLflow

        mlflow.log_params(grid.best_params_)
        y_val_pred = grid.predict(X_val_proc) # Pr√©dictions dures (0 ou 1)
        y_val_probs = grid.predict_proba(X_val_proc)[:, 1]
        val_auc = roc_auc_score(y_val, y_val_probs)
        # On calcule le co√ªt r√©el sur la validation
        real_cost_val = calcul_score_metier(y_val, y_val_pred)
        
        fpr, tpr, thresholds = roc_curve(y_val, y_val_probs)
        fig, ax = plt.subplots(figsize=(8, 6))
        ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'Mod√®le (AUC = {val_auc:.3f})')
        ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Hasard')
        ax.set_title(f'Courbe ROC - Mod√®le {model_name}')
        ax.legend(loc="lower right")
        
        mlflow.log_figure(fig, f"roc_curve_{model_name}.png")

        print(f"‚úÖ Meilleur score score m√©tier (CV) : {grid.best_score_:.4f}")
        print(f"‚úÖ Meilleurs param√®tres : {grid.best_params_}")
        
        return grid.best_estimator_, val_auc, fig, real_cost_val

In [None]:
best_lgbmmodel, lgbm_auc, lgbm_score, fig_lgbm = grid_search_lightgbm_with_early_stopping_score(X_train, y_train, X_val, y_val)

In [None]:
models_list = [LogisticRegression(), RandomForestClassifier(), XGBClassifier()]

score_model_df = pd.DataFrame(columns=["Mod√®le", "AUC", "Score m√©tier"])
for model in models_list:
    grid_search_model, val_auc, fig, val_cost = pipeline_model_score(model, X_train, y_train, X_val, y_val)
    score_model_df = pd.concat([score_model_df, pd.DataFrame([{
        "Mod√®le": model.__class__.__name__,
        "AUC": val_auc,
        "Score m√©tier": val_cost
    }])], ignore_index=True)

In [None]:
score_model_df = pd.concat([score_model_df, pd.DataFrame([{
        "Mod√®le": "LightGBM",
        "AUC": lgbm_auc,
        "Score m√©tier": lgbm_score
    }])], ignore_index=True)
print(score_model_df)