# Import

In [6]:
!pip show shap

[0m

In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
import time
import gc
import os
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import warnings

In [2]:
# Creating the customer csv for the dashboard

In [3]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

# Preprocess application_train.csv and application_test.csv
def application_test(num_rows = None, nan_as_category = False):
    # Read data and merge
    test_df = pd.read_csv('credit_files/application_test.csv')
    df = pd.read_csv('credit_files/application_train.csv')
    print("Test samples: {}".format(len(test_df)))
    
    # Merging
    df = pd.concat([df,test_df])
    df = df.reset_index()

    # Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']
    
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
    
    #Only keeping relevant columns
    df = df[['EXT_SOURCE_3','EXT_SOURCE_2', 'NAME_EDUCATION_TYPE_Higher education','NAME_INCOME_TYPE_Working',
             'NAME_EDUCATION_TYPE_Secondary / secondary special','CODE_GENDER','NAME_CONTRACT_TYPE_Cash loans',
             'REGION_RATING_CLIENT', 'FLAG_DOCUMENT_3']]

    df = df.dropna()

    del test_df
    gc.collect()
    return df

#Checking the first part
trial_1 = application_test()
trial_1

Test samples: 7999


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)


Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,NAME_EDUCATION_TYPE_Higher education,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Secondary / secondary special,CODE_GENDER,NAME_CONTRACT_TYPE_Cash loans,REGION_RATING_CLIENT,FLAG_DOCUMENT_3
2,0.729567,0.555912,False,True,True,1,False,2.0,0.0
6,0.492060,0.724000,True,False,False,0,True,2.0,0.0
7,0.540654,0.714279,True,False,False,1,True,3.0,1.0
8,0.751724,0.205747,False,False,True,0,True,2.0,1.0
10,0.363945,0.651862,True,True,False,0,True,2.0,1.0
...,...,...,...,...,...,...,...,...,...
15989,0.520898,0.610072,False,False,True,1,True,2.0,1.0
15990,0.520898,0.765863,True,True,False,0,True,1.0,1.0
15991,0.775155,0.786587,True,False,False,1,True,2.0,1.0
15992,0.169429,0.538766,False,True,True,0,True,2.0,1.0


In [4]:
trial_1.to_csv('credit_files/cust_dash.csv', index=False)

# Fin