### Import Requirements

In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

#memory management
import gc

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib for plotting
import matplotlib.pyplot as plt
import seaborn as sns

#to impute missing values
from sklearn.preprocessing import Imputer, MinMaxScaler

from sklearn.feature_selection import VarianceThreshold, RFE, SelectFromModel, chi2, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## Utility Functions
### Set folder paths for getting input and saving outputs

In [2]:
def setupFolderPaths():    
    #Set input data folder 
    dataFolder = os.getcwd() + os.sep + os.pardir + os.sep + 'ProjectDataFiles'
    if(not os.path.exists(dataFolder)):
        print("Input Data folder not found. Please specify data folder path as dataFolder variable to proceed")
        raise NotADirectoryError

    #Create output folder is it does not exist
    outputFolder = os.getcwd() + os.sep + os.pardir + os.sep + 'CodeOutputs'

    if not os.path.exists(outputFolder):
        os.makedirs(outputFolder)
        print('Output Folder created')
        
    return dataFolder, outputFolder

In [3]:
def readFile(fileName):
    if(not os.path.exists(fileName)):
        raise FileNotFoundError
        
    return pd.read_csv(fileName)

## Replace all the outliers in the input data set with nan

In [4]:
#Function to replace outliers in input data
def replace_outliers(df):
    #Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    if 'CODE_GENDER' in df:
        df = df[df['CODE_GENDER'] != 'XNA']

    #Replace outlier values with nan
    df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].map(lambda x: x if x <= 0 else np.nan)
    df['REGION_RATING_CLIENT_W_CITY'] = df['REGION_RATING_CLIENT_W_CITY'].map(lambda x: x if x >= 0 else np.nan)
    df['AMT_INCOME_TOTAL'] = df['AMT_INCOME_TOTAL'].map(lambda x: x if x <= 5e6 else np.nan)
    df['AMT_REQ_CREDIT_BUREAU_QRT'] = df['AMT_REQ_CREDIT_BUREAU_QRT'].map(lambda x: x if x <= 10 else np.nan)
    df['OBS_30_CNT_SOCIAL_CIRCLE'] = df['OBS_30_CNT_SOCIAL_CIRCLE'].map(lambda x: x if x <= 40 else np.nan)
    df['OBS_60_CNT_SOCIAL_CIRCLE'] = df['OBS_60_CNT_SOCIAL_CIRCLE'].map(lambda x: x if x <= 50 else np.nan)
    df['DEF_30_CNT_SOCIAL_CIRCLE'] = df['DEF_30_CNT_SOCIAL_CIRCLE'].map(lambda x: x if x <= 100 else np.nan)
    
    return df

### Identify Feature types : Catagorical, Numerical, Integer and Boolean (Flags)

In [5]:
#Function to identify feature types in a given data frame
def identify_feature_types(df, features_to_ignore, verbose = False):
    categorical_features = list(f for f in df.select_dtypes(include='object') if f not in features_to_ignore)
    floatingPoint_features = list(f for f in df.select_dtypes(include='float64') if f not in features_to_ignore)
    temp = list(f for f in df.select_dtypes(include='int64') if f not in features_to_ignore)
    bool_features = [x for x in temp if 'FLAG' in x]
    integer_features = [x for x in temp if x not in bool_features]
    totalCount = len(categorical_features) + len(floatingPoint_features) + len(bool_features) + len(integer_features)
    if (verbose == True):
        print ('Catagorical Features : {}, Floating Point Features : {}, Boolean Features : {}, Integer Features : {}, Total Count : {}'
           .format(len(categorical_features), len(floatingPoint_features), len(bool_features), len(integer_features), totalCount))
    
    return categorical_features, floatingPoint_features, bool_features, integer_features

### Drop list of features from data frame

In [6]:
def drop_features(df, features):
    df = df.drop(columns=[f for f in df.columns if f in features])
    return df

In [7]:
def scale_features(df, feature_list = None, scale_range = (0,1)):
    if(feature_list == None):
        feature_list = [f for f in df.columns if f not in ['TARGET', 'SK_ID_CURR', 'Unnamed :0']]
        
    #Scale each feature to 0-1
    scaler = MinMaxScaler(feature_range = scale_range)
    
    for feature in feature_list:
        if (df[feature].dtype == 'object'):
            continue

        scaler.fit(df[feature].values.reshape(-1,1))
        df[feature] = scaler.transform(df[feature].values.reshape(-1,1))
    
    return df

In [8]:
def impute_features(df, features = 'All'):
    categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(df,
                                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'])
    if(features == 'All'):
        feature_list = [f for f in df.columns if f not in ['TARGET', 'SK_ID_CURR', 'Unnamed :0']]
    elif(features == 'Numerical'):
        feature_list = floatingPoint_feats + integer_feats
    elif(features == 'Categorical'):
        feature_list = categorical_feats + bool_feats
    else:
        raise ValueError('features can either be All, Numerical, Categorical')
        
    #Imputer for numerical features
    imputer = Imputer(strategy = 'median')
    
    for feature in feature_list:
        if (feature in categorical_feats + bool_feats):
            df[feature] = df[feature].fillna(df[feature].value_counts().index[0])
        else:
            imputer.fit(df[feature].values.reshape(-1,1))
            df[feature] = imputer.transform(df[feature].values.reshape(-1,1))

    return df

In [9]:
#Function to caluculate WOE
def calculate_WOE(df, target,feature):
    lst = []
    for i in range(df[feature].nunique(dropna=False)):
        val = list(df[feature].unique())[i]
        lst.append([feature,                                                        # Variable
                    val,                                                            # Value
                    df[df[feature] == val].count()[feature],                        # All
                    df[(df[feature] == val) & (df[target] == 0)].count()[feature],  # Good (target == 0)
                    df[(df[feature] == val) & (df[target] == 1)].count()[feature]]) # Bad (target == 1)

    data = pd.DataFrame(lst, columns=['Variable', 'Value', 'All', 'Good', 'Bad'])
    data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
    data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
    data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])

    data = data.replace({'WoE': {np.inf: 0, -np.inf: 0}})
    return data

In [21]:
def remove_missing_value_columns(df, threshold):
    missing_values = compute_missing_values(df)
    missing_greater = missing_values[missing_values.iloc[:,1] >= threshold]
    df = df.drop(columns=missing_greater.index)
    print('{} columns have been dropped from input data set'.format(len(missing_greater)))
    del missing_values
    gc.collect()
    return df

# Function to calculate missing values by columns 
def compute_missing_values(df, sortAscending = False):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=sortAscending).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing value information
        return mis_val_table_ren_columns

### Feature Selection Utils

#### 1. Variance Selector

Check variance of variable according to a threshold.

**Note**
- Scaling: no
- Impute missing values: yes

In [11]:
def var_selector(input_df, threshold = 0.01):
    df = input_df.copy()
    df = impute_features(df)
    selector = VarianceThreshold(threshold)
    selector.fit_transform(df)
    var_support = selector.get_support()
    var_feature = df.loc[:,var_support].columns.tolist()
    print('Variance : ', str(len(var_feature)), 'selected features out of', str(len(var_support)))
    del df, selector, var_feature
    return var_support

#### 2. Pearson Correlation Selector
Pearson Correlation Coefficient to show linear dependence of variables.

**Note**
- Scaling: no
- Impute missing values: yes

In [12]:
def cor_selector(input_df, labels, drop_ratio = 0.5):
    drop_ratio = max(0,min(drop_ratio, 1))
    df = input_df.copy()
    cor_list = []
    df = impute_features(df, 'All')
    # calculate the correlation with y for each feature
    for i in df.columns.tolist():
        cor = np.corrcoef(df[i], labels)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = df.iloc[:,np.argsort(np.abs(cor_list))[-int(len(cor_list) * drop_ratio):]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in df.columns]
    print('Pearson : ', str(len(cor_feature)), 'selected features out of', str(len(cor_support)))
    del df, cor_feature
    return cor_support

#### 3. Chi2 Selector
Chi-square statistic test to check independance of variable and the class

**Note**
- Scaling: yes
- Impute missing values: yes

In [13]:
def chi_selector(input_df, labels, drop_ratio = 0.5):
    drop_ratio = max(0,min(drop_ratio, 1))
    df = input_df.copy()
    num_feats = int(len(df.columns) * drop_ratio)
    df = impute_features(df, features ='All')    
    df = scale_features(df)
    selector = SelectKBest(chi2, k=num_feats)
    selector.fit(df, labels)
    chi_support = selector.get_support()
    chi_feature = df.loc[:,chi_support].columns.tolist()
    print('CHI2 : ', str(len(chi_feature)), 'selected features out of', str(len(chi_support)))
    del df, selector, chi_feature
    return chi_support

#### 4. RFE Selector (Recursive Feature Selection)
**Note**
- Scaling: yes
- Impute missing values: yes 

In [14]:
def rfe_selector(input_df, labels, drop_ratio = 0.5):
    drop_ratio = max(0,min(drop_ratio, 1))
    df = input_df.copy()
    num_feats = int(len(df.columns) * drop_ratio)
    df = impute_features(df, features ='All')    
    df = scale_features(df)
    selector = RFE(estimator=LogisticRegression(solver='saga', n_jobs=-1), n_features_to_select=num_feats, step=10, verbose=20)
    selector.fit(df, labels)
    rfe_support = selector.get_support()
    rfe_feature = df.loc[:,rfe_support].columns.tolist()
    print('RFE : ', str(len(rfe_feature)), 'selected features out of', str(len(rfe_feature)))
    del df, selector, rfe_feature
    return rfe_support

#### 5. Random Forest Selector
**Note**
- Scaling: no
- Impute missing values: yes

In [15]:
def rf_selector(input_df, labels, drop_ratio = 0.5):
    drop_ratio = max(0,min(drop_ratio, 1))
    df = input_df.copy()
    num_feats = int(len(df.columns) * drop_ratio)
    df = impute_features(df, features ='All')
    selector = SelectFromModel(RandomForestClassifier(n_estimators=150), threshold='1.25*median')
    selector.fit(df, labels)
    rf_support = selector.get_support()
    rf_feature = df.loc[:,rf_support].columns.tolist()
    print('RF : ', str(len(rf_support)), 'selected features out of', str(len(rf_feature)))
    del df, selector, rf_feature
    return rf_support

In [16]:
def select_features(input_df, labels, min_votes = 3, drop_ratio = 0.5):
    
    df = input_df.copy()
    
    var_support = var_selector(df)
    cor_support = cor_selector(df, labels, drop_ratio)
    chi_support = chi_selector(df, labels, drop_ratio)
    rfe_support = rfe_selector(df, labels, drop_ratio)
    rf_support = rf_selector(df, labels, drop_ratio)
    
    pd.set_option('display.max_rows', None)
    
    # put all selection together
    feature_selection_df = pd.DataFrame({'Feature':df.columns, 'Variance': var_support, 'Pearson':cor_support,
                                         'Chi-2':chi_support, 'RFE':rfe_support, 'Random Forest':rf_support})
    ## count the selected times for each feature
    feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
    # display the top 100
    feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
    
    feature_selection_df.index = range(1, len(feature_selection_df)+1)
    
    del df, var_support, cor_support, chi_support, rfe_support, rf_support
    
    return feature_selection_df

### Sampling 

In [17]:
def take_sample(df, labels, stratified = True):
    df['TARGET'] = labels
    num_samples = df.loc[df.TARGET==1].shape[0]
    if (stratified):
        sampling_ratio = num_samples / df.shape[0]
        sample1 = df.loc[df.TARGET==1].sample(frac=sampling_ratio, replace=False)
        print('label 1 sample size:', str(sample1.shape[0]))
        sample0 = df.loc[df.TARGET==0].sample(frac=sampling_ratio, replace=False)
        print('label 0 sample size:', str(sample0.shape[0]))
    else:
        sample1 = df.loc[df.TARGET==1].sample(n=num_samples, replace=False)
        print('label 1 sample size:', str(sample1.shape[0]))
        sample0 = df.loc[df.TARGET==0].sample(n=num_samples, replace=False)
        print('label 0 sample size:', str(sample0.shape[0]))
    
    sampled_df = pd.concat([sample1, sample0], axis=0)
    sampled_labels = sampled_df.pop('TARGET')
    return sampled_df, sampled_labels

In [18]:
dataFolder, outputFolder = setupFolderPaths()

In [19]:
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)

## Data set version 1: with missing values as NaN
### Data set version 1_a: Null values without categorical features

In [22]:
#Version one with null values included and no categorical features.
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
input_df = remove_missing_value_columns(input_df, 40)
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'])
input_df = drop_features(input_df, categorical_feats + bool_feats)

input_df.info()
del input_df, categorical_feats, floatingPoint_feats, bool_feats, integer_feats
gc.collect()

Your selected dataframe has 122 columns.
There are 69 columns that have missing values.
49 columns have been dropped from input data set
<class 'pandas.core.frame.DataFrame'>
Int64Index: 307507 entries, 0 to 307510
Data columns (total 35 columns):
SK_ID_CURR                     307507 non-null int64
TARGET                         307507 non-null int64
CNT_CHILDREN                   307507 non-null int64
AMT_INCOME_TOTAL               307502 non-null float64
AMT_CREDIT                     307507 non-null float64
AMT_ANNUITY                    307495 non-null float64
AMT_GOODS_PRICE                307229 non-null float64
REGION_POPULATION_RELATIVE     307507 non-null float64
DAYS_BIRTH                     307507 non-null int64
DAYS_EMPLOYED                  252133 non-null float64
DAYS_REGISTRATION              307507 non-null float64
DAYS_ID_PUBLISH                307507 non-null int64
CNT_FAM_MEMBERS                307505 non-null float64
REGION_RATING_CLIENT           307507 non-null 

42

### Data set version 1_b: Null values with categorical features OHE

In [23]:
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
input_df = remove_missing_value_columns(input_df, 40)
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'])

input_df = pd.get_dummies(input_df, columns=categorical_feats, dtype=np.int64, dummy_na= True)

input_df.info()
del input_df, categorical_feats, floatingPoint_feats, bool_feats, integer_feats
gc.collect()

Your selected dataframe has 122 columns.
There are 69 columns that have missing values.
49 columns have been dropped from input data set
<class 'pandas.core.frame.DataFrame'>
Int64Index: 307507 entries, 0 to 307510
Columns: 196 entries, SK_ID_CURR to ORGANIZATION_TYPE_nan
dtypes: float64(21), int64(175)
memory usage: 462.2 MB


49

### Data set version 1_c: Null values with categorical feature WOE encoded

In [24]:
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
input_df = remove_missing_value_columns(input_df, 40)
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'])
for cat_feature in categorical_feats:
    WoE_df = calculate_WOE(input_df, 'TARGET', cat_feature)
    input_df[cat_feature] = input_df[cat_feature].replace(WoE_df.set_index('Value')['WoE'])

input_df.info()
del input_df, categorical_feats, floatingPoint_feats, bool_feats, integer_feats, WoE_df
gc.collect()

Your selected dataframe has 122 columns.
There are 69 columns that have missing values.
49 columns have been dropped from input data set
<class 'pandas.core.frame.DataFrame'>
Int64Index: 307507 entries, 0 to 307510
Data columns (total 73 columns):
SK_ID_CURR                     307507 non-null int64
TARGET                         307507 non-null int64
NAME_CONTRACT_TYPE             307507 non-null float64
CODE_GENDER                    307507 non-null float64
FLAG_OWN_CAR                   307507 non-null float64
FLAG_OWN_REALTY                307507 non-null float64
CNT_CHILDREN                   307507 non-null int64
AMT_INCOME_TOTAL               307502 non-null float64
AMT_CREDIT                     307507 non-null float64
AMT_ANNUITY                    307495 non-null float64
AMT_GOODS_PRICE                307229 non-null float64
NAME_TYPE_SUITE                306215 non-null float64
NAME_INCOME_TYPE               307507 non-null float64
NAME_EDUCATION_TYPE            307507 non-n

70

## Data set version 2: With missing value entries omitted
### Data set version 2_a: Without categorical features 

In [25]:
#Version with no null values and categorical features.
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
input_df = remove_missing_value_columns(input_df, 40)
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'])
input_df = drop_features(input_df, categorical_feats + bool_feats)
input_df = input_df.dropna()

input_df.info()
del input_df, categorical_feats, floatingPoint_feats, bool_feats, integer_feats
gc.collect()

Your selected dataframe has 122 columns.
There are 69 columns that have missing values.
49 columns have been dropped from input data set
<class 'pandas.core.frame.DataFrame'>
Int64Index: 201083 entries, 0 to 307510
Data columns (total 35 columns):
SK_ID_CURR                     201083 non-null int64
TARGET                         201083 non-null int64
CNT_CHILDREN                   201083 non-null int64
AMT_INCOME_TOTAL               201083 non-null float64
AMT_CREDIT                     201083 non-null float64
AMT_ANNUITY                    201083 non-null float64
AMT_GOODS_PRICE                201083 non-null float64
REGION_POPULATION_RELATIVE     201083 non-null float64
DAYS_BIRTH                     201083 non-null int64
DAYS_EMPLOYED                  201083 non-null float64
DAYS_REGISTRATION              201083 non-null float64
DAYS_ID_PUBLISH                201083 non-null int64
CNT_FAM_MEMBERS                201083 non-null float64
REGION_RATING_CLIENT           201083 non-null 

42

### Data set version 2_b: With categorical features OHE

In [26]:
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
input_df = remove_missing_value_columns(input_df, 40)
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'])
input_df = input_df.dropna()

input_df = pd.get_dummies(input_df, columns=categorical_feats, dtype=np.int64, dummy_na= True)

input_df.info()
del input_df, categorical_feats, floatingPoint_feats, bool_feats, integer_feats
gc.collect()

Your selected dataframe has 122 columns.
There are 69 columns that have missing values.
49 columns have been dropped from input data set
<class 'pandas.core.frame.DataFrame'>
Int64Index: 167725 entries, 0 to 307510
Columns: 192 entries, SK_ID_CURR to ORGANIZATION_TYPE_nan
dtypes: float64(21), int64(171)
memory usage: 247.0 MB


49

### Data set version 2_c: With categorical features WOE Encoded

In [27]:
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
input_df = remove_missing_value_columns(input_df, 40)
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'])
input_df = input_df.dropna()

for cat_feature in categorical_feats:
    WoE_df = calculate_WOE(input_df, 'TARGET', cat_feature)
    input_df[cat_feature] = input_df[cat_feature].replace(WoE_df.set_index('Value')['WoE'])

input_df.info()
del input_df, categorical_feats, floatingPoint_feats, bool_feats, integer_feats, WoE_df
gc.collect()

Your selected dataframe has 122 columns.
There are 69 columns that have missing values.
49 columns have been dropped from input data set
<class 'pandas.core.frame.DataFrame'>
Int64Index: 167725 entries, 0 to 307510
Data columns (total 73 columns):
SK_ID_CURR                     167725 non-null int64
TARGET                         167725 non-null int64
NAME_CONTRACT_TYPE             167725 non-null float64
CODE_GENDER                    167725 non-null float64
FLAG_OWN_CAR                   167725 non-null float64
FLAG_OWN_REALTY                167725 non-null float64
CNT_CHILDREN                   167725 non-null int64
AMT_INCOME_TOTAL               167725 non-null float64
AMT_CREDIT                     167725 non-null float64
AMT_ANNUITY                    167725 non-null float64
AMT_GOODS_PRICE                167725 non-null float64
NAME_TYPE_SUITE                167725 non-null float64
NAME_INCOME_TYPE               167725 non-null float64
NAME_EDUCATION_TYPE            167725 non-n

168

## Data set version 3: With missing values imputed
### Data set version 3_a: Without categorical features

In [25]:
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'])
input_df = drop_features(input_df, categorical_feats + bool_feats)
input_df = impute_features(input_df, 'All')
input_df.info()
del input_df, categorical_feats, floatingPoint_feats, bool_feats, integer_feats
gc.collect()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 307507 entries, 0 to 307510
Data columns (total 80 columns):
SK_ID_CURR                      307507 non-null int64
TARGET                          307507 non-null int64
CNT_CHILDREN                    307507 non-null float64
AMT_INCOME_TOTAL                307507 non-null float64
AMT_CREDIT                      307507 non-null float64
AMT_ANNUITY                     307507 non-null float64
AMT_GOODS_PRICE                 307507 non-null float64
REGION_POPULATION_RELATIVE      307507 non-null float64
DAYS_BIRTH                      307507 non-null float64
DAYS_EMPLOYED                   307507 non-null float64
DAYS_REGISTRATION               307507 non-null float64
DAYS_ID_PUBLISH                 307507 non-null float64
OWN_CAR_AGE                     307507 non-null float64
CNT_FAM_MEMBERS                 307507 non-null float64
REGION_RATING_CLIENT            307507 non-null float64
REGION_RATING_CLIENT_W_CITY     307507 non-null float

28

### Data set version 3_b: With categorical features OHE

In [26]:
#Nominal variables imputed by mode values and Numerical variables imputed by mean
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'], verbose = True)
    
input_df = impute_features(input_df, 'All')
    
input_df = pd.get_dummies(input_df, columns=categorical_feats, dtype=np.int64)
    
input_df.info()
del input_df
gc.collect()

Catagorical Features : 16, Floating Point Features : 66, Boolean Features : 26, Integer Features : 12, Total Count : 120
<class 'pandas.core.frame.DataFrame'>
Int64Index: 307507 entries, 0 to 307510
Columns: 245 entries, SK_ID_CURR to EMERGENCYSTATE_MODE_Yes
dtypes: float64(78), int64(167)
memory usage: 577.1 MB


28

### Data set version 3_c: With categorical features WOE encoding

In [27]:
#Nominal variables imputed by mode values and Numerical variables imputed by mean
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'])
input_df = impute_features(input_df, 'All')
    
#Replace categorical columns with WOE columns
for cat_feature in categorical_feats:
    WoE_df = calculate_WOE(input_df, 'TARGET', cat_feature)
    input_df[cat_feature] = input_df[cat_feature].replace(WoE_df.set_index('Value')['WoE'])
    del WoE_df
    
input_df.info()
del input_df, categorical_feats, floatingPoint_feats, bool_feats, integer_feats
gc.collect()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 307507 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(94), int64(28)
memory usage: 288.6 MB


35

## Version 4: Imputed and Scaled Numerical Features
### Data set version 4_a: Without categorical features

In [28]:
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'])
input_df = drop_features(input_df, categorical_feats + bool_feats)
input_df = impute_features(input_df, 'All')
input_df = scale_features(input_df, integer_feats + floatingPoint_feats)
input_df.info()

del input_df, categorical_feats, floatingPoint_feats, bool_feats, integer_feats
gc.collect()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 307507 entries, 0 to 307510
Data columns (total 80 columns):
SK_ID_CURR                      307507 non-null int64
TARGET                          307507 non-null int64
CNT_CHILDREN                    307507 non-null float64
AMT_INCOME_TOTAL                307507 non-null float64
AMT_CREDIT                      307507 non-null float64
AMT_ANNUITY                     307507 non-null float64
AMT_GOODS_PRICE                 307507 non-null float64
REGION_POPULATION_RELATIVE      307507 non-null float64
DAYS_BIRTH                      307507 non-null float64
DAYS_EMPLOYED                   307507 non-null float64
DAYS_REGISTRATION               307507 non-null float64
DAYS_ID_PUBLISH                 307507 non-null float64
OWN_CAR_AGE                     307507 non-null float64
CNT_FAM_MEMBERS                 307507 non-null float64
REGION_RATING_CLIENT            307507 non-null float64
REGION_RATING_CLIENT_W_CITY     307507 non-null float

28

### Data set version 4_b: With categorical features OHE

In [29]:
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'], verbose = True)
    
input_df = impute_features(input_df, 'All')
input_df = scale_features(input_df, integer_feats + floatingPoint_feats)
    
input_df = pd.get_dummies(input_df, columns=categorical_feats, dtype=np.int64)
    
input_df.info()
del input_df
gc.collect()

Catagorical Features : 16, Floating Point Features : 66, Boolean Features : 26, Integer Features : 12, Total Count : 120
<class 'pandas.core.frame.DataFrame'>
Int64Index: 307507 entries, 0 to 307510
Columns: 245 entries, SK_ID_CURR to EMERGENCYSTATE_MODE_Yes
dtypes: float64(78), int64(167)
memory usage: 577.1 MB


28

### Data set version 4_c: With categorical features WoE Encoded

In [30]:
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'])
input_df = impute_features(input_df, 'All')
input_df = scale_features(input_df, integer_feats + floatingPoint_feats)
    
#Replace categorical columns with WOE columns
for cat_feature in categorical_feats:
    WoE_df = calculate_WOE(input_df, 'TARGET', cat_feature)
    input_df[cat_feature] = input_df[cat_feature].replace(WoE_df.set_index('Value')['WoE'])
    del WoE_df
    
input_df.info()
del input_df
gc.collect()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 307507 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(94), int64(28)
memory usage: 288.6 MB


28

## Version 5: Impute and Scaled Numerical Features with Feature Selection
### Data set version 5_a: Without categorical features

In [31]:
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
input_df = input_df.drop(columns = 'SK_ID_CURR')
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'], verbose=True)
labels = input_df.pop('TARGET')

input_df = drop_features(input_df, categorical_feats)

feature_df = select_features(input_df, labels, min_votes = 4, drop_ratio = 0.5)

selected_features = feature_df[feature_df.Total >= 3].Feature.values

input_df = input_df[selected_features]
input_df = impute_features(input_df, features ='All')    
input_df = scale_features(input_df)

input_df['TARGET'] = labels 
input_df.info()
del input_df
gc.collect()

Catagorical Features : 16, Floating Point Features : 66, Boolean Features : 26, Integer Features : 12, Total Count : 120
Variance :  45 selected features out of 104
Pearson :  52 selected features out of 104




CHI2 :  52 selected features out of 104




Fitting estimator with 104 features.




Fitting estimator with 94 features.




Fitting estimator with 84 features.
Fitting estimator with 74 features.
Fitting estimator with 64 features.
Fitting estimator with 54 features.
RFE :  52 selected features out of 52
RF :  104 selected features out of 37




<class 'pandas.core.frame.DataFrame'>
Int64Index: 307507 entries, 0 to 307510
Data columns (total 48 columns):
EXT_SOURCE_3                   307507 non-null float64
EXT_SOURCE_2                   307507 non-null float64
EXT_SOURCE_1                   307507 non-null float64
DAYS_LAST_PHONE_CHANGE         307507 non-null float64
DAYS_EMPLOYED                  307507 non-null float64
CNT_CHILDREN                   307507 non-null float64
AMT_GOODS_PRICE                307507 non-null float64
AMT_CREDIT                     307507 non-null float64
LIVINGAREA_AVG                 307507 non-null float64
HOUR_APPR_PROCESS_START        307507 non-null float64
FLOORSMAX_MODE                 307507 non-null float64
FLOORSMAX_AVG                  307507 non-null float64
FLAG_EMP_PHONE                 307507 non-null float64
FLAG_DOCUMENT_3                307507 non-null float64
ELEVATORS_AVG                  307507 non-null float64
DEF_60_CNT_SOCIAL_CIRCLE       307507 non-null float64
DEF_30_CN

21

### Data set version 5_b: With categorical features OHE

In [32]:
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
input_df = input_df.drop(columns = 'SK_ID_CURR')
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'], verbose=True)
labels = input_df.pop('TARGET')

input_df = pd.get_dummies(input_df, columns=categorical_feats, dtype=np.int64)

feature_df = select_features(input_df, labels, min_votes = 4, drop_ratio = 0.5)

selected_features = feature_df[feature_df.Total >= 3].Feature.values
input_df = input_df[selected_features]

input_df = impute_features(input_df, features ='All')    
input_df = scale_features(input_df)

input_df['TARGET'] = labels 
input_df.info()
del input_df
gc.collect()

Catagorical Features : 16, Floating Point Features : 66, Boolean Features : 26, Integer Features : 12, Total Count : 120
Variance :  119 selected features out of 243
Pearson :  121 selected features out of 243




CHI2 :  121 selected features out of 243




Fitting estimator with 243 features.




Fitting estimator with 233 features.




Fitting estimator with 223 features.




Fitting estimator with 213 features.




Fitting estimator with 203 features.




Fitting estimator with 193 features.




Fitting estimator with 183 features.
Fitting estimator with 173 features.
Fitting estimator with 163 features.
Fitting estimator with 153 features.
Fitting estimator with 143 features.
Fitting estimator with 133 features.
Fitting estimator with 123 features.
RFE :  121 selected features out of 121
RF :  243 selected features out of 104




<class 'pandas.core.frame.DataFrame'>
Int64Index: 307507 entries, 0 to 307510
Columns: 120 entries, REG_CITY_NOT_LIVE_CITY to TARGET
dtypes: float64(119), int64(1)
memory usage: 283.9 MB


21

### Data set version 5_c: With categorical features WoE Encoded

In [33]:
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
input_df = input_df.drop(columns = 'SK_ID_CURR')
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'], verbose=True)

#Replace categorical columns with WOE columns
for cat_feature in categorical_feats:
    WoE_df = calculate_WOE(input_df, 'TARGET', cat_feature)
    input_df[cat_feature] = input_df[cat_feature].replace(WoE_df.set_index('Value')['WoE'])
    del WoE_df
    
labels = input_df.pop('TARGET')
feature_df = select_features(input_df, labels, min_votes = 4, drop_ratio = 0.5)

selected_features = feature_df[feature_df.Total >= 3].Feature.values
input_df = input_df[selected_features]

input_df = impute_features(input_df, features ='All')    
input_df = scale_features(input_df)

input_df['TARGET'] = labels 
input_df.info()
del input_df
gc.collect()

Catagorical Features : 16, Floating Point Features : 66, Boolean Features : 26, Integer Features : 12, Total Count : 120
Variance :  53 selected features out of 120
Pearson :  60 selected features out of 120




CHI2 :  60 selected features out of 120




Fitting estimator with 120 features.




Fitting estimator with 110 features.




Fitting estimator with 100 features.




Fitting estimator with 90 features.
Fitting estimator with 80 features.
Fitting estimator with 70 features.
RFE :  60 selected features out of 60
RF :  120 selected features out of 44




<class 'pandas.core.frame.DataFrame'>
Int64Index: 307507 entries, 0 to 307510
Data columns (total 59 columns):
ORGANIZATION_TYPE              307507 non-null float64
OCCUPATION_TYPE                307507 non-null float64
NAME_INCOME_TYPE               307507 non-null float64
NAME_EDUCATION_TYPE            307507 non-null float64
EXT_SOURCE_3                   307507 non-null float64
EXT_SOURCE_2                   307507 non-null float64
EXT_SOURCE_1                   307507 non-null float64
DAYS_LAST_PHONE_CHANGE         307507 non-null float64
DAYS_ID_PUBLISH                307507 non-null float64
DAYS_EMPLOYED                  307507 non-null float64
CNT_CHILDREN                   307507 non-null float64
AMT_GOODS_PRICE                307507 non-null float64
AMT_CREDIT                     307507 non-null float64
REGION_RATING_CLIENT_W_CITY    307507 non-null float64
NAME_HOUSING_TYPE              307507 non-null float64
NAME_FAMILY_STATUS             307507 non-null float64
LIVINGARE

21

## Version 6: Balanced and Imbalanced Sampling
### Data set version 6_a: Balanced Samples with categorical features OHE

In [34]:
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
input_df = input_df.drop(columns = 'SK_ID_CURR')
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'], verbose=True)
labels = input_df.pop('TARGET')

input_df = pd.get_dummies(input_df, columns=categorical_feats, dtype=np.int64)

input_df = impute_features(input_df, features ='All')    
input_df = scale_features(input_df)

sampled_df , sampled_labels = take_sample(input_df, labels, stratified = False)
sampled_df['TARGET'] = sampled_labels
sampled_df.info()
del input_df, sampled_df, labels
gc.collect()

Catagorical Features : 16, Floating Point Features : 66, Boolean Features : 26, Integer Features : 12, Total Count : 120




label 1 sample size: 24825
label 0 sample size: 24825
<class 'pandas.core.frame.DataFrame'>
Int64Index: 49650 entries, 264056 to 77514
Columns: 244 entries, CNT_CHILDREN to TARGET
dtypes: float64(243), int64(1)
memory usage: 92.8 MB


56

### Data set version 6_b: Unbalanced sample with categorical features OHE

In [35]:
# Read Training data
input_df = readFile(dataFolder + '\\application_train.csv')
input_df = replace_outliers(input_df)
input_df = input_df.drop(columns = 'SK_ID_CURR')
categorical_feats, floatingPoint_feats, bool_feats, integer_feats = identify_feature_types(input_df,
                                                                        ['TARGET', 'SK_ID_CURR', 'Unnamed :0'], verbose=True)
labels = input_df.pop('TARGET')

input_df = pd.get_dummies(input_df, columns=categorical_feats, dtype=np.int64)

input_df = impute_features(input_df, features ='All')    
input_df = scale_features(input_df)

sampled_df , sampled_labels = take_sample(input_df, labels)
sampled_df['TARGET'] = sampled_labels
sampled_df.info()
del input_df, sampled_df, labels
gc.collect()

Catagorical Features : 16, Floating Point Features : 66, Boolean Features : 26, Integer Features : 12, Total Count : 120




label 1 sample size: 2004
label 0 sample size: 22821
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24825 entries, 253126 to 46676
Columns: 244 entries, CNT_CHILDREN to TARGET
dtypes: float64(243), int64(1)
memory usage: 46.4 MB


56