In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas_profiling
import pandas_summary


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns


import gc
import time
import psutil
import random as rn


from tqdm import tqdm
from contextlib import contextmanager
from tensorflow import set_random_seed

rn.seed(5)
np.random.seed(7)
set_random_seed(2)
os.environ['PYTHONHASHSEED'] = '3'

# Functions

In [None]:
def data_desc(df):
    print (df.info())
    print (df.describe())
    print (df.columns)
    x_rowcount = df.shape[0] #be careful not to use df1.count() which returns only non-NaN values
    print ('There are ', x_rowcount, "rows in the file\n")
    #print (df.isnull())
    #print ("\n",df.dtypes)
    print ("\n",df.head(),"\n",df.tail())
    df.dtypes.value_counts()
    return

def data_target(df,target):
    print (df[target].value_counts())
    return


def data_cat_val (df):
    for col in df:
        if df[col].dtype == 'object':
            df[col].fillna('missing')
    print (df.select_dtypes(include = ['object']).apply(pd.Series.nunique, axis =0))
    return df


def data_corr (df,target):
    # Find correlations with the target and sort
    correlations = df.corr()[target].sort_values()

    # Display correlations
    print('Most Positive Correlations:\n', correlations.tail(15))
    print('\nMost Negative Correlations:\n', correlations.head(15))
    return correlations

# Function to calculate missing values by column 
def data_mis_val(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table = mis_val_table[mis_val_table.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table

def data_encoder(df):
    # Create a label encoder object
    le = LabelEncoder()
    le_count = 0

    # Iterate through the columns
    for col in df:
        if df[col].dtype == 'object':
            # If 2 or fewer unique categories
            if len(list(df[col].unique())) <= 2:
                # Train on the training data
                le.fit(df[col])
                # print (col)
                # Transform both training and testing data
                df[col] = le.transform(df[col])
                # Keep track of how many columns were label encoded
                le_count += 1
    print('%d columns were label encoded.' % le_count)
    return

def data_dummy(df):
    df = pd.get_dummies(df)
    print (df.tail())
    print('Training Features shape: ', df.shape)
    return

def data_align (df1,target, df2):
    train_labels = df1[target]
    # Align the training and testing data, keep only columns present in both dataframes
    df1, df2 = df1.align(df2, join = 'inner', axis = 1)

    # Add the target back in
    df1[target] = train_labels

    print('Training Features shape: ', df1.shape)
    print('Testing Features shape: ', df2.shape)
    return

def plot_kde(df,target,feature):
    plt.figure(figsize = (6, 4))

    # KDE plot of target == 0
    sns.kdeplot(df.loc[df[target] == 0, feature], label = 'target == 0')

    # KDE plot of target == 1
    sns.kdeplot(df.loc[df[target] == 1, feature], label = 'target == 1')

    # Labeling of plot
    plt.xlabel(feature); plt.ylabel('Density'); plt.title('Distribution of '+ feature);
    
    return


def plot_hist (df,target):

    # Set the style of plots
    plt.style.use('fivethirtyeight')
    
    # Plot the distribution    
    plt.hist(df[target], edgecolor = 'k', bins = 20)
    plt.title(target); plt.xlabel('Values'); plt.ylabel('Count');
    return

def cal_woe(df, target):
    num_events = target.sum()
    num_non_events = target.shape[0] - target.sum()

    feature_list = []
    feature_iv_list = []
    for col in df.columns:
        if df[col].unique().shape[0] == 1:
            del df[col]
            print('remove constant col', col)

        with timer('cope with %s' % col):
            feature_list.append(col)

            woe_df = pd.DataFrame()
            woe_df[col] = df[col]
            woe_df['target'] = target
            events_df = woe_df.groupby(col)['target'].sum().reset_index().rename(columns={'target' : 'events'})
            events_df['non_events'] = woe_df.groupby(col).count().reset_index()['target'] - events_df['events']
            def cal_woe(x):
                return np.log( ((x['non_events']+0.5)/num_non_events) / ((x['events']+0.5)/num_events)  )
            events_df['WOE_'+col] = events_df.apply(cal_woe, axis=1)

            def cal_iv(x):
                return x['WOE_'+col]*(x['non_events'] / num_non_events - x['events'] / num_events)
            events_df['IV_'+col] = events_df.apply(cal_iv, axis=1)

            feature_iv = events_df['IV_'+col].sum()
            feature_iv_list.append(feature_iv)

            events_df = events_df.drop(['events', 'non_events', 'IV_'+col], axis=1)
            df = df.merge(events_df, how='left', on=col)
    iv_df = pd.DataFrame()
    iv_df['feature'] = feature_list
    iv_df['IV'] = feature_iv_list
    iv_df = iv_df.sort_values(by='IV', ascending=False)
    return df, iv_df

timer_depth = -1
@contextmanager
def timer(name):
    t0 = time.time()
    global timer_depth
    timer_depth += 1
    yield
    pid = os.getpid()
    py = psutil.Process(pid)
    memoryUse = py.memory_info()[0] / 2. ** 30
    print('----'*timer_depth + f'>>[{name}] done in {time.time() - t0:.0f} s ---> memory used: {memoryUse:.4f} GB', '')
    if(timer_depth == 0):
        print('\n')
    timer_depth -= 1

In [None]:
# Read Data

In [None]:
# Training data
df1 = pd.read_csv('../input/application_train.csv')

In [None]:
data_desc(df1)

In [None]:
pandas_summary(df1)

In [None]:
df1.tail()

In [None]:
data_target(df1,'TARGET')

plot_hist(df1,'TARGET')

In [None]:
data_mis_val(df1)

In [None]:
df1 = data_cat_val(df1)


In [None]:
df1_target = df1.pop('TARGET')
with timer('calculate WOE and IV'):
    df1, iv_df = cal_woe(df1, df1_target)

In [None]:
df1.tail(100)