# ATTENTION!

## Ask yourself the following questions before moving forward. 
 -  ### Have I thought about the client's problem and their objectives?
 -  ### Have I researched the domain space and/or similar projects?
 -  ### Have I gained a birds-eye view of the data - scope, time frames, etc?
 

## If you answer yes to all 3 (ideally) then it's time for EDA!
 


In [None]:
#%% get libraries
import os
import numpy as np
import pandas as pd
pd.options.display.max_rows = 300
pd.options.display.max_columns = 100

import matplotlib.pyplot as plt ###

import holoviews as hv
hv.extension('bokeh')

from pandas_profiling import ProfileReport
import missingno as msno
from eda.discover import discover


In [None]:
#%% get data and peek
train = pd.read_csv('./input/raw/application_train.csv', index_col='SK_ID_CURR')
test = pd.read_csv('./input/raw/application_test.csv', index_col='SK_ID_CURR')
test['TARGET'] = 2
print (train.shape, '\n', 
       train.TARGET.sum(), '\n',
       test.shape)

traintest = pd.concat([train, test], sort=False).sort_index()
traintest.head()


In [None]:
traintest.head().T

In [None]:
#%% for demo only: reduce features...
keepers = ['TARGET', 'AMT_INCOME_TOTAL',
 'AMT_CREDIT', 'AMT_ANNUITY',
 'AMT_GOODS_PRICE', 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS', 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH', 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE', 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
 'ORGANIZATION_TYPE', 'EXT_SOURCE_1',
 'EXT_SOURCE_2', 'EXT_SOURCE_3',
 'BASEMENTAREA_AVG', 'YEARS_BUILD_AVG',
 'LANDAREA_AVG', 'BASEMENTAREA_MODE', 'CODE_GENDER',
 'TOTALAREA_MODE', 'DAYS_LAST_PHONE_CHANGE',
 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']

trainlite = train[keepers]
traintestlite = traintest[trainlite.columns]
ttsamp = traintestlite.reset_index().sample(frac = 0.1).sort_index()
ttsamp.shape


In [None]:
#%% get automated description of combined dataset
ProfileReport(traintestlite)


In [None]:
%%opts Curve [height=200 width=500]

#check for patterns by id order
curve1 = hv.Curve(ttsamp, 'SK_ID_CURR','SK_ID_CURR')
curve2 = hv.Curve(ttsamp, 'SK_ID_CURR','AMT_CREDIT')
curve3 = hv.Curve(ttsamp[ttsamp.TARGET < 2], 'SK_ID_CURR','TARGET')

patterns = (curve1 + curve2 + curve3).cols(1)
patterns

In [None]:
#%% check duplicate rows and constant columns
duperows = ttsamp.drop('TARGET', axis=1).duplicated(keep=False).sum()
consts = ttsamp.nunique(axis=0)
print(duperows, '\n', consts[consts == 1])

In [None]:
#%% check scatters with pandas
# pd.plotting.scatter_matrix(ttsamp)


In [None]:
%%opts Scatter [tools=['box_select'] border=0] Histogram {+axiswise}
%%output size=150

#check scatters
table = hv.Table(ttsamp.iloc[:, 2:6])
matrix = hv.operation.gridmatrix(table)
matrix

In [None]:
%%opts Distribution  [height=240 width=240]

import warnings
warnings.simplefilter("ignore", FutureWarning)

#%% check distros for numericals
allnums = ttsamp.drop('TARGET', axis = 1).select_dtypes(include='float')
plot_list = [hv.Distribution(train[c])*hv.Distribution(test[c]) for c in allnums.columns]
pltmat = hv.Layout(plot_list)
pltmat

In [None]:
# find outliers in train and test
# code borrowed from Kaggle kernels



def rand_jitter(arr):
    return arr + np.random.randn(len(arr))

def draw_feature_distribution(df, column):
    column_values = df[df[column].notna()][column]
    # group by target
    class_0_values = df[df[column].notna() & (df['TARGET']==0)][column]
    class_1_values = df[df[column].notna() & (df['TARGET']==1)][column]
    class_t_values = df[df[column].notna() & (df['TARGET']==2)][column]        
    print('\n\n', column)
    # for features with unique values >= 10
    if len(df[column].value_counts().keys()) >= 10:
        fig, ax = plt.subplots(1, figsize=(15, 4))
        if df[column].dtype == 'object':
            label_encoder = LabelEncoder()
            label_encoder.fit(column_values)
            class_0_values = label_encoder.transform(class_0_values)
            class_1_values = label_encoder.transform(class_1_values)
            class_t_values = label_encoder.transform(class_t_values)
            column_values = label_encoder.transform(column_values)
            plt.xticks(range(len(label_encoder.classes_)), label_encoder.classes_, fontsize=12, rotation='vertical')

        ax.scatter(class_0_values, rand_jitter([0]*class_0_values.shape[0]), label='Class0', s=10, marker='o', color='#7ac143', alpha=1)
        ax.scatter(class_1_values, rand_jitter([10]*class_1_values.shape[0]), label='Class1', s=10, marker='o', color='#fd5c63', alpha=1)
        ax.scatter(class_t_values, rand_jitter([20]*class_t_values.shape[0]), label='Test', s=10, marker='o', color='#037ef3', alpha=0.4)
        ax.set_title(column +' group by target', fontsize=16)
        ax.legend(bbox_to_anchor=(1.01, 1), loc="upper left")
        ax.set_title(column +' distribution', fontsize=16)
      
    plt.show()
    
    
show_feature_count = 10
for column in ttsamp.columns:
    if show_feature_count == 0:
        break
    show_feature_count -= 1
    draw_feature_distribution(ttsamp, column)


In [None]:
#%% check cat columns differences
allobjs = traintest.select_dtypes(include='object')
for c in allobjs.columns:
    s1 = set(train[c].unique())
    s2 = set(test[c].unique())
    diff = s1.symmetric_difference(s2)
    for d in diff:
        tnrows = train[train[c] == d].shape[0]
        tsrows = test[test[c] == d].shape[0]
        print('{}, "{}": {} in train, {} in test'.format(c,d,tnrows, tsrows))

In [None]:
#%%Look at feature ratios by target value:
def catcompare(feature):
    h = train[feature].value_counts(normalize=True)
    i = test[feature].value_counts(normalize=True)
    catlist = h.index.tolist() + i.index.tolist()
    pctlist = h.tolist() + i.tolist()
    setlist = ['train']*h.shape[0] + ['test']*i.shape[0]
    macro = hv.Table((catlist, setlist, pctlist), ['cat', 'set'], ['pct'])
    plot = macro.to.bars(kdims=['cat', 'set'], vdims='pct', groupby=[], label = feature)
    return plot


In [None]:
%%opts Bars.Grouped [group_index='set']
%%opts Bars [group_index=1 xrotation=45 width=480 show_legend=False tools=['hover']] 
%%opts Bars (color=Cycle('Paired')) 

allcats = traintest.drop('TARGET', axis = 1).select_dtypes(include='object') 
plot_list = [] 
for c in allcats.columns: 
    plot = catcompare(c) 
    plot_list.append(plot)
hv.Layout(plot_list).cols(2)

In [None]:
#%% check missing variable structure
msno.matrix(traintest, filter=None, n=0, p=0, sort=None,
           figsize=(25, 15), width_ratios=(15, 1), color=(0.4, 0.7, 0.95),
           fontsize=8, labels=None, sparkline=False, inline=True, freq=None)