In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [2]:
df = pd.read_csv('data/application_train.csv')

In [None]:
missing_percentage = df.isnull().sum() / len(df) 
dropped = missing_percentage.loc[missing_percentage > 0.9]
df = df.drop(dropped.index, axis=1)

In [None]:
df.head()

In [None]:
df.describe()

NAME_CONTRACT_TYPE	
CODE_GENDER	
FLAG_OWN_CAR	
FLAG_OWN_REALTY	
NAME_TYPE_SUITE	
NAME_INCOME_TYPE	
NAME_EDUCATION_TYPE	
NAME_FAMILY_STATUS

In [None]:
def replace_binary_categorical_var(df, column_name):
    categories = list(df[column_name].unique())
    if np.nan in categories:
        categories.remove(np.nan)
    assert(len(categories) == 2)
    df.loc[df[column_name] == categories[0], column_name] = 0
    df.loc[df[column_name] == categories[1], column_name] = 1    

In [None]:
# Name contract type is either Cash loans or Revolving loans
replace_binary_categorical_var(df, 'NAME_CONTRACT_TYPE')

# Gender is either male, female or N/A. We'll consider it binary
df['CODE_GENDER'] = df['CODE_GENDER'].replace('XNA', np.nan)
replace_binary_categorical_var(df, 'CODE_GENDER')

# FLAG_OWN_CAR and FLAG_OWN_REALTY are flags, either Y or N
replace_binary_categorical_var(df, 'FLAG_OWN_CAR')
replace_binary_categorical_var(df, 'FLAG_OWN_REALTY')

# We'll consider unknown to be N/A
df['NAME_FAMILY_STATUS'] = df['NAME_FAMILY_STATUS'].replace('Unknown', np.nan)

# All these are categorical
df = pd.get_dummies(df, columns=['NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS'])

In [None]:
# Missing value imputation
nulls = df.isnull().any()
nulls = nulls[nulls == True]
print(nulls)

In [None]:
df.fillna(df.median())

In [None]:
df.shape

In [4]:
bureau = pd.read_csv('data/bureau.csv')

In [6]:
bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [8]:
df.merge(bureau, how='left', on='SK_ID_CURR').groupby('SK_ID_CURR').agg('count')['SK_ID_BUREAU']

SK_ID_CURR
100002     8
100003     4
100004     2
100006     0
100007     1
100008     3
100009    18
100010     2
100011     4
100012     0
100014     8
100015     4
100016     7
100017     6
100018     0
100019     2
100020     4
100021     0
100022     2
100023    13
100024     0
100025     1
100026     3
100027     3
100029     4
100030     6
100031     7
100032     4
100033     1
100034     0
          ..
456225     9
456226     2
456227     4
456228     0
456229     1
456230    12
456231     7
456232     2
456233     1
456234    10
456235     3
456236    13
456237     1
456238     6
456239    11
456240     2
456241     5
456242     1
456243     7
456244    23
456245     0
456246     3
456247    11
456248     0
456249    13
456251     0
456252     0
456253     4
456254     1
456255    11
Name: SK_ID_BUREAU, Length: 307511, dtype: int64

In [10]:
df.shape

(307511, 122)