In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:


data = pd.read_csv('data/application_train.csv')

Null_dict = dict()

for column in ['NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']:
    Null_dict[column] = 0


for column in ['NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']:
    if data[column].isnull().sum()>0:
        Null_dict[column] = data[column].isnull().sum()
        
    if (data[column]=='XNA').sum()>0:
        Null_dict[column] = (data[column]=='XNA').sum()

for (key,value) in Null_dict.items():
    Null_dict[key] = float(value)/data.shape[0]
    
def replace_binary_categorical_var(df, column_name):
    categories = list(df[column_name].unique())
    if np.nan in categories:
        categories.remove(np.nan)
    assert(len(categories) == 2)
    df.loc[df[column_name] == categories[0], column_name] = 0
    df.loc[df[column_name] == categories[1], column_name] = 1    
    
for element in data['FONDKAPREMONT_MODE']:
    if element == 'XNA':
        element = np.nan
# Name contract type is either Cash loans or Revolving loans
replace_binary_categorical_var(data, 'NAME_CONTRACT_TYPE')

# Gender is either male, female or N/A. We'll consider it binary
data['CODE_GENDER'] = data['CODE_GENDER'].replace('XNA', np.nan)
replace_binary_categorical_var(data, 'CODE_GENDER')

# FLAG_OWN_CAR and FLAG_OWN_REALTY are flags, either Y or N
replace_binary_categorical_var(data, 'FLAG_OWN_CAR')
replace_binary_categorical_var(data, 'FLAG_OWN_REALTY')

# We'll consider unknown to be N/A
data['NAME_FAMILY_STATUS'] = data['NAME_FAMILY_STATUS'].replace('Unknown', np.nan)

# All these are categorical
data = pd.get_dummies(data, columns=['NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE'])

data = data.fillna(data.median())

data = data.drop('SK_ID_CURR', axis=1)

In [7]:
data_x = data.drop('TARGET', axis=1).as_matrix()
data_y = data['TARGET'].as_matrix()

train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.3)
del data_x, data_y

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
lr = LogisticRegression()
lr.fit(train_x, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
roc_auc_score(test_y, lr.predict_proba(test_x)[:,1])

0.62079571467645356