In [27]:
import os
import numpy as np
import pandas as pd
import xgboost

from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score


In [19]:
#This is my data cleaning code I set up in Quick_and_Dirty.ipynb

data = pd.read_csv('data/application_train.csv')
test_data = pd.read_csv('data/application_test.csv')
data.rename(columns={x:x.lower() for x in data.columns}, inplace=True)
test_data.rename(columns={x:x.lower() for x in test_data.columns}, inplace=True)

bureau_features = pd.read_csv('data/bureau_features.csv')
bureau_features.drop('credit_types',axis = 1,inplace = True)
bureau_ids = bureau_features['sk_id_curr'].unique()

In [20]:
data['has_bureau_features'] = data['sk_id_curr'].apply(lambda x: 1 if x in bureau_ids else 0)
data = data.join(bureau_features,how='left',on='sk_id_curr',rsuffix='X')
data.drop('sk_id_currX',axis=1, inplace = True)

In [21]:
test_data['has_bureau_features'] = test_data['sk_id_curr'].apply(lambda x: 1 if x in bureau_ids else 0)

test_data = test_data.join(bureau_features,how='left',on='sk_id_curr',rsuffix='X')
test_data.drop('sk_id_currX',axis=1, inplace = True)

In [22]:
float_columns = [x for x in data.columns if data[x].dtype == np.float64]
float_columns_with_nulls =  [c for c in float_columns if pd.isnull(data[c].values).any()]
categorical_columns = [x for x in data.columns if data[x].dtype == object]

###Cleaning training data 
#adding a null category for each categorical column 
for col in categorical_columns:
    data.loc[:,col] = data[col].apply(lambda x: x if not pd.isnull(x) else "null")
    
uniques = {col:data[col].unique() for col in categorical_columns}

#imputing nans in float columns and adding an is_imputed column
for col in float_columns_with_nulls:
    null_col = col+'_is_imputed'
    data[null_col] = data[col].apply(lambda x: 1 if pd.isnull(x) else 0)
    data.loc[:,col] = data[col].apply(lambda x: 0 if pd.isnull(x) else x )
    
#creating dummy columns for all the categorical columns
for col in categorical_columns:
    for cat in uniques[col]:
        data[col+'_'+cat] = data[col].apply(lambda x: 1 if x == cat else 0)
    data= data.drop(col, axis = 1)
    
### Cleaning test data    
#adding a null category for each categorical column 
for col in categorical_columns:
    test_data.loc[:,col] = test_data[col].apply(lambda x: x if not pd.isnull(x) else "null")
    

#imputing nans in float columns and adding an is_imputed column
for col in float_columns_with_nulls:
    null_col = col+'_is_imputed'
    test_data[null_col] = test_data[col].apply(lambda x: 1 if pd.isnull(x) else 0)
    test_data.loc[:,col] = test_data[col].apply(lambda x: 0 if pd.isnull(x) else x )
    
#creating dummy columns for all the categorical columns
for col in categorical_columns:
    for cat in uniques[col]:
        test_data[col+'_'+cat] = test_data[col].apply(lambda x: 1 if x == cat else 0)
    test_data= test_data.drop(col, axis = 1)

In [10]:
X,y = data.drop(['target','sk_id_curr'],axis = 1), data['target']

In [None]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }


booster = xgboost.XGBClassifier()

grid = GridSearchCV(estimator = booster,
                    param_grid = params, 
                    scoring = 'roc_auc',
                    n_jobs = -1,
                    cv = 5,
                    verbose = 2)

grid.fit(X,y)

In [40]:
grid.best_params_

{'colsample_bytree': 1.0, 'gamma': 5, 'max_depth': 3, 'min_child_weight': 10}

In [43]:
import json
with open('bestparams.json','w') as f:
        json.dump(grid.best_params_,f)

In [46]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=17)

In [11]:
booster = xgboost.XGBClassifier(subsample = .5, 
                                n_estimators = 20000, 
                                learning_rate= .01, 
                                colsample_bytree = 1, 
                                gamma = 5, 
                                mid_child_weight = 10,
                                max_depth = 3,
                                n_jobs = -1,
                                silent = False)

In [12]:
booster.fit(X,y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=5, learning_rate=0.01, max_delta_step=0,
       max_depth=3, mid_child_weight=10, min_child_weight=1, missing=None,
       n_estimators=20000, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=False,
       subsample=0.5)

In [57]:
y_hat = booster.predict_proba(X_val)[:,1]

roc_auc_score(y_val, y_hat,)

0.75372191799207178

In [23]:
X_test = test_data.drop('sk_id_curr',axis = 1)

test_predictions = booster.predict_proba(X_test)[:,1]

In [29]:
with open('submissions/result{}.csv'.format(len(os.listdir('submissions'))), 'w') as f:
    f.write('SK_ID_CURR,TARGET\n')
    for i,p in zip(test_data['sk_id_curr'].values, test_predictions):
        f.write("{},{}\n".format(i,p))

result0.csv
