In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn import metrics, preprocessing
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [32]:
df_train = pd.read_csv('../data/raw/credit_train.csv', sep=';', encoding='CP1251')
df_test = pd.read_csv('../data/raw/credit_test.csv', sep=';', encoding='CP1251')

In [33]:
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)


In [34]:
df_train.head()

Unnamed: 0,client_id,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count,open_account_flg
0,1,M,48,MAR,UMN,5999800,10,1.6,770249,GRD,КРАСНОДАРСКИЙ КРАЙ,30000.0,1.0,1.0,0
1,2,F,28,MAR,UMN,1088900,6,1.1,248514,GRD,МОСКВА,43000.0,2.0,0.0,0
2,3,M,32,MAR,SPC,1072800,12,1.1,459589,SCH,ОБЛ САРАТОВСКАЯ,23000.0,5.0,0.0,0
3,4,F,27,DIV,SPC,1200909,12,1.1,362536,GRD,ОБЛ ВОЛГОГРАДСКАЯ,17000.0,2.0,0.0,0
4,5,M,45,MAR,SPC,1690889,10,1.1,421385,SCH,ЧЕЛЯБИНСКАЯ ОБЛАСТЬ,25000.0,1.0,0.0,0


In [35]:
def replace_comma(df):
    parsable_columns = ['credit_sum', 'score_shk']
    for key in parsable_columns:
        df[key] = df[key].map(lambda val: val.replace(',','.')).map(float)
    return df

In [36]:
df_train = replace_comma(df_train)
df_test = replace_comma(df_test)

join the features from train and test together before imputing missing values,in case their distribution is slightly different

df[df.isnull().any(axis=1)]

In [37]:
big_X = df_train[df_train.columns].append(df_test[df_test.columns])
big_X_imputed = DataFrameImputer().fit_transform(big_X)

XGBoost doesn't (yet) handle categorical features automatically, so we need to change
them to columns of integer values.
See http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing for more
details and options

In [39]:
categorizable_columns = ['gender', 'marital_status', 'job_position', 'education', 'living_region', 'tariff_id']
le = LabelEncoder()
for feature in categorizable_columns:
    big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])

In [42]:
big_X_imputed.tail()

Unnamed: 0,age,client_id,credit_count,credit_month,credit_sum,education,gender,job_position,living_region,marital_status,monthly_income,open_account_flg,overdue_credit_count,score_shk,tariff_id
91935,41,262682,1.0,12,10114.0,3,1,13,260,2,30000.0,0.0,0.0,0.347262,28
91936,33,262683,0.0,6,14807.0,3,1,13,133,3,30000.0,0.0,0.0,0.40443,26
91937,25,262684,1.0,10,19718.0,1,0,13,208,2,20000.0,0.0,0.0,0.341982,18
91938,26,262685,2.0,10,22258.0,3,1,13,135,3,28000.0,0.0,0.0,0.698764,20
91939,44,262686,0.0,6,31176.0,1,1,13,260,2,40000.0,0.0,0.0,0.238366,23


In [64]:
# Prepare the inputs for the model
train = big_X_imputed[0:df_train.shape[0]]
test_X = big_X_imputed[df_train.shape[0]::]

In [65]:
test_X = test_X.drop('open_account_flg', axis = 1)
train_X = train.drop(["open_account_flg", 'client_id'], axis=1)
train_y = train["open_account_flg"].astype(int)

In [66]:
train_y.as_matrix()

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
gbm = xgb.XGBClassifier()
gbm_params = {
    'learning_rate': [0.05, 0.1],
    'n_estimators': [300, 1000, 2000],
    'max_depth': [2, 3, 5, 10],
}
cv = StratifiedKFold(n_splits=3)
grid = GridSearchCV(gbm, gbm_params,scoring='roc_auc',verbose=10,n_jobs=-1)
grid.fit(train_X, train_y)

print (grid.best_params_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] n_estimators=300, learning_rate=0.05, max_depth=2 ...............
[CV] n_estimators=300, learning_rate=0.05, max_depth=2 ...............
[CV] n_estimators=300, learning_rate=0.05, max_depth=2 ...............
[CV] n_estimators=1000, learning_rate=0.05, max_depth=2 ..............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=2, score=0.749684 - 1.1min
[CV] n_estimators=1000, learning_rate=0.05, max_depth=2 ..............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=2, score=0.750658 - 1.2min
[CV] n_estimators=1000, learning_rate=0.05, max_depth=2 ..............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=2, score=0.751043 - 1.2min
[CV] n_estimators=2000, learning_rate=0.05, max_depth=2 ..............
[CV]  n_estimators=1000, learning_rate=0.05, max_depth=2, score=0.759785 -168.2min
[CV] n_estimators=2000, learning_rate=0.05, max_depth=2 ..............
[CV]  n_estimators=1000, learning_rate=0.05, 

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 169.5min


[CV]  n_estimators=1000, learning_rate=0.05, max_depth=2, score=0.758723 -168.3min
[CV] n_estimators=300, learning_rate=0.05, max_depth=3 ...............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=3, score=0.758410 - 1.7min
[CV] n_estimators=300, learning_rate=0.05, max_depth=3 ...............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=3, score=0.757833 - 1.7min
[CV] n_estimators=300, learning_rate=0.05, max_depth=3 ...............
[CV]  n_estimators=2000, learning_rate=0.05, max_depth=2, score=0.762185 -172.2min
[CV] n_estimators=1000, learning_rate=0.05, max_depth=3 ..............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=3, score=0.757417 - 1.7min
[CV] n_estimators=1000, learning_rate=0.05, max_depth=3 ..............


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 174.5min


[CV]  n_estimators=2000, learning_rate=0.05, max_depth=2, score=0.761804 - 7.8min
[CV] n_estimators=1000, learning_rate=0.05, max_depth=3 ..............
[CV]  n_estimators=2000, learning_rate=0.05, max_depth=2, score=0.761720 - 7.8min
[CV] n_estimators=2000, learning_rate=0.05, max_depth=3 ..............
[CV]  n_estimators=1000, learning_rate=0.05, max_depth=3, score=0.763714 - 5.5min
[CV] n_estimators=2000, learning_rate=0.05, max_depth=3 ..............
[CV]  n_estimators=1000, learning_rate=0.05, max_depth=3, score=0.763825 - 5.5min
[CV] n_estimators=2000, learning_rate=0.05, max_depth=3 ..............
[CV]  n_estimators=1000, learning_rate=0.05, max_depth=3, score=0.762974 - 5.5min
[CV] n_estimators=300, learning_rate=0.05, max_depth=5 ...............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=5, score=0.762955 - 3.0min
[CV] n_estimators=300, learning_rate=0.05, max_depth=5 ...............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=5, score=0.763417 - 3.1min
[C

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 187.6min


[CV]  n_estimators=2000, learning_rate=0.05, max_depth=3, score=0.765098 -11.0min
[CV] n_estimators=1000, learning_rate=0.05, max_depth=5 ..............
[CV]  n_estimators=2000, learning_rate=0.05, max_depth=3, score=0.764001 -11.0min
[CV] n_estimators=1000, learning_rate=0.05, max_depth=5 ..............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=5, score=0.763257 - 3.0min
[CV] n_estimators=1000, learning_rate=0.05, max_depth=5 ..............
[CV]  n_estimators=2000, learning_rate=0.05, max_depth=3, score=0.763706 -11.0min
[CV] n_estimators=2000, learning_rate=0.05, max_depth=5 ..............
[CV]  n_estimators=1000, learning_rate=0.05, max_depth=5, score=0.763553 - 9.6min
[CV] n_estimators=2000, learning_rate=0.05, max_depth=5 ..............
[CV]  n_estimators=1000, learning_rate=0.05, max_depth=5, score=0.763829 - 9.6min
[CV] n_estimators=2000, learning_rate=0.05, max_depth=5 ..............
[CV]  n_estimators=1000, learning_rate=0.05, max_depth=5, score=0.763786 - 9.5min
[

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 200.2min


[CV]  n_estimators=300, learning_rate=0.05, max_depth=10, score=0.756587 - 7.3min
[CV] n_estimators=300, learning_rate=0.05, max_depth=10 ..............
[CV]  n_estimators=2000, learning_rate=0.05, max_depth=5, score=0.760367 -18.6min
[CV] n_estimators=300, learning_rate=0.05, max_depth=10 ..............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=10, score=0.757708 - 7.3min
[CV] n_estimators=1000, learning_rate=0.05, max_depth=10 .............
[CV]  n_estimators=2000, learning_rate=0.05, max_depth=5, score=0.760284 -18.6min
[CV] n_estimators=1000, learning_rate=0.05, max_depth=10 .............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=10, score=0.757441 - 7.3min
[CV] n_estimators=1000, learning_rate=0.05, max_depth=10 .............
[CV]  n_estimators=2000, learning_rate=0.05, max_depth=5, score=0.760962 -18.5min
[CV] n_estimators=2000, learning_rate=0.05, max_depth=10 .............
[CV]  n_estimators=1000, learning_rate=0.05, max_depth=10, score=0.745947 -22.5min

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 239.9min


[CV]  n_estimators=300, learning_rate=0.1, max_depth=2, score=0.756964 - 1.2min
[CV] n_estimators=300, learning_rate=0.1, max_depth=2 ................
[CV]  n_estimators=300, learning_rate=0.1, max_depth=2, score=0.756435 - 1.1min
[CV] n_estimators=300, learning_rate=0.1, max_depth=2 ................
[CV]  n_estimators=300, learning_rate=0.1, max_depth=2, score=0.755825 - 1.2min
[CV] n_estimators=1000, learning_rate=0.1, max_depth=2 ...............
[CV]  n_estimators=1000, learning_rate=0.1, max_depth=2, score=0.762303 - 3.8min
[CV] n_estimators=1000, learning_rate=0.1, max_depth=2 ...............
[CV]  n_estimators=1000, learning_rate=0.1, max_depth=2, score=0.761926 - 3.9min
[CV] n_estimators=1000, learning_rate=0.1, max_depth=2 ...............
[CV]  n_estimators=1000, learning_rate=0.1, max_depth=2, score=0.761751 - 3.9min
[CV] n_estimators=2000, learning_rate=0.1, max_depth=2 ...............
[CV]  n_estimators=2000, learning_rate=0.1, max_depth=2, score=0.763438 - 7.8min
[CV] n_est

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 270.4min


[CV]  n_estimators=2000, learning_rate=0.1, max_depth=2, score=0.762490 - 7.6min
[CV] n_estimators=300, learning_rate=0.1, max_depth=3 ................
[CV]  n_estimators=300, learning_rate=0.1, max_depth=3, score=0.762286 - 1.6min
[CV] n_estimators=300, learning_rate=0.1, max_depth=3 ................
[CV]  n_estimators=300, learning_rate=0.1, max_depth=3, score=0.762077 - 1.6min
[CV] n_estimators=1000, learning_rate=0.1, max_depth=3 ...............
[CV]  n_estimators=300, learning_rate=0.1, max_depth=3, score=0.761552 - 1.6min
[CV] n_estimators=1000, learning_rate=0.1, max_depth=3 ...............
[CV]  n_estimators=1000, learning_rate=0.1, max_depth=3, score=0.765227 - 5.3min
[CV] n_estimators=1000, learning_rate=0.1, max_depth=3 ...............


In [None]:
predictions = grid.best_estimator_.predict(test_X)