In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.cross_validation import train_test_split
from sklearn import metrics, preprocessing
import xgboost
%matplotlib inline



In [2]:
df = pd.read_csv('../data/raw/credit_train.csv', sep=';', encoding='CP1251')
df_test = pd.read_csv('../data/raw/credit_test.csv', sep=';', encoding='CP1251')

In [3]:
parsable_columns = ['credit_sum', 'score_shk']
for key in parsable_columns:
    df[key] = df[key].map(lambda val: val.replace(',','.')).map(float)
    df_test[key] = df_test[key].map(lambda val: val.replace(',','.')).map(float)

In [4]:
categorizable_columns = ['gender', 'marital_status', 'job_position', 'education', 'living_region']
for key in categorizable_columns:
    le = preprocessing.LabelEncoder()
    le.fit(
        np.unique(
            np.concatenate((
                df[key].unique(), 
                df_test[key].unique()
            ))
        )
    )
    df[key] = le.transform(df[key])
    df_test[key] = le.transform(df_test[key])

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


In [None]:
corr = df.corr()

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(corr, cmap="YlGnBu")
plt.show()

In [5]:
df_train = df

In [6]:
xgb = xgboost.XGBRegressor(
    learning_rate=0.1,
    n_estimators=2000,
    max_depth=5,
    min_child_weight=5,
    gamma=0,
    subsample=0.75,
    colsample_bytree=0.8,
    objective="reg:linear",
    reg_alpha=0.01,
    seed=0,
)

In [9]:
df_predictors = df_train.drop(["open_account_flg", 'client_id'], axis=1)
df_target = df_train["open_account_flg"]

xgtrain = xgboost.DMatrix(data=df_predictors, label=df_target)

In [10]:
df_predictors.columns

Index([u'gender', u'age', u'marital_status', u'job_position', u'credit_sum',
       u'credit_month', u'tariff_id', u'score_shk', u'education',
       u'living_region', u'monthly_income', u'credit_count',
       u'overdue_credit_count'],
      dtype='object')

In [11]:
xgb_param = xgb.get_xgb_params()
cvresult = xgboost.cv(
    xgb_param,
    xgtrain,
    num_boost_round=xgb_param['n_estimators'],
    nfold=5,
    verbose_eval=20,
    early_stopping_rounds=50,
    metrics='auc'
)

[0]	train-auc:0.686609+0.000817934	test-auc:0.684081+0.0034076
[20]	train-auc:0.755142+0.000828913	test-auc:0.749144+0.00231937
[40]	train-auc:0.766064+0.000379386	test-auc:0.757101+0.00227024
[60]	train-auc:0.772069+0.000429001	test-auc:0.760571+0.00197008
[80]	train-auc:0.776398+0.0006707	test-auc:0.762456+0.00160641
[100]	train-auc:0.779984+0.00060926	test-auc:0.763913+0.00155672
[120]	train-auc:0.782848+0.000477418	test-auc:0.764544+0.00160696
[140]	train-auc:0.785459+0.00047507	test-auc:0.765081+0.00172415
[160]	train-auc:0.787677+0.000458523	test-auc:0.765459+0.00173629
[180]	train-auc:0.789694+0.000475545	test-auc:0.765638+0.00169719
[200]	train-auc:0.791704+0.000541268	test-auc:0.76579+0.00179948
[220]	train-auc:0.793906+0.000562259	test-auc:0.766054+0.00193349
[240]	train-auc:0.795937+0.000594976	test-auc:0.76619+0.00199888
[260]	train-auc:0.798012+0.000591229	test-auc:0.766261+0.00197411
[280]	train-auc:0.799829+0.000554284	test-auc:0.766121+0.00210919
[300]	train-auc:0.80158

In [13]:
cvresult.shape[0]

261

In [None]:
xgb.set_params(n_estimators=cvresult.shape[0])

In [None]:
xgboost.plot_importance(xgb)

In [None]:
from sklearn.metrics import auc

booster = xgb.fit(df_predictors, df_target, eval_metric=auc)

In [None]:
df_test['predicted_target'] = xgb.predict(df_test.drop(['client_id'], axis=1))
df_test['predicted_target_cropped'] = df_test['predicted_target'].rank().astype(int)
s = df_test[['client_id','predicted_target_cropped']].rename(columns={'predicted_target_cropped': '_VAL_', 'client_id': '_ID_'})
s.to_csv('./ans_xgboost.csv', index=False, index_label='_ID_')