In [1]:
import pandas as pd
import numpy as np


import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier


In [8]:
df = pd.read_csv('CreditScoring.csv')

In [9]:
#Preproccesing

df.columns = df.columns.str.lower()
status_values = {
    1: 'good',
    2: 'bad',
    0: 'unk'
}
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'    
}
marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}
records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}
job_values = {
    1: 'fixed',
    2: 'parttime',
    3: 'freelance',
    4:'others',
    0: 'unk'
}

df.status = df.status.map(status_values)
df.home = df.home.map(home_values)
df.marital = df.marital.map(marital_values)
df.records = df.records.map(records_values)
df.job = df.job.map(job_values)

#Replacing the "99999999" values with Nan
for c in ['income','assets','debt']: 
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)

df = df[df.status != 'unk']

In [10]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=11)

In [11]:
y_train = (df_train_full.status == 'bad').values
y_test = (df_test.status == 'bad').values
#y_val = (df_val.status == 'bad').values

del df_train_full['status']
del df_test['status']
#del df_val['status']


#to dictionary for one-hot-encoding
dict_train = df_train_full.fillna(0).to_dict(orient='records')
dict_test = df_test.fillna(0).to_dict(orient='records')
#dict_val = df_val.fillna(0).to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dict_train)
X_test = dv.transform(dict_test)
#X_val = df.transform(dict_val)

dtrain = xgb.DMatrix(X_train, label=y_train, feature_names= dv.feature_names_)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=dv.feature_names_)


xgb_params = {
    'eta': 0.1,
    'max_depth': 3,         
    'min_child_weight': 1,

    'objective': 'binary:logistic',   
    'eval_metric':'auc',
    'nthread': 8,
    'seed': 1,
    'silent': 1
}

num_trees = 160

model = xgb.train(xgb_params, dtrain, num_boost_round=num_trees)




Parameters: { "silent" } are not used.



In [12]:
#Evaluating

y_pred_xgb = model.predict(dtest)
roc_auc_score(y_test, y_pred_xgb)

0.833634719710669