In [1]:
import wget
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('transformed_data.csv', index_col='Unnamed: 0')

cat_col = ['home','marital','records', 'job']
num_col = ['seniority', 'time', 'age', 'expenses', 
            'income', 'assets', 'debt', 'amount', 'price']

cols = ['seniority', 'home', 'time', 'age', 'marital', 'records', 'job',
        'expenses', 'income', 'assets', 'debt', 'amount', 'price']

In [3]:
full_train_df, full_test_df = train_test_split(df, test_size =0.2, random_state=11)
full_train_df, full_val_df = train_test_split(full_train_df, test_size = 0.25, random_state=11)

train_df = full_train_df.reset_index(drop = True)
test_df = full_test_df.reset_index(drop = True)
val_df = full_val_df.reset_index(drop = True)

y_train = (train_df['status'] == 'default').astype('int')
y_test = (test_df['status'] == 'default').astype('int')
y_val = (val_df['status'] == 'default').astype('int')

dv = DictVectorizer(sparse = False)
dv.fit(train_df[cols].to_dict(orient = 'records'))
feature_names = dv.get_feature_names()

X_train = dv.transform(train_df[cols].to_dict(orient = 'records'))
X_test = dv.transform(test_df[cols].to_dict(orient = 'records'))
X_val = dv.transform(val_df[cols].to_dict(orient = 'records'))


del train_df['status']
del test_df['status']
del val_df['status']

In [4]:
dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test, label = y_test)
dval = xgb.DMatrix(X_val, label = y_val)

In [5]:
x_params = {
    'eta': 0.1,
    'max_depth':3,
    'min_child_weight':30,

    'objective':'binary:logistic',
    'eval_metric' : 'auc',
    
    'n_threads':8,
    'seed':1,
    'verbosity':0
}

model = xgb.train(x_params, dtrain = dtrain,num_boost_round = 125)

In [6]:
pred = model.predict(dval)
auc = roc_auc_score(y_val,pred)
print(auc)

0.8482830442897934


In [7]:
full_data = pd.concat([full_train_df, full_val_df])
full_data = full_data.reset_index(drop = True)

full_y_train = (full_data['status'] == 'default').astype('int')
full_x_train = dv.transform(full_data[cat_col + num_col].to_dict(orient = 'records'))

dtrain = xgb.DMatrix(full_x_train, label = full_y_train)
test =  xgb.DMatrix(X_test)

model = xgb.train(x_params, dtrain = dtrain,num_boost_round = 125)
prediction = model.predict(test)
auc = roc_auc_score(y_test, prediction)
auc

0.8426050992720449

In [11]:
import bentoml
bentoml.xgboost.save_model('credict_risk', model, 
                                custom_objects={'dictvectorizer':dv},
                                signatures = {
                                            "predict":{
                                                "batchable": True,
                                                "batch_dim": 0
                                            }
                                }
                                )

Model(tag="credict_risk:meupd6spxggv4aav", path="/home/godwin/bentoml/models/credict_risk/meupd6spxggv4aav/")