## Credit risk scoring project

* We will use machine learning to calculate the risk of default
* It is a binary classification problem: The target is positive ("1") if the customer defaults and negative ("0") otherwise

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
import xgboost as xgb

In [2]:
df = pd.read_csv("input/CreditScoring.csv")

In [3]:
df.columns = df.columns.str.lower()
status_values = {
    1: "ok",
    2:"default",
    0: "unk"
}
df.status = df.status.map(status_values)

home_values = {
    1: "rent",
    2: "owner",
    3: "private",
    4: "ignore",
    5: "parents",
    6: "others",
    0: "unk"
}

df.home = df.home.map(home_values)

marital_values = {
    1: "single",
    2: "married",
    3: "widow",
    4: "separated",
    5: "divorced",
    0: "unk"
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: "no",
    2: "yes",
    0: "unk"
}

df.records = df.records.map(records_values)

job_values = {
    1: "fixed",
    2: "parttime",
    3: "freelance",
    4: "others",
    0: "unk"
}

df.job = df.job.map(job_values)

for idx in ["income", "assets", "debt"]:
    df[idx] = df[idx].replace(99999999, np.nan)
    
df = df[df["status"] != "unk"].reset_index(drop=True)

In [4]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_test = df_test.reset_index(drop=True)
df_train_full = df_train_full.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [5]:
df_train_full.shape

(3563, 14)

In [6]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_test = df_test.reset_index(drop=True)
df_train_full = df_train_full.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_test = (df_test.status == "default").astype("int").values
y_train_full = (df_train_full.status == 'default').astype("int").values
del df_test["status"]
del df_train_full['status']
df_train_full = df_train_full.fillna(0)
df_test = df_test.fillna(0)
dicts_full_train = df_train_full.to_dict(orient='records')
dicts_test = df_test.to_dict(orient='records')

In [7]:
dv = DictVectorizer(sparse=False)
X_train_full = dv.fit_transform(dicts_full_train)
X_test = dv.transform(dicts_test)


In [8]:
dfull_train = xgb.DMatrix(X_train_full, label=y_train_full)
dtest = xgb.DMatrix(X_test) # We do not need label here, because we wont use this for testing, we will evaluate it with scikit-learn

In [9]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 3, 
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread':8,
    'seed': 1,
    'verbosity': 1

}

model = xgb.train(xgb_params, dfull_train, num_boost_round=180)
y_pred = model.predict(dtest)
roc_auc_score(y_test, y_pred)

0.8327366697619862

## bentoml

In [10]:
!bentoml --version

bentoml, version 1.0.7


bentoml."framekwork".save_model("model name", model) <br>
bentoml serve service.py:svc --reload #Run the service 

In [19]:
import bentoml
bentoml.xgboost.save_model("credit_risk_model", model,
                          custom_objects = {
                              "dictVectorizer": dv
                        },
                        signatures={
                            "predict":{
                                "batchable": True,
                                "batch_dim": 0,
                            }
                        }
)

Model(tag="credit_risk_model:ujcthwstb2inkaav", path="/home/kevin/bentoml/models/credit_risk_model/ujcthwstb2inkaav/")

In [12]:
df_train_full.iloc[0].to_dict()

{'seniority': 22,
 'home': 'owner',
 'time': 48,
 'age': 48,
 'marital': 'married',
 'records': 'no',
 'job': 'fixed',
 'expenses': 60,
 'income': 110.0,
 'assets': 3000.0,
 'debt': 0.0,
 'amount': 1000,
 'price': 1460}

In [13]:
 {"seniority": 22,
 "home": "owner",
 "time": 48,
 "age": 48,
 "marital": "married",
 "records": "no",
 "job": "fixed",
 "expenses": 60,
 "income": 110.0,
 "assets": 3000.0,
 "debt": 0.0,
 "amount": 1000,
 "price": 1460}

{'seniority': 22,
 'home': 'owner',
 'time': 48,
 'age': 48,
 'marital': 'married',
 'records': 'no',
 'job': 'fixed',
 'expenses': 60,
 'income': 110.0,
 'assets': 3000.0,
 'debt': 0.0,
 'amount': 1000,
 'price': 1460}