In [1]:
import wget
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [13]:
df = pd.read_csv('transformed_data.csv', index_col='Unnamed: 0')

cat_col = ['home','marital','records', 'job']
num_col = ['seniority', 'time', 'age', 'expenses', 
            'income', 'assets', 'debt', 'amount', 'price']

In [14]:
full_train_df, full_test_df = train_test_split(df, test_size =0.2, random_state=11)
full_train_df, full_val_df = train_test_split(full_train_df, test_size = 0.25, random_state=11)

train_df = full_train_df.reset_index(drop = True)
test_df = full_test_df.reset_index(drop = True)
val_df = full_val_df.reset_index(drop = True)

y_train = (train_df['status'] == 'default').astype('int')
y_test = (test_df['status'] == 'default').astype('int')
y_val = (val_df['status'] == 'default').astype('int')

dv = DictVectorizer(sparse = False)
dv.fit(train_df[cat_col + num_col].to_dict(orient = 'records'))
feature_names = dv.get_feature_names()

X_train = dv.transform(train_df[cat_col + num_col].to_dict(orient = 'records'))
X_test = dv.transform(test_df[cat_col + num_col].to_dict(orient = 'records'))
X_val = dv.transform(val_df[cat_col + num_col].to_dict(orient = 'records'))


del train_df['status']
del test_df['status']
del val_df['status']

In [15]:
train_df.iloc[0].to_dict()

{'seniority': 3,
 'home': 'rent',
 'time': 36,
 'age': 61,
 'marital': 'married',
 'records': 'no_rec',
 'job': 1,
 'expenses': 41,
 'income': 57.0,
 'assets': 12000.0,
 'debt': 0.0,
 'amount': 2500,
 'price': 3559}

In [5]:
model = RandomForestClassifier(n_estimators=10)
model.fit(X_train, y_train)
pred = model.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, pred)

In [6]:
# scores = []

# for n in range (10,200,10):
#         model = RandomForestClassifier(n_estimators=n, random_state=1)
#         model.fit(X_train, y_train)
#         pred = model.predict_proba(X_val)[:,1]
#         auc = roc_auc_score(y_val, pred)
#         scores.append([n,auc])
        
#         print('n_trees == %4s    Validation data == %.3f' % (n,auc))
#         print()

In [7]:
# output = pd.DataFrame(scores, columns = ['no_estimators', 'auc_score'])
# output = output.sort_values('auc_score', ascending= False).reset_index(drop= True)

# plt.plot(output.no_estimators, output.auc_score)

In [8]:
model = RandomForestClassifier(n_estimators=60, max_depth = 11, min_samples_leaf=3, random_state=1)
model.fit(X_train, y_train)
pred = model.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, pred)
print(auc)

0.8299669958837562


In [11]:
import bentoml
bentoml.sklearn.save_model('credict_risk_rf', model, 
                                custom_objects={'dictvectorizer':dv})

Model(tag="credict_risk_rf:arbrefcrfkdkuaav", path="/home/godwin/bentoml/models/credict_risk_rf/arbrefcrfkdkuaav/")

In [None]:
bentoml.sklearn.save_model