In [1]:
# Package Imports
import pickle
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
#Loading the Model
model = pickle.load(open('light_model.sav', 'rb'))

In [3]:
# Read in data
app_train = pd.read_csv('application_train.csv').sort_values('SK_ID_CURR').reset_index(drop = True).loc[:1000000, :]

In [4]:
# Set an index
app_train = app_train.set_index('SK_ID_CURR')
app_train.shape

(307511, 121)

In [5]:
# Delete COLUMNS with very many NaNs, more than 40% of the observations in the column missing
app_data = app_train.dropna(thresh = 45000, axis = 1)

In [6]:
# Delete ROWS with very many NaNs, more than 40% of the observations in the row missing
app_data = app_data.dropna(thresh = 45, axis = 0)

In [7]:
# Delete ROWS with very many NaNs, more than 40% of the observations in the row missing
app_data = app_data.dropna(thresh = 45, axis = 0)

In [8]:
# Subset numerical data
numerics = ['int16','int32','int64','float16','float32','float64']
numerical_vars = list(app_data.select_dtypes(include=numerics).columns)
numerical_data = app_data[numerical_vars]
numerical_data.shape

(307511, 105)

In [9]:
# Fill in missing values
numerical_data = numerical_data.fillna(numerical_data.mean())

In [10]:
# Subset categorical data
cates = ['object']
cate_vars = list(app_data.select_dtypes(include=cates).columns)
categorical_data = app_data[cate_vars]
categorical_data.shape

(307511, 16)

In [11]:
# Fill in missing values
categorical_data = categorical_data.fillna(method = 'ffill')

In [12]:
# Instantiate label encoder
le = preprocessing.LabelEncoder()
categorical_data = categorical_data.apply(lambda col: le.fit_transform(col).astype(str))

In [13]:
# Concat the data
clean_data = pd.concat([categorical_data, numerical_data], axis = 1)
clean_data.shape

(307511, 121)

In [14]:
# Preview Data
clean_data['TARGET'].sample(15)

SK_ID_CURR
408327    0
289473    0
257445    0
150682    0
379560    0
295829    0
127803    0
443604    0
205106    0
429383    0
345417    0
426723    0
159356    0
161058    1
413434    0
Name: TARGET, dtype: int64

In [15]:
# Prepare test data for individual predictions
test_data = clean_data.drop(['TARGET'], axis = 1)

In [16]:
# Define a risk assessment function
def risk_assessor(a):
    client_infor = test_data.loc[[a]].values   #Subset a specific client infor, *a* represent SK_ID_CURR
    prob = model.predict_proba(client_infor).tolist()[0]    #predict a client's probability of defaulting
    p = prob[1]
    if p > 0.67:
        print('Client with ID # {} has a high risk of defaulting the loan'.format(a))
    elif p > 0.33:
        print('Client with ID # {} has a moderate risk of defaulting the loan'.format(a))
    else:
        print('Client with ID # {} has a low risk of defaulting the loan'.format(a))

In [17]:
risk_assessor(261013)

Client with ID # 261013 has a high risk of defaulting the loan
