API Requests
====

In [2]:
import pandas as pd
import numpy as np
import json
import requests
import pickle
from flask import Flask, jsonify, request

from sklearn.preprocessing import scale, MinMaxScaler, StandardScaler, RobustScaler

In [180]:
test = pd.read_csv("/media/juanan/DATA/loan_data_analysis/data/loans_processed.csv", sep = "^").sample(1)

In [181]:
test = test.reset_index().\
       rename(columns = {'index':'id'}).\
       drop("loan_status", axis = 1)

This is the way the API is going to receive the data:

In [182]:
test = test[sorted(test.columns)]

In [183]:
api_data = test.to_json(orient='records')
api_data

'[{"addr_state":"NM","annual_inc":50000.0,"application_type":"Individual","avg_cur_bal":27505.0,"bc_open_to_buy":1312.0,"bc_util":62.5,"delinq_2yrs":0.0,"delinq_amnt":0.0,"disbursement_method":"Cash","dti":12.34,"emp_length":"10+ years","emp_title":"him specialist ii","fico_range_high":689.0,"fico_range_low":685.0,"funded_amnt":15000.0,"funded_amnt_inv":15000.0,"grade":"C","home_ownership":"MORTGAGE","id":166105,"initial_list_status":"w","installment":517.34,"int_rate":14.64,"loan_amnt":15000.0,"mort_acc":3.0,"num_accts_ever_120_pd":0.0,"num_bc_sats":1.0,"num_bc_tl":1.0,"num_il_tl":13.0,"num_op_rev_tl":4.0,"num_rev_accts":4.0,"num_rev_tl_bal_gt_0":2.0,"num_sats":9.0,"open_acc":9.0,"pct_tl_nvr_dlq":90.0,"percent_bc_gt_75":0.0,"pub_rec":0.0,"pub_rec_bankruptcies":0.0,"purpose":"debt_consolidation","revol_bal":2273.0,"revol_util":34.4,"sub_grade":"C3","tax_liens":0.0,"term":" 36 months","tot_coll_amt":101.0,"tot_hi_cred_lim":260757.0,"total_bal_ex_mort":35822.0,"total_bc_limit":3500.0,"to

We need to preprocess the data before feeding the models, but for that, we need two things from training phase:
* Mean and Standard deviation of the numeric variables in order to normalize new unkown data from API.
* Categorical dictionary to transform each category into numeric data.

In [72]:
data = pd.read_csv("/media/juanan/DATA/loan_data_analysis/data/loans_processed.csv", sep = "^")

In [184]:
data = data[sorted(data.columns)]
data.head()

Unnamed: 0,addr_state,annual_inc,application_type,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,delinq_amnt,disbursement_method,dti,...,sub_grade,tax_liens,term,tot_coll_amt,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,verification_status,zip_code
0,VA,78000.0,Individual,29828.0,9525.0,4.7,0.0,0.0,Cash,12.03,...,C1,0.0,60 months,0.0,196500.0,149140.0,10000.0,12000.0,Source Verified,235xx
1,CA,58000.0,Individual,9536.0,7599.0,41.5,0.0,0.0,Cash,14.92,...,A3,0.0,36 months,0.0,179407.0,15030.0,13000.0,11325.0,Not Verified,937xx
2,MO,63800.0,Individual,4232.0,324.0,97.8,0.0,0.0,Cash,18.49,...,D1,0.0,60 months,0.0,57073.0,42315.0,15000.0,35573.0,Source Verified,658xx
3,AZ,50000.0,Individual,5857.0,332.0,93.2,0.0,0.0,Cash,34.81,...,C3,0.0,36 months,0.0,82331.0,64426.0,4900.0,64031.0,Source Verified,850xx
4,NJ,69000.0,Individual,3214.0,6494.0,69.2,0.0,0.0,Cash,25.81,...,C3,0.0,36 months,0.0,52490.0,38566.0,21100.0,24890.0,Source Verified,077xx


In [185]:
data.dtypes

addr_state                     object
annual_inc                    float64
application_type               object
avg_cur_bal                   float64
bc_open_to_buy                float64
bc_util                       float64
delinq_2yrs                   float64
delinq_amnt                   float64
disbursement_method            object
dti                           float64
emp_length                     object
emp_title                      object
fico_range_high               float64
fico_range_low                float64
funded_amnt                   float64
funded_amnt_inv               float64
grade                          object
home_ownership                 object
initial_list_status            object
installment                   float64
int_rate                      float64
loan_amnt                     float64
loan_status                   float64
mort_acc                      float64
num_accts_ever_120_pd         float64
num_bc_sats                   float64
num_bc_tl   

In [186]:
numeric_variables = data._get_numeric_data().columns
categorical_variables = data.select_dtypes(include="object").columns

Let's get numeric stats first:

In [187]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data[numeric_variables])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [188]:
numeric_stats = pd.DataFrame({"numeric_variable": numeric_variables,
                              "mean": scaler.mean_,
                              "std": np.sqrt(scaler.var_)})

numeric_stats.head()

Unnamed: 0,numeric_variable,mean,std
0,annual_inc,75230.389095,65243.689416
1,avg_cur_bal,12402.720344,15973.456502
2,bc_open_to_buy,8826.174978,14272.032276
3,bc_util,59.402908,28.39502
4,delinq_2yrs,0.306673,0.856058


Let's check this with `num_bc_sats`:

In [189]:
np.mean(data["num_bc_sats"])

6.970189265655382

In [190]:
np.std(data["num_bc_sats"])

10.265984443730279

That's it!!! Let's save this data set:

In [191]:
numeric_stats.to_csv("../output/numeric_stats_in_training_for_new_data.csv", sep = "^", index = False)

Let's go now with categorical dictionaries:

In [192]:
categorical_dict = {}
for variable in categorical_variables:
    categorical_dict[variable] = data.groupby(variable)["loan_status"].mean()

In [193]:
categorical_dict

{'addr_state': addr_state
 AK    0.204141
 AL    0.247650
 AR    0.243154
 AZ    0.199321
 CA    0.198303
 CO    0.158083
 CT    0.183810
 DC    0.130378
 DE    0.206575
 FL    0.222007
 GA    0.191224
 HI    0.204535
 IA    0.142857
 ID    0.193487
 IL    0.193302
 IN    0.228179
 KS    0.171004
 KY    0.216353
 LA    0.237900
 MA    0.192974
 MD    0.214845
 ME    0.136564
 MI    0.208880
 MN    0.210363
 MO    0.219400
 MS    0.275619
 MT    0.177759
 NC    0.215282
 ND    0.237232
 NE    0.267946
 NH    0.142554
 NJ    0.216208
 NM    0.225694
 NV    0.234202
 NY    0.227045
 OH    0.225400
 OK    0.248252
 OR    0.152827
 PA    0.214776
 RI    0.194009
 SC    0.172705
 SD    0.225787
 TN    0.230479
 TX    0.202286
 UT    0.179049
 VA    0.207942
 VT    0.149225
 WA    0.166538
 WI    0.175315
 WV    0.182457
 WY    0.178142
 Name: loan_status, dtype: float64, 'application_type': application_type
 Individual    0.206316
 Joint App     0.190262
 Name: loan_status, dtype: float64, '

In [194]:
categorical_dict["grade"]

grade
A    0.065211
B    0.134901
C    0.224259
D    0.305041
E    0.389276
F    0.449290
G    0.483176
Name: loan_status, dtype: float64

In [195]:
categorical_dict["grade"]["A"]

0.06521071361678217

Done! Let's save the dictionary for preprocessing new data when API is been called:

In [196]:
import pickle
pickle.dump(categorical_dict, open("../output/categorical_dict.pkl", "wb"))

Let's remeber how our API data look like:

In [197]:
api_data

'[{"addr_state":"NM","annual_inc":50000.0,"application_type":"Individual","avg_cur_bal":27505.0,"bc_open_to_buy":1312.0,"bc_util":62.5,"delinq_2yrs":0.0,"delinq_amnt":0.0,"disbursement_method":"Cash","dti":12.34,"emp_length":"10+ years","emp_title":"him specialist ii","fico_range_high":689.0,"fico_range_low":685.0,"funded_amnt":15000.0,"funded_amnt_inv":15000.0,"grade":"C","home_ownership":"MORTGAGE","id":166105,"initial_list_status":"w","installment":517.34,"int_rate":14.64,"loan_amnt":15000.0,"mort_acc":3.0,"num_accts_ever_120_pd":0.0,"num_bc_sats":1.0,"num_bc_tl":1.0,"num_il_tl":13.0,"num_op_rev_tl":4.0,"num_rev_accts":4.0,"num_rev_tl_bal_gt_0":2.0,"num_sats":9.0,"open_acc":9.0,"pct_tl_nvr_dlq":90.0,"percent_bc_gt_75":0.0,"pub_rec":0.0,"pub_rec_bankruptcies":0.0,"purpose":"debt_consolidation","revol_bal":2273.0,"revol_util":34.4,"sub_grade":"C3","tax_liens":0.0,"term":" 36 months","tot_coll_amt":101.0,"tot_hi_cred_lim":260757.0,"total_bal_ex_mort":35822.0,"total_bc_limit":3500.0,"to

First, we have to transform it to dataframe:

In [198]:
api_df = pd.read_json(api_data, orient='records')

api_df

Unnamed: 0,addr_state,annual_inc,application_type,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,delinq_amnt,disbursement_method,dti,...,sub_grade,tax_liens,term,tot_coll_amt,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,verification_status,zip_code
0,NM,50000,Individual,27505,1312,62.5,0,0,Cash,12.34,...,C3,0,36 months,101,260757,35822,3500,32556,Source Verified,870xx


Get numeric variables (except `id`) and categorical variables from API data

In [199]:
numeric_variables = api_df._get_numeric_data().columns
numeric_variables = [variable for variable in numeric_variables if variable != "id"]
categorical_variables = api_df.select_dtypes(include="object").columns

### Preprocess numeric variables

Let's try on `annual_inc`:

In [200]:
mean = float(numeric_stats[numeric_stats["numeric_variable"] == "annual_inc"]["mean"])
std = float(numeric_stats[numeric_stats["numeric_variable"] == "annual_inc"]["std"])

In [201]:
mean

75230.38909480652

In [202]:
std

65243.68941579512

In [203]:
api_df["annual_inc"]

0    50000
Name: annual_inc, dtype: int64

In [204]:
api_df["annual_inc"].map(lambda i: (i - mean)/std)

0   -0.38671
Name: annual_inc, dtype: float64

In [205]:
api_df

Unnamed: 0,addr_state,annual_inc,application_type,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,delinq_amnt,disbursement_method,dti,...,sub_grade,tax_liens,term,tot_coll_amt,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,verification_status,zip_code
0,NM,50000,Individual,27505,1312,62.5,0,0,Cash,12.34,...,C3,0,36 months,101,260757,35822,3500,32556,Source Verified,870xx


Let's do it in every numeric variable:

In [206]:
for variable in numeric_variables:
    mean = float(numeric_stats[numeric_stats["numeric_variable"] == variable]["mean"])
    std = float(numeric_stats[numeric_stats["numeric_variable"] == variable]["std"])
    api_df[variable] = api_df[variable].map(lambda i: (i - mean)/std)

In [207]:
api_df

Unnamed: 0,addr_state,annual_inc,application_type,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,delinq_amnt,disbursement_method,dti,...,sub_grade,tax_liens,term,tot_coll_amt,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,verification_status,zip_code
0,NM,-0.38671,Individual,0.945461,-0.526496,0.109072,-0.358238,-0.018875,Cash,-0.62413,...,C3,-0.052897,36 months,-0.011239,0.590805,-0.216217,-0.77826,-0.118976,Source Verified,870xx


__Preprocess categorical data__

Let's try first on `grade` variable:

In [208]:
api_df["grade"]

0    C
Name: grade, dtype: object

In [209]:
api_df["grade"].map(lambda i: categorical_dict["grade"][i])

0    0.224259
Name: grade, dtype: float64

In [210]:
categorical_dict["grade"]["C"]

0.2242593518858696

Let's do it in every categorical variable:

In [211]:
api_df

Unnamed: 0,addr_state,annual_inc,application_type,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,delinq_amnt,disbursement_method,dti,...,sub_grade,tax_liens,term,tot_coll_amt,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,verification_status,zip_code
0,NM,-0.38671,Individual,0.945461,-0.526496,0.109072,-0.358238,-0.018875,Cash,-0.62413,...,C3,-0.052897,36 months,-0.011239,0.590805,-0.216217,-0.77826,-0.118976,Source Verified,870xx


In [212]:
for variable in categorical_variables:
    api_df[variable] = api_df[variable].map(lambda i: categorical_dict[variable][i])

In [213]:
api_df

Unnamed: 0,addr_state,annual_inc,application_type,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,delinq_amnt,disbursement_method,dti,...,sub_grade,tax_liens,term,tot_coll_amt,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,verification_status,zip_code
0,0.225694,-0.38671,0.206316,0.945461,-0.526496,0.109072,-0.358238,-0.018875,0.205857,-0.62413,...,0.227377,-0.052897,0.16571,-0.011239,0.590805,-0.216217,-0.77826,-0.118976,0.220237,0.250765


Last, we will use the model for predicting...

In [214]:
api_id = api_df["id"]

api_id

0    166105
Name: id, dtype: int64

In [217]:
api_data = api_df.drop("id", axis = 1)

api_data

Unnamed: 0,addr_state,annual_inc,application_type,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,delinq_amnt,disbursement_method,dti,...,sub_grade,tax_liens,term,tot_coll_amt,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,verification_status,zip_code
0,0.225694,-0.38671,0.206316,0.945461,-0.526496,0.109072,-0.358238,-0.018875,0.205857,-0.62413,...,0.227377,-0.052897,0.16571,-0.011239,0.590805,-0.216217,-0.77826,-0.118976,0.220237,0.250765


In [218]:
with open("../output/models/logistic_regression_model.sav","rb") as f:
    loaded_model = pickle.load(f)

pd.DataFrame(loaded_model.predict_proba(api_data)).loc[:,0]

0    0.738918
Name: 0, dtype: float64

In [227]:
print("Loading models...")
logistic_regression = pickle.load(open("../output/models/logistic_regression_model.sav","rb"))
random_forest = pickle.load(open("../output/models/random_forest_model.sav","rb"))
xg_boost = pickle.load(open("../output/models/xg_boost_model.sav","rb"))

print("Models have been loaded...doing predictions now...")
logit_predictions = list(pd.DataFrame(logistic_regression.predict_proba(api_data)).loc[:,1])
rf_predictions = list(pd.DataFrame(random_forest.predict_proba(api_data)).loc[:,1])
xg_predictions = list(pd.DataFrame(xg_boost.predict_proba(np.matrix(api_data))).loc[:,1])

final_predictions = pd.DataFrame({"id": api_id,
                                  "logit": logit_predictions,
                                  "rf": rf_predictions,
                                  "xg": xg_predictions})
print("Done!")
final_predictions.to_json(orient = "records")

Loading models...
Models have been loaded...doing predictions now...
Done!


'[{"id":166105,"logit":0.2610819251,"rf":0.1888843702,"xg":0.3054217696}]'

Querying the API
=====
__________

Let's try with just one observation:

In [234]:
test = pd.read_csv("/media/juanan/DATA/loan_data_analysis/data/loans_processed.csv", sep = "^").sample(1)

In [235]:
test = test.reset_index().\
       rename(columns = {'index':'id'}).\
       drop("loan_status", axis = 1)

In [236]:
test = test.to_json()

test

'{"id":{"0":196878},"num_bc_sats":{"0":4.0},"num_rev_tl_bal_gt_0":{"0":6.0},"grade":{"0":"C"},"avg_cur_bal":{"0":9981.0},"pub_rec_bankruptcies":{"0":1.0},"num_rev_accts":{"0":26.0},"tax_liens":{"0":0.0},"funded_amnt_inv":{"0":6000.0},"delinq_2yrs":{"0":0.0},"total_bal_ex_mort":{"0":21763.0},"pct_tl_nvr_dlq":{"0":96.9},"disbursement_method":{"0":"Cash"},"fico_range_low":{"0":670.0},"verification_status":{"0":"Verified"},"delinq_amnt":{"0":0.0},"purpose":{"0":"debt_consolidation"},"emp_title":{"0":"shipping\\/recieving"},"zip_code":{"0":"480xx"},"loan_amnt":{"0":6000.0},"installment":{"0":206.44},"fico_range_high":{"0":674.0},"annual_inc":{"0":40748.0},"term":{"0":" 36 months"},"int_rate":{"0":14.47},"emp_length":{"0":"2 years"},"revol_bal":{"0":3644.0},"application_type":{"0":"Individual"},"num_bc_tl":{"0":15.0},"num_sats":{"0":13.0},"tot_hi_cred_lim":{"0":101560.0},"tot_coll_amt":{"0":0.0},"initial_list_status":{"0":"w"},"bc_open_to_buy":{"0":988.0},"total_bc_limit":{"0":3100.0},"open_

In [250]:
pd.read_json(test, orient="records")

Unnamed: 0,id,num_bc_sats,num_rev_tl_bal_gt_0,grade,avg_cur_bal,pub_rec_bankruptcies,num_rev_accts,tax_liens,funded_amnt_inv,delinq_2yrs,...,addr_state,num_accts_ever_120_pd,total_il_high_credit_limit,bc_util,percent_bc_gt_75,sub_grade,mort_acc,num_op_rev_tl,dti,home_ownership
0,196878,4,6,C,9981,1,26,0,6000,0,...,MI,0,24386,68.1,0,C2,4,10,18.41,MORTGAGE


Let's launch the API in the shell with the following command 

`gunicorn --bind 0.0.0.0:8000 src.server:app`

Let's querying the API:

In [286]:
resp = requests.post("http://0.0.0.0:8000/predict",
              data = json.dumps(test),
              headers = {'Content-Type': 'application/json',
                         'Accept': 'application/json'})

In [287]:
resp

<Response [200]>

In [288]:
resp.json()

{'code': 200,
 'response': {'score': '[{"id":196878,"logit":0.2120499739,"rf":0.1915461482,"xg":0.3079130054}]'},
 'status': 'OK'}

Let's try with several observations:

In [289]:
test = pd.read_csv("/media/juanan/DATA/loan_data_analysis/data/loans_processed.csv", sep = "^").sample(10)

In [290]:
test = test.reset_index().\
       rename(columns = {'index':'id'}).\
       drop("loan_status", axis = 1)

In [291]:
test = test.to_json()

test

'{"id":{"0":63632,"1":457070,"2":582064,"3":38533,"4":752056,"5":609883,"6":645407,"7":50678,"8":110994,"9":182758},"num_bc_sats":{"0":11.0,"1":1.0,"2":7.0,"3":5.0,"4":1.0,"5":6.0,"6":91.7,"7":6.0,"8":7.0,"9":7.0},"num_rev_tl_bal_gt_0":{"0":21.0,"1":2.0,"2":7.0,"3":3.0,"4":2.0,"5":3.0,"6":91.7,"7":6.0,"8":10.0,"9":5.0},"grade":{"0":"B","1":"D","2":"C","3":"B","4":"C","5":"C","6":"B","7":"D","8":"C","9":"D"},"avg_cur_bal":{"0":5657.0,"1":1298.0,"2":13690.0,"3":41932.0,"4":7788.0,"5":20197.0,"6":91.7,"7":31356.0,"8":18524.0,"9":7567.0},"pub_rec_bankruptcies":{"0":0.0,"1":0.0,"2":0.0,"3":0.0,"4":0.0,"5":0.0,"6":0.0,"7":0.0,"8":0.0,"9":0.0},"num_rev_accts":{"0":44.0,"1":4.0,"2":17.0,"3":19.0,"4":5.0,"5":17.0,"6":91.7,"7":11.0,"8":22.0,"9":12.0},"tax_liens":{"0":0.0,"1":0.0,"2":0.0,"3":0.0,"4":0.0,"5":0.0,"6":0.0,"7":0.0,"8":0.0,"9":0.0},"funded_amnt_inv":{"0":9600.0,"1":6700.0,"2":12600.0,"3":29975.0,"4":20000.0,"5":29700.0,"6":4000.0,"7":35000.0,"8":26000.0,"9":30000.0},"delinq_2yrs":{"0"

In [292]:
resp = requests.post("http://0.0.0.0:8000/predict",
              data = json.dumps(test),
              headers = {'Content-Type': 'application/json',
                         'Accept': 'application/json'})

In [293]:
resp

<Response [200]>

In [294]:
resp.json()

{'code': 200,
 'response': {'score': '[{"id":63632,"logit":0.1593462283,"rf":0.2198941973,"xg":0.2086590976},{"id":457070,"logit":0.1526589482,"rf":0.2080917206,"xg":0.2551024258},{"id":582064,"logit":0.0952889671,"rf":0.1877994904,"xg":0.1068849415},{"id":38533,"logit":0.4320059544,"rf":0.306772172,"xg":0.2915415764},{"id":752056,"logit":0.1345828365,"rf":0.2044461973,"xg":0.1789424121},{"id":609883,"logit":0.164548906,"rf":0.2003882482,"xg":0.2376462966},{"id":645407,"logit":0.2222423053,"rf":0.3587959564,"xg":0.2657648623},{"id":50678,"logit":0.106234942,"rf":0.2420237246,"xg":0.3909677267},{"id":110994,"logit":0.1300851536,"rf":0.2335529262,"xg":0.4421696365},{"id":182758,"logit":0.1049188741,"rf":0.3017494169,"xg":0.3189023435}]'},
 'status': 'OK'}

__Great Job!__ :)