In this notebook create a baseline LightGBM model and performs related feature engineering.

In [47]:
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
from pathlib import Path
%matplotlib inline
import pickle
pd.set_option('display.max_columns', 50)
warnings.filterwarnings('ignore')

In [48]:
train_df = pd.read_pickle('data/train_adj.pkl').iloc[:,2:] # first two columns are id info
test_df = pd.read_pickle('data/test_adj.pkl').iloc[:,2:]

In [49]:
import pickle
with open('cat_encs.pickle', 'rb') as f:
    cat_encs = pickle.load(f)
    
with open('x_cols.pickle', 'rb') as f:
    x_cols = pickle.load(f)   

In [None]:
data_path = 'data/' # replace with name of your data file if different
key = '' # put your api key here
acc_id = '' # put your account id here

In [50]:
import requests, json, re

api_key =  {'Authorization': key} # put your api key here
investor_id = acc_id # put your account id here

# get loan listings data
loans = 'https://api.lendingclub.com/api/investor/v1/loans/listing'
res = requests.get(loans, headers=api_key, params={'showAll':True})
data = json.loads(res.text)

# grabs the available features
avail_cols = list(data['loans'][0].keys())

In [51]:
df = pd.DataFrame(data['loans'])

In [52]:
to_map = {'secAppCollections12MthsExMed': 'sec_app_collections_12_mths_ex_med',
          'secAppInqLast6Mths': 'sec_app_inq_last_6mths',
          'numAcctsEver120Ppd': 'num_accts_ever_120_pd',
          'inqLast6Mths': 'inq_last_6mths',
          'numTl120dpd2m': 'num_tl_120dpd_2m',
          'numTl30dpd': 'num_tl_30dpd',
          'numTl90gDpd24m': 'num_tl_90g_dpd_24m',
          'numTlOpPast12m': 'num_tl_op_past_12m',
          'collections12MthsExMed': 'collections_12_mths_ex_med',
          'isIncV': 'verification_status',
          'isIncVJoint': 'verification_status_joint',
          'openIl12m': 'open_il_12m',
          'openIl24m': 'open_il_24m',
          'openRv12m': 'open_rv_12m',
          'openRv24m': 'open_rv_24m',
          'secAppChargeoffWithin12Mths': 'sec_app_chargeoff_within_12_mths',
          'addrZip': 'zip_code',
          'accOpenPast24Mths': 'acc_open_past_24mths',
          'chargeoffWithin12Mths': 'chargeoff_within_12_mths',
          'inqLast12m': 'inq_last_12m',
          'delinq2Yrs': 'delinq_2yrs',
          'percentBcGt75': 'percent_bc_gt_75',
          'loanAmount': 'loan_amnt',
          'iLUtil': 'il_util',          
         }


# cols dropped from the listed loans features (these features are not in historical data)
to_drop = ['reviewStatus', 'housingPayment', 'creditPullD', 'ilsExpD', 'mtgPayment', 'expD', 'acceptD',
          'investorCount','serviceFeeRate', 'disbursementMethod', 'listD', 'expDefaultRate',
          'reviewStatusD','fundedAmount']


api_cols = []
api_dict = {}
for col in list(df.columns):
    if col in list(to_map.keys()):
        continue
    if col in to_drop:
        continue
        
    new_col = re.sub(r'([A-Z])', r'_\1', col).lower()
    new_col = re.sub(r'([0-9])+', r'_\1', new_col).lower()
    api_cols.append(new_col)
    api_dict[col] = new_col

df.rename(columns=api_dict, inplace=True)

df.rename(columns=to_map, inplace=True)

for col in x_cols:
    if col not in df.columns:
        print(col)

In [53]:
df = df[(df['term']==36)&(df['application_type']=='INDIVIDUAL')]

In [54]:
df = df[x_cols+['id']]

In [56]:
df['emp_length'] = df.emp_length//12
df.loc[df['emp_length']>10,'emp_length']=10

In [57]:
df['MPRIME'] = 4.75
df['issue_m'] = 1
df['prev_mo_cpi'] = 258

In [58]:
from datetime import date
index = df[df['loan_amnt'].isnull()].index
df.drop(index=index, inplace=True)
    # convert dates to useable formats
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'], utc=True)
df['issue_d'] = pd.to_datetime(date.today(),utc=True)
    
    # convert earliest cr_line from date to months since loan request
df['earliest_cr_line'] = ((df['issue_d'] - df['earliest_cr_line']) / np.timedelta64(1, 'M')).astype(int)

In [59]:
df['issue_m'] = df['issue_d'].dt.month
df['int_rate'] = df['int_rate'] - df['MPRIME']

In [60]:
to_adj = ['annual_inc','loan_amnt','installment',
          'revol_bal', 'avg_cur_bal','bc_open_to_buy',
          'total_il_high_credit_limit','total_bc_limit',
          'total_rev_hi_lim','tot_hi_cred_lim','total_bal_ex_mort',
          'tot_cur_bal','tot_coll_amt','delinq_amnt']
inflation_adj_2016 = 237.833

for val in to_adj:
    if val in x_cols:
        df[val] = df[val]/df['prev_mo_cpi']
        df[val] = df[val]*inflation_adj_2016
        df[val] = df[val].round(decimals=2)

In [61]:
for col in cat_encs.keys():
    if col == 'emp_length':
        continue
    if col in x_cols:
        print(col)
        df[col] = df[col].map(cat_encs[col], na_action='UNK')

sub_grade
home_ownership
zip_code
emp_title


In [62]:
import os
df['pred_returns'] = 0
for m in os.listdir('models/'):
    if m[-4:] == '.txt':
        model = lgb.Booster(model_file=f'models/{m}')
        preds = model.predict(df[x_cols])
        df['pred_returns'] += preds/5

In [63]:
df = df.sort_values('pred_returns',ascending=False)

In [64]:
df

Unnamed: 0,loan_amnt,int_rate,installment,sub_grade,emp_length,home_ownership,annual_inc,zip_code,emp_title,acc_open_past_24mths,bc_open_to_buy,bc_util,dti,earliest_cr_line,mths_since_last_record,mths_since_recent_inq,mths_since_recent_bc,revol_bal,revol_util,total_bc_limit,total_il_high_credit_limit,tot_hi_cred_lim,avg_cur_bal,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,id,MPRIME,issue_m,prev_mo_cpi,issue_d,pred_returns
19,7098.12,9.55,243.64,10,10.0,1,92183.33,867,139300.0,7,8709.48,25.6,8.35,178,68.0,1.0,1,5128.16,20.5,11707.28,21799.51,46873.38,1631.64,177.0,178,165784819,4.75,1,258,2020-01-28 00:00:00+00:00,0.062834
33,18436.67,15.8,690.35,16,5.0,5,41482.5,264,149388.0,1,11344.08,37.8,5.57,175,,10.0,31,7170.94,23.2,18252.3,74930.3,105811.72,8457.82,165.0,175,165767908,4.75,1,258,2020-01-28 00:00:00+00:00,0.058433
20,9218.33,11.37,324.64,12,10.0,4,46091.67,806,134444.0,3,3915.03,43.4,27.94,225,73.0,8.0,37,9096.65,51.1,6913.75,30345.83,48137.21,4028.41,87.0,225,165504196,4.75,1,258,2020-01-28 00:00:00+00:00,0.041566
25,9218.33,12.2,328.43,13,1.0,1,39638.83,305,148366.0,4,585.36,90.9,14.21,44,,2.0,3,7482.52,47.7,6452.83,20056.33,189599.91,21881.56,25.0,44,165785103,4.75,1,258,2020-01-28 00:00:00+00:00,0.041306
46,29498.67,6.96,975.71,7,0.0,1,110620.0,838,83310.0,14,36047.37,38.8,20.49,77,,9.0,11,24134.52,35.1,58905.15,98274.81,473937.56,14148.3,66.0,77,165707095,4.75,1,258,2020-01-28 00:00:00+00:00,0.038566
23,11062.0,12.99,398.48,14,10.0,5,67293.83,314,137822.0,3,428.65,94.8,7.07,183,,22.0,3,11577.3,78.5,8204.32,0.0,14749.33,1052.73,100.0,183,165767116,4.75,1,258,2020-01-28 00:00:00+00:00,0.03792
0,9218.33,18.3,357.08,17,1.0,1,73746.67,313,136450.0,12,21190.18,23.1,7.91,79,60.0,0.0,4,9398.09,22.7,27562.82,8344.44,373332.36,23977.81,36.0,79,165780005,4.75,1,258,2020-01-28 00:00:00+00:00,0.036529
35,18436.67,8.33,621.91,9,8.0,1,51838.38,159,38445.0,1,986.36,75.1,17.09,136,,15.0,74,3003.33,63.9,3963.88,26834.57,201700.82,23065.19,136.0,74,165708192,4.75,1,258,2020-01-28 00:00:00+00:00,0.0364
45,36873.33,3.44,1158.72,3,4.0,1,115229.17,707,132070.0,3,11169.85,71.9,21.16,211,,12.0,53,29276.5,58.4,39731.02,65809.68,351485.83,18525.16,160.0,211,164955744,4.75,1,258,2020-01-28 00:00:00+00:00,0.035339
21,13827.5,7.65,461.92,8,1.0,4,276550.0,866,21207.0,3,19494.93,52.6,7.32,148,,1.0,4,21618.84,52.6,41113.77,28818.35,88258.17,6167.99,61.0,148,165702455,4.75,1,258,2020-01-28 00:00:00+00:00,0.035255
