In [1]:
# for prelim cleaning, find rows that need to be removed, convert object cols to floats where necessary
# goal just to make data usable
# fill nulls and engineer features later b/c it may be different for different models

In [2]:
# create a library that will clean data and show missing/ cat value counts faster
# for each col check how many null, if cat how many cat and num per cat, check min, max, mean, std

In [3]:
import os, warnings, gc
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

Data folder also contains rejected loans. For now only focus on approved loans.

In [4]:
data_path = Path('data/')
files = os.listdir(data_path)
approved_files = [f for f in files if f[0]=='L']

Limit features to ones that are present at time of loan application

In [5]:
cols_of_interest = ['issue_d','loan_status','total_pymnt', 'total_rec_int','total_rec_late_fee','total_rec_prncp']

In [6]:
cols = ['id','loan_amnt', 'purpose', 'grade', 'sub_grade', 'int_rate', 'term',
       'installment', 'home_ownership', 'emp_length', 'emp_title',
       'annual_inc_joint', 'dti_joint', 'annual_inc', 'dti', 'zip_code',
       'addr_state', 'fico_range_low','fico_range_high', 'earliest_cr_line',
       'open_acc', 'total_acc', 'revol_bal','revol_util', 'inq_last_6mths',
       'acc_now_delinq', 'delinq_amnt' , 'delinq_2yrs',
       'mths_since_last_delinq', 'pub_rec','mths_since_last_record',
       'mths_since_last_major_derog', 'collections_12_mths_ex_med',
       'verification_status', 'issue_d', 'loan_status',
        'total_pymnt', 'total_rec_int','total_rec_late_fee','total_rec_prncp']
       

fl_to_int = ['open_acc', 'loan_amnt', 'fico_range_low', 'fico_range_high', 'total_acc', 'revol_bal', 
            'inq_last_6mths', 'acc_now_delinq', 'delinq_amnt', 'delinq_2yrs', 'mths_since_last_delinq',
             'pub_rec', 'mths_since_last_record', 'mths_since_last_major_derog', 'collections_12_mths_ex_med']

In [7]:
to_map = {'secAppCollections12MthsExMed': 'sec_app_collections_12_mths_ex_med',
          'secAppInqLast6Mths': 'sec_app_inq_last_6mths',
          'numAcctsEver120Ppd': 'num_accts_ever_120_pd',
          'inqLast6Mths': 'inq_last_6mths',
          'numTl120dpd2m': 'num_tl_120dpd_2m',
          'numTl30dpd': 'num_tl_30dpd',
          'numTl90gDpd24m': 'num_tl_90g_dpd_24m',
          'numTlOpPast12m': 'num_tl_op_past_12m',
          'collections12MthsExMed': 'collections_12_mths_ex_med',
          'isIncV': 'verification_status',
          'isIncVJoint': 'verification_status_joint',
          'openIl12m': 'open_il_12m',
          'openIl24m': 'open_il_24m',
          'openRv12m': 'open_rv_12m',
          'openRv24m': 'open_rv_24m',
          'secAppChargeoffWithin12Mths': 'sec_app_chargeoff_within_12_mths',
          'addrZip': 'zip_code',
          'accOpenPast24Mths': 'acc_open_past_24mths',
          'chargeoffWithin12Mths': 'chargeoff_within_12_mths',
          'inqLast12m': 'inq_last_12m',
          'delinq2Yrs': 'delinq_2yrs',
          'percentBcGt75': 'percent_bc_gt_75',
          'loanAmount': 'loan_amnt',
          'iLUtil': 'il_util',
          
         }

In [8]:
to_drop = ['reviewStatus', 'housingPayment', 'creditPullD', 'ilsExpD', 'mtgPayment', 'expD', 'acceptD',
          'investorCount','serviceFeeRate', 'disbursementMethod', 'listD', 'expDefaultRate',
          'reviewStatusD','fundedAmount']

In [9]:
cat_cols = ['purpose', 'grade', 'sub_grade', 'term', 'home_ownerhsip', 'emp_length', 'zip_code',
            'addr_state', ]

In [11]:
def cleaning(df):
            
    # drop non_loan columns
    index = df[df['loan_amnt'].isnull()].index
    df.drop(index=index, inplace=True)
    
    # convert % rate from str to float
    df['int_rate'] = df['int_rate'].str[:-1].astype(float)
    df['revol_util'] = df['revol_util'].str[:-1].astype(float)
    
    # convert term to float
    df['term'] = df['term'].str.strip().str[:2]

    # col too messy for now, maybe try later
    #df.drop(columns='emp_title', inplace=True)
    
    # drop weird old columns
    index = (df[(df['loan_status'] == 'Does not meet the credit policy. Status:Charged Off')
            |(df['loan_status'] == 'Does not meet the credit policy. Status:Fully Paid')]).index
    df.drop(index=index, inplace=True)
    
    # convert dates to useable formats
    df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'], format='%b-%Y')
    df['issue_d'] = pd.to_datetime(df['issue_d'], format='%b-%Y')
    
    # time since earliest cr_line
    df['earliest_cr_line'] = ((df['issue_d'] - df['earliest_cr_line']) / np.timedelta64(1, 'M')).astype(int)
    
    # convert floats to ints
    for col in fl_to_int:
        try: df[col] = df[col].astype(int)
        except: pass    
    
    return df[api_cols+cols_of_interest]

In [12]:
def get_data(approved_files, data_path):
    df = pd.DataFrame()
    for file in approved_files:
        print('reading in {}'.format(file))
        temp_df = pd.read_csv(data_path/file,header=1)
        temp_df = cleaning(temp_df)
        df = pd.concat([df,temp_df],ignore_index=True)
    del temp_df
    return df

In [130]:
import requests
from requests.auth import HTTPDigestAuth
import json

api_key =  {'Authorization': 'JX4Wu5IqtrfKwUqUIHAikOoEoNQ='}

investor_id = '124665850'
SubResource = 'summary'

loans = 'https://api.lendingclub.com/api/investor/v1/loans/listing'
res = requests.get(loans, headers=api_key)
data = json.loads(res.text)

avail_cols = list(data['loans'][0].keys())

In [131]:
import re
api_cols = []
for col in avail_cols:
    if col in list(to_map.keys()):
        new_col = to_map[col]
        api_cols.append(new_col)
        continue
    if col in to_drop:
        continue
    new_col = re.sub(r'([A-Z])', r'_\1', col).lower()
    new_col = re.sub(r'([0-9])+', r'_\1', new_col).lower()
    api_cols.append(new_col)

In [13]:
df = get_data(approved_files, data_path)

reading in LoanStats_securev1_2018Q4.csv
reading in LoanStats3b_securev1.csv
reading in LoanStats3c_securev1.csv
reading in LoanStats3d_securev1.csv
reading in LoanStats_securev1_2018Q2.csv
reading in LoanStats_securev1_2018Q3.csv
reading in LoanStats_securev1_2018Q1.csv
reading in LoanStats_securev1_2019Q1.csv
reading in LoanStats_securev1_2017Q1.csv
reading in LoanStats_securev1_2017Q2.csv
reading in LoanStats_securev1_2017Q3.csv
reading in LoanStats_securev1_2017Q4.csv
reading in LoanStats_securev1_2016Q2.csv
reading in LoanStats3a_securev1.csv
reading in LoanStats_securev1_2016Q3.csv
reading in LoanStats_securev1_2016Q1.csv
reading in LoanStats_securev1_2016Q4.csv


In [18]:
explore = df[(~df['settlement_amount'].isnull())&(df['issue_d'].dt.year==2014)].copy()

In [40]:
val = df[df['issue_d'].dt.year==2014].copy()

In [44]:
loans = val.loan_amnt.sum()

In [46]:
paid = val.total_pymnt.sum()

In [52]:
(paid/loans)**(1/3)

1.036684711757115

In [50]:
math.exp(1/3)

1.3956124250860895

In [51]:
from math import exp
(paid/loans).exp(1/3)

AttributeError: 'numpy.float64' object has no attribute 'exp'

In [43]:
val.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,...,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
316593,36805548,,10400,10400.0,10400.0,36,6.99,321.08,A,A3,Truck Driver Delivery Personel,8 years,MORTGAGE,58000.0,Not Verified,2014-12-01,Charged Off,n,https://lendingclub.com/browse/loanDetail.acti...,,credit_card,Credit card refinancing,937xx,CA,14.92,...,,,,N,,,,,,,,,,,,,,,N,,,,,,
316594,38098114,,15000,15000.0,15000.0,60,12.39,336.64,C,C1,MANAGEMENT,10+ years,RENT,78000.0,Source Verified,2014-12-01,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,235xx,VA,12.03,...,,,,N,,,,,,,,,,,,,,,N,,,,,,
316595,37822187,,9600,9600.0,9600.0,36,13.66,326.53,C,C3,Admin Specialist,10+ years,RENT,69000.0,Source Verified,2014-12-01,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,077xx,NJ,25.81,...,,,,N,,,,,,,,,,,,,,,N,,,,,,
316596,37662224,,7650,7650.0,7650.0,36,13.66,260.2,C,C3,Technical Specialist,< 1 year,RENT,50000.0,Source Verified,2014-12-01,Charged Off,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,850xx,AZ,34.81,...,,,,N,,,,,,,,,,,,,,,N,,,,,,
316597,37612354,,12800,12800.0,12800.0,60,17.14,319.08,D,D4,Senior Sales Professional,10+ years,MORTGAGE,125000.0,Verified,2014-12-01,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,car,Car financing,953xx,CA,8.31,...,,,,N,,,,,,,,,,,,,,,N,,,,,,


In [32]:
0.4498/753.00

0.00059734395750332

In [25]:
3156.58+744.25

3900.83

In [24]:
4903.83-744.25-3156.58

1003.0

In [39]:
753/0.45

1673.3333333333333

In [38]:
1003.*0.45

451.35

In [34]:
3156.58+744.25

3900.83

In [37]:
explore.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,...,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
316619,37661949,,5000,5000.0,5000.0,36,10.49,162.49,B,B3,Teacher,4 years,MORTGAGE,52000.0,Source Verified,2014-12-01,Charged Off,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,853xx,AZ,14.35,...,,,,N,,,,,,,,,,,,,,,Y,Aug-2018,COMPLETE,Jan-2018,753.0,44.98,6.0
316705,37752007,,22200,22200.0,22200.0,60,17.14,553.4,D,D4,Clinical Data Lead,10+ years,RENT,74500.0,Source Verified,2014-12-01,Charged Off,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,015xx,MA,8.05,...,,,,N,,,,,,,,,,,,,,,Y,Jul-2017,BROKEN,May-2017,9488.02,45.0,24.0
316765,37781998,,14400,14400.0,14400.0,60,19.24,375.45,E,E2,Administrative Assistant,10+ years,MORTGAGE,70000.0,Verified,2014-12-01,Charged Off,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,105xx,NY,26.81,...,,,,N,,,,,,,,,,,,,,,Y,Dec-2018,COMPLETE,Apr-2017,6857.0,44.99,24.0
316766,12389132,,11200,11200.0,11200.0,60,14.31,262.41,C,C4,Teacher,10+ years,RENT,40000.0,Source Verified,2014-12-01,Charged Off,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,880xx,NM,29.07,...,,,,N,,,,,,,,,,,,,,,Y,Apr-2019,COMPLETE,May-2018,2668.0,45.01,16.0
316783,37831802,,12000,12000.0,12000.0,60,17.86,303.81,D,D5,Teacher,5 years,RENT,45000.0,Verified,2014-12-01,Charged Off,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,891xx,NV,30.67,...,,,,N,,,,,,,,,,,,,,,Y,Nov-2018,ACTIVE,Nov-2018,2389.0,45.0,12.0


In [23]:
explore[cols_of_interest + ['loan_amnt','settlement_amount', 'settlement_status']]

Unnamed: 0,issue_d,loan_status,total_pymnt,total_rec_int,total_rec_late_fee,total_rec_prncp,loan_amnt,settlement_amount,settlement_status
316619,2014-12-01,Charged Off,4903.830000,744.25,0.00,3156.58,5000,753.00,COMPLETE
316705,2014-12-01,Charged Off,6793.720000,2981.37,0.00,2520.92,22200,9488.02,BROKEN
316765,2014-12-01,Charged Off,7584.810000,436.35,0.00,291.46,14400,6857.00,COMPLETE
316766,2014-12-01,Charged Off,11838.990000,3614.74,0.00,5556.25,11200,2668.00,COMPLETE
316783,2014-12-01,Charged Off,12694.300000,5440.07,0.00,7004.23,12000,2389.00,ACTIVE
316788,2014-12-01,Charged Off,1703.070000,518.57,0.00,1130.23,3375,1089.37,BROKEN
316814,2014-12-01,Charged Off,8179.430000,1874.07,0.00,5485.36,7175,820.00,COMPLETE
316840,2014-12-01,Charged Off,18188.130000,3485.16,32.95,14670.02,20000,7305.37,COMPLETE
317160,2014-12-01,Charged Off,11394.150000,883.62,0.00,5083.98,15000,5126.55,COMPLETE
317187,2014-12-01,Charged Off,12251.730000,1737.72,0.00,10007.47,18000,4134.96,BROKEN


In [14]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,...,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,145647242,,9600,9600.0,9600.0,36,12.98,323.37,B,B5,,,MORTGAGE,35704.0,Not Verified,2018-12-01,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,home_improvement,Home improvement,401xx,KY,0.84,...,,,,N,,,,,,,,,,,,,,,N,,,,,,
1,145248657,,4000,4000.0,4000.0,36,23.4,155.68,E,E1,Security,3 years,RENT,90000.0,Source Verified,2018-12-01,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,070xx,NJ,26.33,...,,,,N,,,,,,,,,,,,,,,N,,,,,,
2,145640422,,2500,2500.0,2500.0,36,13.56,84.92,C,C1,Chef,10+ years,RENT,55000.0,Not Verified,2018-12-01,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,109xx,NY,18.24,...,,,,N,,,,,,,,,,,,,,,N,,,,,,
3,145631930,,30000,30000.0,30000.0,60,18.94,777.23,D,D2,Postmaster,10+ years,MORTGAGE,90000.0,Source Verified,2018-12-01,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,713xx,LA,26.52,...,,,,N,,,,,,,,,,,,,,,N,,,,,,
4,145638579,,5000,5000.0,5000.0,36,17.97,180.69,D,D1,Administrative,6 years,MORTGAGE,59280.0,Source Verified,2018-12-01,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,490xx,MI,10.51,...,,,,N,,,,,,,,,,,,,,,N,,,,,,


In [15]:
list(df.columns)

['id',
 'member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'url',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'fico_range_low',
 'fico_range_high',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'last_credit_pull_d',
 'last_fico_range_high',
 'last_fico_range_low',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'policy_code',
 'application_type',
 'annual_inc_joint',
 '

In [66]:
len(avail_cols)

119

In [67]:
df_cols = list(df.columns)

In [68]:
df_cols.sort()

In [99]:
len(api_cols)

105

In [100]:
api_cols

['id',
 'member_id',
 'loan_amnt',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'desc',
 'purpose',
 'zip_code',
 'addr_state',
 'initial_list_status',
 'emp_title',
 'acc_now_delinq',
 'acc_open_past_24mths',
 'bc_open_to_buy',
 'percent_bc_gt_75',
 'bc_util',
 'dti',
 'delinq_2yrs',
 'delinq_amnt',
 'earliest_cr_line',
 'fico_range_low',
 'fico_range_high',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'mths_since_recent_inq',
 'mths_since_recent_revol_delinq',
 'mths_since_recent_bc',
 'mort_acc',
 'open_acc',
 'pub_rec',
 'total_bal_ex_mort',
 'revol_bal',
 'revol_util',
 'total_bc_limit',
 'total_acc',
 'total_il_high_credit_limit',
 'num_rev_accts',
 'mths_since_recent_bc_dlq',
 'pub_rec_bankruptcies',
 'num_accts_ever_120_pd',
 'chargeoff_within_12_mths',
 'collections_12_mths_ex_med',
 'tax_liens',
 'mths_since_last_major_derog',
 'num_sats',
 'num_tl_op_past_1

In [90]:
df_cols

['acc_now_delinq',
 'acc_open_past_24mths',
 'addr_state',
 'all_util',
 'annual_inc',
 'annual_inc_joint',
 'application_type',
 'avg_cur_bal',
 'bc_open_to_buy',
 'bc_util',
 'chargeoff_within_12_mths',
 'collection_recovery_fee',
 'collections_12_mths_ex_med',
 'debt_settlement_flag',
 'debt_settlement_flag_date',
 'deferral_term',
 'delinq_2yrs',
 'delinq_amnt',
 'desc',
 'dti',
 'dti_joint',
 'earliest_cr_line',
 'emp_length',
 'emp_title',
 'fico_range_high',
 'fico_range_low',
 'funded_amnt',
 'funded_amnt_inv',
 'grade',
 'hardship_amount',
 'hardship_dpd',
 'hardship_end_date',
 'hardship_flag',
 'hardship_last_payment_amount',
 'hardship_length',
 'hardship_loan_status',
 'hardship_payoff_balance_amount',
 'hardship_reason',
 'hardship_start_date',
 'hardship_status',
 'hardship_type',
 'home_ownership',
 'id',
 'il_util',
 'initial_list_status',
 'inq_fi',
 'inq_last_12m',
 'inq_last_6mths',
 'installment',
 'int_rate',
 'issue_d',
 'last_credit_pull_d',
 'last_fico_range_hi

In [64]:
avail_cols.sort()
avail_cols

['accNowDelinq',
 'accOpenPast24Mths',
 'acceptD',
 'addrState',
 'addrZip',
 'allUtil',
 'annualInc',
 'annualIncJoint',
 'applicationType',
 'avgCurBal',
 'bcOpenToBuy',
 'bcUtil',
 'chargeoffWithin12Mths',
 'collections12MthsExMed',
 'creditPullD',
 'delinq2Yrs',
 'delinqAmnt',
 'desc',
 'disbursementMethod',
 'dti',
 'dtiJoint',
 'earliestCrLine',
 'empLength',
 'empTitle',
 'expD',
 'expDefaultRate',
 'ficoRangeHigh',
 'ficoRangeLow',
 'fundedAmount',
 'grade',
 'homeOwnership',
 'housingPayment',
 'iLUtil',
 'id',
 'ilsExpD',
 'initialListStatus',
 'inqFi',
 'inqLast12m',
 'inqLast6Mths',
 'installment',
 'intRate',
 'investorCount',
 'isIncV',
 'isIncVJoint',
 'listD',
 'loanAmount',
 'maxBalBc',
 'memberId',
 'moSinOldIlAcct',
 'moSinOldRevTlOp',
 'moSinRcntRevTlOp',
 'moSinRcntTl',
 'mortAcc',
 'mtgPayment',
 'mthsSinceLastDelinq',
 'mthsSinceLastMajorDerog',
 'mthsSinceLastRecord',
 'mthsSinceRcntIl',
 'mthsSinceRecentBc',
 'mthsSinceRecentBcDlq',
 'mthsSinceRecentInq',
 'mth

In [106]:
df[api_cols + cols_of_interest].to_csv('data/from_api.csv')

In [134]:
df.to_csv('data/from_api2.csv')

In [107]:
df = df[api_cols + cols_of_interest]

In [143]:
df = pd.read_csv('data/from_api2.csv')

In [None]:
df.issue_d.min()

In [146]:
df[df['term']==36]

Unnamed: 0.1,Unnamed: 0,id,member_id,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,desc,purpose,zip_code,addr_state,initial_list_status,emp_title,acc_now_delinq,acc_open_past_24mths,bc_open_to_buy,percent_bc_gt_75,bc_util,dti,...,max_bal_bc,all_util,inq_fi,total_cu_tl,inq_last_12m,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,revol_bal_joint,open_act_il,sec_app_open_act_il,issue_d,loan_status,total_pymnt,total_rec_int,total_rec_late_fee,total_rec_prncp
0,0,145647242,,9600,36,12.98,323.37,B,B5,,MORTGAGE,35704.0,Not Verified,,home_improvement,401xx,KY,w,,0,3.0,3452.0,0.0,17.8,0.84,...,748.0,12.0,0.0,0.0,1.0,,,,,,,,,,,,,0.0,,2018-12-01,Current,1317.720000,425.24,0.00,892.48
1,1,145248657,,4000,36,23.40,155.68,E,E1,3 years,RENT,90000.0,Source Verified,,debt_consolidation,070xx,NJ,w,Security,0,15.0,20174.0,0.0,7.9,26.33,...,1353.0,68.0,4.0,0.0,4.0,,,,,,,,,,,,,4.0,,2018-12-01,Current,770.600000,366.75,0.00,403.85
2,2,145640422,,2500,36,13.56,84.92,C,C1,10+ years,RENT,55000.0,Not Verified,,debt_consolidation,109xx,NY,w,Chef,0,9.0,34360.0,0.0,5.9,18.24,...,2137.0,28.0,1.0,11.0,2.0,,,,,,,,,,,,,2.0,,2018-12-01,Current,421.780000,131.95,0.00,289.83
4,4,145638579,,5000,36,17.97,180.69,D,D1,6 years,MORTGAGE,59280.0,Source Verified,,debt_consolidation,490xx,MI,w,Administrative,0,4.0,13800.0,0.0,0.0,10.51,...,0.0,35.0,1.0,5.0,0.0,,,,,,,,,,,,,1.0,,2018-12-01,Current,715.270000,282.84,0.00,432.43
5,5,145217616,,10000,36,10.33,324.23,B,B1,< 1 year,MORTGAGE,280000.0,Not Verified,,debt_consolidation,974xx,OR,w,,0,7.0,11897.0,28.6,43.1,6.15,...,4923.0,46.0,2.0,7.0,1.0,,,,,,,,,,,,,3.0,,2018-12-01,Current,1612.540000,401.13,0.00,1211.41
7,7,144858623,,8000,36,23.40,311.35,E,E1,10+ years,OWN,43000.0,Source Verified,,debt_consolidation,357xx,AL,w,Manager,0,2.0,126.0,100.0,94.5,33.24,...,2174.0,72.0,1.0,1.0,1.0,,,,,,,,,,,,,3.0,,2018-12-01,Current,1541.150000,733.51,0.00,807.64
14,14,145233976,,13000,36,23.40,505.95,E,E1,2 years,MORTGAGE,90000.0,Verified,,other,191xx,PA,w,Sale Representative,0,4.0,9503.0,66.7,83.8,39.73,...,18037.0,64.0,2.0,0.0,1.0,,,,,,,,,,,,,7.0,,2018-12-01,Current,2504.400000,1191.95,0.00,1312.45
15,15,145509846,,9600,36,23.40,373.62,E,E1,9 years,RENT,65000.0,Not Verified,,credit_card,265xx,WV,f,driver coordinator,0,1.0,232.0,85.7,96.9,23.01,...,2264.0,35.0,0.0,0.0,0.0,,,,,,,,,,,,,2.0,,2018-12-01,Current,1849.380000,880.21,0.00,969.17
16,16,145399386,,3500,36,20.89,131.67,D,D4,10+ years,MORTGAGE,40000.0,Source Verified,,car,078xx,NJ,w,gas attendant,0,3.0,3683.0,0.0,26.3,9.09,...,1317.0,87.0,1.0,0.0,3.0,630.0,634.0,May-2003,0.0,3.0,6.0,47.9,21.0,0.0,0.0,46.0,6902.0,1.0,0.0,2018-12-01,Current,652.260000,286.02,0.00,366.24
20,20,145621687,,24000,36,15.02,832.21,C,C3,3 years,OWN,105000.0,Not Verified,,credit_card,711xx,LA,w,MAINTENANCE PLANNER,0,13.0,7774.0,50.0,65.4,22.14,...,6618.0,71.0,3.0,4.0,4.0,,,,,,,,,,,,,2.0,,2018-12-01,Current,4131.010000,1404.56,0.00,2726.45


In [144]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,member_id,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,desc,purpose,zip_code,addr_state,initial_list_status,emp_title,acc_now_delinq,acc_open_past_24mths,bc_open_to_buy,percent_bc_gt_75,bc_util,dti,...,max_bal_bc,all_util,inq_fi,total_cu_tl,inq_last_12m,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,revol_bal_joint,open_act_il,sec_app_open_act_il,issue_d,loan_status,total_pymnt,total_rec_int,total_rec_late_fee,total_rec_prncp
0,0,145647242,,9600,36,12.98,323.37,B,B5,,MORTGAGE,35704.0,Not Verified,,home_improvement,401xx,KY,w,,0,3.0,3452.0,0.0,17.8,0.84,...,748.0,12.0,0.0,0.0,1.0,,,,,,,,,,,,,0.0,,2018-12-01,Current,1317.72,425.24,0.0,892.48
1,1,145248657,,4000,36,23.4,155.68,E,E1,3 years,RENT,90000.0,Source Verified,,debt_consolidation,070xx,NJ,w,Security,0,15.0,20174.0,0.0,7.9,26.33,...,1353.0,68.0,4.0,0.0,4.0,,,,,,,,,,,,,4.0,,2018-12-01,Current,770.6,366.75,0.0,403.85
2,2,145640422,,2500,36,13.56,84.92,C,C1,10+ years,RENT,55000.0,Not Verified,,debt_consolidation,109xx,NY,w,Chef,0,9.0,34360.0,0.0,5.9,18.24,...,2137.0,28.0,1.0,11.0,2.0,,,,,,,,,,,,,2.0,,2018-12-01,Current,421.78,131.95,0.0,289.83
3,3,145631930,,30000,60,18.94,777.23,D,D2,10+ years,MORTGAGE,90000.0,Source Verified,,debt_consolidation,713xx,LA,w,Postmaster,0,10.0,13761.0,0.0,8.3,26.52,...,998.0,57.0,2.0,15.0,2.0,,,,,,,,,,,,,4.0,,2018-12-01,Current,11338.8,2210.86,0.0,9127.94
4,4,145638579,,5000,36,17.97,180.69,D,D1,6 years,MORTGAGE,59280.0,Source Verified,,debt_consolidation,490xx,MI,w,Administrative,0,4.0,13800.0,0.0,0.0,10.51,...,0.0,35.0,1.0,5.0,0.0,,,,,,,,,,,,,1.0,,2018-12-01,Current,715.27,282.84,0.0,432.43


In [145]:
df.shape

(2373594, 112)

In [12]:
df.to_csv(data_path/'prelim-clean.csv', index=False)

march 2019

test set march 2015 -> march 2016
val set march 2014 -> feb 2015
train set begin -> feb 2014

In [147]:
df = df[df['term']==36]

In [148]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,member_id,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,desc,purpose,zip_code,addr_state,initial_list_status,emp_title,acc_now_delinq,acc_open_past_24mths,bc_open_to_buy,percent_bc_gt_75,bc_util,dti,...,max_bal_bc,all_util,inq_fi,total_cu_tl,inq_last_12m,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,revol_bal_joint,open_act_il,sec_app_open_act_il,issue_d,loan_status,total_pymnt,total_rec_int,total_rec_late_fee,total_rec_prncp
0,0,145647242,,9600,36,12.98,323.37,B,B5,,MORTGAGE,35704.0,Not Verified,,home_improvement,401xx,KY,w,,0,3.0,3452.0,0.0,17.8,0.84,...,748.0,12.0,0.0,0.0,1.0,,,,,,,,,,,,,0.0,,2018-12-01,Current,1317.72,425.24,0.0,892.48
1,1,145248657,,4000,36,23.4,155.68,E,E1,3 years,RENT,90000.0,Source Verified,,debt_consolidation,070xx,NJ,w,Security,0,15.0,20174.0,0.0,7.9,26.33,...,1353.0,68.0,4.0,0.0,4.0,,,,,,,,,,,,,4.0,,2018-12-01,Current,770.6,366.75,0.0,403.85
2,2,145640422,,2500,36,13.56,84.92,C,C1,10+ years,RENT,55000.0,Not Verified,,debt_consolidation,109xx,NY,w,Chef,0,9.0,34360.0,0.0,5.9,18.24,...,2137.0,28.0,1.0,11.0,2.0,,,,,,,,,,,,,2.0,,2018-12-01,Current,421.78,131.95,0.0,289.83
4,4,145638579,,5000,36,17.97,180.69,D,D1,6 years,MORTGAGE,59280.0,Source Verified,,debt_consolidation,490xx,MI,w,Administrative,0,4.0,13800.0,0.0,0.0,10.51,...,0.0,35.0,1.0,5.0,0.0,,,,,,,,,,,,,1.0,,2018-12-01,Current,715.27,282.84,0.0,432.43
5,5,145217616,,10000,36,10.33,324.23,B,B1,< 1 year,MORTGAGE,280000.0,Not Verified,,debt_consolidation,974xx,OR,w,,0,7.0,11897.0,28.6,43.1,6.15,...,4923.0,46.0,2.0,7.0,1.0,,,,,,,,,,,,,3.0,,2018-12-01,Current,1612.54,401.13,0.0,1211.41


In [149]:
gc.collect()

72

In [152]:
df['issue_d'] = pd.to_datetime(df['issue_d'])

In [153]:
train_df = df[df['issue_d'].dt.year<2014]
val_df = df[df['issue_d'].dt.year==2014]
test_df = df[df['issue_d'].dt.year==2015]

In [154]:
df['loan_status'].value_counts()

Fully Paid            908443
Current               587644
Charged Off           170920
Late (31-120 days)     12153
In Grace Period         3854
Late (16-30 days)       2720
Default                   11
Name: loan_status, dtype: int64

In [155]:
val_df.loan_status.value_counts()

Fully Paid     140255
Charged Off     22315
Name: loan_status, dtype: int64

In [156]:
train_df = train_df[(train_df['loan_status']=='Fully Paid')|(train_df['loan_status']=='Charged Off')]
val_df = val_df[(val_df['loan_status']=='Fully Paid')|(val_df['loan_status']=='Charged Off')]
test_df = test_df[(test_df['loan_status']=='Fully Paid')|(test_df['loan_status']=='Charged Off')]

In [157]:
train_df.loan_status.value_counts()

Fully Paid     151480
Charged Off     21508
Name: loan_status, dtype: int64

In [158]:
val_df.loan_status.value_counts()

Fully Paid     140255
Charged Off     22315
Name: loan_status, dtype: int64

In [159]:
test_df.loan_status.value_counts()

Fully Paid     240978
Charged Off     42171
Name: loan_status, dtype: int64

In [160]:
train_df.to_pickle('data/api_train_df.pkl')
val_df.to_pickle('data/api_val_df.pkl')
test_df.to_pickle('data/api_test_df.pkl')

In [None]:
from lightgbm import LGBMClassifier

In [None]:
train_df = df[]

In [None]:
# reset cols to those available when investing in loan
# predict if a loan will be paid off within 12 months of issuance