This notebook will clean data to use for LightGBM models.

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
pd.set_option('display.max_columns', 50)

In [2]:
data_path = 'data/'

train_df = pd.read_pickle(data_path+'train_adj.pkl').iloc[:,2:] # first two columns are id info
test_df = pd.read_pickle(data_path+'test_adj.pkl').iloc[:,2:]

In [3]:
train_df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,desc,purpose,zip_code,addr_state,initial_list_status,emp_title,acc_now_delinq,acc_open_past_24mths,bc_open_to_buy,percent_bc_gt_75,bc_util,dti,delinq_2yrs,delinq_amnt,earliest_cr_line,...,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,revol_bal_joint,open_act_il,sec_app_open_act_il,issue_d,loan_status,total_pymnt,total_rec_int,total_rec_late_fee,total_rec_prncp,recoveries,collection_recovery_fee,last_pymnt_d,loan_length,returns,MPRIME,issue_m,CPIAUCSL,prev_mo_cpi
0,21131.68,36,10.28,717.42,B,B5,10+ years,RENT,82799.61,Verified,Borrower added on 12/31/13 > My goal is to p...,debt_consolidation,100xx,NY,f,Operations Manager,0.0,9.0,6919.61,50.0,54.6,16.73,0.0,0.0,186,...,,,,,,,,,,,2013-12-01,Fully Paid,23926.640008,3126.64,0.0,20800.0,0.0,0.0,May-2015,516.0,0.104131,3.25,12,234.719,234.1
1,12191.35,36,4.37,379.9,A,A3,3 years,MORTGAGE,98038.81,Not Verified,Borrower added on 12/31/13 > Bought a new ho...,debt_consolidation,782xx,TX,f,Systems Engineer,0.0,4.0,2479.92,100.0,83.5,12.61,0.0,0.0,123,...,,,,,,,,,,,2013-12-01,Fully Paid,13397.539998,1397.54,0.0,12000.0,0.0,0.0,Jun-2016,913.0,0.045026,3.25,12,234.719,234.1
2,27481.34,36,7.74,899.58,B,B2,10+ years,OWN,55877.04,Verified,Borrower added on 12/31/13 > Combining high ...,debt_consolidation,481xx,MI,w,Team Leadern Customer Ops & Systems,0.0,3.0,16735.68,25.0,53.9,22.87,0.0,0.0,326,...,,,,,,,,,,,2013-12-01,Fully Paid,31752.53,4702.53,0.0,27050.0,0.0,0.0,Jul-2016,943.0,0.064005,3.25,12,234.719,234.1
3,12191.35,36,7.74,399.07,B,B2,4 years,RENT,60956.77,Not Verified,Borrower added on 12/31/13 > I would like to...,debt_consolidation,281xx,NC,f,Project Manager,0.0,8.0,15458.64,0.0,15.9,4.62,0.0,0.0,48,...,,,,,,,,,,,2013-12-01,Fully Paid,13988.609996,1988.61,0.0,12000.0,0.0,0.0,Apr-2016,852.0,0.067896,3.25,12,234.719,234.1
4,4876.54,36,7.74,159.64,B,B2,2 years,MORTGAGE,40231.47,Source Verified,Borrower added on 12/31/13 > Just bought a h...,home_improvement,782xx,TX,w,Surgical Technician,0.0,0.0,21907.86,0.0,16.1,2.49,0.0,0.0,220,...,,,,,,,,,,,2013-12-01,Fully Paid,5157.519457,357.52,0.0,4800.0,0.0,0.0,Sep-2014,274.0,0.100428,3.25,12,234.719,234.1


In [4]:
train_df.shape

(529873, 118)

Create list of features by removing loan performace info

In [5]:
to_drop = ['issue_d','desc','loan_status', 'total_pymnt','total_rec_int',
           'total_rec_late_fee','total_rec_prncp', 'recoveries', 
           'collection_recovery_fee', 'last_pymnt_d', 
           'loan_length', 'returns', 'CPIAUCSL']


x_cols = [x for x in list(train_df.columns) if x not in to_drop]

In [6]:
# emp length should be ordered
emp_len_map = {'< 1 year':0, '1 year':1, '2 years': 2, '3 years': 3,
               '4 years':4, '5 years':5, '6 years':6, '7 years':7,
               '8 years':8, '9 years':9, '10+ years': 10}

train_df['emp_length'] = train_df['emp_length'].map(emp_len_map)
test_df['emp_length'] = test_df['emp_length'].map(emp_len_map)

To encode the categorical/str columns to numbers:
1. Create a dictionary with encoding info for each column
2. For each column, with the training data create a dict mapping each category to an int
3. Apply the categorical mapping from str to ints on the train, val, and test data

Sklearn label encoder was not used because it has issues if there are values in the val set we are mapping that were not seen in the train encodings. We do not want to fit a label encoder on both train and val data because we are simulating training on historical data to validate against unseen data.

In [7]:
cat_encs = {}

for col in x_cols:
    if train_df[col].dtype == object:
        unique = train_df[col].unique()
        try: # keeps alphabetical order, good for grade and subgrade
            unique.sort()
            cat_encs[col] = dict(zip(unique, range(len(train_df[col].unique()))))
        except:
            cat_encs[col] = dict(zip(unique, range(len(train_df[col].unique()))))
        
for col in cat_encs.keys():
    train_df[col] = train_df[col].map(cat_encs[col], na_action='UNK')
    test_df[col] = test_df[col].map(cat_encs[col], na_action='UNK')

Some columns are missing all info, so remove those columns

In [8]:
drop_missing = []
for col in x_cols:
    if train_df[col].isnull().mean()>=0.99:
        drop_missing.append(col)

x_cols = [x for x in x_cols if x not in drop_missing]

In [9]:
returns_info = ['issue_d','desc','loan_status', 'total_pymnt','total_rec_int',
                'total_rec_late_fee','total_rec_prncp', 'recoveries', 
                'collection_recovery_fee', 'last_pymnt_d', 
                'loan_length', 'returns']

# these are the same for every loan
to_drop = ['term','application_type','verification_status_joint',
           'sec_app_earliest_cr_line']

# save data
train_df[x_cols+returns_info].to_pickle(data_path+'train_lgbm.pkl')
test_df[x_cols+returns_info].to_pickle(data_path+'test_lgbm.pkl')

In [10]:
cat_encs['emp_length']= emp_len_map

In [11]:
# Store cat encs
with open('cat_encs.pickle', 'wb') as f:
    pickle.dump(cat_encs, f)