In [30]:
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from xgboost import XGBRegressor
from imblearn.pipeline import Pipeline

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV

warnings.filterwarnings('ignore')

In [31]:
datasets = pd.read_csv('./Data/LoansData_sample (1).csv.gz', compression='gzip', encoding='utf-8')

datasets

Unnamed: 0.1,Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,0,38098114,,15000.0,15000.0,15000.0,60 months,12.39,336.64,C,...,,,Cash,N,,,,,,
1,1,36805548,,10400.0,10400.0,10400.0,36 months,6.99,321.08,A,...,,,Cash,N,,,,,,
2,2,37842129,,21425.0,21425.0,21425.0,60 months,15.59,516.36,D,...,,,Cash,N,,,,,,
3,3,37612354,,12800.0,12800.0,12800.0,60 months,17.14,319.08,D,...,,,Cash,N,,,,,,
4,4,37662224,,7650.0,7650.0,7650.0,36 months,13.66,260.20,C,...,,,Cash,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,22454240,,8400.0,8400.0,8400.0,36 months,9.17,267.79,B,...,,,Cash,N,,,,,,
99996,99996,11396920,,10000.0,10000.0,10000.0,36 months,12.99,336.90,C,...,,,Cash,N,,,,,,
99997,99997,8556176,,30000.0,30000.0,30000.0,60 months,20.99,811.44,E,...,,,Cash,N,,,,,,
99998,99998,24023408,,8475.0,8475.0,8475.0,36 months,24.99,336.92,F,...,,,Cash,N,,,,,,


In [32]:
datasets = datasets.loc[datasets['loan_status'].isin(['Fully Paid', 'Charged Off'])]
datasets['loan_status'].value_counts( normalize=True )

Fully Paid     0.812441
Charged Off    0.187559
Name: loan_status, dtype: float64

In [33]:
datasets['charged_off'] = (datasets['loan_status']=='Charged Off').astype(np.int32)

In [34]:
datasets.drop('loan_status', axis=1, inplace=True)

In [35]:
missing_fractions = datasets.isnull().mean().sort_values(ascending=False)
missing_fractions

total_cu_tl                            1.0
annual_inc_joint                       1.0
sec_app_mths_since_last_major_derog    1.0
sec_app_collections_12_mths_ex_med     1.0
sec_app_chargeoff_within_12_mths       1.0
                                      ... 
total_rec_int                          0.0
total_rec_prncp                        0.0
total_pymnt_inv                        0.0
total_pymnt                            0.0
charged_off                            0.0
Length: 151, dtype: float64

In [36]:
drop_list = sorted( list(missing_fractions[missing_fractions>0.3].index))
drop_list[:5]

['all_util',
 'annual_inc_joint',
 'debt_settlement_flag_date',
 'deferral_term',
 'desc']

In [37]:
datasets.drop(labels=drop_list, axis=1, inplace=True)

In [38]:
datasets.shape

(86138, 93)

In [39]:
keep_list = ['charged_off','funded_amnt','addr_state', 'annual_inc',
             'application_type', 'dti', 'earliest_cr_line',
             'fico_range_high', 'fico_range_low',
             'grade', 'home_ownership', 'initial_list_status',
             'installment', 'int_rate', 'loan_amnt', 'loan_status',
             'mort_acc', 'open_acc', 'purpose', 'revol_util', 'sub_grade',
             'term', 'verification_status','last_pymnt_amnt',
             'num_actv_rev_tl', 'mo_sin_rcnt_rev_tl_op','mo_sin_old_rev_tl_op',
             'bc_util','bc_open_to_buy', 'avg_cur_bal','acc_open_past_24mths' ]
len(keep_list)

31

In [40]:
drop_list2 = [col for col in datasets.columns if col not in keep_list]
datasets.drop(labels=drop_list2, axis=1, inplace=True)
datasets.shape

(86138, 30)

In [41]:
datasets['term'] = datasets['term'].apply(lambda s: np.int8(s.split()[0]))

In [42]:
datasets['log_annual_inc'] = datasets['annual_inc'].apply(lambda x: np.log10(x+1))
datasets.drop('annual_inc', axis=1, inplace=True)

In [43]:
datasets['fico_score'] = 0.5 * datasets['fico_range_low'] + 0.5 * datasets['fico_range_high']
datasets.drop(['fico_range_high', 'fico_range_low'], axis=1, inplace=True)

In [44]:
categorical_feature_mask = datasets.dtypes == object