Processing Variables For Modeling
==================

In [1]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt; plt.style.use("ggplot")

from collections import Counter

In [2]:
loans = pd.read_csv("../data/clean/loans.csv", sep = "^").sample(200000, random_state = 42)

In [3]:
loans.head()

Unnamed: 0,funded_amnt_inv,term,issue_d,installment,int_rate,grade,emp_title,emp_length,annual_inc,title,dti,home_ownership,zip_code,addr_state,total_rec_late_fee,application_type,total_acc,loan_status
510666,16175.0,36 months,Jan-2017,584.69,17.99%,D,Income-Revenue Specialist,10+ years,32895.0,Home improvement,34.22,OWN,731xx,OK,0.0,Joint App,26.0,Current
33663,6500.0,36 months,Mar-2016,201.87,7.39%,A,Water Treatment Plant Operator,4 years,62000.0,Car financing,8.96,RENT,147xx,NY,0.0,Individual,19.0,Fully Paid
934932,3600.0,36 months,Dec-2013,115.61,9.67%,B,Finance/Accounts Payable,2 years,40000.0,Credit card refinancing,4.11,OWN,554xx,MN,0.0,Individual,14.0,Fully Paid
238556,6000.0,36 months,Sep-2016,206.5,14.49%,C,Service Technician /Foreman,2 years,55000.0,Home improvement,0.0,OWN,752xx,TX,0.0,Individual,10.0,Current
722009,40000.0,36 months,Jul-2017,1252.91,7.97%,A,Owner/Operator,< 1 year,80000.0,Debt consolidation,17.34,MORTGAGE,119xx,NY,0.0,Joint App,36.0,Current


### 01 - Target: Loan Status

`loan_status` is the current status of the loan. This is the variable we want to predict in our machine learning model. For this variable, we are going to considerar three labels:
- 0: loans that have already been paid.
- 1: default or charged off loans.
- 2: current loans (rest of the cases), where we don't know if they are going to be paid or not.

We will use labels 0 and 1 for training and testing our model. Label 2 is going to use just for predicting.

In [4]:
loans['loan_status'].value_counts()

Current                                                95239
Fully Paid                                             79596
Charged Off                                            20569
Late (31-120 days)                                      2459
In Grace Period                                         1330
Late (16-30 days)                                        485
Does not meet the credit policy. Status:Fully Paid       225
Does not meet the credit policy. Status:Charged Off       90
Default                                                    7
Name: loan_status, dtype: int64

In [5]:
def process_loan_status(loan_status):
    
    loan_status_dict = {
    "Current": 2,
    "Fully Paid": 0,
    "Charged Off": 1,
    "Late (31-120 days)": 2,
    "In Grace Period": 2,
    "Late (16-30 days)": 2,
    "Does not meet the credit policy. Status:Fully Paid": 0,
    "Does not meet the credit policy. Status:Charged Off": 1,
    "Default": 1
    }
    
    return loan_status_dict[loan_status]

In [6]:
loans['loan_status'] = loans['loan_status'].map(process_loan_status)

In [7]:
loans['loan_status'].head(10)

510666     2
33663      0
934932     0
238556     2
722009     2
1665695    2
1106564    0
663151     2
963656     0
1764427    0
Name: loan_status, dtype: int64

In [8]:
loans = loans[loans['loan_status'] < 2]

In [9]:
loans.head()

Unnamed: 0,funded_amnt_inv,term,issue_d,installment,int_rate,grade,emp_title,emp_length,annual_inc,title,dti,home_ownership,zip_code,addr_state,total_rec_late_fee,application_type,total_acc,loan_status
33663,6500.0,36 months,Mar-2016,201.87,7.39%,A,Water Treatment Plant Operator,4 years,62000.0,Car financing,8.96,RENT,147xx,NY,0.0,Individual,19.0,0
934932,3600.0,36 months,Dec-2013,115.61,9.67%,B,Finance/Accounts Payable,2 years,40000.0,Credit card refinancing,4.11,OWN,554xx,MN,0.0,Individual,14.0,0
1106564,8000.0,36 months,Jan-2012,257.8,9.91%,B,Raytheon Technical Services Co LLC,5 years,60000.0,Debt Consolidation Loan,8.68,RENT,928xx,CA,0.0,Individual,21.0,0
963656,4275.0,36 months,Oct-2013,131.95,6.97%,A,,,40800.0,Credit card refinancing,6.24,OWN,120xx,NY,0.0,Individual,23.0,0
1764427,6600.0,36 months,Jan-2015,226.57,14.31%,C,Maintenance,9 years,34000.0,Credit card refinancing,18.28,MORTGAGE,136xx,NY,0.0,Individual,22.0,0


In [10]:
loans['loan_status'].describe()

count    100487.000000
mean          0.205658
std           0.404184
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: loan_status, dtype: float64

__Our prior value of un paid loans is about 20%__ We will keep this value in mind!

### 02 - Numeric Variables

In [11]:
loans.dtypes

funded_amnt_inv       float64
term                   object
issue_d                object
installment           float64
int_rate               object
grade                  object
emp_title              object
emp_length             object
annual_inc            float64
title                  object
dti                   float64
home_ownership         object
zip_code               object
addr_state             object
total_rec_late_fee    float64
application_type       object
total_acc             float64
loan_status             int64
dtype: object

In [18]:
numerical_variables = ["funded_amnt_inv", "installment", "int_rate", "annual_inc", "dti",
                        "total_rec_late_fee", "total_acc"]

__Interest Rate__

In [13]:
loans['int_rate']

33663        7.39%
934932       9.67%
1106564      9.91%
963656       6.97%
1764427     14.31%
1078773     15.31%
1000221      7.62%
1209281     26.06%
216095       7.89%
1023189     10.16%
936407      17.10%
991790      10.64%
1399646      7.89%
885800      15.62%
946126      13.67%
1305012     14.64%
1759195     12.99%
1538436      5.32%
1309532     13.65%
1045218     12.12%
935452      13.67%
1602541     13.33%
136540      11.49%
1119189     11.99%
1142349     20.99%
896307       5.79%
1184523      8.39%
1308408      6.62%
1727608     14.99%
386037      11.39%
            ...   
1672417     18.25%
1291108     24.08%
388569      21.49%
820018      19.03%
1485012     17.86%
121752      13.99%
1425360     17.57%
1715363     20.49%
1239606     18.99%
1050599     17.77%
296621      17.99%
1494702     13.99%
1573753      7.26%
1677705      9.17%
1654398      8.18%
1423567     15.61%
1347980     13.99%
999037      25.28%
1203787     22.15%
937543      23.70%
1019186     19.72%
565245      

In [14]:
loans['int_rate'] = loans['int_rate'].map(lambda x: float(x[:-1]))

In [15]:
loans['int_rate'].head()

33663       7.39
934932      9.67
1106564     9.91
963656      6.97
1764427    14.31
Name: int_rate, dtype: float64

In [16]:
loans.dtypes

funded_amnt_inv       float64
term                   object
issue_d                object
installment           float64
int_rate              float64
grade                  object
emp_title              object
emp_length             object
annual_inc            float64
title                  object
dti                   float64
home_ownership         object
zip_code               object
addr_state             object
total_rec_late_fee    float64
application_type       object
total_acc             float64
loan_status             int64
dtype: object

__Outliers__

In [19]:
loans[numerical_variables].describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc
count,100487.0,100487.0,100487.0,100486.0,100483.0,100487.0,100482.0
mean,14266.111581,436.601337,13.488018,75378.66,17.894028,1.116726,25.383949
std,8521.092621,256.189642,4.648421,67004.41,8.550205,8.002909,12.020481
min,0.0,21.25,5.32,0.0,0.0,0.0,2.0
25%,7975.0,251.4,9.99,45000.0,11.66,0.0,17.0
50%,12000.0,376.41,12.99,65000.0,17.4,0.0,24.0
75%,20000.0,575.51,16.29,90000.0,23.63,0.0,32.0
max,40000.0,1566.8,30.99,9225000.0,380.72,654.320001,176.0


In [20]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than n outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers

In [24]:
# detect outliers from numerical features 
outliers_to_drop = detect_outliers(loans,1,numerical_variables)

print("There are {} outliers from numerical features".format(len(outliers_to_drop)))

There are 468 outliers from numerical features


In [25]:
loans = loans.drop(outliers_to_drop, axis=0)

In [26]:
loans.shape

(100019, 18)

In [27]:
loans[numerical_variables].describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc
count,100019.0,100019.0,100019.0,100018.0,100015.0,100019.0,100014.0
mean,14179.428953,433.444364,13.458464,75128.99,17.892988,0.967266,25.370188
std,8426.212562,251.797823,4.60522,66761.5,8.54484,6.695473,12.013285
min,0.0,21.25,5.32,0.0,0.0,0.0,2.0
25%,7950.0,250.87,9.99,45000.0,11.67,0.0,17.0
50%,12000.0,375.43,12.99,65000.0,17.4,0.0,24.0
75%,20000.0,571.9,16.29,90000.0,23.625,0.0,32.0
max,40000.0,1411.07,30.99,9225000.0,380.72,361.9,176.0


__Dealing NA's__

In [28]:
loans[numerical_variables].isnull().sum()

funded_amnt_inv       0
installment           0
int_rate              0
annual_inc            1
dti                   4
total_rec_late_fee    0
total_acc             5
dtype: int64

In [29]:
loans[numerical_variables].median()

funded_amnt_inv       12000.00
installment             375.43
int_rate                 12.99
annual_inc            65000.00
dti                      17.40
total_rec_late_fee        0.00
total_acc                24.00
dtype: float64

In [30]:
loans[numerical_variables] = loans[numerical_variables].fillna(loans[numerical_variables].median())

In [31]:
loans[numerical_variables].isnull().sum()

funded_amnt_inv       0
installment           0
int_rate              0
annual_inc            0
dti                   0
total_rec_late_fee    0
total_acc             0
dtype: int64

### 03 - Categorical Variables

In [32]:
categorical_variables = ['term', 'grade', 'emp_title', 'emp_length', 'title', 
                         'home_ownership', 'zip_code', 'addr_state', 'application_type']

In [33]:
for variable in categorical_variables:
    number_of_categories = loans[variable].unique().size
    print("{}: {} categories".format(variable, number_of_categories))

term: 2 categories
grade: 7 categories
emp_title: 47771 categories
emp_length: 12 categories
title: 9456 categories
home_ownership: 6 categories
zip_code: 885 categories
addr_state: 50 categories
application_type: 2 categories


Too many categories for `emp_title`, `title` and `zip_code`. Let's take a look at these three variables:

__emp_title__:

In [9]:
loans['emp_title'].value_counts().head(10)

Teacher             3377
Manager             2970
Owner               1845
Registered Nurse    1423
RN                  1405
Supervisor          1258
Driver              1229
Sales               1158
Project Manager      999
Office Manager       891
Name: emp_title, dtype: int64

In [10]:
loans.loc[~((loans['emp_title'] == 'Teacher') |
            (loans['emp_title'] == 'Manager') | 
            (loans['emp_title'] == 'Owner')),'emp_title'] = 'Other'

In [11]:
loans['emp_title'].value_counts()

Other      191808
Teacher      3377
Manager      2970
Owner        1845
Name: emp_title, dtype: int64

__title__:

In [12]:
loans['title'].value_counts().head(20)

Debt consolidation           101317
Credit card refinancing       38913
Home improvement              11836
Other                         10446
Major purchase                 3728
Medical expenses               2171
Business                       1835
Car financing                  1807
Debt Consolidation             1781
Vacation                       1260
Moving and relocation          1198
Home buying                     779
Consolidation                   574
debt consolidation              540
Debt Consolidation Loan         449
Credit Card Consolidation       284
consolidation                   249
Personal Loan                   219
Consolidation Loan              213
Home Improvement                202
Name: title, dtype: int64

In [13]:
loans = loans.drop('title', axis=1)

__zip_code__:

In [14]:
loans['zip_code'].describe()

count     200000
unique       897
top        945xx
freq        2203
Name: zip_code, dtype: object

In [15]:
loans = loans.drop('zip_code', axis=1)

__Transform to categorical__:

In [16]:
categorical_variables = ['term', 'grade', 'emp_title', 'emp_length', 
                         'home_ownership', 'addr_state', 'application_type']

In [17]:
for variable in categorical_variables:
    loans[variable] = loans[variable].astype("category")

In [18]:
loans[categorical_variables].describe()

Unnamed: 0,term,grade,emp_title,emp_length,home_ownership,addr_state,application_type
count,200000,200000,200000,188261,200000,200000,200000
unique,2,7,4,11,6,50,2
top,36 months,C,Other,10+ years,MORTGAGE,CA,Individual
freq,143360,59166,191808,66594,98520,27955,194099


#### 3.3 - Dates

We have just one date variable, `issue_d`. We are only interested in the year of the loan and we will consider it as categorical variable:

In [19]:
loans['issue_d'] = loans['issue_d'].map(lambda x: x[4:])

In [20]:
loans['issue_d'] = loans['issue_d'].astype('category')

In [21]:
loans['issue_d'].describe()

count     200000
unique        11
top         2017
freq       50507
Name: issue_d, dtype: object

In [45]:
categorical_variables = ['issue_d', 'term', 'grade', 'emp_title', 'emp_length', 
                         'home_ownership', 'addr_state', 'application_type']

In [175]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_fscore_support

In [55]:
data_for_modeling = loans[(loans['loan_status'] == 0) |
                          (loans['loan_status'] == 1)]

In [56]:
data_for_modeling.shape

(97501, 18)

In [57]:
data_for_modeling.dtypes

funded_amnt_inv        float64
term                  category
issue_d               category
installment            float64
int_rate               float64
grade                 category
emp_title             category
emp_length            category
annual_inc             float64
dti                    float64
home_ownership        category
addr_state            category
last_pymnt_amnt        float64
total_pymnt_inv        float64
total_rec_late_fee     float64
application_type      category
total_acc              float64
loan_status           category
dtype: object

In [58]:
data_for_modeling.head()

Unnamed: 0,funded_amnt_inv,term,issue_d,installment,int_rate,grade,emp_title,emp_length,annual_inc,dti,home_ownership,addr_state,last_pymnt_amnt,total_pymnt_inv,total_rec_late_fee,application_type,total_acc,loan_status
1345261,10000.0,36 months,2015,308.73,6.99,A,Other,< 1 year,70000.0,21.79,MORTGAGE,AZ,7761.77,10543.79,0.0,Individual,23.0,0
1004346,12000.0,60 months,2013,293.67,16.29,C,Other,6 years,60000.0,18.14,MORTGAGE,FL,397.31,15720.07,0.0,Individual,29.0,0
1458620,34000.0,36 months,2015,1083.89,9.17,B,Other,10+ years,200000.0,11.05,MORTGAGE,TX,1083.89,13418.32,0.0,Individual,19.0,1
1133231,15775.0,36 months,2014,546.77,14.99,C,Other,< 1 year,36000.0,18.4,RENT,CA,5216.23,19405.98,0.0,Individual,9.0,0
1690706,16000.0,36 months,2015,523.75,10.99,B,Other,10+ years,50000.0,21.65,MORTGAGE,AL,519.25,18766.98,0.0,Individual,17.0,0


In [59]:
categorical_variables

['issue_d',
 'term',
 'grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'addr_state',
 'application_type']

In [60]:
categorical_for_modeling = ['term', 'grade', 'emp_title', 'emp_length', 'home_ownership', 
                            'application_type']

In [61]:
data_for_modeling = pd.get_dummies(data_for_modeling, columns = categorical_for_modeling)

In [62]:
data_for_modeling = data_for_modeling.drop(["issue_d", "addr_state"], axis = 1)

In [63]:
data_for_modeling.head()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,last_pymnt_amnt,total_pymnt_inv,total_rec_late_fee,total_acc,loan_status,...,emp_length_9 years,emp_length_< 1 year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,application_type_Individual,application_type_Joint App
1345261,10000.0,308.73,6.99,70000.0,21.79,7761.77,10543.79,0.0,23.0,0,...,0,1,0,1,0,0,0,0,1,0
1004346,12000.0,293.67,16.29,60000.0,18.14,397.31,15720.07,0.0,29.0,0,...,0,0,0,1,0,0,0,0,1,0
1458620,34000.0,1083.89,9.17,200000.0,11.05,1083.89,13418.32,0.0,19.0,1,...,0,0,0,1,0,0,0,0,1,0
1133231,15775.0,546.77,14.99,36000.0,18.4,5216.23,19405.98,0.0,9.0,0,...,0,1,0,0,0,0,0,1,1,0
1690706,16000.0,523.75,10.99,50000.0,21.65,519.25,18766.98,0.0,17.0,0,...,0,0,0,1,0,0,0,0,1,0


In [47]:
numerical_variables

['funded_amnt_inv',
 'installment',
 'int_rate',
 'annual_inc',
 'dti',
 'last_pymnt_amnt',
 'total_pymnt_inv',
 'total_rec_late_fee',
 'total_acc']

In [212]:
data_for_modeling = data_for_modeling.drop(['last_pymnt_amnt', 'total_pymnt_inv'], axis = 1)

In [213]:
data_for_modeling.head()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc,loan_status,term_ 36 months,term_ 60 months,...,emp_length_9 years,emp_length_< 1 year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,application_type_Individual,application_type_Joint App
1345261,10000.0,308.73,6.99,70000.0,21.79,0.0,23.0,0,1,0,...,0,1,0,1,0,0,0,0,1,0
1004346,12000.0,293.67,16.29,60000.0,18.14,0.0,29.0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
1458620,34000.0,1083.89,9.17,200000.0,11.05,0.0,19.0,1,1,0,...,0,0,0,1,0,0,0,0,1,0
1133231,15775.0,546.77,14.99,36000.0,18.4,0.0,9.0,0,1,0,...,0,1,0,0,0,0,0,1,1,0
1690706,16000.0,523.75,10.99,50000.0,21.65,0.0,17.0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
