Processing Variables For Modeling
==================

Libraries

In [1]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt; plt.style.use("ggplot")

from collections import Counter

Read Data

In [2]:
loans = pd.read_csv("../data/clean/loans.csv", sep = "^")

In [3]:
loans.head()

Unnamed: 0,funded_amnt_inv,term,issue_d,installment,int_rate,grade,emp_title,emp_length,annual_inc,title,dti,home_ownership,zip_code,addr_state,total_rec_late_fee,application_type,total_acc,loan_status
0,10000.0,60 months,Mar-2016,262.34,19.53%,D,lpn/charge nurse,4 years,52000.0,Other,15.0,OWN,317xx,GA,0.0,Individual,12.0,Fully Paid
1,35000.0,60 months,Mar-2016,941.96,20.75%,E,Coiler,3 years,85000.0,Debt consolidation,24.98,MORTGAGE,144xx,NY,0.0,Individual,19.0,Fully Paid
2,20000.0,60 months,Mar-2016,416.73,9.16%,B,Reliability Engineer,1 year,77000.0,Home improvement,13.75,MORTGAGE,606xx,IL,0.0,Individual,19.0,Current
3,17475.0,60 months,Mar-2016,384.06,11.47%,B,,,41682.0,Debt consolidation,30.06,MORTGAGE,796xx,TX,0.0,Individual,18.0,Current
4,8000.0,36 months,Mar-2016,255.0,9.16%,B,Technician,10+ years,72000.0,Debt consolidation,22.63,RENT,217xx,MD,0.0,Individual,12.0,Current


### 01 - Target: Loan Status

`loan_status` is the current status of the loan. This is the variable we want to predict in our machine learning model. For this variable, we are going to considerar three labels:
- 0: loans that have already been paid.
- 1: default or charged off loans.
- 2: current loans (rest of the cases), where we don't know if they are going to be paid or not.

We will use labels 0 and 1 for training and testing our model. Label 2 is going to use just for predicting.

In [4]:
loans['loan_status'].value_counts()

Current                                                843754
Fully Paid                                             698690
Charged Off                                            182199
Late (31-120 days)                                      21742
In Grace Period                                         11812
Late (16-30 days)                                        4423
Does not meet the credit policy. Status:Fully Paid       1988
Does not meet the credit policy. Status:Charged Off       761
Default                                                    57
Name: loan_status, dtype: int64

In [5]:
def process_loan_status(loan_status):
    
    loan_status_dict = {
    "Current": 2,
    "Fully Paid": 0,
    "Charged Off": 1,
    "Late (31-120 days)": 2,
    "In Grace Period": 2,
    "Late (16-30 days)": 2,
    "Does not meet the credit policy. Status:Fully Paid": 0,
    "Does not meet the credit policy. Status:Charged Off": 1,
    "Default": 1
    }
    
    return loan_status_dict[loan_status]

In [6]:
loans['loan_status'] = loans['loan_status'].map(process_loan_status)

In [7]:
loans['loan_status'].head(10)

0    0
1    0
2    2
3    2
4    2
5    0
6    0
7    2
8    2
9    2
Name: loan_status, dtype: int64

In [8]:
loans = loans[loans['loan_status'] < 2]

In [9]:
loans.head()

Unnamed: 0,funded_amnt_inv,term,issue_d,installment,int_rate,grade,emp_title,emp_length,annual_inc,title,dti,home_ownership,zip_code,addr_state,total_rec_late_fee,application_type,total_acc,loan_status
0,10000.0,60 months,Mar-2016,262.34,19.53%,D,lpn/charge nurse,4 years,52000.0,Other,15.0,OWN,317xx,GA,0.0,Individual,12.0,0
1,35000.0,60 months,Mar-2016,941.96,20.75%,E,Coiler,3 years,85000.0,Debt consolidation,24.98,MORTGAGE,144xx,NY,0.0,Individual,19.0,0
5,14400.0,36 months,Mar-2016,469.74,10.75%,B,Coder,10+ years,85000.0,Business,28.11,MORTGAGE,037xx,NH,0.0,Individual,33.0,0
6,18000.0,60 months,Mar-2016,400.31,11.99%,C,Registered Nurse,6 years,100000.0,Debt consolidation,20.41,MORTGAGE,986xx,WA,0.0,Individual,40.0,0
10,7250.0,36 months,Mar-2016,246.63,13.67%,C,REGISTERED NURSE,3 years,72000.0,Debt consolidation,23.93,MORTGAGE,462xx,IN,0.0,Individual,24.0,0


In [10]:
loans['loan_status'].describe()

count    883695.000000
mean          0.207104
std           0.405231
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: loan_status, dtype: float64

__Our prior value of un paid loans is about 20%__ We will keep this value in mind!

### 02 - Numeric Variables

In [11]:
loans.dtypes

funded_amnt_inv       float64
term                   object
issue_d                object
installment           float64
int_rate               object
grade                  object
emp_title              object
emp_length             object
annual_inc            float64
title                  object
dti                   float64
home_ownership         object
zip_code               object
addr_state             object
total_rec_late_fee    float64
application_type       object
total_acc             float64
loan_status             int64
dtype: object

In [12]:
numerical_variables = ["funded_amnt_inv", "installment", "int_rate", "annual_inc", "dti",
                        "total_rec_late_fee", "total_acc"]

In [13]:
loans[numerical_variables].isnull().sum()

funded_amnt_inv        0
installment            0
int_rate               0
annual_inc             4
dti                   53
total_rec_late_fee     0
total_acc             29
dtype: int64

__Interest Rate__

In [14]:
loans['int_rate']

0           19.53%
1           20.75%
5           10.75%
6           11.99%
10          13.67%
13           5.32%
19           9.75%
20          11.47%
22           9.75%
25          12.99%
27           7.39%
28          18.99%
31          15.31%
35          19.53%
37          18.25%
42          14.46%
43           9.16%
44          13.67%
48          14.46%
57          12.99%
58          14.46%
59           5.32%
62           6.49%
63          19.53%
64           9.75%
66           7.89%
71           8.39%
72          14.46%
78           5.32%
79           5.32%
            ...   
1765393     10.49%
1765394     15.99%
1765395     13.66%
1765396      6.99%
1765397     15.99%
1765398      8.19%
1765399     11.99%
1765402     17.86%
1765403     17.86%
1765404      8.19%
1765405     15.59%
1765406     15.99%
1765408      8.19%
1765409      6.03%
1765410      6.03%
1765411      6.49%
1765412     14.99%
1765413     16.49%
1765414     14.31%
1765415      9.49%
1765416      8.67%
1765417     

In [15]:
loans['int_rate'] = loans['int_rate'].map(lambda x: float(x[:-1]))

In [16]:
loans['int_rate'].head()

0     19.53
1     20.75
5     10.75
6     11.99
10    13.67
Name: int_rate, dtype: float64

In [17]:
loans.dtypes

funded_amnt_inv       float64
term                   object
issue_d                object
installment           float64
int_rate              float64
grade                  object
emp_title              object
emp_length             object
annual_inc            float64
title                  object
dti                   float64
home_ownership         object
zip_code               object
addr_state             object
total_rec_late_fee    float64
application_type       object
total_acc             float64
loan_status             int64
dtype: object

__Outliers__

In [18]:
loans[numerical_variables].describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc
count,883695.0,883695.0,883695.0,883691.0,883642.0,883695.0,883666.0
mean,14258.36106,436.443223,13.457068,75359.64,17.921756,1.12989,25.369467
std,8532.472866,256.615587,4.641531,65133.31,9.100154,7.731221,11.987691
min,0.0,4.93,5.32,0.0,-1.0,-5.1e-09,1.0
25%,7925.0,250.33,9.99,45000.0,11.67,0.0,17.0
50%,12000.0,376.25,12.99,65000.0,17.4,0.0,24.0
75%,20000.0,576.27,16.29,90000.0,23.69,0.0,32.0
max,40000.0,1714.54,30.99,9550000.0,999.0,654.32,176.0


In [19]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than n outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers

In [20]:
# detect outliers from numerical features 
outliers_to_drop = detect_outliers(loans,1,numerical_variables)

print("There are {} outliers from numerical features".format(len(outliers_to_drop)))

There are 4062 outliers from numerical features


In [21]:
loans = loans.drop(outliers_to_drop, axis=0)

In [22]:
loans.shape

(879633, 18)

In [23]:
loans[numerical_variables].describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc
count,879633.0,879633.0,879633.0,879629.0,879581.0,879633.0,879604.0
mean,14171.467904,433.270098,13.427911,75095.74,17.917728,0.9813336,25.353935
std,8438.07892,252.202513,4.60117,64765.73,9.076448,6.72155,11.980763
min,0.0,4.93,5.32,0.0,-1.0,-5.1e-09,1.0
25%,7850.0,250.23,9.99,45000.0,11.66,0.0,17.0
50%,12000.0,375.43,12.99,65000.0,17.39,0.0,24.0
75%,20000.0,572.72,16.29,90000.0,23.68,0.0,32.0
max,40000.0,1506.65,30.99,9550000.0,999.0,402.03,176.0


__Dealing NA's__

In [24]:
loans[numerical_variables].isnull().sum()

funded_amnt_inv        0
installment            0
int_rate               0
annual_inc             4
dti                   52
total_rec_late_fee     0
total_acc             29
dtype: int64

In [25]:
loans[numerical_variables].median()

funded_amnt_inv       12000.00
installment             375.43
int_rate                 12.99
annual_inc            65000.00
dti                      17.39
total_rec_late_fee        0.00
total_acc                24.00
dtype: float64

In [26]:
loans[numerical_variables] = loans[numerical_variables].fillna(loans[numerical_variables].median())

In [27]:
loans[numerical_variables].isnull().sum()

funded_amnt_inv       0
installment           0
int_rate              0
annual_inc            0
dti                   0
total_rec_late_fee    0
total_acc             0
dtype: int64

### 03 - Categorical Variables

In [28]:
categorical_variables = ['term', 'grade', 'emp_title', 'emp_length', 'title', 
                         'home_ownership', 'zip_code', 'addr_state', 'application_type']

In [29]:
for variable in categorical_variables:
    number_of_categories = loans[variable].unique().size
    print("{}: {} categories".format(variable, number_of_categories))

term: 2 categories
grade: 7 categories
emp_title: 294604 categories
emp_length: 12 categories
title: 61934 categories
home_ownership: 6 categories
zip_code: 933 categories
addr_state: 51 categories
application_type: 2 categories


Too many categories for `emp_title`, `title`, `zip_code` and `addr_state`. Let's take a look at these three variables:

__Employee Title__:

In [30]:
loans['emp_title'].value_counts().head(10)

Teacher             12508
Manager             11524
Registered Nurse     5301
RN                   5155
Owner                5080
Supervisor           5059
Sales                4500
Driver               4196
Project Manager      3945
Office Manager       3288
Name: emp_title, dtype: int64

NA's

In [31]:
loans['emp_title'].isnull().sum()

52740

In [32]:
loans['emp_title'] = loans['emp_title'].fillna(method="ffill")

In [33]:
loans['emp_title'].isnull().sum()

0

In [34]:
loans['emp_title'] = loans['emp_title'].str.lower()

In [35]:
loans['emp_title'][loans['emp_title'].str.contains('manager')] = 'manager'

In [36]:
loans['manager'] = (loans['emp_title'] == 'manager').astype(int)

In [37]:
loans = loans.drop('emp_title', axis=1)

In [38]:
loans['manager'].describe()

count    879633.000000
mean          0.129146
std           0.335362
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: manager, dtype: float64

__Purpose__ of the loan:

In [39]:
loans['title'].value_counts().head(20)

Debt consolidation           406459
Credit card refinancing      147109
Home improvement              42972
Other                         36991
Debt Consolidation            15213
Major purchase                13582
Medical expenses               7648
Business                       7092
Car financing                  6371
Consolidation                  5158
Moving and relocation          4680
debt consolidation             4648
Vacation                       4596
Debt Consolidation Loan        3731
Home buying                    3044
Credit Card Consolidation      2291
consolidation                  2096
Personal Loan                  2092
Home Improvement               1721
Consolidation Loan             1713
Name: title, dtype: int64

In [40]:
loans['title'].isnull().sum()

8869

In [41]:
loans['title'] = loans['title'].fillna(method="ffill")

In [42]:
loans['title'].isnull().sum()

0

In [43]:
def loan_purpose_renaming():    
    title = loans['title'].str.lower()
    title[title.str.contains("credit card|credit")] = "credit card refinancing"
    title[title.str.contains("debt|consolidation|consolidate|refinance")] = "debt consolidation"
    title[title.str.contains("home improvement")] = "home improvement"
    title[title.str.contains("^home$|home loan")] = "home buying"
    title[title.str.contains("green")] = "green"
    title[title.str.contains("wedding")] = "wedding"
    title[title.str.contains("medical")] = "medical"
    title[title.str.contains("personal|my loan|^loan$|lending club")] = "personal"
    title[title.str.contains("business")] = "business"
    title[title.str.contains("pay(| )off")] = "payoff"
    title[title.str.contains("car loan|car financing")] = "car"
    
    return title

In [44]:
loans['title'] = loan_purpose_renaming()

In [45]:
loans['title'].value_counts()

debt consolidation                        482875
credit card refinancing                   179675
home improvement                           47978
other                                      37664
major purchase                             14081
personal                                   10188
business                                    9494
medical                                     9079
car                                         7299
payoff                                      6546
vacation                                    4861
moving and relocation                       4753
home buying                                 3880
wedding                                     2337
freedom                                     1227
green                                        536
bills                                        459
loan 1                                       347
cc loan                                      306
pool                                         293
pool loan           

In [46]:
loans['refinance'] = ((loans['title'] == 'credit card refinancing') | \
                     (loans['title'] == 'debt consolidation')).astype(int)

In [47]:
loans = loans.drop('title', axis = 1)

In [48]:
loans['refinance'].describe()

count    879633.000000
mean          0.753212
std           0.431143
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: refinance, dtype: float64

__zip_code__:

In [49]:
loans['zip_code'].describe()

count     879633
unique       933
top        945xx
freq       10179
Name: zip_code, dtype: object

In [50]:
loans['zip_code'].value_counts().head()

945xx    10179
750xx     9540
112xx     9030
606xx     8165
300xx     7811
Name: zip_code, dtype: int64

In [51]:
loans = loans.drop('zip_code', axis=1)

__addr_state__:

In [52]:
loans['addr_state'].describe()

count     879633
unique        51
top           CA
freq      133038
Name: addr_state, dtype: object

In [53]:
loans['addr_state'].value_counts().head()

CA    133038
NY     71929
TX     70740
FL     61854
IL     33406
Name: addr_state, dtype: int64

In [54]:
loans = loans.drop('addr_state', axis = 1)

In [55]:
categorical_variables = ['term', 'grade', 'emp_length', 'home_ownership', 'application_type']

In [56]:
loans[categorical_variables].describe()

Unnamed: 0,term,grade,emp_length,home_ownership,application_type
count,879633,879633,833358,879633,879633
unique,2,7,11,6,2
top,36 months,B,10+ years,MORTGAGE,Individual
freq,668168,253784,285774,437718,874543


__issue_d__

We have just one date variable, `issue_d`. We are only interested in the year of the loan and we will consider it as categorical variable:

In [57]:
loans['issue_d'] = loans['issue_d'].map(lambda x: x[4:])

In [58]:
loans['issue_d'].describe()

count     879633
unique        11
top         2015
freq      245033
Name: issue_d, dtype: object

In [59]:
loans = loans.drop('issue_d', axis = 1)

In [60]:
loans.head()

Unnamed: 0,funded_amnt_inv,term,installment,int_rate,grade,emp_length,annual_inc,dti,home_ownership,total_rec_late_fee,application_type,total_acc,loan_status,manager,refinance
0,10000.0,60 months,262.34,19.53,D,4 years,52000.0,15.0,OWN,0.0,Individual,12.0,0,0,0
1,35000.0,60 months,941.96,20.75,E,3 years,85000.0,24.98,MORTGAGE,0.0,Individual,19.0,0,0,1
5,14400.0,36 months,469.74,10.75,B,10+ years,85000.0,28.11,MORTGAGE,0.0,Individual,33.0,0,0,0
6,18000.0,60 months,400.31,11.99,C,6 years,100000.0,20.41,MORTGAGE,0.0,Individual,40.0,0,0,1
10,7250.0,36 months,246.63,13.67,C,3 years,72000.0,23.93,MORTGAGE,0.0,Individual,24.0,0,0,1


In [61]:
loans.shape

(879633, 15)

In [62]:
loans.isnull().sum()

funded_amnt_inv           0
term                      0
installment               0
int_rate                  0
grade                     0
emp_length            46275
annual_inc                0
dti                       0
home_ownership            0
total_rec_late_fee        0
application_type          0
total_acc                 0
loan_status               0
manager                   0
refinance                 0
dtype: int64

In [63]:
loans = loans.fillna(method = 'ffill')

In [64]:
loans.isnull().sum()

funded_amnt_inv       0
term                  0
installment           0
int_rate              0
grade                 0
emp_length            0
annual_inc            0
dti                   0
home_ownership        0
total_rec_late_fee    0
application_type      0
total_acc             0
loan_status           0
manager               0
refinance             0
dtype: int64

In [65]:
loans = pd.get_dummies(loans, columns = categorical_variables)

### 04 - Final Data 

In [66]:
loans.head()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc,loan_status,manager,refinance,...,emp_length_9 years,emp_length_< 1 year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,application_type_Individual,application_type_Joint App
0,10000.0,262.34,19.53,52000.0,15.0,0.0,12.0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,35000.0,941.96,20.75,85000.0,24.98,0.0,19.0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
5,14400.0,469.74,10.75,85000.0,28.11,0.0,33.0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
6,18000.0,400.31,11.99,100000.0,20.41,0.0,40.0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
10,7250.0,246.63,13.67,72000.0,23.93,0.0,24.0,0,0,1,...,0,0,0,1,0,0,0,0,1,0


In [67]:
loans['non_payment'] = loans['loan_status']

In [68]:
loans = loans.drop('loan_status', axis = 1)

In [69]:
loans.head()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc,manager,refinance,term_ 36 months,...,emp_length_< 1 year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,application_type_Individual,application_type_Joint App,non_payment
0,10000.0,262.34,19.53,52000.0,15.0,0.0,12.0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,35000.0,941.96,20.75,85000.0,24.98,0.0,19.0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
5,14400.0,469.74,10.75,85000.0,28.11,0.0,33.0,0,0,1,...,0,0,1,0,0,0,0,1,0,0
6,18000.0,400.31,11.99,100000.0,20.41,0.0,40.0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
10,7250.0,246.63,13.67,72000.0,23.93,0.0,24.0,0,1,1,...,0,0,1,0,0,0,0,1,0,0


In [70]:
loans.shape

(879633, 38)

In [71]:
loans.dtypes

funded_amnt_inv                float64
installment                    float64
int_rate                       float64
annual_inc                     float64
dti                            float64
total_rec_late_fee             float64
total_acc                      float64
manager                          int64
refinance                        int64
term_ 36 months                  uint8
term_ 60 months                  uint8
grade_A                          uint8
grade_B                          uint8
grade_C                          uint8
grade_D                          uint8
grade_E                          uint8
grade_F                          uint8
grade_G                          uint8
emp_length_1 year                uint8
emp_length_10+ years             uint8
emp_length_2 years               uint8
emp_length_3 years               uint8
emp_length_4 years               uint8
emp_length_5 years               uint8
emp_length_6 years               uint8
emp_length_7 years       

In [72]:
loans.describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc,manager,refinance,term_ 36 months,...,emp_length_< 1 year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,application_type_Individual,application_type_Joint App,non_payment
count,879633.0,879633.0,879633.0,879633.0,879633.0,879633.0,879633.0,879633.0,879633.0,879633.0,...,879633.0,879633.0,879633.0,879633.0,879633.0,879633.0,879633.0,879633.0,879633.0,879633.0
mean,14171.467904,433.270098,13.427911,75095.7,17.917697,0.9813336,25.353891,0.129146,0.753212,0.759599,...,0.08396,0.000126,0.497614,5.8e-05,0.000207,0.102174,0.39982,0.994213,0.005787,0.205928
std,8438.07892,252.202513,4.60117,64765.58,9.07618,6.72155,11.980568,0.335362,0.431143,0.427327,...,0.277328,0.011233,0.499995,0.007614,0.014383,0.302878,0.489861,0.075849,0.075849,0.404378
min,0.0,4.93,5.32,0.0,-1.0,-5.1e-09,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7850.0,250.23,9.99,45000.0,11.67,0.0,17.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,12000.0,375.43,12.99,65000.0,17.39,0.0,24.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,20000.0,572.72,16.29,90000.0,23.68,0.0,32.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,40000.0,1506.65,30.99,9550000.0,999.0,402.03,176.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [73]:
loans.to_csv('../data/loans_sample_processed.csv', sep = "^", index = False)