Processing Variables For Modeling
==================

Libraries

In [1]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt; plt.style.use("ggplot")

from collections import Counter

Read Data

In [2]:
loans = pd.read_csv("../data/clean/loans.csv", sep = "^").sample(200000, random_state = 4290)

In [3]:
loans.head()

Unnamed: 0,funded_amnt_inv,term,issue_d,installment,int_rate,grade,emp_title,emp_length,annual_inc,title,dti,home_ownership,zip_code,addr_state,total_rec_late_fee,application_type,total_acc,loan_status
1496716,14375.0,60 months,Sep-2015,362.31,17.57%,D,Shop foreman,8 years,140000.0,Debt consolidation,26.55,MORTGAGE,306xx,GA,0.0,Individual,39.0,Current
566100,3000.0,36 months,Jun-2017,105.51,16.02%,C,Respiratory Therapist,6 years,50000.0,Debt consolidation,33.46,MORTGAGE,550xx,MN,0.0,Individual,22.0,Current
191845,7000.0,36 months,Apr-2016,230.74,11.47%,B,Natural Resource Manager,10+ years,102000.0,Major purchase,5.55,MORTGAGE,389xx,MS,0.0,Individual,47.0,Current
611191,8800.0,36 months,Apr-2017,276.74,8.24%,B,GIS TECHNICIAN,10+ years,42000.0,Debt consolidation,13.16,MORTGAGE,285xx,NC,0.0,Individual,31.0,Current
1737593,16475.0,60 months,Jan-2015,430.86,19.24%,E,Sr. Retirement Counselor,8 years,50000.0,Debt consolidation,32.35,MORTGAGE,374xx,TN,0.0,Individual,32.0,Current


### 01 - Target: Loan Status

`loan_status` is the current status of the loan. This is the variable we want to predict in our machine learning model. For this variable, we are going to considerar three labels:
- 0: loans that have already been paid.
- 1: default or charged off loans.
- 2: current loans (rest of the cases), where we don't know if they are going to be paid or not.

We will use labels 0 and 1 for training and testing our model. Label 2 is going to use just for predicting.

In [4]:
loans['loan_status'].value_counts()

Current                                                95158
Fully Paid                                             79394
Charged Off                                            20721
Late (31-120 days)                                      2542
In Grace Period                                         1354
Late (16-30 days)                                        514
Does not meet the credit policy. Status:Fully Paid       225
Does not meet the credit policy. Status:Charged Off       84
Default                                                    8
Name: loan_status, dtype: int64

In [5]:
def process_loan_status(loan_status):
    
    loan_status_dict = {
    "Current": 2,
    "Fully Paid": 0,
    "Charged Off": 1,
    "Late (31-120 days)": 2,
    "In Grace Period": 2,
    "Late (16-30 days)": 2,
    "Does not meet the credit policy. Status:Fully Paid": 0,
    "Does not meet the credit policy. Status:Charged Off": 1,
    "Default": 1
    }
    
    return loan_status_dict[loan_status]

In [6]:
loans['loan_status'] = loans['loan_status'].map(process_loan_status)

In [7]:
loans['loan_status'].head(10)

1496716    2
566100     2
191845     2
611191     2
1737593    2
292947     2
1738120    0
214732     2
1261074    1
199096     2
Name: loan_status, dtype: int64

In [8]:
loans = loans[loans['loan_status'] < 2]

In [9]:
loans.head()

Unnamed: 0,funded_amnt_inv,term,issue_d,installment,int_rate,grade,emp_title,emp_length,annual_inc,title,dti,home_ownership,zip_code,addr_state,total_rec_late_fee,application_type,total_acc,loan_status
1738120,25000.0,36 months,Jan-2015,777.55,7.49%,A,Senior Quality Engineer,10+ years,106157.0,Debt consolidation,9.37,MORTGAGE,922xx,CA,0.0,Individual,49.0,0
1261074,12000.0,36 months,May-2014,404.27,12.99%,C,service manager,7 years,67000.0,Debt consolidation,21.33,MORTGAGE,315xx,GA,0.0,Individual,28.0,1
985734,4800.0,36 months,Aug-2013,175.59,18.85%,D,LAVO,3 years,56000.0,Consolidate,7.62,RENT,900xx,CA,0.0,Individual,15.0,0
1167943,7900.0,36 months,Oct-2014,273.82,14.99%,C,security guard,1 year,58300.0,Debt consolidation,12.15,MORTGAGE,775xx,TX,0.0,Individual,17.0,0
1216290,10725.0,36 months,Jul-2014,385.43,17.57%,D,Teacher,1 year,36000.0,Debt consolidation,21.68,RENT,361xx,AL,19.27,Individual,18.0,0


In [10]:
loans['loan_status'].describe()

count    100432.000000
mean          0.207235
std           0.405327
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: loan_status, dtype: float64

__Our prior value of un paid loans is about 20%__ We will keep this value in mind!

### 02 - Numeric Variables

In [11]:
loans.dtypes

funded_amnt_inv       float64
term                   object
issue_d                object
installment           float64
int_rate               object
grade                  object
emp_title              object
emp_length             object
annual_inc            float64
title                  object
dti                   float64
home_ownership         object
zip_code               object
addr_state             object
total_rec_late_fee    float64
application_type       object
total_acc             float64
loan_status             int64
dtype: object

In [12]:
numerical_variables = ["funded_amnt_inv", "installment", "int_rate", "annual_inc", "dti",
                        "total_rec_late_fee", "total_acc"]

In [13]:
loans[numerical_variables].isnull().sum()

funded_amnt_inv       0
installment           0
int_rate              0
annual_inc            0
dti                   6
total_rec_late_fee    0
total_acc             3
dtype: int64

__Interest Rate__

In [14]:
loans['int_rate']

1738120      7.49%
1261074     12.99%
985734      18.85%
1167943     14.99%
1216290     17.57%
1452944     13.33%
5668        18.25%
1267926     13.98%
1524323     10.99%
1390334     28.99%
1692298      9.99%
465932      14.99%
1310447     12.99%
1683022     13.99%
903463      13.61%
987535       9.71%
1567289     12.29%
1754244     15.59%
1312464      7.62%
1265474     12.99%
932894      14.30%
1150752     16.99%
938196      12.99%
1736743      8.19%
1673528     15.61%
1215228     12.99%
1014811     16.29%
1011153     21.98%
414345      11.49%
931802      19.20%
            ...   
1150940     17.57%
716887      12.62%
234551      24.49%
1629594     14.65%
220706      11.99%
1251407     12.99%
1294988     14.64%
708596      13.59%
1027150     15.31%
974808      22.70%
1684773     19.52%
1570475     11.53%
1137675     12.39%
1529159     12.69%
316981      12.79%
1265195      7.69%
1433435     10.99%
1105839     11.71%
1646249     14.65%
1207384      6.49%
251102      10.49%
989257      

In [15]:
loans['int_rate'] = loans['int_rate'].map(lambda x: float(x[:-1]))

In [16]:
loans['int_rate'].head()

1738120     7.49
1261074    12.99
985734     18.85
1167943    14.99
1216290    17.57
Name: int_rate, dtype: float64

In [17]:
loans.dtypes

funded_amnt_inv       float64
term                   object
issue_d                object
installment           float64
int_rate              float64
grade                  object
emp_title              object
emp_length             object
annual_inc            float64
title                  object
dti                   float64
home_ownership         object
zip_code               object
addr_state             object
total_rec_late_fee    float64
application_type       object
total_acc             float64
loan_status             int64
dtype: object

__Outliers__

In [18]:
loans[numerical_variables].describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc
count,100432.0,100432.0,100432.0,100432.0,100426.0,100432.0,100429.0
mean,14256.9269,436.587013,13.463686,75393.06,17.868144,1.126022,25.390674
std,8524.613057,257.093076,4.626455,62239.74,9.157764,7.593729,11.968376
min,0.0,16.31,5.32,0.0,0.0,0.0,1.0
25%,7950.0,251.36,9.99,45647.0,11.61,0.0,17.0
50%,12000.0,375.99,12.99,65000.0,17.33,0.0,24.0
75%,20000.0,574.3225,16.29,90000.0,23.63,0.0,32.0
max,40000.0,1584.9,30.99,9550000.0,999.0,291.9,135.0


In [19]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than n outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers

In [20]:
# detect outliers from numerical features 
outliers_to_drop = detect_outliers(loans,1,numerical_variables)

print("There are {} outliers from numerical features".format(len(outliers_to_drop)))

There are 1340 outliers from numerical features


In [21]:
loans = loans.drop(outliers_to_drop, axis=0)

In [22]:
loans.shape

(99092, 18)

In [23]:
loans[numerical_variables].describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc
count,99092.0,99092.0,99092.0,99092.0,99086.0,99092.0,99089.0
mean,14007.48494,427.709516,13.428513,73654.86,17.903371,0.926889,25.321489
std,8267.845456,245.497565,4.588009,58440.0,9.146164,6.328852,11.938413
min,0.0,16.31,5.32,0.0,0.0,0.0,1.0
25%,7800.0,249.55,9.99,45000.0,11.66,0.0,17.0
50%,12000.0,372.71,12.99,64322.0,17.38,0.0,24.0
75%,19925.0,564.18,16.29,90000.0,23.67,0.0,32.0
max,40000.0,1404.4,30.99,9550000.0,999.0,283.98,135.0


__Dealing NA's__

In [24]:
loans[numerical_variables].isnull().sum()

funded_amnt_inv       0
installment           0
int_rate              0
annual_inc            0
dti                   6
total_rec_late_fee    0
total_acc             3
dtype: int64

In [25]:
loans[numerical_variables].median()

funded_amnt_inv       12000.00
installment             372.71
int_rate                 12.99
annual_inc            64322.00
dti                      17.38
total_rec_late_fee        0.00
total_acc                24.00
dtype: float64

In [26]:
loans[numerical_variables] = loans[numerical_variables].fillna(loans[numerical_variables].median())

In [27]:
loans[numerical_variables].isnull().sum()

funded_amnt_inv       0
installment           0
int_rate              0
annual_inc            0
dti                   0
total_rec_late_fee    0
total_acc             0
dtype: int64

### 03 - Categorical Variables

In [28]:
categorical_variables = ['term', 'grade', 'emp_title', 'emp_length', 'title', 
                         'home_ownership', 'zip_code', 'addr_state', 'application_type']

In [29]:
for variable in categorical_variables:
    number_of_categories = loans[variable].unique().size
    print("{}: {} categories".format(variable, number_of_categories))

term: 2 categories
grade: 7 categories
emp_title: 47501 categories
emp_length: 12 categories
title: 9563 categories
home_ownership: 6 categories
zip_code: 879 categories
addr_state: 51 categories
application_type: 2 categories


Too many categories for `emp_title`, `title`, `zip_code` and `addr_state`. Let's take a look at these three variables:

__Employee Title__:

In [30]:
loans['emp_title'].value_counts().head(10)

Teacher             1369
Manager             1285
Registered Nurse     620
Owner                584
RN                   566
Supervisor           548
Sales                499
Driver               479
Project Manager      454
Office Manager       371
Name: emp_title, dtype: int64

NA's

In [31]:
loans['emp_title'].isnull().sum()

6019

In [32]:
loans['emp_title'] = loans['emp_title'].fillna(method="ffill")

In [33]:
loans['emp_title'].isnull().sum()

0

In [34]:
loans['emp_title'] = loans['emp_title'].str.lower()

In [35]:
loans['emp_title'][loans['emp_title'].str.contains('manager')] = 'manager'

In [36]:
loans['manager'] = (loans['emp_title'] == 'manager').astype(int)

In [37]:
loans = loans.drop('emp_title', axis=1)

In [38]:
loans['manager'].describe()

count    99092.000000
mean         0.127629
std          0.333678
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: manager, dtype: float64

__Purpose__ of the loan:

In [39]:
loans['title'].value_counts().head(20)

Debt consolidation           45800
Credit card refinancing      16721
Home improvement              4715
Other                         4164
Debt Consolidation            1716
Major purchase                1533
Medical expenses               852
Business                       740
Car financing                  685
Consolidation                  613
Vacation                       531
Moving and relocation          513
debt consolidation             489
Debt Consolidation Loan        432
Home buying                    345
Credit Card Consolidation      260
Personal Loan                  240
consolidation                  222
Home Improvement               203
Consolidation Loan             199
Name: title, dtype: int64

In [40]:
loans['title'].isnull().sum()

977

In [41]:
loans['title'] = loans['title'].fillna(method="ffill")

In [42]:
loans['title'].isnull().sum()

0

In [43]:
def loan_purpose_renaming():    
    title = loans['title'].str.lower()
    title[title.str.contains("credit card|credit")] = "credit card refinancing"
    title[title.str.contains("debt|consolidation|consolidate|refinance")] = "debt consolidation"
    title[title.str.contains("home improvement")] = "home improvement"
    title[title.str.contains("^home$|home loan")] = "home buying"
    title[title.str.contains("green")] = "green"
    title[title.str.contains("wedding")] = "wedding"
    title[title.str.contains("medical")] = "medical"
    title[title.str.contains("personal|my loan|^loan$|lending club")] = "personal"
    title[title.str.contains("business")] = "business"
    title[title.str.contains("pay(| )off")] = "payoff"
    title[title.str.contains("car loan|car financing")] = "car"
    
    return title

In [44]:
loans['title'] = loan_purpose_renaming()

In [45]:
loans['title'].value_counts()

debt consolidation                          54415
credit card refinancing                     20337
home improvement                             5249
other                                        4217
major purchase                               1575
personal                                     1168
medical                                      1024
business                                     1005
car                                           784
payoff                                        741
vacation                                      566
moving and relocation                         519
home buying                                   426
wedding                                       273
freedom                                       138
green                                          60
bills                                          59
loan 1                                         43
cc loan                                        40
pool loan                                      34


In [46]:
loans['refinance'] = ((loans['title'] == 'credit card refinancing') | \
                     (loans['title'] == 'debt consolidation')).astype(int)

In [47]:
loans = loans.drop('title', axis = 1)

In [48]:
loans['refinance'].describe()

count    99092.000000
mean         0.754370
std          0.430462
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: refinance, dtype: float64

__zip_code__:

In [49]:
loans['zip_code'].describe()

count     99092
unique      879
top       945xx
freq       1142
Name: zip_code, dtype: object

In [50]:
loans['zip_code'].value_counts().head()

945xx    1142
750xx    1048
112xx    1016
606xx     878
070xx     846
Name: zip_code, dtype: int64

In [51]:
loans = loans.drop('zip_code', axis=1)

__addr_state__:

In [52]:
loans['addr_state'].describe()

count     99092
unique       51
top          CA
freq      15025
Name: addr_state, dtype: object

In [53]:
loans['addr_state'].value_counts().head()

CA    15025
NY     8119
TX     7991
FL     6942
IL     3697
Name: addr_state, dtype: int64

In [54]:
loans = loans.drop('addr_state', axis = 1)

In [55]:
categorical_variables = ['term', 'grade', 'emp_length', 'home_ownership', 'application_type']

In [56]:
loans[categorical_variables].describe()

Unnamed: 0,term,grade,emp_length,home_ownership,application_type
count,99092,99092,93810,99092,99092
unique,2,7,11,6,2
top,36 months,B,10+ years,MORTGAGE,Individual
freq,75128,28461,32340,49219,98532


__issue_d__

We have just one date variable, `issue_d`. We are only interested in the year of the loan and we will consider it as categorical variable:

In [57]:
loans['issue_d'] = loans['issue_d'].map(lambda x: x[4:])

In [58]:
loans['issue_d'].describe()

count     99092
unique       11
top        2015
freq      27631
Name: issue_d, dtype: object

In [59]:
loans = loans.drop('issue_d', axis = 1)

In [60]:
loans.head()

Unnamed: 0,funded_amnt_inv,term,installment,int_rate,grade,emp_length,annual_inc,dti,home_ownership,total_rec_late_fee,application_type,total_acc,loan_status,manager,refinance
1738120,25000.0,36 months,777.55,7.49,A,10+ years,106157.0,9.37,MORTGAGE,0.0,Individual,49.0,0,0,1
1261074,12000.0,36 months,404.27,12.99,C,7 years,67000.0,21.33,MORTGAGE,0.0,Individual,28.0,1,1,1
985734,4800.0,36 months,175.59,18.85,D,3 years,56000.0,7.62,RENT,0.0,Individual,15.0,0,0,1
1167943,7900.0,36 months,273.82,14.99,C,1 year,58300.0,12.15,MORTGAGE,0.0,Individual,17.0,0,0,1
1216290,10725.0,36 months,385.43,17.57,D,1 year,36000.0,21.68,RENT,19.27,Individual,18.0,0,0,1


In [61]:
loans.shape

(99092, 15)

In [62]:
loans.isnull().sum()

funded_amnt_inv          0
term                     0
installment              0
int_rate                 0
grade                    0
emp_length            5282
annual_inc               0
dti                      0
home_ownership           0
total_rec_late_fee       0
application_type         0
total_acc                0
loan_status              0
manager                  0
refinance                0
dtype: int64

In [63]:
loans = loans.fillna(method = 'ffill')

In [64]:
loans.isnull().sum()

funded_amnt_inv       0
term                  0
installment           0
int_rate              0
grade                 0
emp_length            0
annual_inc            0
dti                   0
home_ownership        0
total_rec_late_fee    0
application_type      0
total_acc             0
loan_status           0
manager               0
refinance             0
dtype: int64

In [65]:
loans = pd.get_dummies(loans, columns = categorical_variables)

### 04 - Final Data 

In [66]:
loans.head()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc,loan_status,manager,refinance,...,emp_length_9 years,emp_length_< 1 year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,application_type_Individual,application_type_Joint App
1738120,25000.0,777.55,7.49,106157.0,9.37,0.0,49.0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
1261074,12000.0,404.27,12.99,67000.0,21.33,0.0,28.0,1,1,1,...,0,0,0,1,0,0,0,0,1,0
985734,4800.0,175.59,18.85,56000.0,7.62,0.0,15.0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
1167943,7900.0,273.82,14.99,58300.0,12.15,0.0,17.0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
1216290,10725.0,385.43,17.57,36000.0,21.68,19.27,18.0,0,0,1,...,0,0,0,0,0,0,0,1,1,0


In [67]:
loans['non_payment'] = loans['loan_status']

In [68]:
loans = loans.drop('loan_status', axis = 1)

In [69]:
loans.head()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc,manager,refinance,term_ 36 months,...,emp_length_< 1 year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,application_type_Individual,application_type_Joint App,non_payment
1738120,25000.0,777.55,7.49,106157.0,9.37,0.0,49.0,0,1,1,...,0,0,1,0,0,0,0,1,0,0
1261074,12000.0,404.27,12.99,67000.0,21.33,0.0,28.0,1,1,1,...,0,0,1,0,0,0,0,1,0,1
985734,4800.0,175.59,18.85,56000.0,7.62,0.0,15.0,0,1,1,...,0,0,0,0,0,0,1,1,0,0
1167943,7900.0,273.82,14.99,58300.0,12.15,0.0,17.0,0,1,1,...,0,0,1,0,0,0,0,1,0,0
1216290,10725.0,385.43,17.57,36000.0,21.68,19.27,18.0,0,1,1,...,0,0,0,0,0,0,1,1,0,0


In [70]:
loans.shape

(99092, 38)

In [71]:
loans.dtypes

funded_amnt_inv                float64
installment                    float64
int_rate                       float64
annual_inc                     float64
dti                            float64
total_rec_late_fee             float64
total_acc                      float64
manager                          int64
refinance                        int64
term_ 36 months                  uint8
term_ 60 months                  uint8
grade_A                          uint8
grade_B                          uint8
grade_C                          uint8
grade_D                          uint8
grade_E                          uint8
grade_F                          uint8
grade_G                          uint8
emp_length_1 year                uint8
emp_length_10+ years             uint8
emp_length_2 years               uint8
emp_length_3 years               uint8
emp_length_4 years               uint8
emp_length_5 years               uint8
emp_length_6 years               uint8
emp_length_7 years       

In [72]:
loans.describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc,manager,refinance,term_ 36 months,...,emp_length_< 1 year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,application_type_Individual,application_type_Joint App,non_payment
count,99092.0,99092.0,99092.0,99092.0,99092.0,99092.0,99092.0,99092.0,99092.0,99092.0,...,99092.0,99092.0,99092.0,99092.0,99092.0,99092.0,99092.0,99092.0,99092.0,99092.0
mean,14007.48494,427.709516,13.428513,73654.86,17.903339,0.926889,25.321449,0.127629,0.75437,0.758164,...,0.085315,0.000161,0.4967,7.1e-05,0.000222,0.102824,0.400022,0.994349,0.005651,0.20598
std,8267.845456,245.497565,4.588009,58440.0,9.145888,6.328852,11.938234,0.333678,0.430462,0.428198,...,0.279351,0.012706,0.499992,0.008405,0.014899,0.30373,0.489905,0.074963,0.074963,0.404418
min,0.0,16.31,5.32,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7800.0,249.55,9.99,45000.0,11.66,0.0,17.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,12000.0,372.71,12.99,64322.0,17.38,0.0,24.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,19925.0,564.18,16.29,90000.0,23.67,0.0,32.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,40000.0,1404.4,30.99,9550000.0,999.0,283.98,135.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [73]:
loans.to_csv('../data/loans_sample_processed.csv', sep = "^", index = False)