Processing Variables For Modeling
==================

Libraries

In [1]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt; plt.style.use("ggplot")

from collections import Counter

Read Data

In [2]:
loans = pd.read_csv("../data/clean/loans.csv", sep = "^").sample(200000, random_state = 42)

In [3]:
loans.head()

Unnamed: 0,funded_amnt_inv,term,issue_d,installment,int_rate,grade,emp_title,emp_length,annual_inc,title,dti,home_ownership,zip_code,addr_state,total_rec_late_fee,application_type,total_acc,loan_status
634766,10800.0,36 months,Apr-2017,379.65,15.99%,C,Sales,1 year,58800.0,Debt consolidation,23.41,RENT,776xx,TX,0.0,Individual,17.0,Current
1211417,12000.0,36 months,Jul-2014,378.2,8.39%,A,Retention,2 years,60000.0,Debt consolidation,5.26,OWN,302xx,GA,0.0,Individual,10.0,Fully Paid
329169,11500.0,60 months,Jul-2016,270.52,14.49%,C,,,50400.0,Debt consolidation,33.17,MORTGAGE,981xx,WA,0.0,Individual,25.0,Fully Paid
874986,9400.0,36 months,Oct-2017,305.17,10.42%,B,CNC Machinist,1 year,39360.0,Debt consolidation,28.66,RENT,067xx,CT,0.0,Individual,14.0,Current
1494916,11000.0,36 months,Sep-2015,331.27,5.32%,A,Pilot,10+ years,81184.0,Credit card refinancing,28.69,OWN,325xx,FL,0.0,Individual,28.0,Fully Paid


### 01 - Target: Loan Status

`loan_status` is the current status of the loan. This is the variable we want to predict in our machine learning model. For this variable, we are going to considerar three labels:
- 0: loans that have already been paid.
- 1: default or charged off loans.
- 2: current loans (rest of the cases), where we don't know if they are going to be paid or not.

We will use labels 0 and 1 for training and testing our model. Label 2 is going to use just for predicting.

In [4]:
loans['loan_status'].value_counts()

Current                                                95633
Fully Paid                                             79226
Charged Off                                            20522
Late (31-120 days)                                      2437
In Grace Period                                         1352
Late (16-30 days)                                        510
Does not meet the credit policy. Status:Fully Paid       222
Does not meet the credit policy. Status:Charged Off       93
Default                                                    5
Name: loan_status, dtype: int64

In [5]:
def process_loan_status(loan_status):
    
    loan_status_dict = {
    "Current": 2,
    "Fully Paid": 0,
    "Charged Off": 1,
    "Late (31-120 days)": 2,
    "In Grace Period": 2,
    "Late (16-30 days)": 2,
    "Does not meet the credit policy. Status:Fully Paid": 0,
    "Does not meet the credit policy. Status:Charged Off": 1,
    "Default": 1
    }
    
    return loan_status_dict[loan_status]

In [6]:
loans['loan_status'] = loans['loan_status'].map(process_loan_status)

In [7]:
loans['loan_status'].head(10)

634766     2
1211417    0
329169     0
874986     2
1494916    0
648273     2
402339     2
1669339    0
130885     2
562257     2
Name: loan_status, dtype: int64

In [8]:
loans = loans[loans['loan_status'] < 2]

In [9]:
loans.head()

Unnamed: 0,funded_amnt_inv,term,issue_d,installment,int_rate,grade,emp_title,emp_length,annual_inc,title,dti,home_ownership,zip_code,addr_state,total_rec_late_fee,application_type,total_acc,loan_status
1211417,12000.0,36 months,Jul-2014,378.2,8.39%,A,Retention,2 years,60000.0,Debt consolidation,5.26,OWN,302xx,GA,0.0,Individual,10.0,0
329169,11500.0,60 months,Jul-2016,270.52,14.49%,C,,,50400.0,Debt consolidation,33.17,MORTGAGE,981xx,WA,0.0,Individual,25.0,0
1494916,11000.0,36 months,Sep-2015,331.27,5.32%,A,Pilot,10+ years,81184.0,Credit card refinancing,28.69,OWN,325xx,FL,0.0,Individual,28.0,0
1669339,10000.0,60 months,Apr-2015,223.92,12.29%,C,appeals associate,7 years,48600.0,Debt consolidation,15.8,MORTGAGE,280xx,NC,0.0,Individual,24.0,0
1696382,7150.0,36 months,Mar-2015,259.39,18.25%,E,,,22450.8,Debt consolidation,22.94,RENT,184xx,PA,0.0,Individual,21.0,1


In [10]:
loans['loan_status'].describe()

count    100068.000000
mean          0.206060
std           0.404476
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: loan_status, dtype: float64

__Our prior value of un paid loans is about 20%__ We will keep this value in mind!

### 02 - Numeric Variables

In [11]:
loans.dtypes

funded_amnt_inv       float64
term                   object
issue_d                object
installment           float64
int_rate               object
grade                  object
emp_title              object
emp_length             object
annual_inc            float64
title                  object
dti                   float64
home_ownership         object
zip_code               object
addr_state             object
total_rec_late_fee    float64
application_type       object
total_acc             float64
loan_status             int64
dtype: object

In [12]:
numerical_variables = ["funded_amnt_inv", "installment", "int_rate", "annual_inc", "dti",
                        "total_rec_late_fee", "total_acc"]

In [13]:
loans[numerical_variables].isnull().sum()

funded_amnt_inv       0
installment           0
int_rate              0
annual_inc            1
dti                   4
total_rec_late_fee    0
total_acc             3
dtype: int64

__Interest Rate__

In [14]:
loans['int_rate']

1211417      8.39%
329169      14.49%
1494916      5.32%
1669339     12.29%
1696382     18.25%
1342783     11.99%
1612327     12.69%
1336811     15.61%
1382949     17.57%
1609462     15.61%
1728038      9.49%
1243026     14.49%
1380942      7.89%
1415736     16.55%
906984      10.38%
1048877     17.77%
1382735     13.67%
613590      15.99%
1757560     17.14%
1002742     13.11%
1197292      6.03%
1085400     14.09%
1606580     21.99%
1202196     10.15%
1317034     16.59%
1159241     14.49%
1220000     14.99%
987301      18.25%
1324845     14.47%
262061      12.79%
            ...   
1629668     20.49%
152654       7.39%
1446208     13.33%
1206607     15.61%
1021704     14.33%
1028918     15.80%
1290370     12.99%
1259082     23.43%
1295241     24.50%
1287684     12.49%
706555      24.85%
922961      14.98%
475725      24.74%
1056099     17.77%
1544073      7.26%
1267061     12.99%
194975      17.27%
227950       8.39%
1018460     15.80%
893909       7.49%
1243486     19.52%
1510777     

In [15]:
loans['int_rate'] = loans['int_rate'].map(lambda x: float(x[:-1]))

In [16]:
loans['int_rate'].head()

1211417     8.39
329169     14.49
1494916     5.32
1669339    12.29
1696382    18.25
Name: int_rate, dtype: float64

In [17]:
loans.dtypes

funded_amnt_inv       float64
term                   object
issue_d                object
installment           float64
int_rate              float64
grade                  object
emp_title              object
emp_length             object
annual_inc            float64
title                  object
dti                   float64
home_ownership         object
zip_code               object
addr_state             object
total_rec_late_fee    float64
application_type       object
total_acc             float64
loan_status             int64
dtype: object

__Outliers__

In [18]:
loans[numerical_variables].describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc
count,100068.0,100068.0,100068.0,100067.0,100064.0,100068.0,100065.0
mean,14240.401546,435.564755,13.436653,75479.54,17.841266,1.099204,25.340878
std,8516.08286,255.805453,4.646925,65246.63,9.067211,7.429252,11.98068
min,0.0,16.08,5.32,0.0,0.0,0.0,1.0
25%,7925.0,250.59,9.99,45000.0,11.59,0.0,17.0
50%,12000.0,375.85,12.99,65000.0,17.28,0.0,24.0
75%,20000.0,573.06,16.29,90000.0,23.58,0.0,32.0
max,40000.0,1517.09,30.99,8300000.0,818.1,338.46,162.0


In [19]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than n outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers

In [20]:
# detect outliers from numerical features 
outliers_to_drop = detect_outliers(loans,1,numerical_variables)

print("There are {} outliers from numerical features".format(len(outliers_to_drop)))

There are 481 outliers from numerical features


In [21]:
loans = loans.drop(outliers_to_drop, axis=0)

In [22]:
loans.shape

(99587, 18)

In [23]:
loans[numerical_variables].describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc
count,99587.0,99587.0,99587.0,99586.0,99583.0,99587.0,99584.0
mean,14149.931486,432.283494,13.40625,75177.69,17.83899,0.951194,25.322793
std,8417.699841,251.264015,4.604616,64629.72,9.067207,6.440769,11.972187
min,0.0,16.08,5.32,0.0,0.0,0.0,1.0
25%,7850.0,250.29,9.99,45000.0,11.59,0.0,17.0
50%,12000.0,375.14,12.99,65000.0,17.28,0.0,24.0
75%,19975.0,568.7,16.29,90000.0,23.58,0.0,32.0
max,40000.0,1404.4,30.99,8300000.0,818.1,268.55,162.0


__Dealing NA's__

In [24]:
loans[numerical_variables].isnull().sum()

funded_amnt_inv       0
installment           0
int_rate              0
annual_inc            1
dti                   4
total_rec_late_fee    0
total_acc             3
dtype: int64

In [25]:
loans[numerical_variables].median()

funded_amnt_inv       12000.00
installment             375.14
int_rate                 12.99
annual_inc            65000.00
dti                      17.28
total_rec_late_fee        0.00
total_acc                24.00
dtype: float64

In [26]:
loans[numerical_variables] = loans[numerical_variables].fillna(loans[numerical_variables].median())

In [27]:
loans[numerical_variables].isnull().sum()

funded_amnt_inv       0
installment           0
int_rate              0
annual_inc            0
dti                   0
total_rec_late_fee    0
total_acc             0
dtype: int64

### 03 - Categorical Variables

In [28]:
categorical_variables = ['term', 'grade', 'emp_title', 'emp_length', 'title', 
                         'home_ownership', 'zip_code', 'addr_state', 'application_type']

In [29]:
for variable in categorical_variables:
    number_of_categories = loans[variable].unique().size
    print("{}: {} categories".format(variable, number_of_categories))

term: 2 categories
grade: 7 categories
emp_title: 47634 categories
emp_length: 12 categories
title: 9534 categories
home_ownership: 6 categories
zip_code: 884 categories
addr_state: 51 categories
application_type: 2 categories


Too many categories for `emp_title`, `title`, `zip_code` and `addr_state`. Let's take a look at these three variables:

__Employee Title__:

In [30]:
loans['emp_title'].value_counts().head(10)

Manager             1344
Teacher             1338
Registered Nurse     602
Supervisor           590
RN                   578
Owner                548
Sales                539
Driver               471
Project Manager      439
Office Manager       381
Name: emp_title, dtype: int64

NA's

In [31]:
loans['emp_title'].isnull().sum()

5944

In [32]:
loans['emp_title'] = loans['emp_title'].fillna(method="ffill")

In [33]:
loans['emp_title'].isnull().sum()

0

In [34]:
loans['emp_title'] = loans['emp_title'].str.lower()

In [35]:
loans['emp_title'][loans['emp_title'].str.contains('manager')] = 'manager'

In [36]:
loans['manager'] = (loans['emp_title'] == 'manager').astype(int)

In [37]:
loans = loans.drop('emp_title', axis=1)

In [38]:
loans['manager'].describe()

count    99587.000000
mean         0.128651
std          0.334815
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: manager, dtype: float64

__Purpose__ of the loan:

In [39]:
loans['title'].value_counts().head(20)

Debt consolidation           45915
Credit card refinancing      16740
Home improvement              4812
Other                         4178
Debt Consolidation            1767
Major purchase                1520
Medical expenses               907
Business                       775
Car financing                  704
Consolidation                  644
Vacation                       528
debt consolidation             524
Moving and relocation          500
Debt Consolidation Loan        394
Home buying                    344
Personal Loan                  259
Credit Card Consolidation      229
consolidation                  227
Home Improvement               206
Consolidation Loan             182
Name: title, dtype: int64

In [40]:
loans['title'].isnull().sum()

975

In [41]:
loans['title'] = loans['title'].fillna(method="ffill")

In [42]:
loans['title'].isnull().sum()

0

In [43]:
def loan_purpose_renaming():    
    title = loans['title'].str.lower()
    title[title.str.contains("credit card|credit")] = "credit card refinancing"
    title[title.str.contains("debt|consolidation|consolidate|refinance")] = "debt consolidation"
    title[title.str.contains("home improvement")] = "home improvement"
    title[title.str.contains("^home$|home loan")] = "home buying"
    title[title.str.contains("green")] = "green"
    title[title.str.contains("wedding")] = "wedding"
    title[title.str.contains("medical")] = "medical"
    title[title.str.contains("personal|my loan|^loan$|lending club")] = "personal"
    title[title.str.contains("business")] = "business"
    title[title.str.contains("pay(| )off")] = "payoff"
    title[title.str.contains("car loan|car financing")] = "car"
    
    return title

In [44]:
loans['title'] = loan_purpose_renaming()

In [45]:
loans['title'].value_counts()

debt consolidation                                                 54589
credit card refinancing                                            20370
home improvement                                                    5372
other                                                               4223
major purchase                                                      1576
personal                                                            1255
medical                                                             1073
business                                                            1057
car                                                                  822
payoff                                                               737
vacation                                                             556
moving and relocation                                                504
home buying                                                          441
wedding                                            

In [46]:
loans['refinance'] = ((loans['title'] == 'credit card refinancing') | \
                     (loans['title'] == 'debt consolidation')).astype(int)

In [47]:
loans = loans.drop('title', axis = 1)

In [48]:
loans['refinance'].describe()

count    99587.000000
mean         0.752699
std          0.431446
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: refinance, dtype: float64

__zip_code__:

In [49]:
loans['zip_code'].describe()

count     99587
unique      884
top       945xx
freq       1151
Name: zip_code, dtype: object

In [50]:
loans['zip_code'].value_counts().head()

945xx    1151
750xx    1088
112xx    1044
606xx     925
300xx     922
Name: zip_code, dtype: int64

In [51]:
loans = loans.drop('zip_code', axis=1)

__addr_state__:

In [52]:
loans['addr_state'].describe()

count     99587
unique       51
top          CA
freq      15117
Name: addr_state, dtype: object

In [53]:
loans['addr_state'].value_counts().head()

CA    15117
NY     8102
TX     8020
FL     6969
IL     3776
Name: addr_state, dtype: int64

In [54]:
loans = loans.drop('addr_state', axis = 1)

In [55]:
categorical_variables = ['term', 'grade', 'emp_length', 'home_ownership', 'application_type']

In [56]:
loans[categorical_variables].describe()

Unnamed: 0,term,grade,emp_length,home_ownership,application_type
count,99587,99587,94372,99587,99587
unique,2,7,11,6,2
top,36 months,B,10+ years,MORTGAGE,Individual
freq,75575,28866,32483,49392,99025


__issue_d__

We have just one date variable, `issue_d`. We are only interested in the year of the loan and we will consider it as categorical variable:

In [57]:
loans['issue_d'] = loans['issue_d'].map(lambda x: x[4:])

In [58]:
loans['issue_d'].describe()

count     99587
unique       11
top        2015
freq      27922
Name: issue_d, dtype: object

In [59]:
loans = loans.drop('issue_d', axis = 1)

In [60]:
loans.head()

Unnamed: 0,funded_amnt_inv,term,installment,int_rate,grade,emp_length,annual_inc,dti,home_ownership,total_rec_late_fee,application_type,total_acc,loan_status,manager,refinance
1211417,12000.0,36 months,378.2,8.39,A,2 years,60000.0,5.26,OWN,0.0,Individual,10.0,0,0,1
329169,11500.0,60 months,270.52,14.49,C,,50400.0,33.17,MORTGAGE,0.0,Individual,25.0,0,0,1
1494916,11000.0,36 months,331.27,5.32,A,10+ years,81184.0,28.69,OWN,0.0,Individual,28.0,0,0,1
1669339,10000.0,60 months,223.92,12.29,C,7 years,48600.0,15.8,MORTGAGE,0.0,Individual,24.0,0,0,1
1696382,7150.0,36 months,259.39,18.25,E,,22450.8,22.94,RENT,0.0,Individual,21.0,1,0,1


In [61]:
loans.shape

(99587, 15)

In [62]:
loans.isnull().sum()

funded_amnt_inv          0
term                     0
installment              0
int_rate                 0
grade                    0
emp_length            5215
annual_inc               0
dti                      0
home_ownership           0
total_rec_late_fee       0
application_type         0
total_acc                0
loan_status              0
manager                  0
refinance                0
dtype: int64

In [63]:
loans = loans.fillna(method = 'ffill')

In [64]:
loans.isnull().sum()

funded_amnt_inv       0
term                  0
installment           0
int_rate              0
grade                 0
emp_length            0
annual_inc            0
dti                   0
home_ownership        0
total_rec_late_fee    0
application_type      0
total_acc             0
loan_status           0
manager               0
refinance             0
dtype: int64

In [65]:
loans = pd.get_dummies(loans, columns = categorical_variables)

### 04 - Final Data 

In [66]:
loans.head()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc,loan_status,manager,refinance,...,emp_length_9 years,emp_length_< 1 year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,application_type_Individual,application_type_Joint App
1211417,12000.0,378.2,8.39,60000.0,5.26,0.0,10.0,0,0,1,...,0,0,0,0,0,0,1,0,1,0
329169,11500.0,270.52,14.49,50400.0,33.17,0.0,25.0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
1494916,11000.0,331.27,5.32,81184.0,28.69,0.0,28.0,0,0,1,...,0,0,0,0,0,0,1,0,1,0
1669339,10000.0,223.92,12.29,48600.0,15.8,0.0,24.0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
1696382,7150.0,259.39,18.25,22450.8,22.94,0.0,21.0,1,0,1,...,0,0,0,0,0,0,0,1,1,0


In [67]:
loans['non_payment'] = loans['loan_status']

In [68]:
loans = loans.drop('loan_status', axis = 1)

In [69]:
loans.head()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc,manager,refinance,term_ 36 months,...,emp_length_< 1 year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,application_type_Individual,application_type_Joint App,non_payment
1211417,12000.0,378.2,8.39,60000.0,5.26,0.0,10.0,0,1,1,...,0,0,0,0,0,1,0,1,0,0
329169,11500.0,270.52,14.49,50400.0,33.17,0.0,25.0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
1494916,11000.0,331.27,5.32,81184.0,28.69,0.0,28.0,0,1,1,...,0,0,0,0,0,1,0,1,0,0
1669339,10000.0,223.92,12.29,48600.0,15.8,0.0,24.0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
1696382,7150.0,259.39,18.25,22450.8,22.94,0.0,21.0,0,1,1,...,0,0,0,0,0,0,1,1,0,1


In [70]:
loans.shape

(99587, 38)

In [71]:
loans.dtypes

funded_amnt_inv                float64
installment                    float64
int_rate                       float64
annual_inc                     float64
dti                            float64
total_rec_late_fee             float64
total_acc                      float64
manager                          int64
refinance                        int64
term_ 36 months                  uint8
term_ 60 months                  uint8
grade_A                          uint8
grade_B                          uint8
grade_C                          uint8
grade_D                          uint8
grade_E                          uint8
grade_F                          uint8
grade_G                          uint8
emp_length_1 year                uint8
emp_length_10+ years             uint8
emp_length_2 years               uint8
emp_length_3 years               uint8
emp_length_4 years               uint8
emp_length_5 years               uint8
emp_length_6 years               uint8
emp_length_7 years       

In [72]:
loans.describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc,manager,refinance,term_ 36 months,...,emp_length_< 1 year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,application_type_Individual,application_type_Joint App,non_payment
count,99587.0,99587.0,99587.0,99587.0,99587.0,99587.0,99587.0,99587.0,99587.0,99587.0,...,99587.0,99587.0,99587.0,99587.0,99587.0,99587.0,99587.0,99587.0,99587.0,99587.0
mean,14149.931486,432.283494,13.40625,75177.59,17.838967,0.951194,25.322753,0.128651,0.752699,0.758884,...,0.083997,0.00012,0.495968,9e-05,0.000221,0.103327,0.400273,0.994357,0.005643,0.204816
std,8417.699841,251.264015,4.604616,64629.4,9.067025,6.440769,11.972009,0.334815,0.431446,0.427763,...,0.277385,0.010977,0.499986,0.009506,0.014862,0.304387,0.489956,0.07491,0.07491,0.403569
min,0.0,16.08,5.32,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7850.0,250.29,9.99,45000.0,11.59,0.0,17.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,12000.0,375.14,12.99,65000.0,17.28,0.0,24.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,19975.0,568.7,16.29,90000.0,23.58,0.0,32.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,40000.0,1404.4,30.99,8300000.0,818.1,268.55,162.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [73]:
loans.to_csv('../data/loans_sample_processed.csv', sep = "^", index = False)