Processing Variables For Modeling
==================

In [1]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from collections import Counter

In [2]:
loans = pd.read_csv("../data/clean/loans.csv", sep = "^")

### 01 - Target: Loan Status

`loan_status` is the current status of the loan. This is the variable we want to predict in our machine learning model. For this variable, we are going to considerar three labels:
- 0: loans that have already been paid.
- 1: default or charged off loans.
- 2: current loans (rest of the cases), where we don't know if they are going to be paid or not.

We will use labels 0 and 1 for training and testing our model. Label 2 is going to use just for predicting.

In [3]:
loans['loan_status'].value_counts()

Current                                                843754
Fully Paid                                             698690
Charged Off                                            182199
Late (31-120 days)                                      21742
In Grace Period                                         11812
Late (16-30 days)                                        4423
Does not meet the credit policy. Status:Fully Paid       1988
Does not meet the credit policy. Status:Charged Off       761
Default                                                    57
Name: loan_status, dtype: int64

In [4]:
def process_loan_status(loan_status):
    
    loan_status_dict = {
    "Current": 2,
    "Fully Paid": 0,
    "Charged Off": 1,
    "Late (31-120 days)": 2,
    "In Grace Period": 2,
    "Late (16-30 days)": 2,
    "Does not meet the credit policy. Status:Fully Paid": 0,
    "Does not meet the credit policy. Status:Charged Off": 1,
    "Default": 1
    }
    
    return loan_status_dict[loan_status]

In [5]:
loans['loan_status'] = loans['loan_status'].map(process_loan_status).\
                            astype('category')

In [6]:
loans['loan_status'].describe()

count     1765426
unique          3
top             2
freq       881731
Name: loan_status, dtype: int64

### 02 - Categorical Variables

In [7]:
categorical_variables = ['term', 'grade', 'emp_title', 'emp_length', 'title', 
                         'home_ownership', 'zip_code', 'addr_state', 'application_type']

In [8]:
for variable in categorical_variables:
    number_of_categories = loans[variable].unique().size
    print("{}: {} categories".format(variable, number_of_categories))

term: 2 categories
grade: 7 categories
emp_title: 449773 categories
emp_length: 12 categories
title: 63156 categories
home_ownership: 6 categories
zip_code: 954 categories
addr_state: 51 categories
application_type: 2 categories


Too many categories for `emp_title`, `title` and `zip_code`. Let's take a look at these three variables:

__emp_title__:

In [9]:
loans['emp_title'].value_counts().head(10)

Teacher             30145
Manager             26290
Owner               16496
Registered Nurse    12234
RN                  11756
Supervisor          11230
Driver              10445
Sales               10131
Project Manager      8656
Office Manager       7553
Name: emp_title, dtype: int64

In [10]:
loans.loc[~((loans['emp_title'] == 'Teacher') |
            (loans['emp_title'] == 'Manager') | 
            (loans['emp_title'] == 'Owner')),'emp_title'] = 'Other'

In [11]:
loans['emp_title'].value_counts()

Other      1692495
Teacher      30145
Manager      26290
Owner        16496
Name: emp_title, dtype: int64

__title__:

In [12]:
loans['title'].value_counts().head(20)

Debt consolidation           893651
Credit card refinancing      341989
Home improvement             104689
Other                         92691
Major purchase                33218
Medical expenses              18766
Business                      16221
Debt Consolidation            15763
Car financing                 15547
Vacation                      10942
Moving and relocation         10685
Home buying                    7284
Consolidation                  5385
debt consolidation             4837
Debt Consolidation Loan        3804
Credit Card Consolidation      2360
consolidation                  2175
Personal Loan                  2133
Consolidation Loan             1778
Home Improvement               1773
Name: title, dtype: int64

In [13]:
loans = loans.drop('title', axis=1)

__zip_code__:

In [14]:
loans['zip_code'].describe()

count     1765425
unique        953
top         945xx
freq        18858
Name: zip_code, dtype: object

In [15]:
loans = loans.drop('zip_code', axis=1)

__Transform to categorical__:

In [16]:
categorical_variables = ['term', 'grade', 'emp_title', 'emp_length', 
                         'home_ownership', 'addr_state', 'application_type']

In [17]:
for variable in categorical_variables:
    loans[variable] = loans[variable].astype("category")

In [18]:
loans[categorical_variables].describe()

Unnamed: 0,term,grade,emp_title,emp_length,home_ownership,addr_state,application_type
count,1765426,1765426,1765426,1660506,1765426,1765426,1765426
unique,2,7,4,11,6,51,2
top,36 months,C,Other,10+ years,MORTGAGE,CA,Individual
freq,1265083,523203,1692495,587623,872230,247266,1713701


#### 3.3 - Dates

We have just one date variable, `issue_d`. We are only interested in the year of the loan and we will consider it as categorical variable:

In [19]:
loans['issue_d'] = loans['issue_d'].map(lambda x: x[4:])

In [20]:
loans['issue_d'] = loans['issue_d'].astype('category')

In [21]:
loans['issue_d'].describe()

count     1765426
unique         11
top          2017
freq       443579
Name: issue_d, dtype: object

#### 3.4 - Numeric Variables

In [22]:
numerical_variables = ["funded_amnt_inv", "installment", "int_rate", "annual_inc", "dti",
                       "last_pymnt_amnt", "total_pymnt_inv", "total_rec_late_fee", "total_acc"]

The only variable we are going to process in this part is the interest rate on the loan (`int_rate`). We have to take the number without the percentage symbol and then transform to float:

__int_rate__:

In [23]:
loans['int_rate'] = loans['int_rate'].map(lambda x: float(x[:-1]))

In [24]:
loans.dtypes

funded_amnt_inv        float64
term                  category
issue_d               category
installment            float64
int_rate               float64
grade                 category
emp_title             category
emp_length            category
annual_inc             float64
dti                    float64
home_ownership        category
addr_state            category
last_pymnt_amnt        float64
total_pymnt_inv        float64
total_rec_late_fee     float64
application_type      category
total_acc              float64
loan_status           category
dtype: object

__outliers detection__:

In [25]:
loans[numerical_variables].describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,last_pymnt_amnt,total_pymnt_inv,total_rec_late_fee,total_acc
count,1765426.0,1765426.0,1765426.0,1765422.0,1764847.0,1765426.0,1765426.0,1765426.0,1765397.0
mean,14743.41,439.9808,13.19561,77402.89,18.58771,3152.905,10930.59,1.033835,24.59412
std,8888.966,261.0757,4.733918,118515.8,11.83439,5803.45,9373.107,8.223289,11.91904
min,0.0,4.93,5.32,0.0,-1.0,0.0,0.0,-9.5e-09,1.0
25%,8000.0,251.36,9.75,46000.0,12.02,300.63,3749.95,0.0,16.0
50%,12500.0,376.37,12.74,65000.0,17.87,541.64,8359.535,0.0,23.0
75%,20000.0,581.58,15.99,92000.0,24.36,2782.267,15455.18,0.0,31.0
max,40000.0,1719.83,30.99,110000000.0,999.0,42148.53,62404.47,1126.26,176.0


In [26]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers

In [32]:
# detect outliers from numerical features 
outliers_to_drop = detect_outliers(loans,2,numerical_variables)

print("There are {} outliers from numerical features".format(len(outliers_to_drop)))

There are 17011 outliers from numerical features


In [33]:
loans = loans.drop(outliers_to_drop, axis=0)

In [34]:
loans[numerical_variables].describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,last_pymnt_amnt,total_pymnt_inv,total_rec_late_fee,total_acc
count,1748415.0,1748415.0,1748415.0,1748411.0,1747842.0,1748415.0,1748415.0,1748415.0,1748386.0
mean,14549.81,433.0376,13.17208,76733.63,18.59645,2965.384,10660.73,0.9746508,24.53686
std,8706.615,252.2696,4.714838,118419.1,11.78862,5419.843,8989.155,7.73271,11.89776
min,0.0,4.93,5.32,0.0,-1.0,0.0,0.0,-9.5e-09,1.0
25%,8000.0,250.29,9.75,46000.0,12.02,298.75,3710.585,0.0,16.0
50%,12175.0,373.92,12.69,65000.0,17.88,533.75,8258.25,0.0,23.0
75%,20000.0,573.43,15.95,91000.0,24.37,2512.75,15155.4,0.0,31.0
max,40000.0,1598.26,30.99,110000000.0,999.0,38666.22,61513.72,742.17,176.0


#### 3.5 - Dealing with NA's

In [35]:
loans.isnull().sum()

funded_amnt_inv            0
term                       0
issue_d                    0
installment                0
int_rate                   0
grade                      0
emp_title                  0
emp_length            104538
annual_inc                 4
dti                      573
home_ownership             0
addr_state                 0
last_pymnt_amnt            0
total_pymnt_inv            0
total_rec_late_fee         0
application_type           0
total_acc                 29
loan_status                0
dtype: int64

In [37]:
loans = loans.fillna(method = 'ffill')

In [38]:
loans.isnull().sum()

funded_amnt_inv       0
term                  0
issue_d               0
installment           0
int_rate              0
grade                 0
emp_title             0
emp_length            0
annual_inc            0
dti                   0
home_ownership        0
addr_state            0
last_pymnt_amnt       0
total_pymnt_inv       0
total_rec_late_fee    0
application_type      0
total_acc             0
loan_status           0
dtype: int64

### 04 - Data for modeling

In [39]:
data_for_modeling = loans[(loans['loan_status'] == 0) |
                          (loans['loan_status'] == 1)]

__Get dummies__:

In [40]:
data_for_modeling = pd.get_dummies(data_for_modeling, columns = categorical_variables)

In [41]:
data_for_modeling.head()

Unnamed: 0,funded_amnt_inv,issue_d,installment,int_rate,annual_inc,dti,last_pymnt_amnt,total_pymnt_inv,total_rec_late_fee,total_acc,...,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,application_type_Individual,application_type_Joint App
0,10000.0,2016,262.34,19.53,52000.0,15.0,9575.49,11127.83,0.0,12.0,...,0,0,0,0,0,0,0,0,1,0
1,35000.0,2016,941.96,20.75,85000.0,24.98,509.1,37226.47,0.0,19.0,...,0,0,0,0,0,0,0,0,1,0
5,14400.0,2016,469.74,10.75,85000.0,28.11,10992.93,15673.13,0.0,33.0,...,0,0,0,0,0,0,0,0,1,0
6,18000.0,2016,400.31,11.99,100000.0,20.41,14885.49,20465.85,0.0,40.0,...,0,0,0,0,1,0,0,0,1,0
10,7250.0,2016,246.63,13.67,72000.0,23.93,6310.66,7779.43,0.0,24.0,...,0,0,0,0,0,0,0,0,1,0


In [42]:
data_for_modeling.shape

(867279, 94)

In [None]:
data_for_modeling.to_csv("../data/clean/loans_train_test.csv", sep = "^", index = False)

In [None]:
!ls -lh ../data/clean