In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt; plt.style.use("ggplot")

In [3]:
loans = pd.read_csv("../../data/clean/loans.csv", sep = "^").sample(200000, random_state = 4290)

In [4]:
loans.head()

Unnamed: 0,funded_amnt_inv,term,issue_d,installment,int_rate,grade,emp_title,emp_length,annual_inc,title,dti,home_ownership,zip_code,addr_state,total_rec_late_fee,application_type,total_acc,loan_status
1496716,14375.0,60 months,Sep-2015,362.31,17.57%,D,Shop foreman,8 years,140000.0,Debt consolidation,26.55,MORTGAGE,306xx,GA,0.0,Individual,39.0,Current
566100,3000.0,36 months,Jun-2017,105.51,16.02%,C,Respiratory Therapist,6 years,50000.0,Debt consolidation,33.46,MORTGAGE,550xx,MN,0.0,Individual,22.0,Current
191845,7000.0,36 months,Apr-2016,230.74,11.47%,B,Natural Resource Manager,10+ years,102000.0,Major purchase,5.55,MORTGAGE,389xx,MS,0.0,Individual,47.0,Current
611191,8800.0,36 months,Apr-2017,276.74,8.24%,B,GIS TECHNICIAN,10+ years,42000.0,Debt consolidation,13.16,MORTGAGE,285xx,NC,0.0,Individual,31.0,Current
1737593,16475.0,60 months,Jan-2015,430.86,19.24%,E,Sr. Retirement Counselor,8 years,50000.0,Debt consolidation,32.35,MORTGAGE,374xx,TN,0.0,Individual,32.0,Current


In [5]:
loans['loan_status'].value_counts()

Current                                                95158
Fully Paid                                             79394
Charged Off                                            20721
Late (31-120 days)                                      2542
In Grace Period                                         1354
Late (16-30 days)                                        514
Does not meet the credit policy. Status:Fully Paid       225
Does not meet the credit policy. Status:Charged Off       84
Default                                                    8
Name: loan_status, dtype: int64

`loan_status` is the current status of the loan. This is the variable we want to predict in our machine learning model. For this variable, we are going to consider two labels:
- 0: loans that have already been paid.
- 1: default or charged off loans.

For the purpose of this project, we are going to consider __Charge Off__ and __Default__ status as unpaid loan (label 1) and __Fully Paid__ as paid loan (label 0). The rest of the loan status are going to be removed due to they have not finished yet.

In [6]:
def process_loan_status(loan_status):
    
    loan_status_dict = {
    "Current": 2,
    "Fully Paid": 0,
    "Charged Off": 1,
    "Late (31-120 days)": 2,
    "In Grace Period": 2,
    "Late (16-30 days)": 2,
    "Does not meet the credit policy. Status:Fully Paid": 0,
    "Does not meet the credit policy. Status:Charged Off": 1,
    "Default": 1
    }
    
    return loan_status_dict[loan_status]

In [7]:
loans['loan_status'] = loans['loan_status'].map(process_loan_status)

In [10]:
loans['loan_status'].head(10)

1496716    2
566100     2
191845     2
611191     2
1737593    2
292947     2
1738120    0
214732     2
1261074    1
199096     2
Name: loan_status, dtype: int64

In [13]:
loans = loans[loans['loan_status']<2]

In [14]:
loans.head()

Unnamed: 0,funded_amnt_inv,term,issue_d,installment,int_rate,grade,emp_title,emp_length,annual_inc,title,dti,home_ownership,zip_code,addr_state,total_rec_late_fee,application_type,total_acc,loan_status
1738120,25000.0,36 months,Jan-2015,777.55,7.49%,A,Senior Quality Engineer,10+ years,106157.0,Debt consolidation,9.37,MORTGAGE,922xx,CA,0.0,Individual,49.0,0
1261074,12000.0,36 months,May-2014,404.27,12.99%,C,service manager,7 years,67000.0,Debt consolidation,21.33,MORTGAGE,315xx,GA,0.0,Individual,28.0,1
985734,4800.0,36 months,Aug-2013,175.59,18.85%,D,LAVO,3 years,56000.0,Consolidate,7.62,RENT,900xx,CA,0.0,Individual,15.0,0
1167943,7900.0,36 months,Oct-2014,273.82,14.99%,C,security guard,1 year,58300.0,Debt consolidation,12.15,MORTGAGE,775xx,TX,0.0,Individual,17.0,0
1216290,10725.0,36 months,Jul-2014,385.43,17.57%,D,Teacher,1 year,36000.0,Debt consolidation,21.68,RENT,361xx,AL,19.27,Individual,18.0,0


In [21]:
loans.to_csv('../../data/loans_sample.csv', sep = "^", index=False)

In [17]:
loans['loan_status'].describe()

count    100432.000000
mean          0.207235
std           0.405327
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: loan_status, dtype: float64

__Our prior value of un paid loans is about 20%__ We will keep this value in mind!