In [64]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt; plt.style.use("ggplot")
import seaborn as sns

from matplotlib import rcParams
from collections import Counter

%matplotlib inline
# figure size
rcParams['figure.figsize'] = 11,6
# remove warnings
import warnings
warnings.filterwarnings('ignore')

In [28]:
loans = pd.read_csv("../../data/loans_sample.csv", sep = "^")

In [29]:
loans.head()

Unnamed: 0,funded_amnt_inv,term,issue_d,installment,int_rate,grade,emp_title,emp_length,annual_inc,title,dti,home_ownership,zip_code,addr_state,total_rec_late_fee,application_type,total_acc,loan_status
0,25000.0,36 months,Jan-2015,777.55,7.49%,A,Senior Quality Engineer,10+ years,106157.0,Debt consolidation,9.37,MORTGAGE,922xx,CA,0.0,Individual,49.0,0
1,12000.0,36 months,May-2014,404.27,12.99%,C,service manager,7 years,67000.0,Debt consolidation,21.33,MORTGAGE,315xx,GA,0.0,Individual,28.0,1
2,4800.0,36 months,Aug-2013,175.59,18.85%,D,LAVO,3 years,56000.0,Consolidate,7.62,RENT,900xx,CA,0.0,Individual,15.0,0
3,7900.0,36 months,Oct-2014,273.82,14.99%,C,security guard,1 year,58300.0,Debt consolidation,12.15,MORTGAGE,775xx,TX,0.0,Individual,17.0,0
4,10725.0,36 months,Jul-2014,385.43,17.57%,D,Teacher,1 year,36000.0,Debt consolidation,21.68,RENT,361xx,AL,19.27,Individual,18.0,0


In [30]:
loans.dtypes

funded_amnt_inv       float64
term                   object
issue_d                object
installment           float64
int_rate               object
grade                  object
emp_title              object
emp_length             object
annual_inc            float64
title                  object
dti                   float64
home_ownership         object
zip_code               object
addr_state             object
total_rec_late_fee    float64
application_type       object
total_acc             float64
loan_status             int64
dtype: object

In [31]:
numerical_variables = ["funded_amnt_inv", "installment", "int_rate", "annual_inc", "dti",
                       "total_rec_late_fee", "total_acc"]

__Interest Rate__

In [32]:
loans['int_rate']

0           7.49%
1          12.99%
2          18.85%
3          14.99%
4          17.57%
5          13.33%
6          18.25%
7          13.98%
8          10.99%
9          28.99%
10          9.99%
11         14.99%
12         12.99%
13         13.99%
14         13.61%
15          9.71%
16         12.29%
17         15.59%
18          7.62%
19         12.99%
20         14.30%
21         16.99%
22         12.99%
23          8.19%
24         15.61%
25         12.99%
26         16.29%
27         21.98%
28         11.49%
29         19.20%
           ...   
100402     17.57%
100403     12.62%
100404     24.49%
100405     14.65%
100406     11.99%
100407     12.99%
100408     14.64%
100409     13.59%
100410     15.31%
100411     22.70%
100412     19.52%
100413     11.53%
100414     12.39%
100415     12.69%
100416     12.79%
100417      7.69%
100418     10.99%
100419     11.71%
100420     14.65%
100421      6.49%
100422     10.49%
100423      6.03%
100424      8.39%
100425      9.17%
100426    

In [33]:
loans['int_rate'] = loans['int_rate'].map(lambda x: float(x[:-1]))

In [34]:
loans['int_rate'].head()

0     7.49
1    12.99
2    18.85
3    14.99
4    17.57
Name: int_rate, dtype: float64

In [35]:
loans.dtypes

funded_amnt_inv       float64
term                   object
issue_d                object
installment           float64
int_rate              float64
grade                  object
emp_title              object
emp_length             object
annual_inc            float64
title                  object
dti                   float64
home_ownership         object
zip_code               object
addr_state             object
total_rec_late_fee    float64
application_type       object
total_acc             float64
loan_status             int64
dtype: object

__Outliers__

In [71]:
loans[numerical_variables].describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc
count,100432.0,100432.0,100432.0,100432.0,100426.0,100432.0,100429.0
mean,14256.9269,436.587013,13.463686,75393.06,17.868144,1.126022,25.390674
std,8524.613057,257.093076,4.626455,62239.74,9.157764,7.593729,11.968376
min,0.0,16.31,5.32,0.0,0.0,0.0,1.0
25%,7950.0,251.36,9.99,45647.0,11.61,0.0,17.0
50%,12000.0,375.99,12.99,65000.0,17.33,0.0,24.0
75%,20000.0,574.3225,16.29,90000.0,23.63,0.0,32.0
max,40000.0,1584.9,30.99,9550000.0,999.0,291.9,135.0


In [69]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than n outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers

In [70]:
# detect outliers from numerical features 
outliers_to_drop = detect_outliers(loans,1,numerical_variables)

print("There are {} outliers from numerical features".format(len(outliers_to_drop)))

There are 1340 outliers from numerical features


In [72]:
loans = loans.drop(outliers_to_drop, axis=0)

In [73]:
loans.shape

(99092, 18)

In [74]:
loans[numerical_variables].describe()

Unnamed: 0,funded_amnt_inv,installment,int_rate,annual_inc,dti,total_rec_late_fee,total_acc
count,99092.0,99092.0,99092.0,99092.0,99086.0,99092.0,99089.0
mean,14007.48494,427.709516,13.428513,73654.86,17.903371,0.926889,25.321489
std,8267.845456,245.497565,4.588009,58440.0,9.146164,6.328852,11.938413
min,0.0,16.31,5.32,0.0,0.0,0.0,1.0
25%,7800.0,249.55,9.99,45000.0,11.66,0.0,17.0
50%,12000.0,372.71,12.99,64322.0,17.38,0.0,24.0
75%,19925.0,564.18,16.29,90000.0,23.67,0.0,32.0
max,40000.0,1404.4,30.99,9550000.0,999.0,283.98,135.0


__Dealing NA's__

In [81]:
loans[numerical_variables].isnull().sum()

funded_amnt_inv       0
installment           0
int_rate              0
annual_inc            0
dti                   6
total_rec_late_fee    0
total_acc             3
dtype: int64

In [79]:
loans[numerical_variables].median()

funded_amnt_inv       12000.00
installment             372.71
int_rate                 12.99
annual_inc            64322.00
dti                      17.38
total_rec_late_fee        0.00
total_acc                24.00
dtype: float64

In [82]:
loans[numerical_variables] = loans[numerical_variables].fillna(loans[numerical_variables].median())

In [83]:
loans[numerical_variables].isnull().sum()

funded_amnt_inv       0
installment           0
int_rate              0
annual_inc            0
dti                   0
total_rec_late_fee    0
total_acc             0
dtype: int64