In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
from sklearn.preprocessing import StandardScaler

In [16]:
from sklearn.preprocessing import MinMaxScaler

In [20]:
from sklearn.preprocessing import RobustScaler

# CONTENTS
### investigate file
### read data
### basic statistics
### standardization (Z-score normalization)
### normalization (min-max scalar)
### robust scalar (uses median and quantiles)

### investigate file

In [5]:
data = pd.read_csv('lending-club-data.csv', nrows=1000)

In [6]:
data.shape
data.columns
data.head(5)

(1000, 68)

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'is_inc_v', 'issue_d',
       'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title',
       'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'not_compliant', 'status', 'inactive_loans', 'bad_loans',
       'emp_length_num', 'grade_num', 'sub_gra

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,0.4,1,1,1,0,8.1435,20141201T000000,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,0.8,1,1,1,1,2.3932,20161201T000000,1,1,1
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1.0,1,1,1,0,8.25955,20141201T000000,1,1,1
3,1076863,1277178,10000,10000,10000,36 months,13.49,339.31,C,C1,...,0.2,1,1,1,0,8.27585,20141201T000000,0,1,1
4,1075269,1311441,5000,5000,5000,36 months,7.9,156.46,A,A4,...,0.8,1,1,1,0,5.21533,20141201T000000,1,1,1


### read data

In [7]:
cols = ['loan_amnt', 'int_rate', 'installment']
data = pd.read_csv('lending-club-data_coursera.csv', nrows=30000, usecols=cols)

In [8]:
data.shape
data.columns
data.head(5)

(30000, 3)

Index(['loan_amnt', 'int_rate', 'installment'], dtype='object')

Unnamed: 0,loan_amnt,int_rate,installment
0,5000,10.65,162.87
1,2500,15.27,59.83
2,2400,15.96,84.33
3,10000,13.49,339.31
4,5000,7.9,156.46


In [9]:
data = data.dropna()
data.shape

(30000, 3)

### basic statistics
these statistics slightly different from tutorial results

In [10]:
data.describe()

Unnamed: 0,loan_amnt,int_rate,installment
count,30000.0,30000.0,30000.0
mean,10948.665833,11.713979,323.060486
std,7386.545848,3.813171,212.02296
min,1000.0,5.42,19.87
25%,5000.0,7.9,164.86
50%,9600.0,11.49,277.57
75%,15000.0,14.27,424.585
max,35000.0,24.4,1305.19


### standardization (Z-score normalization)   
- center variable at 0 and standardize variance at 1     
- min and max values vary according to how spread out initial data is, highly influenced by outliers   
- result   
mean = 0   
std = 1

In [12]:
scaler = StandardScaler() 
data_scaled = scaler.fit_transform(data)

In [14]:
data_scaled.mean(axis=0)
data_scaled.std(axis=0)

array([ 1.06107715e-16, -5.12538160e-16, -1.26950302e-16])

array([1., 1., 1.])

In [15]:
print('Min values (Loan Amount, Int rate and Installment): ', data_scaled.min(axis=0))
print('Max values (Loan Amount, Int rate and Installment): ', data_scaled.max(axis=0))

Min values (Loan Amount, Int rate and Installment):  [-1.34688552 -1.65061685 -1.43001276]
Max values (Loan Amount, Int rate and Installment):  [3.25615457 3.32695104 4.63226191]


### normalization (min-max scalar)   
- data scaled to fixed range, usually 0 to 1    
- compared to standardization, will have smaller std   
- can reduce impact of outliers
- `x_norm = (x - x_min) / (x_max - x_min)`   
- result   
values NOT centered at 0   
std! = 1

In [17]:
scaler = MinMaxScaler() 
data_scaled = scaler.fit_transform(data)

In [18]:
print('means (Loan Amount, Int rate and Installment): ', data_scaled.mean(axis=0))
print('std (Loan Amount, Int rate and Installment): ', data_scaled.std(axis=0))

means (Loan Amount, Int rate and Installment):  [0.29260782 0.33161112 0.23588716]
std (Loan Amount, Int rate and Installment):  [0.21724773 0.20090133 0.16495458]


In [19]:
print('Min values (Loan Amount, Int rate and Installment): ', data_scaled.min(axis=0))
print('Max values (Loan Amount, Int rate and Installment): ', data_scaled.max(axis=0))

Min values (Loan Amount, Int rate and Installment):  [0. 0. 0.]
Max values (Loan Amount, Int rate and Installment):  [1. 1. 1.]


### robust scalar (uses median and quantiles)
- uses feature statistics that are robust to outliers   
- `IQR = 75th quantile — 25th quantile`   
- `X_scaled = (X — X.median) / IQR`   
- result   
values NOT centered at 0   
std! = 1   
min, max NOT set to particular values

In [21]:
scaler = RobustScaler() 
data_scaled = scaler.fit_transform(data)

In [22]:
print('means (Loan Amount, Int rate and Installment): ', data_scaled.mean(axis=0))
print('std (Loan Amount, Int rate and Installment): ', data_scaled.std(axis=0))

means (Loan Amount, Int rate and Installment):  [0.13486658 0.03516154 0.17514866]
std (Loan Amount, Int rate and Installment):  [0.73864227 0.59860396 0.81632275]


In [23]:
print('Min values (Loan Amount, Int rate and Installment): ', data_scaled.min(axis=0))
print('Max values (Loan Amount, Int rate and Installment): ', data_scaled.max(axis=0))

Min values (Loan Amount, Int rate and Installment):  [-0.86       -0.95290424 -0.99220329]
Max values (Loan Amount, Int rate and Installment):  [2.54       2.0266876  3.95656945]
