In [142]:
import pandas as pd
import numpy as np
from datetime import datetime
from matplotlib import pyplot as plt

%matplotlib inline

import statsmodels.api as sm
from sklearn import linear_model, datasets
from sklearn.cross_validation import train_test_split

In [143]:
url = 'Lending_Club_Stats_2015_v2.csv'
loan = pd.read_csv(url, low_memory = False)

In [144]:
loan.head()

Unnamed: 0,id,loan_amnt,term,int_rate,grade,sub_grade,emp_length,home_ownership,annual_inc,loan_status,purpose,addr_state,dti,Default_Status
0,1085,10850,36,0.18,D,D5,10+ years,MORTGAGE,47000.0,Fully Paid,home_improvement,CA,0,0
1,2406,15000,36,0.12,C,C1,7 years,OWN,97000.0,Fully Paid,house,IL,0,0
2,3565,4000,36,0.11,B,B4,2 years,OWN,36000.0,Current,car,CT,0,0
3,3713,35000,36,0.09,B,B2,< 1 year,MORTGAGE,200000.0,Current,home_improvement,CA,0,0
4,3783,24000,36,0.08,B,B1,2 years,RENT,98000.0,Current,debt_consolidation,OR,0,0


In [145]:
loan.dropna().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 421095 entries, 0 to 421094
Data columns (total 14 columns):
id                421095 non-null int64
loan_amnt         421095 non-null int64
term              421095 non-null int64
int_rate          421095 non-null float64
grade             421095 non-null object
sub_grade         421095 non-null object
emp_length        421095 non-null object
home_ownership    421095 non-null object
annual_inc        421095 non-null float64
loan_status       421095 non-null object
purpose           421095 non-null object
addr_state        421095 non-null object
dti               421095 non-null int64
Default_Status    421095 non-null int64
dtypes: float64(2), int64(5), object(7)
memory usage: 48.2+ MB


In [146]:
loan['emp_length_clean'] = loan.emp_length.str.replace('+','')
loan['emp_length_clean'] = loan.emp_length_clean.str.replace('<','')
loan['emp_length_clean'] = loan.emp_length_clean.str.replace('years','')
loan['emp_length_clean'] = loan.emp_length_clean.str.replace('year','')
loan['emp_length_clean'] = loan.emp_length_clean.str.replace('n/a','0')

In [147]:
loan.emp_length_clean.unique()

array(['10 ', '7 ', '2 ', ' 1 ', '0', '4 ', '3 ', '6 ', '5 ', '1 ', '8 ',
       '9 '], dtype=object)

In [148]:
loan['grade_clean'] = loan['grade'].map({'A':7,'B':6,'C':5,'D':4,'E':3,'F':2,'G':1})

In [149]:
loan_amnt = loan.loan_amnt
mean_loan_amnt = loan[loan.loan_amnt.notnull()].loan_amnt.mean()
loan.loan_amnt.fillna(mean_loan_amnt, inplace=True)


In [150]:
annual_inc = loan.annual_inc
mean_annual_inc = loan[loan.annual_inc.notnull()].annual_inc.mean()
loan.annual_inc.fillna(mean_annual_inc, inplace=True)

In [151]:
emp_length = loan.emp_length_clean
mean_emp_length_clean = loan[loan.emp_length_clean.notnull()].emp_length_clean.mean()
loan.emp_length_clean.fillna(mean_emp_length_clean, inplace=True)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [152]:
grade = loan.grade
mean_grade_clean = loan[loan.grade.notnull()].grade_clean.mean()
loan.grade_clean.fillna(mean_grade_clean, inplace=True)

In [153]:
loan.head()

Unnamed: 0,id,loan_amnt,term,int_rate,grade,sub_grade,emp_length,home_ownership,annual_inc,loan_status,purpose,addr_state,dti,Default_Status,emp_length_clean,grade_clean
0,1085,10850,36,0.18,D,D5,10+ years,MORTGAGE,47000.0,Fully Paid,home_improvement,CA,0,0,10,4
1,2406,15000,36,0.12,C,C1,7 years,OWN,97000.0,Fully Paid,house,IL,0,0,7,5
2,3565,4000,36,0.11,B,B4,2 years,OWN,36000.0,Current,car,CT,0,0,2,6
3,3713,35000,36,0.09,B,B2,< 1 year,MORTGAGE,200000.0,Current,home_improvement,CA,0,0,1,6
4,3783,24000,36,0.08,B,B1,2 years,RENT,98000.0,Current,debt_consolidation,OR,0,0,2,6


In [17]:
loan.home_ownership.unique().tolist()

['RENT', 'MORTGAGE', 'OWN', 'ANY']

In [18]:
home_ownership = pd.get_dummies(loan.home_ownership)
loan = loan.join(home_ownership)

In [19]:
loan.head()

Unnamed: 0,id,loan_amnt,term,int_rate,grade,sub_grade,emp_length,home_ownership,annual_inc,loan_status,purpose,addr_state,dti,Default_Status,emp_length_clean,grade_clean,ANY,MORTGAGE,OWN,RENT
0,1,20000,36 months,9.80%,B,B3,5 years,RENT,75000.0,Fully Paid,debt_consolidation,WA,23.45,0,5,6,0,0,0,1
1,2,15000,60 months,13.44%,C,C3,4 years,RENT,77213.0,Fully Paid,credit_card,FL,18.17,0,4,5,0,0,0,1
2,3,7200,36 months,11.48%,B,B5,10+ years,MORTGAGE,72500.0,Fully Paid,home_improvement,GA,8.47,0,10,6,0,1,0,0
3,4,15000,36 months,8.49%,B,B1,6 years,MORTGAGE,110000.0,Fully Paid,credit_card,NJ,13.24,0,6,6,0,1,0,0
4,5,16000,60 months,8.49%,B,B1,5 years,MORTGAGE,62000.0,Current,credit_card,CA,28.92,0,5,6,0,1,0,0


In [30]:
X_Variables_3 = ['RENT', 'MORTGAGE', 'OWN', 'ANY']
X_3 = loan[X_Variables_3]

In [31]:
X_3 = X_3.values

In [32]:
y_3 = loan['Default_Status'].values

In [33]:
clf = linear_model.LogisticRegression()

In [34]:
model_3 = clf.fit(X_3,y_3)

In [35]:
model_3.score(X_3,y_3)


0.83180517460430548

In [155]:
homeowner = pd.DataFrame(list(zip(X_Variables_3, model_3.coef_.T)))

homeowner

Unnamed: 0,0,1
0,RENT,[-0.082840161079]
1,MORTGAGE,[-0.40535641738]
2,OWN,[-0.289959289972]
3,ANY,[0.158438032719]


In [38]:
loan.purpose.unique().tolist()

['debt_consolidation',
 'credit_card',
 'home_improvement',
 'house',
 'medical',
 'other',
 'major_purchase',
 'car',
 'small_business',
 'vacation',
 'moving',
 'renewable_energy',
 'wedding',
 'educational']

In [39]:
purpose = pd.get_dummies(loan.purpose)
loan = loan.join(purpose)

In [47]:
X_Variables_2 = ['debt_consolidation', 'credit_card', 'home_improvement', 'house','medical','other','major_purchase'
                 ,'car','small_business','vacation','moving','renewable_energy','wedding','educational']
X_2 = loan[X_Variables_2]

In [48]:
X_2 = X_2.values

y_2 = loan['Default_Status'].values

In [49]:
model_2 = clf.fit(X_2,y_2)

model_2.score(X_2,y_2)

0.83180517460430548

In [158]:
Purpose = pd.DataFrame(list(zip(X_Variables_2, model_2.coef_.T)))
Purpose


Unnamed: 0,0,1
0,debt_consolidation,[-0.082840161079]
1,credit_card,[-0.40535641738]
2,home_improvement,[-0.289959289972]
3,house,[0.158438032719]
4,medical,[-0.040916365649]
5,other,[-0.0671391437716]
6,major_purchase,[-0.179284499142]
7,car,[-0.380615118894]
8,small_business,[0.32088979098]
9,vacation,[-0.151853966418]


In [51]:
loan.head()

Unnamed: 0,id,loan_amnt,term,int_rate,grade,sub_grade,emp_length,home_ownership,annual_inc,loan_status,...,home_improvement,house,major_purchase,medical,moving,other,renewable_energy,small_business,vacation,wedding
0,1,20000,36 months,9.80%,B,B3,5 years,RENT,75000.0,Fully Paid,...,0,0,0,0,0,0,0,0,0,0
1,2,15000,60 months,13.44%,C,C3,4 years,RENT,77213.0,Fully Paid,...,0,0,0,0,0,0,0,0,0,0
2,3,7200,36 months,11.48%,B,B5,10+ years,MORTGAGE,72500.0,Fully Paid,...,1,0,0,0,0,0,0,0,0,0
3,4,15000,36 months,8.49%,B,B1,6 years,MORTGAGE,110000.0,Fully Paid,...,0,0,0,0,0,0,0,0,0,0
4,5,16000,60 months,8.49%,B,B1,5 years,MORTGAGE,62000.0,Current,...,0,0,0,0,0,0,0,0,0,0


In [178]:
employed = pd.get_dummies(loan.emp_length)
loan = loan.join(employed)

In [179]:
X_Variables_emp = ['< 1 year','1 year','2 years','3 years','4 years','5 years','6 years','7 years','8 years','9 years','10+ years']


In [180]:
x = loan[X_Variables_emp]

y = loan['Default_Status'].values

model_emp = clf.fit(x,y)

In [185]:
clf = linear_model.LogisticRegression()

model_emp = clf.fit(x,y)

In [186]:
model_emp.score(x,y)

0.83180517460430548

In [191]:
employ_df = pd.DataFrame(list(zip(x,model_emp.coef_.T)))
employ_df

Unnamed: 0,0,1
0,< 1 year,[-0.287473153855]
1,1 year,[-0.246619096844]
2,2 years,[-0.303567067747]
3,3 years,[-0.305584907324]
4,4 years,[-0.295608569165]
5,5 years,[-0.302809501994]
6,6 years,[-0.332268917485]
7,7 years,[-0.306520056964]
8,8 years,[-0.290465010487]
9,9 years,[-0.30771925425]


In [193]:
grade = pd.get_dummies(loan.grade)
loan = loan.join(grade)
loan.head()

Unnamed: 0,id,loan_amnt,term,int_rate,grade,sub_grade,emp_length,home_ownership,annual_inc,loan_status,...,9 years,< 1 year,n/a,A,B,C,D,E,F,G
0,1085,10850,36,0.18,D,D5,10+ years,MORTGAGE,47000.0,Fully Paid,...,0,0,0,0,0,0,1,0,0,0
1,2406,15000,36,0.12,C,C1,7 years,OWN,97000.0,Fully Paid,...,0,0,0,0,0,1,0,0,0,0
2,3565,4000,36,0.11,B,B4,2 years,OWN,36000.0,Current,...,0,0,0,0,1,0,0,0,0,0
3,3713,35000,36,0.09,B,B2,< 1 year,MORTGAGE,200000.0,Current,...,0,1,0,0,1,0,0,0,0,0
4,3783,24000,36,0.08,B,B1,2 years,RENT,98000.0,Current,...,0,0,0,0,1,0,0,0,0,0


In [195]:
X_Variables_grade = ['A','B','C','D','E','F','G']

x_G = loan[X_Variables_grade]

y_G = loan['Default_Status'].values

In [196]:
clf = linear_model.LogisticRegression()

model_G = clf.fit(x_G,y_G)

In [197]:
model_G.score(x_G,y_G)

0.83180517460430548

In [198]:
Grade_df = pd.DataFrame(list(zip(x_G,model_G.coef_.T)))
Grade_df

Unnamed: 0,0,1
0,A,[-1.80261320734]
1,B,[-0.978206251838]
2,C,[-0.393658741176]
3,D,[0.0537432698369]
4,E,[0.358809757624]
5,F,[0.727000869455]
6,G,[0.921722180888]


In [164]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

In [165]:
url = 'Lending_Club_Stats_2015_v2.csv'
loan2 = pd.read_csv(url, low_memory = False)

In [167]:
data = loan2.drop(["id","grade","sub_grade","emp_length","home_ownership","loan_status","purpose","addr_state"], axis=1)

In [169]:
x = data.drop("Default_Status", axis=1)
y = data['Default_Status']
print(x.shape, y.shape)

(421095, 5) (421095,)


In [171]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=1, stratify=y)

In [172]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [173]:
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [174]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8318066246386402
Testing Data Score: 0.831800824515075


In [175]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]


In [177]:
logistic_regression_prediction = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
logistic_regression_prediction

Unnamed: 0,Actual,Prediction
0,0,0
1,0,0
2,0,0
3,1,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [199]:
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [200]:
y_pred = LogReg.predict(X_test)


In [201]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix

array([[87567,     0],
       [17707,     0]])

In [52]:
from sklearn import tree
import pandas as pd

In [131]:
url = 'Lending_Club_Stats_2015_v2.csv'
loan2 = pd.read_csv(url, low_memory = False)

In [132]:
target = loan2["Default_Status"]

In [133]:
target_names = ['Default','Paid']

In [134]:
data = loan2.drop(["Default_Status","id","grade","sub_grade","emp_length","home_ownership","loan_status","purpose","addr_state"], axis=1)

In [135]:

feature_names = data.columns

data.head()

Unnamed: 0,loan_amnt,term,int_rate,annual_inc,dti
0,10850,36,0.18,47000.0,0
1,15000,36,0.12,97000.0,0
2,4000,36,0.11,36000.0,0
3,35000,36,0.09,200000.0,0
4,24000,36,0.08,98000.0,0


In [136]:
data.dropna().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 421095 entries, 0 to 421094
Data columns (total 5 columns):
loan_amnt     421095 non-null int64
term          421095 non-null int64
int_rate      421095 non-null float64
annual_inc    421095 non-null float64
dti           421095 non-null int64
dtypes: float64(2), int64(3)
memory usage: 19.3 MB


In [137]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, stratify=target)

In [138]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.73832095294184696

In [140]:
from  sklearn.ensemble  import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.79996010410927676

In [163]:
random_forest_df = pd.DataFrame(sorted(zip(rf.feature_importances_, feature_names), reverse=True))
random_forest_df

Unnamed: 0,0,1
0,0.390535,annual_inc
1,0.318606,loan_amnt
2,0.179492,dti
3,0.101447,int_rate
4,0.00992,term
