In [3]:
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.tree import export_graphviz
import graphviz
import pandas as pd


accepted_data = pd.read_csv("data/accepted_2007_to_2018Q4.csv", usecols = ['last_fico_range_high','last_fico_range_low','loan_amnt','dti','zip_code','addr_state','emp_length','loan_status'],nrows=4000)
rejected_data = pd.read_csv("data/rejected_2007_to_2018Q4.csv",usecols = ['Amount Requested', 'Debt-To-Income Ratio', 'Zip Code', 'State', 'Employment Length','Risk_Score'],nrows =4000)
num_of_accepted = len(accepted_data)
num_of_rejected = len(rejected_data)


accepted_data = accepted_data.dropna(axis=0)
rejected_data = rejected_data.dropna(axis=0)




In [4]:
#Calculate the feature from accepted data equivalent to "risk factor" in rejected data : mean of fico scores
cols = ['last_fico_range_high','last_fico_range_low']
Fico_mean = accepted_data[cols].astype(float).mean(axis=1) 

#add the new feature to accepted data
accepted_data_tmp = accepted_data.copy()
accepted_data_tmp['fico_mean'] = Fico_mean
accepted_data_new = accepted_data_tmp[['loan_amnt','dti','zip_code','addr_state','emp_length','fico_mean','loan_status']]

#Categorize accepted data by the loan status: Fully Paid =1, Charged Off, Default =0, else = remove rows
accepted_data_new.loan_status = accepted_data_new.loan_status.replace(['Fully Paid', 'Charged Off' , 'Default'] , [1,0,0])

#Remove rows that have values other than 'Fully Paid', 'Charged Off' , 'Current'
values_valid = [0,1]
accepted_data_new = accepted_data_new[accepted_data_new.loan_status.isin(values_valid)]
#print(accepted_data_new.head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [5]:
#Choosing the features from rejected-data
rejected_data_tmp = rejected_data.copy()
rejected_data_new = rejected_data_tmp[['Amount Requested', 'Debt-To-Income Ratio', 'Zip Code', 'State', 'Employment Length','Risk_Score']]

#Change the column headers to match the columns of accepted_data_new
rejected_data_new.columns = ['loan_amnt','dti','zip_code','addr_state','emp_length','fico_mean']

#Compute Fico scroe from Risk scores
rejected_data_new['fico_mean'] =  850 - rejected_data_new['fico_mean']



In [6]:
#Dealing with strings in the features
rejected_data_new['dti'] = rejected_data_new.dti.str.extract(r'(\d+)', expand=True).astype(int)
rejected_data_new['zip_code'] = rejected_data_new.zip_code.str.extract(r'(\d+)', expand=True).astype(int)
rejected_data_new['emp_length'] = rejected_data_new.emp_length.str.extract(r'(\d+)', expand=True).astype(int)

accepted_data_new['zip_code'] = accepted_data_new.zip_code.str.extract(r'(\d+)', expand=True).astype(int)
accepted_data_new['emp_length'] = accepted_data_new.emp_length.str.extract(r'(\d+)', expand=True).astype(int)

#Extract size of each accepted or rejected data
num_of_accepted = len(accepted_data_new)
num_of_rejected = len(rejected_data_new)

#Remove the "loan_status" column for the concatenation of all data
accepted_tmp = accepted_data_new[['loan_amnt','dti','zip_code','addr_state','emp_length','fico_mean']]

#Concatenate all data to do the numerization of the categorical data for all homogeneously
all_data = pd.concat([accepted_tmp,rejected_data_new])

#Change the categorical feature "addr_state" to numerical feature
address_state = all_data['addr_state'].unique().tolist()
all_data['addr_state'] = all_data['addr_state'].apply( lambda x : address_state.index(x))

#print(all_data.head())


In [7]:
#Separate accepted and rejected dat after homogenizing all features
processed_data_accepted = all_data[0:num_of_accepted]
processed_data_accepted = processed_data_accepted.astype('int')

processed_data_rejected = all_data[num_of_accepted:]
processed_data_rejected = processed_data_rejected.astype('int')

#target values for calssification: loan_status column
target = accepted_data_new.loan_status
target = target.astype('int')


In [8]:
#Shuffle data
cv = ShuffleSplit(n_splits= 10, test_size=0.4, random_state=0)

#Design a decision tree
clf = tree.DecisionTreeClassifier(max_depth=4,min_samples_leaf=6,min_samples_split=4)

#Perform cross validation and calculate the score 
score = cross_val_score(clf, processed_data_accepted, target, cv=cv)
print('cross validation scores:',score)

#Fit the model on the data to later visualize the designed tree
clf_fitted = clf.fit(processed_data_accepted, target)
predictions_accepted = clf_fitted.predict_proba(processed_data_accepted)
print(predictions_accepted)

#Visualize the tree
##Comment out if graphviz not installed
dot_data = tree.export_graphviz(clf, out_file=None,feature_names=processed_data_accepted.columns,class_names=['0','1'],filled=True, rounded=True,special_characters=True)  
graph = graphviz.Source(dot_data) 
graph
graph.render("accepted_data")

cross validation scores: [0.92529586 0.92159763 0.92899408 0.92455621 0.91568047 0.91420118
 0.91789941 0.92455621 0.92381657 0.9260355 ]
[[0.2755102  0.7244898 ]
 [0.00263852 0.99736148]
 [0.00263852 0.99736148]
 ...
 [0.00263852 0.99736148]
 [0.17692308 0.82307692]
 [0.92990654 0.07009346]]


'accepted_data.pdf'

In [9]:
#Set the constant 'c'
#Apply the designed tree on the rejected data
predictions_rejected = clf_fitted.predict_proba(processed_data_rejected)
rejected_probs  = predictions_rejected[:,1]
print(np.mean(rejected_probs))
c = (np.percentile(rejected_probs,95))+0.01
print(c)


0.20122756919972912
0.5655555555555556


In [10]:
#Compute 'utility' values
utility_accepted = predictions_accepted -c
print(utility_accepted)

utility_rejected = predictions_rejected -c
print(utility_rejected)


[[-0.29004535  0.15893424]
 [-0.56291703  0.43180592]
 [-0.56291703  0.43180592]
 ...
 [-0.56291703  0.43180592]
 [-0.38863248  0.25752137]
 [ 0.36435099 -0.4954621 ]]
[[-0.12111111 -0.01      ]
 [-0.12111111 -0.01      ]
 [ 0.36435099 -0.4954621 ]
 ...
 [ 0.36435099 -0.4954621 ]
 [ 0.36435099 -0.4954621 ]
 [-0.12111111 -0.01      ]]


In [None]:
#clf = clf.fit(X, Y)
#clf.predict_proba([[2., 2.]])