In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Load the lending club dataset

In [86]:
loans = pd.read_csv('./lending-club-data.csv')

# Explore some features

In [87]:
loans.columns

Index([u'id', u'member_id', u'loan_amnt', u'funded_amnt', u'funded_amnt_inv',
       u'term', u'int_rate', u'installment', u'grade', u'sub_grade',
       u'emp_title', u'emp_length', u'home_ownership', u'annual_inc',
       u'is_inc_v', u'issue_d', u'loan_status', u'pymnt_plan', u'url', u'desc',
       u'purpose', u'title', u'zip_code', u'addr_state', u'dti',
       u'delinq_2yrs', u'earliest_cr_line', u'inq_last_6mths',
       u'mths_since_last_delinq', u'mths_since_last_record', u'open_acc',
       u'pub_rec', u'revol_bal', u'revol_util', u'total_acc',
       u'initial_list_status', u'out_prncp', u'out_prncp_inv', u'total_pymnt',
       u'total_pymnt_inv', u'total_rec_prncp', u'total_rec_int',
       u'total_rec_late_fee', u'recoveries', u'collection_recovery_fee',
       u'last_pymnt_d', u'last_pymnt_amnt', u'next_pymnt_d',
       u'last_credit_pull_d', u'collections_12_mths_ex_med',
       u'mths_since_last_major_derog', u'policy_code', u'not_compliant',
       u'status', u'inactiv

# Explore the target column

In [88]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1)
loans = loans.drop('bad_loans', axis=1)

In [95]:
print 'percentage of safe loan = ' + str(1.0*len(loans[loans['safe_loans'] == +1])/len(loans))
print 'percentage of risky loan = ' + str(1.0*len(loans[loans['safe_loans'] == -1])/len(loans))

print 'number of safe loan = ' + str(len(loans[loans['safe_loans'] == +1]))
print 'number of risky loan = ' + str(len(loans[loans['safe_loans'] == -1]))

percentage of safe loan = 0.811185331996
percentage of risky loan = 0.188814668004
number of safe loan = 99457
number of risky loan = 23150


# Features for the classification algorithm

In [90]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

categorical_variables = []
for feature_type, feature_name in zip(loans.dtypes, loans.columns):
    if feature_type == object:
        categorical_variables.append(feature_name)

#print categorical_variables        
for feature in categorical_variables:
    #loan_data_one_hot_encoded = loans[feature].apply(lambda x: {x:1})
    loan_data_unpacked = pd.get_dummies(loans[feature], prefix=feature)
    
    #print loan_data_unpacked
    
    for col in loan_data_unpacked.columns:
        loan_data_unpacked[col] = loan_data_unpacked[col].fillna(0)
        
    loans = loans.drop(feature, axis=1)
    #loans = loans.add(loan_data_unpacked, axis=1)
    loans = pd.concat([loans, loan_data_unpacked], axis=1)

In [107]:
train_idx = pd.read_json('./module-5-assignment-1-train-idx.json', typ=list)
validation_idx = pd.read_json('./module-5-assignment-1-validation-idx.json', typ=list)

train_data = loans.iloc[train_idx]
validation_data = loans.iloc[validation_idx]

In [109]:
print len(train_data)
print len(validation_data)

37224
9284


# Build a decision tree classifier

In [119]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

decision_tree_model = DecisionTreeClassifier(max_depth=6)
decision_tree_model.fit(train_data.drop(target, axis=1).as_matrix(), train_data[target].as_matrix())

small_model = DecisionTreeClassifier(max_depth=2)
small_model.fit(train_data.drop(target, axis=1), train_data[target])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

# Visualizing a learned model

In [145]:
features = [x for x in train_data.columns if x != 'safe_loans']
with open("tree.dot",'w') as f:
    f = export_graphviz(small_model, out_file=f, class_names=['risky', 'safe'], feature_names=features)

# Make predictions

In [128]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1] 

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)

sample_validation_data

Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans,grade_A,grade_B,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
19,0,11,11.18,1,1,82.4,0.0,1,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79,0,10,16.85,1,1,96.4,0.0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
24,0,3,13.97,0,1,59.5,0.0,-1,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
41,0,11,16.33,1,1,62.1,0.0,-1,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Q1

In [150]:
predicted = decision_tree_model.predict(sample_validation_data.drop(target, axis=1))
print predicted
print sample_validation_data[target].as_matrix()

print small_model.predict_proba(sample_validation_data[features])
print decision_tree_model.predict_proba(sample_validation_data[features])

[ 1 -1 -1  1]
[ 1  1 -1 -1]
[[ 0.41896585  0.58103415]
 [ 0.59255339  0.40744661]
 [ 0.59255339  0.40744661]
 [ 0.23120112  0.76879888]]
[[ 0.34156543  0.65843457]
 [ 0.53630646  0.46369354]
 [ 0.64750958  0.35249042]
 [ 0.20789474  0.79210526]]


In [149]:
help(small_model.predict)

Help on method predict in module sklearn.tree.tree:

predict(self, X, check_input=True) method of sklearn.tree.tree.DecisionTreeClassifier instance
    Predict class or regression value for X.
    
    For a classification model, the predicted class for each sample in X is
    returned. For a regression model, the predicted value based on X is
    returned.
    
    Parameters
    ----------
    X : array-like or sparse matrix of shape = [n_samples, n_features]
        The input samples. Internally, it will be converted to
        ``dtype=np.float32`` and if a sparse matrix is provided
        to a sparse ``csr_matrix``.
    
    check_input : boolean, (default=True)
        Allow to bypass several input checking.
        Don't use this parameter unless you know what you do.
    
    Returns
    -------
    y : array of shape = [n_samples] or [n_samples, n_outputs]
        The predicted classes, or the predict values.



In [132]:
print '50% of predictions using decision_tree_model is correct'

50% of predictions using decision_tree_model is correct


# Evaluating accuracy of the decision tree model

In [135]:
small_model_score = small_model.score(train_data.drop(target, axis=1), train_data[target])
decision_tree_score = decision_tree_model.score(train_data.drop(target, axis=1), train_data[target])

print 'training accuracy of small model = ' + str(small_model_score)
print 'training accuracy of decision tree model = ' + str(decision_tree_score)

training accuracy of small model = 0.613502041694
training accuracy of decision tree model = 0.640527616591


## Q2

In [136]:
small_model_validate_score = small_model.score(validation_data.drop(target, axis=1), validation_data[target])
decision_tree_validate_score = decision_tree_model.score(validation_data.drop(target, axis=1), validation_data[target])

print 'validation accuracy of small model = ' + str(small_model_validate_score)
print 'validation accuracy of decision tree model = ' + str(decision_tree_validate_score)

validation accuracy of small model = 0.619345109866
validation accuracy of decision tree model = 0.636148211978


# Evaluating accuracy of a complex decision tree model

In [138]:
big_model = DecisionTreeClassifier(max_depth=10)
big_model.fit(train_data.drop(target, axis=1), train_data[target])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

## Q3

In [139]:
print 'training accuracy of big model = ' + str(big_model.score(train_data.drop(target, axis=1), train_data[target]))
print 'validation accuracy of big model = ' + str(big_model.score(validation_data.drop(target, axis=1), validation_data[target]))

training accuracy of big model = 0.663845905867
validation accuracy of big model = 0.626454114606


# Quantifying the cost of mistakes

In [143]:
predicted = decision_tree_model.predict(validation_data.drop(target, axis=1))
false_positive = sum([1 if x==-1 and y==+1 else 0 for x,y in zip(validation_data[target], predicted)])
false_negative = sum([1 if x==+1 and y==-1 else 0 for x,y in zip(validation_data[target], predicted)])

## Q4

In [144]:
print 'cost of mistakes = ' + str(false_positive*20000+false_negative*10000)

cost of mistakes = 50390000
