## Loading Dataset

In [129]:
import pandas as pd
import numpy as np

In [130]:
# Reading the csv file and displaying the first 5 rows of the dataframe.
loans = pd.read_csv('Dataset/lending-club-data.csv')
loans.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,0.4,1.0,1.0,1.0,0,8.1435,20141201T000000,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,0.8,1.0,1.0,1.0,1,2.3932,20161201T000000,1,1,1
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1.0,1.0,1.0,1.0,0,8.25955,20141201T000000,1,1,1
3,1076863,1277178,10000,10000,10000,36 months,13.49,339.31,C,C1,...,0.2,1.0,1.0,1.0,0,8.27585,20141201T000000,0,1,1
4,1075269,1311441,5000,5000,5000,36 months,7.9,156.46,A,A4,...,0.8,1.0,1.0,1.0,0,5.21533,20141201T000000,1,1,1


## Feature Engineering

In [131]:
# Creating a new column called safe_loans and assigning it the value of 1 if the value of bad_loans is 0 and -1 if the value of bad_loans is 1.
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.drop(columns = 'bad_loans')

In [132]:
# Getting the distribution in numbers of safe loans in the dataset.
print(loans['safe_loans'].value_counts())
# Calculating the proportion of safe loans in the dataset.
print(loans['safe_loans'].value_counts()/len(loans))

 1    99457
-1    23150
Name: safe_loans, dtype: int64
 1    0.811185
-1    0.188815
Name: safe_loans, dtype: float64


In [133]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

In [134]:
loans.head()

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
0,B,B2,0,11,RENT,27.65,credit_card,36 months,1,1,83.7,0.0,1
1,C,C4,1,1,RENT,1.0,car,60 months,1,1,9.4,0.0,-1
2,C,C5,0,11,RENT,8.72,small_business,36 months,1,1,98.5,0.0,1
3,C,C1,0,11,RENT,20.0,other,36 months,0,1,21.0,16.97,1
4,A,A4,0,4,RENT,11.2,wedding,36 months,1,1,28.3,0.0,1


In [135]:
# Creating two dataframes, one for safe loans and one for risky loans.
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print ("Number of safe loans  : %s" % len(safe_loans_raw))
print ("Number of risky loans : %s" % len(risky_loans_raw))

Number of safe loans  : 99457
Number of risky loans : 23150


In [126]:
# # Since there are fewer risky loans than safe loans, find the ratio of the sizes and use that percentage to undersample the safe loans.
# percentage = len(risky_loans_raw)/float(len(safe_loans_raw))

# risky_loans = risky_loans_raw

# np.random.seed(1)
# safe_loans = safe_loans_raw.sample(int(percentage*len(safe_loans_raw)))

# # Append the risky_loans with the downsampled version of safe_loans
# loans_data = risky_loans.append(safe_loans)

# # Getting the distribution in numbers of safe loans in the dataset.
# print(loans_data['safe_loans'].value_counts())
# # Calculating the proportion of safe loans in the dataset.
# print(loans_data['safe_loans'].value_counts()/len(loans_data))

In [136]:
# The below code is one-hot encoding the categorical variables.
loans_data = loans
# Creating a list of all the categorical variables in the dataset.
categorical_variables = []
for feat_name, feat_type in zip(loans_data.columns, loans_data.dtypes):
    if feat_type == object:
        categorical_variables.append(feat_name)
print(categorical_variables)

# Creating dummy variables for the categorical variables.
onehot_frame = pd.get_dummies(loans_data[categorical_variables])
# Dropping the categorical variables from the dataframe.
loans_data.drop(columns=categorical_variables, inplace=True)
# Concatenating the onehot_frame and loans_data dataframes.
loans_data = pd.concat([onehot_frame,loans_data],axis=1)

loans_data.head()

['grade', 'sub_grade', 'home_ownership', 'purpose', 'term']


Unnamed: 0,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,sub_grade_A1,sub_grade_A2,sub_grade_A3,...,term_ 36 months,term_ 60 months,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,11,27.65,1,1,83.7,0.0,1
1,0,0,1,0,0,0,0,0,0,0,...,0,1,1,1,1.0,1,1,9.4,0.0,-1
2,0,0,1,0,0,0,0,0,0,0,...,1,0,0,11,8.72,1,1,98.5,0.0,1
3,0,0,1,0,0,0,0,0,0,0,...,1,0,0,11,20.0,0,1,21.0,16.97,1
4,1,0,0,0,0,0,0,0,0,0,...,1,0,0,4,11.2,1,1,28.3,0.0,1


## Model Building

In [137]:
# The below code is loading the train and test index files into the train_index and test_index variables.Index files are used to follow the same implementation done in assignment as it's not done on pandas.
import json

temp_file = open('Dataset/module-5-assignment-1-train-idx.json')
train_index = json.load(temp_file)
temp_file.close()
temp_file = open('Dataset/module-5-assignment-1-validation-idx.json')
test_index = json.load(temp_file)
temp_file.close()

train_data = loans_data.iloc[train_index]
validation_data = loans_data.iloc[test_index]

In [139]:
y_train = train_data[target]
y_test = validation_data[target]
X_train = train_data[train_data.columns[:-1]]
X_test = validation_data[validation_data.columns[:-1]]

In [168]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier(max_depth=6)
small_model = DecisionTreeClassifier(max_depth=2)
big_model = DecisionTreeClassifier(max_depth=10)

# Fitting the decision tree model to the training data
decision_tree_model.fit(X_train, y_train)
small_model.fit(X_train, y_train)
big_model.fit(X_train, y_train)

In [142]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

Unnamed: 0,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,sub_grade_A1,sub_grade_A2,sub_grade_A3,...,term_ 36 months,term_ 60 months,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
19,0,1,0,0,0,0,0,0,0,0,...,1,0,0,11,11.18,1,1,82.4,0.0,1
79,0,0,0,1,0,0,0,0,0,0,...,1,0,0,10,16.85,1,1,96.4,0.0,1
24,0,0,0,1,0,0,0,0,0,0,...,0,1,0,3,13.97,0,1,59.5,0.0,-1
41,1,0,0,0,0,0,0,0,0,0,...,1,0,0,11,16.33,1,1,62.1,0.0,-1


## Questions

> Question 1
> 
> What percentage of the predictions on sample_validation_data did decision_tree_model get correct?

In [147]:
# Calculating the accuracy of the model on the sample validation data.
features = sample_validation_data.columns[:-1]
(decision_tree_model.predict(sample_validation_data[features])==sample_validation_data[target]).sum()/len(sample_validation_data)

0.5

> Question 2
> 
> Which loan has the highest probability of being classified as a safe loan?

In [152]:
decision_tree_model.predict_proba(sample_validation_data[features])[:,1]

array([0.65843457, 0.46369354, 0.35249042, 0.79210526])

> Question 4
> 
> What is the accuracy of decision_tree_model on the validation set, rounded to the nearest .01 (e.g. 0.76)?

In [154]:
(decision_tree_model.predict(X_test[features])==y_test).sum()/len(y_test)

0.6361482119775959

> Question 5
> 
> How does the performance of big_model on the validation set compare to decision_tree_model on the validation set? Is this a sign of overfitting?

In [170]:
print("Decision Tree Model :",(decision_tree_model.predict(X_test[features])==y_test).sum()/len(y_test))
print("Big Model :",(big_model.predict(X_test[features])==y_test).sum()/len(y_test))

Decision Tree Model : 0.6363636363636364
Big Model : 0.6263464024127531


> Question 6
> 
> Let us assume that each mistake costs money:
> 
*   Assume a cost of $10,000 per false negative.

*   Assume a cost of $20,000 per false positive.
> 
> What is the total cost of mistakes made by decision_tree_model on validation_data? Please enter your answer as a plain integer, without the dollar sign or the comma separator, e.g. 3002000.


In [166]:
predictions = decision_tree_model.predict(X_test[features])
false_negative = ((predictions==-1) * (y_test==1)).sum()
false_positive = ((predictions==1) * (y_test==-1)).sum()

In [167]:
print(10000*false_negative+20000*false_positive)

50390000
