## Loading Dataset

In [4]:
import pandas as pd
import numpy as np

In [5]:
# Reading the csv file and displaying the first 5 rows of the dataframe.
loans = pd.read_csv('Dataset/lending-club-data.csv')
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.drop(columns='bad_loans')

In [6]:
target = 'safe_loans'
features = ['grade',                     # grade of the loan (categorical)
            'sub_grade_num',             # sub-grade of the loan as a number from 0 to 1
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'payment_inc_ratio',         # ratio of the monthly payment to income
            'delinq_2yrs',               # number of delinquincies
             'delinq_2yrs_zero',          # no delinquincies in last 2 years
            'inq_last_6mths',            # number of creditor inquiries in last 6 months
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'open_acc',                  # number of open credit accounts
            'pub_rec',                   # number of derogatory public records
            'pub_rec_zero',              # no derogatory public records
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
            'int_rate',                  # interest rate of the loan
            'total_rec_int',             # interest received to date
            'annual_inc',                # annual income of borrower
            'funded_amnt',               # amount committed to the loan
            'funded_amnt_inv',           # amount committed by investors for the loan
            'installment',               # monthly payment owed by the borrower
           ]

In [7]:
# The below code is one-hot encoding the categorical variables.
loans = loans[features+[target]]
# Creating a list of all the categorical variables in the dataset.
categorical_variables = []
for feat_name, feat_type in zip(loans.columns, loans.dtypes):
    if feat_type == object:
        categorical_variables.append(feat_name)
print(categorical_variables)

# Creating dummy variables for the categorical variables.
onehot_frame = pd.get_dummies(loans[categorical_variables])
# Dropping the categorical variables from the dataframe.
loans.drop(columns=categorical_variables, inplace=True)
# Concatenating the onehot_frame and loans_data dataframes.
loans = pd.concat([onehot_frame,loans],axis=1)

loans.head()

['grade', 'home_ownership', 'purpose']


Unnamed: 0,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,...,pub_rec_zero,revol_util,total_rec_late_fee,int_rate,total_rec_int,annual_inc,funded_amnt,funded_amnt_inv,installment,safe_loans
0,0,1,0,0,0,0,0,0,0,0,...,1.0,83.7,0.0,10.65,861.07,24000.0,5000,4975,162.87,1
1,0,0,1,0,0,0,0,0,0,0,...,1.0,9.4,0.0,15.27,435.17,30000.0,2500,2500,59.83,-1
2,0,0,1,0,0,0,0,0,0,0,...,1.0,98.5,0.0,15.96,603.65,12252.0,2400,2400,84.33,1
3,0,0,1,0,0,0,0,0,0,0,...,1.0,21.0,16.97,13.49,2209.33,49200.0,10000,10000,339.31,1
4,1,0,0,0,0,0,0,0,0,0,...,1.0,28.3,0.0,7.9,631.38,36000.0,5000,5000,156.46,1


In [8]:
# The below code is loading the train and test index files into the train_index and test_index variables.Index files are used to follow the same implementation done in assignment as it's not done on pandas.
import json

temp_file = open('Dataset/module-8-assignment-1-train-idx.json')
train_index = json.load(temp_file)
temp_file.close()
temp_file = open('Dataset/module-8-assignment-1-validation-idx.json')
validation_index = json.load(temp_file)
temp_file.close()

train_data = loans.iloc[train_index]
validation_data = loans.iloc[validation_index]

In [20]:
X_train = train_data.dropna(axis=0)
X_test = validation_data.dropna(axis=0)
y_train = X_train[target]
y_test = X_test[target]
X_train = X_train[X_train.columns[:-1]]
X_test = X_test[X_test.columns[:-1]]

In [49]:
from sklearn.ensemble import GradientBoostingClassifier

model_5 = GradientBoostingClassifier(max_depth=6, n_estimators=5).fit(X_train,y_train)
model_10 = GradientBoostingClassifier(n_estimators=10, max_depth=6).fit(X_train, y_train)
model_50 = GradientBoostingClassifier(n_estimators=50, max_depth=6).fit(X_train, y_train)
model_100 = GradientBoostingClassifier(n_estimators=100, max_depth=6).fit(X_train, y_train)
model_200 = GradientBoostingClassifier(n_estimators=200, max_depth=6).fit(X_train, y_train)
model_500 = GradientBoostingClassifier(n_estimators=500, max_depth=6).fit(X_train, y_train)

In [22]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

Unnamed: 0,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,...,pub_rec_zero,revol_util,total_rec_late_fee,int_rate,total_rec_int,annual_inc,funded_amnt,funded_amnt_inv,installment,safe_loans
38153,0,0,0,0,0,1,0,0,0,0,...,1.0,66.8,0.0,15.01,1558.74,18000.0,12800,1000,443.81,1
38157,0,0,0,1,0,0,0,0,0,0,...,0.0,79.7,59.9893,12.8,1261.73,57000.0,6000,1025,201.6,1
24,0,0,0,1,0,0,0,0,0,0,...,1.0,59.5,0.0,16.77,719.11,50004.0,5000,5000,123.65,-1
41,1,0,0,0,0,0,0,1,0,0,...,1.0,62.1,0.0,8.9,696.99,100000.0,5000,5000,158.77,-1


## Questions

> Question 1
> 
> What percentage of the predictions on sample_validation_data did model_5 get correct?

In [50]:
(model_5.predict(sample_validation_data[sample_validation_data.columns[:-1]]) == sample_validation_data[target]).sum()/len(sample_validation_data)

0.5

> Question 2
> 
> According to **model_5**, which loan is the least likely to be a safe loan?

In [51]:
model.predict_proba(sample_validation_data[sample_validation_data.columns[:-1]])[:,1]

array([0.64858216, 0.5061776 , 0.66918026, 0.79395478])

> Question 3
> 
> What is the number of false positives on the validation data?

In [39]:
((validation_data[target]==-1) * (model.predict(validation_data[validation_data.columns[:-1]])==1)).sum()

2499

> Question 4
> 
> Using the same costs of the false positives and false negatives, what is the cost of the mistakes made by the boosted tree model (model_5) as evaluated on the validation_set?

In [40]:
false_positives = ((validation_data[target]==-1) * (model.predict(validation_data[validation_data.columns[:-1]])==1)).sum()
false_negatives = ((validation_data[target]==1) * (model.predict(validation_data[validation_data.columns[:-1]])==-1)).sum()
print("Cost :",(10000*false_negatives)+(20000*false_positives))

Cost : 50120000


> Question 5
> 
> What grades are the top 5 loans?

In [44]:
new_df = validation_data.copy() 
new_df['probability'] = model.predict_proba(validation_data[validation_data.columns[:-1]])[:,1]
new_df.sort_values('probability', ascending=False).head(5)

Unnamed: 0,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,...,revol_util,total_rec_late_fee,int_rate,total_rec_int,annual_inc,funded_amnt,funded_amnt_inv,installment,safe_loans,probability
110479,1,0,0,0,0,0,0,0,0,0,...,48.9,0.0,6.49,181.56,70000.0,17000,17000,520.96,1,0.809223
41265,1,0,0,0,0,0,0,0,0,0,...,97.2,0.0,7.9,1200.22,97000.0,20000,20000,625.81,1,0.809223
40545,1,0,0,0,0,0,0,0,0,0,...,54.1,0.0,7.62,1655.81,75000.0,23000,23000,716.72,1,0.809223
40553,1,0,0,0,0,0,0,1,0,0,...,8.2,0.0,7.9,1153.24,83840.0,24000,24000,750.97,1,0.809223
2902,1,0,0,0,0,0,0,1,0,0,...,11.5,0.0,6.03,1195.53,180000.0,12500,12500,380.45,1,0.809223


> Question 6
> 
> Which model has the best accuracy on the validation_data?

In [47]:
from sklearn import metrics
print(metrics.accuracy_score(y_test, model_10.predict(X_test)))
print(metrics.accuracy_score(y_test, model_50.predict(X_test)))
print(metrics.accuracy_score(y_test, model_100.predict(X_test)))
print(metrics.accuracy_score(y_test, model_200.predict(X_test)))
print(metrics.accuracy_score(y_test, model_500.predict(X_test)))

0.744398965962947
0.7604480827229643
0.7661568289530375
0.7698190435157259
0.767664799655321
