In [1]:
# import dependencies
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Read in CSV
loan_data = pd.read_csv('drive/MyDrive/Loan_Eligibility_ML/Resources/Loan_Default.csv')

# Create dataframe
loan_df = pd.DataFrame(loan_data)
loan_df

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.376900,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148665,173555,2019,cf,Sex Not Available,nopre,type1,p3,l1,nopc,nob/c,...,CIB,659,EXP,55-64,to_inst,71.792763,south,direct,0,48.0
148666,173556,2019,cf,Male,nopre,type1,p1,l1,nopc,nob/c,...,CIB,569,CIB,25-34,not_inst,74.428934,south,direct,0,15.0
148667,173557,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,CIB,702,EXP,45-54,not_inst,61.332418,North,direct,0,49.0
148668,173558,2019,cf,Female,nopre,type1,p4,l1,nopc,nob/c,...,EXP,737,EXP,55-64,to_inst,70.683453,North,direct,0,29.0


In [4]:
# check initial length before cleaning
len(loan_df.index)

148670

In [5]:
# Drop columns that dont have any differing values as well as ID columns
loan_df = loan_df.drop(columns = ['construction_type', 'Secured_by', 'ID', 'Security_Type', 'year', 'open_credit', 'submission_of_application', 'Interest_rate_spread', 'Upfront_charges', 'dtir1', 'rate_of_interest'])

In [6]:
# Drop null values
clean_loan_df = loan_df.dropna(axis=0, how='any')
# clean_loan_df = loan_df.copy()

In [7]:
# check initial length before cleaning
len(clean_loan_df.index)

120526

In [8]:
# Remove instances of Gender NA
clean_loan_df.drop(clean_loan_df[clean_loan_df['Gender'] == 'Sex Not Available'].index, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.drop(clean_loan_df[clean_loan_df['Gender'] == 'Sex Not Available'].index, inplace = True)


In [9]:
clean_loan_df['LTV'].unique()

array([ 80.01968504,  69.3768997 ,  91.88654354, ...,  66.73703257,
       151.8145161 ,  41.49550706])

In [10]:
clean_loan_df

Unnamed: 0,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,business_or_commercial,loan_amount,term,Neg_ammortization,...,occupancy_type,total_units,income,credit_type,Credit_Score,co-applicant_credit_type,age,LTV,Region,Status
2,cf,Male,pre,type1,p1,l1,nob/c,406500,360.0,neg_amm,...,pr,1U,9480.0,EXP,834,CIB,35-44,80.019685,south,0
3,cf,Male,nopre,type1,p4,l1,nob/c,456500,360.0,not_neg,...,pr,1U,11880.0,EXP,587,CIB,45-54,69.376900,North,0
4,cf,Joint,pre,type1,p1,l1,nob/c,696500,360.0,not_neg,...,pr,1U,10440.0,CRIF,602,EXP,25-34,91.886544,North,0
5,cf,Joint,pre,type1,p1,l1,nob/c,706500,360.0,not_neg,...,pr,1U,10080.0,EXP,864,EXP,35-44,70.089286,North,0
6,cf,Joint,pre,type1,p3,l1,nob/c,346500,360.0,not_neg,...,pr,1U,5040.0,EXP,860,EXP,55-64,79.109589,North,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148664,cf,Joint,nopre,type2,p1,l1,b/c,156500,360.0,not_neg,...,pr,1U,4020.0,EXP,859,EXP,65-74,99.050633,central,0
148666,cf,Male,nopre,type1,p1,l1,nob/c,586500,360.0,not_neg,...,ir,4U,7140.0,CIB,569,CIB,25-34,74.428934,south,0
148667,cf,Male,nopre,type1,p4,l1,nob/c,446500,180.0,not_neg,...,pr,1U,6900.0,CIB,702,EXP,45-54,61.332418,North,0
148668,cf,Female,nopre,type1,p4,l1,nob/c,196500,180.0,not_neg,...,pr,1U,7140.0,EXP,737,EXP,55-64,70.683453,North,0


In [11]:
# Convert Approv_in_adv to 0 = pre and 1 = nopre
clean_loan_df.approv_in_adv[clean_loan_df.approv_in_adv == 'pre'] = 0
clean_loan_df.approv_in_adv[clean_loan_df.approv_in_adv == 'nopre'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.approv_in_adv[clean_loan_df.approv_in_adv == 'pre'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.approv_in_adv[clean_loan_df.approv_in_adv == 'pre'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.approv_in_adv[clean_loan_df.approv_in_adv == 'nopre'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/ind

In [12]:
# Set the loan limit to be conforming loans cf = 0 and nonconforming loans ncf = 1
clean_loan_df.loan_limit[clean_loan_df.loan_limit == 'cf'] = 0
clean_loan_df.loan_limit[clean_loan_df.loan_limit == 'ncf'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.loan_limit[clean_loan_df.loan_limit == 'cf'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.loan_limit[clean_loan_df.loan_limit == 'cf'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.loan_limit[clean_loan_df.loan_limit == 'ncf'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a

In [13]:
# set the credit worthiness to 0 and 1 so l1 = 0 and l2 = 1
clean_loan_df.Credit_Worthiness[clean_loan_df.Credit_Worthiness == 'l1'] = 0
clean_loan_df.Credit_Worthiness[clean_loan_df.Credit_Worthiness == 'l2'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.Credit_Worthiness[clean_loan_df.Credit_Worthiness == 'l1'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.Credit_Worthiness[clean_loan_df.Credit_Worthiness == 'l1'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.Credit_Worthiness[clean_loan_df.Credit_Worthiness == 'l2'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

In [14]:
# set the business or commercial to nob/c = 0 and b/c = 1
clean_loan_df.business_or_commercial[clean_loan_df.business_or_commercial == 'nob/c'] = 0
clean_loan_df.business_or_commercial[clean_loan_df.business_or_commercial == 'b/c'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.business_or_commercial[clean_loan_df.business_or_commercial == 'nob/c'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.business_or_commercial[clean_loan_df.business_or_commercial == 'nob/c'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.business_or_commercial[clean_loan_df.business_or_commercial == 'b/c'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: htt

In [15]:
clean_loan_df['age'].unique()

array(['35-44', '45-54', '25-34', '55-64', '65-74', '>74', '<25'],
      dtype=object)

In [16]:
# set Neg_ammortization to notneg = 0 and neg = 1
clean_loan_df.Neg_ammortization[clean_loan_df.Neg_ammortization == 'not_neg'] = 0
clean_loan_df.Neg_ammortization[clean_loan_df.Neg_ammortization == 'neg_amm'] = 1

# set interest_only to not_int = 0 and int_only = 1
clean_loan_df.interest_only[clean_loan_df.interest_only == 'not_int'] = 0
clean_loan_df.interest_only[clean_loan_df.interest_only == 'int_only'] = 1

# set lump_sum_payment to not_lpsm = 0 and lpsm = 1
clean_loan_df.lump_sum_payment[clean_loan_df.lump_sum_payment == 'not_lpsm'] = 0
clean_loan_df.lump_sum_payment[clean_loan_df.lump_sum_payment == 'lpsm'] = 1

# set age to values 0 - 6
clean_loan_df.age[clean_loan_df.age == '<25'] = 0
clean_loan_df.age[clean_loan_df.age == '25-34'] = 1
clean_loan_df.age[clean_loan_df.age == '35-44'] = 2
clean_loan_df.age[clean_loan_df.age == '45-54'] = 3
clean_loan_df.age[clean_loan_df.age == '55-64'] = 4
clean_loan_df.age[clean_loan_df.age == '65-74'] = 5
clean_loan_df.age[clean_loan_df.age == '>74'] = 6


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.Neg_ammortization[clean_loan_df.Neg_ammortization == 'not_neg'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.Neg_ammortization[clean_loan_df.Neg_ammortization == 'not_neg'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df.Neg_ammortization[clean_loan_df.Neg_ammortization == 'neg_amm'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org

In [17]:
# Use get_dummies to convert columns to int
clean_loan_df = pd.get_dummies(clean_loan_df,
                            columns=['Gender', 'loan_type', 'loan_purpose', 'total_units', 'credit_type', 'co-applicant_credit_type', 'Region', 'occupancy_type'], dtype=int)

In [18]:
# Remove the extra column created by get_dummies (for example in gender you had values of male, female, and joint. 
# Joint can be removed because if it is not male/female that is the only other option)
clean_loan_df = clean_loan_df.drop(columns = ['Gender_Joint', 'loan_type_type3', 'loan_purpose_p4', 'total_units_4U', 'credit_type_EXP', 'co-applicant_credit_type_EXP', 'Region_south', 'occupancy_type_sr'])



In [19]:
clean_loan_df['Status'].unique()

array([0, 1])

In [20]:
# Define features set
X = clean_loan_df.copy()
X.drop("Status", axis=1, inplace=True)
X.head()

Unnamed: 0,loan_limit,approv_in_adv,Credit_Worthiness,business_or_commercial,loan_amount,term,Neg_ammortization,interest_only,lump_sum_payment,property_value,...,total_units_3U,credit_type_CIB,credit_type_CRIF,credit_type_EQUI,co-applicant_credit_type_CIB,Region_North,Region_North-East,Region_central,occupancy_type_ir,occupancy_type_pr
2,0,0,0,0,406500,360.0,1,0,0,508000.0,...,0,0,0,0,1,0,0,0,0,1
3,0,1,0,0,456500,360.0,0,0,0,658000.0,...,0,0,0,0,1,1,0,0,0,1
4,0,0,0,0,696500,360.0,0,0,0,758000.0,...,0,0,1,0,0,1,0,0,0,1
5,0,0,0,0,706500,360.0,0,0,0,1008000.0,...,0,0,0,0,0,1,0,0,0,1
6,0,0,0,0,346500,360.0,0,0,0,438000.0,...,0,0,0,0,0,1,0,0,0,1


In [21]:
# Define target vector
y = clean_loan_df["Status"].values.reshape(-1, 1)
y = y.astype('int')
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [22]:
# split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [23]:
# Scale data
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [24]:
# Create the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [25]:
# Fit the model from decision tree classifier
model = model.fit(X_train_scaled, y_train)

In [26]:
# Making predictions using the testing data from decision tree classifier
predictions = model.predict(X_test_scaled)


In [27]:
# Calculating the confusion matrix for decision tree classifier
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [28]:
# Displaying results of decision tree classifier
print("Confusion Matrix")
# display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix
Accuracy Score : 0.797663139329806
Classification Report
              precision    recall  f1-score   support

           0       0.89      0.87      0.88     19239
           1       0.35      0.38      0.36      3441

    accuracy                           0.80     22680
   macro avg       0.62      0.63      0.62     22680
weighted avg       0.81      0.80      0.80     22680

