# Instructor Do: Decision Trees

In [64]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Loading and Preprocessing Loans Encoded Data

In [65]:
# Loading data
file_path = Path("loans_data_encoded.csv")
df_loans = pd.read_csv(file_path)
df_loans.head()

Unnamed: 0,amount,term,age,bad,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,0,6,0,1,0,0,0,1
1,1000,30,50,0,7,1,0,0,0,1,0
2,1000,30,33,0,8,1,0,0,0,1,0
3,1000,15,27,0,9,0,0,0,1,0,1
4,1000,30,28,0,10,0,0,0,1,1,0


In [66]:
# Define features set
X = df_loans.copy()
X = X.drop("bad", axis=1)
X.head()

Unnamed: 0,amount,term,age,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,6,0,1,0,0,0,1
1,1000,30,50,7,1,0,0,0,1,0
2,1000,30,33,8,1,0,0,0,1,0
3,1000,15,27,9,0,0,0,1,0,1
4,1000,30,28,10,0,0,0,1,1,0


In [67]:
# Define target vector
y = df_loans["bad"].values.reshape(-1, 1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [68]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [69]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(375, 10)
(125, 10)
(375, 1)
(125, 1)


In [70]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [71]:
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(400, 10)
(100, 10)
(400, 1)
(100, 1)


In [72]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [73]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [74]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Fitting the Decision Tree Model

In [75]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [76]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

## Making Predictions Using the Tree Model

In [77]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

## Model Evaluation

In [78]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [79]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,51,33
Actual 1,21,20


Accuracy Score : 0.568
Classification Report
              precision    recall  f1-score   support

           0       0.71      0.61      0.65        84
           1       0.38      0.49      0.43        41

    accuracy                           0.57       125
   macro avg       0.54      0.55      0.54       125
weighted avg       0.60      0.57      0.58       125



In [92]:
# Right now the  model is just saved in memory
# in order to use it in a web application we need
# to save the model to disk (to a file)
import pickle
filename = 'finalized_model_oct_23_9_03.sav'
pickle.dump(model, open(filename, 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))

# Test that you can use the model and scaler with new data

In [93]:
# Load saved model and scaler
filename = 'finalized_model_oct_23_9_03.sav'
loaded_model = pickle.load(open(filename, 'rb'))
loaded_scaler = pickle.load(open('scaler.pkl', 'rb'))

In [94]:
# Get new user inputs from web app form
# amount, term, age, month_num, BA, HS, MA, College, M, F
raw_user_input = [[1000,30,10,6,0,0,0,1,1,1]]

In [95]:
# Scale new user inputs to match pre-processing done on
# train/test data using loaded scaler
scaled_user_input = loaded_scaler.transform(raw_user_input)
scaled_user_input

array([[ 0.49582979,  0.92029442, -3.49798104, -0.23071295, -0.3785402 ,
        -0.811968  , -0.10383483,  1.06904497,  2.38671921,  0.41898519]])

In [96]:
# Get predictions using new, scaled user inputs
prediction = loaded_model.predict(scaled_user_input)
if prediction == 1:
    print("Yes, approve loan")
else:
    print("No, reject loan")

Yes, approve loan


# Resources

https://machinelearningmastery.com/how-to-save-and-load-models-and-data-preparation-in-scikit-learn-for-later-use/

https://blog.cambridgespark.com/deploying-a-machine-learning-model-to-the-web-725688b851c7