In [1]:
# import modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scratchai.preproccessing import split_data, StandardScaler
from scratchai.linear_models import LogisticRegression
from scratchai.cart import DecisionTreeClassifier
from scratchai.plotting import plot_generalization_curve

In [None]:
# Load the dataset into a dataframe
loan_data_raw = pd.read_csv("data\Bank_Personal_Loan_Modelling.csv")
loan_data_raw.head()

In [None]:
# include only the columns that may be useful
columns = ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 'Securities Account', 'CD Account', 'Personal Loan']
loan_data = loan_data_raw[columns]
loan_data.head()

In [None]:
# cleaning the data
missing_vals = loan_data.isna().sum()
dupl_rows = loan_data.duplicated().sum()

print(
    f"Count of Missing values:\n {missing_vals} \n",
    f"Number of duplicate rows: {dupl_rows}"
)

In [4]:
loan_data = loan_data.drop_duplicates()

In [None]:
# Basic visualisation
loan_data.corr(numeric_only = True)

In [None]:
loan_data.describe()

In [None]:
print(
    f"Number of custumers who accepted the personal loan: {loan_data.loc[loan_data['Personal Loan'] == 1,:].shape[0]}\n",
    f"Number of custumers who didn't: {loan_data.loc[loan_data['Personal Loan'] == 0].shape[0]}"
)

In [None]:
# plot some graphs
for column in ['Age', 'Experience', 'Income', 'CCAvg', 'Mortgage']:
    plt.hist(loan_data[column])
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()
    

In [None]:
for column in ['Family', 'Education', 'Securities Account', 'CD Account']:
    plt.hist(loan_data[column])
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Covert some variables into bins to explore if there is any patterns
loan_data.loc[:, 'Age_group'] = pd.cut(loan_data['Age'], bins = [20, 40, 50, 60, 80], labels = ['20-40', '40-50','50-60','60-80']).astype('O')
loan_data.loc[:, 'Income_group'] = pd.cut(loan_data['Income'], bins = [5, 50, 100, 300], labels = ['low', 'medium', 'high']).astype('O')

for column in ['Age_group', 'Income_group']:
    plt.hist(loan_data[column])
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
columns = ['Age', 'Experience', 'Income', 'CCAvg', 'Education', 'Mortgage', 'Personal Loan']
sns.pairplot(loan_data[columns], hue = 'Personal Loan', corner = True)
plt.show()

In [10]:
# Cliping outliers in the data
loan_data['Income'] = loan_data['Income'].clip(lower = 0, upper = 150)
loan_data['CCAvg'] = loan_data['CCAvg'].clip(lower = 0, upper = 5)
loan_data['Mortgage'] = loan_data['Mortgage'].clip(lower = 0, upper = 300)

In [None]:
input_features = ['Age_group','Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage']
label = 'Personal Loan'

processed_data = loan_data[input_features + [label]]

processed_data = processed_data.join(pd.get_dummies(processed_data['Age_group']).astype('int'))
processed_data = processed_data.drop('Age_group', axis = 1)
processed_data.head()

In [None]:
# scale the data
scaled_columns = ['Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage']

scaler = StandardScaler()
processed_data = scaler.transform(processed_data, columns = scaled_columns)
processed_data.head()

In [74]:
# split the data
processed_data = processed_data.sample(frac = 1).reset_index(drop = True)

train_data, testing_data = split_data(processed_data, split_size = 0.7)
logistic_regressor_test_data, valid_data = split_data(testing_data, split_size = 0.5)

X_train, y_train = train_data.drop(label, axis = 1).values, train_data[label].values
X_valid, y_valid = valid_data.drop(label, axis = 1).values, valid_data[label].values

In [75]:
# build and fit a logisitic regression model to the data

logistic_regressor = LogisticRegression()
logistic_regressor.fit(X_train, y_train, learning_rate = 0.01, batch_size = 2048, epochs = 750, reg_rate = 0.5, X_valid = X_valid, y_valid=  y_valid)

In [None]:
# plot the generalization curve to make sure that the model didn't overfit the data
plot_generalization_curve(logistic_regressor.training_losses, logistic_regressor.validation_losses, logistic_regressor.traning_epochs)

In [None]:
from scratchai.metrics import accuracy, precision, recall

def evaluate_model(y_true, y_pred,*, model_name = ''):
    for metric, func in [('Accuracy', accuracy),('Precision', precision),('Recall', recall)]:
        print(f"{model_name} {metric}: {func(y_true, y_pred):.2f}")

In [None]:
# evaluate the logistic regression model on validation data
logistic_regressor.threshold = .5
y_pred = logistic_regressor.classifie(X_valid)

evaluate_model(y_valid, y_pred)

In [80]:
# prepare the data for the decision tree
loan_data = loan_data[input_features + [label]]

# split the data
train_data, testing_data = split_data(loan_data, split_size = 0.7)
tree_classifier_test_data, valid_data = split_data(testing_data, split_size = 0.5)

X_train, y_train = train_data[input_features].values, train_data[label].values
X_valid, y_valid = valid_data[input_features].values, valid_data[label].values

In [81]:
# Build and fit a Descision tree to the data
tree_calssifier = DecisionTreeClassifier(max_depth = 15, min_samples_split = 10)
tree_calssifier.fit(X_train, y_train)

In [None]:
# evaluate the decision tree on validation data
y_pred = tree_calssifier.predict(X_valid)

evaluate_model(y_valid, y_pred)

In [None]:
# evaludate both models on testing data
X_test, y_test = logistic_regressor_test_data.drop(label, axis = 1).values, logistic_regressor_test_data[label]
y_pred = logistic_regressor.classifie(X_test)

evaluate_model(y_test, y_pred, model_name = 'Logistic Regressor')

X_test, y_test = tree_classifier_test_data[input_features].values, tree_classifier_test_data[label].values
y_pred = tree_calssifier.predict(X_test)

evaluate_model(y_test, y_pred, model_name = 'Decision tree classifier')