# Week 8

Patrick Weatherford

***


<br>

## Import Modules

Mount Google Drive to Colab session

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


<br>Copy custom packages/modules over to default path where python packages are installed for this Colab session.

[Custom Modules Repo](https://github.com/Hakuna-Patata/BU_MSDS_PTW/tree/DSC-550/Python/Custom_Modules)

In [2]:
import os
import site
import shutil

g_drive_mod_path = r"/content/drive/MyDrive/Bellevue_University/Python/Custom_Modules"

# create custom modules folder in default pkg location
colab_mod_path = site.getsitepackages()[0]  # get path where packages are installed
colab_cust_mod_path = f"{colab_mod_path}/hakuna_patata_modules"

if os.path.exists(colab_cust_mod_path):
    print('Directory not copied! Directory already exists!')
else:
    try:
        shutil.copytree(g_drive_mod_path, colab_cust_mod_path)
        print(f"{os.path.basename(g_drive_mod_path)} successfully copied from {g_drive_mod_path} to {colab_cust_mod_path}")
    except:
        print(f"Copy failed!")


Custom_Modules successfully copied from /content/drive/MyDrive/Bellevue_University/Python/Custom_Modules to /usr/local/lib/python3.7/dist-packages/hakuna_patata_modules


In [30]:
from hakuna_patata_modules import (
    api_keys,
    hp_kaggle,
    sklearn_df
)
from zipfile import ZipFile
import pandas as pd 
from pandas import DataFrame as DF
import numpy as np

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingGridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer
from sklearn.metrics import classification_report

from google.colab.data_table import DataTable
def DT(df, num_rows_per_page=10):
    return DataTable(df, num_rows_per_page=num_rows_per_page)


***

<br>

## Import & Clean Data

In [4]:
kaggle_api = hp_kaggle.kaggle_api(api_keys.Kaggle_API['username'], api_keys.Kaggle_API['key'])


In [5]:
hp_kaggle.kaggle_dataset_download(
    kaggle_api,
    r"granjithkumar/loan-approval-data-set",
    ['Loan_Train.csv'],
    r"/content/kaggle_datasets"
)


Loan_Train.csv successfully downloaded to /content/kaggle_datasets


In [6]:
origin_df = pd.read_csv('kaggle_datasets/Loan_Train.csv')

clean_df = ((origin_df)
    .drop('Loan_ID', axis=1)
    .dropna()
    .replace({'Y':1, 'N':0})
    .reset_index(drop=True)
)

DT(clean_df)


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
1,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
2,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
3,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1
4,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,1
...,...,...,...,...,...,...,...,...,...,...,...,...
475,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,1
476,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,1
477,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,1
478,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,1


***

<br>

## Split Data

In [7]:
X = clean_df.drop('Loan_Status', axis=1)
y = clean_df['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25
)


(360, 360, 120, 120)

***

<br>

## Create Transformer Pipeline

In [8]:
num_cols = X_train.select_dtypes(include='number').columns.to_list()
cat_cols = X_train.select_dtypes(include=['object','category']).columns.to_list()

cat_pipe = Pipeline([
    ('get_cat_cols', sklearn_df.DFColumnExtractor(cat_cols)),
    ('to_string', sklearn_df.DFStringTransformer()),
    ('encode', sklearn_df.DFDummyTransformer())
])


num_pipe = Pipeline([
    ('get_num_cols', sklearn_df.DFColumnExtractor(num_cols))
])


preprocess_pipe = Pipeline([
    ('feature_union', sklearn_df.DFFeatureUnion([
        ('categorical_transform', cat_pipe),
        ('numeric_transform', num_pipe)
    ]))
])


***

<br>

## Create & Fit Model Pipeline

In [9]:
model_pipe = Pipeline([
    ('preprocess', preprocess_pipe),
    ('scale', sklearn_df.DFScaler(MinMaxScaler())),
    ('clf', KNeighborsClassifier(n_jobs=-1))
])


In [23]:
model_pipe.fit(X_train, y_train)

test_predictions = model_pipe.predict(X_test)

print(classification_report(y_test, test_predictions))


              precision    recall  f1-score   support

           0       0.50      0.20      0.29        40
           1       0.69      0.90      0.78        80

    accuracy                           0.67       120
   macro avg       0.60      0.55      0.53       120
weighted avg       0.63      0.67      0.62       120



***

<br>

## Hyperparameter Tuning

Create search space and grid search object.

In [24]:
n_neighbors = [i for i in range(1, 11)]

search_space = [{"clf__n_neighbors":n_neighbors}]

model_search = HalvingGridSearchCV(model_pipe, search_space, cv=5, n_jobs=-1, verbose=0)


<br>

Fit the grid search object to the training data.

In [27]:
model_search.fit(X_train, y_train)

model_search.best_params_


{'clf__n_neighbors': 9}

<br>

Results score of best found model.

In [29]:
new_test_predictions = model_search.predict(X_test)

print(classification_report(y_test, new_test_predictions))


              precision    recall  f1-score   support

           0       0.50      0.15      0.23        40
           1       0.69      0.93      0.79        80

    accuracy                           0.67       120
   macro avg       0.59      0.54      0.51       120
weighted avg       0.62      0.67      0.60       120



***

<br>

## Grid Searching for Best Model

In [37]:
## create parameter settings for search space
n_neighbors = [i for i in range(1, 11)]
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
n_estimators = [10, 100, 1000]
max_features = [1, 2, 3]
knn = [KNeighborsClassifier(n_jobs=-1)]
rf = [RandomForestClassifier(n_jobs=-1)]
logit = [LogisticRegression(solver='liblinear')]

## create search space
search_space = [
    {
        "clf":knn,
        "clf__n_neighbors":n_neighbors
    },
    {
        "clf":rf,
        "clf__n_estimators":n_estimators,
        "clf__max_features":max_features
    },
    {
        "clf":logit,
        "clf__penalty":penalty,
        "clf__C":C
    }
]

model_search = HalvingGridSearchCV(model_pipe, search_space, cv=5, n_jobs=-1, verbose=0)


In [38]:
model_search.fit(X_train, y_train)

model_search.best_params_


{'clf': LogisticRegression(penalty='l1', solver='liblinear'),
 'clf__C': 1.0,
 'clf__penalty': 'l1'}

In [39]:
best_model_predictions = model_search.predict(X_test)

print(classification_report(y_test, best_model_predictions))


              precision    recall  f1-score   support

           0       0.89      0.42      0.58        40
           1       0.77      0.97      0.86        80

    accuracy                           0.79       120
   macro avg       0.83      0.70      0.72       120
weighted avg       0.81      0.79      0.77       120



***

<br>

## Interpretation

<br>

> The K-Nearest Neighbor (KNN) model was fit to the data first with a default `n_neighbors=5`. This model prodced an accuracy of 67% and f1-scores of 78% when predicting `Y(1)` and 23% when predicting `N(0)`. The imbalance in the f1-score leads me to believe that there is imbalance in target variable classes.

> A grid search to fine tune the KNN model was then performed to find the value of `n_neighbors` which produced the best predicting model. It was found that `n_neighbors=9` was the best performing model. The improvements in results scores where very minimal and in fact worsened the prediction of `N(0)`.

> Another grid search was then performed on 3 different models, KNN, Random Forest, and Logistic Regression. Various hyperparameters where also evaluated for each of the models as well. The model with the below parameters was found to be the best model.
- Logistic Regression
- Penalty: L1
- Solver: liblinear
- C: 1
>
>This model produced an accuracy of 79% which was 12% higher than the previous 2 models. The recall when guess `N(0)` was still quiet low at 42% but was a significant jump from the previous 15%. If the imbalance in the analysis was handled different, I think that we could further improve the scores of the model.



***

<br>

## References

G Ranjith kumar. (2020). Loan Approval Data Set. Kaggle.com. https://www.kaggle.com/datasets/granjithkumar/loan-approval-data-set

‌
