In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, mean_absolute_error
from sklearn.tree import plot_tree, DecisionTreeClassifier 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

In [None]:
def log_regression_model(X_train, X_test, y_train, y_test):
    # Modelling
    logistic_regression = LogisticRegression()
    logistic_regression.fit(X_train, y_train)

    y_pred = logistic_regression.predict(X_test) # prediction of the testdata
    y_train_proba = logistic_regression.predict_proba(X_train)
    y_test_proba = logistic_regression.predict_proba(X_train)

    accuracy = (round(accuracy_score(y_test, y_pred),2))
    precision =  (round(precision_score(y_test, y_pred),2))
    recall =  (round(recall_score(y_test, y_pred),2))
    mae =  (round(mean_absolute_error(y_test, y_pred),2))

    print('Metrics for Logistic Regression Model')
    print('---'*15)
    print('Confusion Matrix: ')
    print(confusion_matrix(y_test, y_pred))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn',fmt=".0f")
    print('Accuracy: ', accuracy)
    print('Precision (TP/(TP+FP)): ', precision)
    print('Recall (TP/(TP+FN)): ', recall)
    print('MAE: ', mae)
    #ConfusionMatrixDisplay.from_estimator(logistic_regression, X_test, y_test, normalize='all', cmap= 'YlGn')


    return(y_pred,y_train_proba,y_test_proba)

In [None]:
def knn_model(X_train, X_test, y_train, y_test, k=15):
    # Train model
    knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
    knn.fit(X_train, y_train)

    # Predict on test set
    y_pred = knn.predict(X_test)

    accuracy = (round(accuracy_score(y_test, y_pred),2))
    precision =  (round(precision_score(y_test, y_pred),2))
    recall =  (round(recall_score(y_test, y_pred),2))
    mae =  (round(mean_absolute_error(y_test, y_pred),2))

    print('Metrics for KNN Model')
    print('---'*15)
    print('Confusion Matrix: ')
    print(confusion_matrix(y_test, y_pred))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn',fmt=".0f")
    print('Accuracy: ', accuracy)
    print('Precision (TP/(TP+FP)): ', precision)
    print('Recall (TP/(TP+FN)): ', recall)
    print('MAE: ', mae)

    #ConfusionMatrixDisplay.from_estimator(knn, X_test, y_test, normalize='all', cmap= 'YlGn')
    #ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred), normaliza=all).plot()
    #sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn')

    return(y_pred)

In [None]:
# handling the inbalanced data

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE


smote = SMOTE(random_state=42)

#fit predictor and target variable
X_smote, y_smote = smote.fit_resample(X_train,y_train) # Resample the dataset

print('original dataset shape', Counter(y_train))
print('resample dataset shape', Counter(y_smote))

In [None]:
def print_plot_metrics(y_pred, model):

    accuracy = (round(accuracy_score(y_test, y_pred),2))
    precision =  (round(precision_score(y_test, y_pred),2))
    recall =  (round(recall_score(y_test, y_pred),2))
    mae =  (round(mean_absolute_error(y_test, y_pred),2))
    print('---'*15)
    print(f'Accuracy: {accuracy}')
    print(f'Precision (TP/(TP+FP)):  {precision}')
    print(f'Recall (TP/(TP+FN)):  {recall}')
    print(f'MAE:  {mae}')

    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn',fmt=".0f")

    return(accuracy, precision, recall, mae)

In [None]:
#import XGBClassifier
from xgboost import XGBClassifier

my_model = XGBClassifier()
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Optimize model paramaters 
# I run this code in google colab to make the execution much faster and use the best params in the next code
param_grid = {'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
my_model2 = GridSearchCV(my_model, param_grid)
my_model2.fit(X_Train, y_Train)
print(my_model2.best_params_)