In [None]:
# load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.datasets import fetch_openml

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix

# my model performance visualization file
import performance_eval 

In [None]:
# load data

mnist = pd.read_csv('./data/mnist.csv')

In [None]:
# train test split
X = mnist.drop(columns='target')
y = mnist['target']

X_train = X.iloc[:60000, :]
X_test = X.iloc[60000:, :]

y_train = y[:60000]
y_test = y[60000:]

# binary classification 
y_train_8 = y_train == 8

In [None]:
# logestic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

lr_clf = LogisticRegression()

cv = 5

hyper_params = {'penalty':['none', 'l1', 'l2', 'elasticnet'],
                # 'regularization':[],
                'fit_intercept':[True, False],
                'class_weight':['balanced', {0:1, 1:3}, {0:3, 1:1}], 
                # 'l1_ratio':[]
                }

rdm_grid_search_lr_clf = RandomizedSearchCV(lr_clf, hyper_params, cv=cv, return_train_score=True, )




In [None]:
# run grid search
rdm_grid_search_lr_clf.fit(X_train, y_train_8)

In [None]:
# models failed to converge!
# compare different models

lr_clf_results = rdm_grid_search_lr_clf.cv_results_



performances_lr_clf = zip(
    lr_clf_results['params'], 
    lr_clf_results['mean_test_score'], 
    lr_clf_results['std_test_score'], 
    lr_clf_results['rank_test_score'])

# print models performance scores
for model_performance in performances_lr_clf:

    params = model_performance[0]
    mean_test_score = model_performance[1]
    
    print(mean_test_score, '   ', params)
    


In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.base import clone

copy the hyper parameters of the best estimator
lr_clf_best = clone(rdm_grid_search_lr_clf.best_estimator_)

# get the z_hat scores
y_score_8_lr_clf = cross_val_predict(lr_clf_best, X_train, y_train_8, method="decision_function")
# now you can visualize performance using the other file

In [None]:
# visualize the performance of the second best model
# one with 'class_weight': {0: 1, 1: 3}
y_score_8_lr_clf_whts = cross_val_predict(LogisticRegression(penalty= 'l2', fit_intercept= True, class_weight= {0: 1, 1: 3}),
                                     X_train, y_train_8, method="decision_function")

In [None]:
# compare two models
metrics_scores = performance_eval.performance_vs_thresholds(y_true=y_train_8, y_score=y_score_8_lr_clf)
metrics_scores_whts = performance_eval.performance_vs_thresholds(y_true=y_train_8, y_score=y_score_8_lr_clf)


performance_eval.plot_performance_curve(('fp/n', metrics_scores['fp/n']),
                                        ('tp/p', metrics_scores['tp/p']), label='Logestic Regression')

performance_eval.plot_performance_curve(('fp/n', metrics_scores['fp/n']),
                                        ('tp/p', metrics_scores['tp/p']), label='Logestic Regression Weighted',
                                        line_only=True)

# ROC curves are exactly the same!

In [None]:
# compare AUC scores
from sklearn.metrics import roc_auc_score
auc_score_lr_clf = roc_auc_score(y_train_8, y_score_8_lr_clf)
auc_score_lr_clf_whts = roc_auc_score(y_train_8, y_score_8_lr_clf_whts)

# auc scores for the second best model (according to grid search) were higher than best model!

################################
############ knn  ##############
################################

In [None]:
# knn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

knn_clf = KNeighborsClassifier()

cv = 5

params_knn = {
    'n_neighbors':np.arange(1, 40, 2),
}

rdm_hyper_params_knn = {
                'n_neighbors':[1, 5, 10], # would take way too long
                'weights':['uniform', 'distance'],
                # 'algorithm':['ball_tree', 'kd_tree' , 'brute', 'auto'],
                'algorithm':['ball_tree', 'kd_tree', 'auto'],
                
                }

rdm_grid_search_knn_clf = RandomizedSearchCV(knn_clf, rdm_hyper_params_knn, cv=cv, return_train_score=True, )
grid_search_knn_clf = GridSearchCV(knn_clf, params_knn, cv=cv, return_train_score=True)

In [None]:
# run grid search
rdm_grid_search_knn_clf.fit(X_train, y_train_8)

In [None]:
grid_search_knn_clf.fit(X_train, y_train_8)

In [None]:
# visualie performance for different k values

scores_knn_clf = grid_search_knn_clf...
x_ticks = params_knn.items()[0][1] # np.arange(1, 20, 2)
plt.plot(x_ticks, scores_knn_clf, label='KNN Performance')
plt.title('KNN Model Performance at Different K Values')
plt.xlabel('Neighbors')
plt.ylabel('Metric') # what is it?




In [None]:
# models failed to converge!
# compare different models

rdm_knn_clf_results = rdm_grid_search_knn_clf.cv_results_



performances_knn_clf = zip(
    rdm_knn_clf_results['params'], 
    rdm_knn_clf_results['mean_test_score'], 
    rdm_knn_clf_results['std_test_score'], 
    rdm_knn_clf_results['rank_test_score'])

# print models performance scores
for model_performance in performances_knn_clf:

    params = model_performance[0]
    mean_test_score = model_performance[1]
    
    print(mean_test_score, '   ', params)
    


In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.base import clone

copy the hyper parameters of the best estimator
knn_clf_best = clone(rdm_grid_search_knn_clf.best_estimator_)

# get the z_hat scores          # does knn algo have that?
y_score_8_lr_clf = cross_val_predict(knn_clf_best, X_train, y_train_8, method="decision_function")
# now you can visualize performance using the other file

In [None]:
# visualize the performance of the second best model
# one with 'class_weight': {0: 1, 1: 3}
y_score_8_lr_clf_whts = cross_val_predict(LogisticRegression(penalty= 'l2', fit_intercept= True, class_weight= {0: 1, 1: 3}),
                                     X_train, y_train_8, method="decision_function")

In [None]:
# compare two models
metrics_scores = performance_eval.performance_vs_thresholds(y_true=y_train_8, y_score=y_score_8_lr_clf)
metrics_scores_whts = performance_eval.performance_vs_thresholds(y_true=y_train_8, y_score=y_score_8_lr_clf)


performance_eval.plot_performance_curve(('fp/n', metrics_scores['fp/n']),
                                        ('tp/p', metrics_scores['tp/p']), label='Logestic Regression')

performance_eval.plot_performance_curve(('fp/n', metrics_scores['fp/n']),
                                        ('tp/p', metrics_scores['tp/p']), label='Logestic Regression Weighted',
                                        line_only=True)

# ROC curves are exactly the same!

In [None]:
# compare AUC scores
from sklearn.metrics import roc_auc_score
auc_score_lr_clf = roc_auc_score(y_train_8, y_score_8_lr_clf)
auc_score_lr_clf_whts = roc_auc_score(y_train_8, y_score_8_lr_clf_whts)

# auc scores for the second best model (according to grid search) were higher than best model!

















In [None]:
# visualize ROC curve
metrics_scores = performance_eval.performance_vs_thresholds(y_true=y_train_8, y_score=y_score_8_lr_clf)

performance_eval.plot_performance_curve(('fp/n', metrics_scores['fp/n']),
                                        ('tp/p', metrics_scores['tp/p']), label='Logestic Regression')

# visualize other curves...