In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("creditDefault.csv")

In [4]:
targetName = "default payment next month"
df.columns.get_loc(targetName)
featureNames = df.columns[np.where(df.columns != targetName)]

X = df[featureNames].values
y = df[targetName].values

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree

k=5
knn_clf = KNeighborsClassifier(n_neighbors = k)
knn_clf.fit(X_train, y_train)

tree_clf = tree.DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [20]:
from sklearn.model_selection import cross_val_score

knn_cv_score = cross_val_score(knn_clf, X_train, y_train, cv=5)
tree_cv_score = cross_val_score(tree_clf, X_train, y_train, cv=5)

print(knn_cv_score)
print(tree_cv_score)

print("Knn cv score mean: {}".format(np.mean(knn_cv_score)))
print("Tree cv score mean: {}".format(np.mean(tree_cv_score)))

[0.74520833 0.753125   0.75166667 0.74604167 0.74604167]
[0.72166667 0.73125    0.72583333 0.73958333 0.73125   ]
Knn cv score mean: 0.7484166666666667
Tree cv score mean: 0.7299166666666667


In [31]:
from sklearn.model_selection import cross_validate

metrics_to_calculate= ['accuracy', 'roc_auc', 'precision', 'f1', 'recall']

knn_cv_scores = cross_validate(knn_clf, X_train, y_train, cv=5, scoring=metrics_to_calculate)
tree_cv_scores = cross_validate(tree_clf, X_train, y_train, cv=5, scoring=metrics_to_calculate)

print(knn_cv_scores)
print(tree_cv_scores)

{'fit_time': array([0.15960836, 0.1505971 , 0.14760494, 0.14860177, 0.14756727]), 'score_time': array([0.91950488, 0.94148326, 0.94148493, 1.01033401, 0.90964556]), 'test_accuracy': array([0.74520833, 0.753125  , 0.75166667, 0.74604167, 0.74604167]), 'test_roc_auc': array([0.59616151, 0.60175608, 0.59909434, 0.59741677, 0.59928779]), 'test_precision': array([0.35675676, 0.38649156, 0.37119675, 0.35019455, 0.35634328]), 'test_f1': array([0.24459543, 0.25798372, 0.23491656, 0.2279924 , 0.23860087]), 'test_recall': array([0.18609023, 0.19360902, 0.17183099, 0.16901408, 0.17934272])}
{'fit_time': array([0.41090274, 0.41687608, 0.4098649 , 0.40891123, 0.41788268]), 'score_time': array([0.00897765, 0.00997615, 0.00900888, 0.00997448, 0.00997353]), 'test_accuracy': array([0.725625  , 0.72854167, 0.723125  , 0.731875  , 0.72625   ]), 'test_roc_auc': array([0.61469285, 0.60816421, 0.61467843, 0.62365707, 0.61769331]), 'test_precision': array([0.3887423 , 0.38863001, 0.38601036, 0.40228873, 0.39

In [32]:
knn_accuracy_scores=np.mean(knn_cv_scores['test_accuracy'])

print(knn_accuracy_scores)

0.7484166666666667


In [33]:
tree_accuracy_scores=np.mean(tree_cv_scores['test_accuracy'])

print(tree_accuracy_scores)

0.7270833333333333


In [34]:
from sklearn.metrics import SCORERS

sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

In [38]:
from sklearn.metrics import confusion_matrix

knn_pred=knn_clf.predict(X_test)

confusion_matrix(y_test, knn_pred)

array([[4270,  417],
       [1078,  235]], dtype=int64)

In [40]:
tree_pred = tree_clf.predict(X_test)

confusion_matrix(y_test, tree_pred)

array([[3794,  893],
       [ 769,  544]], dtype=int64)