# KNN

In [2]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics
import model_evaluator
import data_import
from data_preprocessing import KMeansDimensionalityReduction

In [3]:
# Import data from Contagio dataset
X_tot, df_tot, X_train, X_test, y_train, y_test = data_import.import_data_train_test('Contagio')

In [4]:
KMDimRed = KMeansDimensionalityReduction(df_tot)
df_tot_km, X_tot_km, X_train_km = KMDimRed.fit_ben_mal_kmeans(df_tot, X_tot, X_train)
X_test_km = KMDimRed.transform_ben_mal_kmeans(X_test)

In [5]:
# Original data
from sklearn.neighbors import KNeighborsClassifier
import model_evaluator
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)
y_test_predicted = knn_clf.predict(X_test).reshape(-1,1)
print('Accuracy on testing: %f%%' % (model_evaluator.compute_accuracy(y_test, y_test_predicted)*100))
y_train_predicted = knn_clf.predict(X_train).reshape(-1,1)
print('Accuracy on training: %f%%' % (model_evaluator.compute_accuracy(y_train, y_train_predicted)*100))
print('f1-score on testing: %f%%' % (model_evaluator.compute_f1_score(y_test, y_test_predicted)*100))
print('False positives: %d' % (model_evaluator.get_false_positive(y_test, y_test_predicted)))

Accuracy on testing: 99.529936%
Accuracy on training: 99.690728%
f1-score on testing: 99.571590%
False positives: 7


In [25]:
# KMeans dimensionality reduction
from sklearn.neighbors import KNeighborsClassifier
import model_evaluator
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train_km, y_train)
y_test_predicted = knn_clf.predict(X_test_km).reshape(-1,1)
print('Accuracy on testing: %f%%' % (model_evaluator.compute_accuracy(y_test, y_test_predicted)*100))
y_train_predicted = knn_clf.predict(X_train_km).reshape(-1,1)
print('Accuracy on training: %f%%' % (model_evaluator.compute_accuracy(y_train, y_train_predicted)*100))
print('f1-score on testing: %f%%' % (model_evaluator.compute_f1_score(y_test, y_test_predicted)*100))
print('False positives: %d' % (model_evaluator.get_false_positive(y_test, y_test_predicted)))

Accuracy on testing: 99.406235%
Accuracy on training: 99.449496%
f1-score on testing: 99.457505%
False positives: 4


In [7]:
kfold_scores_accuracy = model_evaluator.kfold_cross_validation(knn_clf, X_train, y_train, k=5, scoring='accuracy')
kfold_scores_accuracy

array([0.99536178, 0.99443414, 0.9938138 , 0.9938138 , 0.99257655])

In [8]:
kfold_scores_npv = model_evaluator.kfold_cross_validation(knn_clf, X_train, y_train, k=5, scoring='precision')
kfold_scores_npv

array([0.99253225, 0.99189737, 0.98710115, 0.98789509, 0.98675035])

In [17]:
for i in range(1,10):
    knn_clf = KNeighborsClassifier(n_neighbors=i)
    kfold_scores_npv = model_evaluator.kfold_cross_validation(knn_clf, X_train, y_train, k=5, scoring='precision')
    print("i=" + str(i))
    print(kfold_scores_npv);
    kfold_scores_npv_avg=np.mean(kfold_scores_npv)
    print("mean=", kfold_scores_npv_avg);

i=1
[0.99727149 0.99390657 0.98977505 0.98989899 0.9922807 ]
mean= 0.9926265594367933
i=2
[0.99253731 0.99123399 0.98643148 0.98791135 0.98675958]
mean= 0.9889747418695256
i=3
[0.99253225 0.99189737 0.98710115 0.98789509 0.98675035]
mean= 0.9892352414656017
i=4
[0.99254743 0.98725687 0.98577236 0.98592493 0.98607242]
mean= 0.98751480283057
i=5
[0.99252717 0.98725687 0.98641304 0.98724832 0.98743017]
mean= 0.9881751163435079
i=6
[0.98784605 0.98725687 0.98440678 0.98460509 0.98467967]
mean= 0.9857588913921923
i=7
[0.98784605 0.98724832 0.98574338 0.98525469 0.98467967]
mean= 0.9861544220792913
i=8
[0.98322148 0.98460509 0.98507463 0.98327759 0.98263889]
mean= 0.9837635342505194
i=9
[0.98384926 0.98657718 0.98507463 0.98393574 0.98332175]
mean= 0.9845517124038947


In [18]:
# Automatic grid search
from sklearn.model_selection import GridSearchCV
parameters = {'n_neighbors': np.arange(1,10,2)}
grid = GridSearchCV(knn_clf, param_grid=parameters, 
                    scoring=metrics.make_scorer(metrics.precision_score, pos_label=0), cv=5)
grid.fit(X_train, y_train.ravel())

print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters:", grid.best_params_)

best mean cross-validation score: 0.992
best parameters: {'n_neighbors': 1}


In [27]:
# Automatic grid search
from sklearn.model_selection import GridSearchCV
parameters = {'n_neighbors': np.arange(1,10,2)}
grid = GridSearchCV(knn_clf, param_grid=parameters, 
                    scoring=metrics.make_scorer(metrics.f1_score, pos_label=0), cv=5)
grid.fit(X_train, y_train.ravel())

print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters:", grid.best_params_)

best mean cross-validation score: 0.995
best parameters: {'n_neighbors': 1}
