In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier 
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from termcolor import colored
from collections import Counter

In [2]:
df = pd.read_csv("../../preprocessing/data_preprocessed.csv")

In [3]:
df.head()

Unnamed: 0,a (AU),e,i (deg),w (deg),Node (deg),M (deg),q (AU),Q (AU),H (mag),MOID (AU),class
0,1.078066,0.826854,22.825495,31.382966,88.010681,215.528772,0.1867,1.97,16.9,0.034507,APO*
1,1.245304,0.335342,13.337482,276.893024,337.207958,104.155607,0.8277,1.66,15.6,0.030669,APO*
2,1.470264,0.559922,6.352995,285.852564,35.736768,174.626213,0.647,2.29,16.25,0.025795,APO*
3,1.776025,0.650141,39.832538,267.791993,356.903343,173.188556,0.6214,2.93,15.2,0.003551,APO*
4,1.874123,0.764602,1.326399,43.388048,349.694944,235.158622,0.4412,3.31,18.8,0.011645,APO*


In [4]:
Y = df['class']
X = df.drop('class', axis=1)
feature_names = X.columns
X.head()

Unnamed: 0,a (AU),e,i (deg),w (deg),Node (deg),M (deg),q (AU),Q (AU),H (mag),MOID (AU)
0,1.078066,0.826854,22.825495,31.382966,88.010681,215.528772,0.1867,1.97,16.9,0.034507
1,1.245304,0.335342,13.337482,276.893024,337.207958,104.155607,0.8277,1.66,15.6,0.030669
2,1.470264,0.559922,6.352995,285.852564,35.736768,174.626213,0.647,2.29,16.25,0.025795
3,1.776025,0.650141,39.832538,267.791993,356.903343,173.188556,0.6214,2.93,15.2,0.003551
4,1.874123,0.764602,1.326399,43.388048,349.694944,235.158622,0.4412,3.31,18.8,0.011645


In [5]:
X.shape

(1747, 10)

In [6]:
Y.shape

(1747,)

In [7]:
Y.unique()

array(['APO*', 'ATE*', 'AMO*', 'APO', 'IEO*', 'ATE'], dtype=object)

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y, random_state=0)

In [11]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
def knn_report(y_true, y_pred):

  confusion_matrix = metrics.confusion_matrix(y_true, y_pred)

  accuracy = metrics.accuracy_score(y_true, y_pred)
  # Izabran je micro jer je default binary i micro uzima u obzir da su sve komponente jednako bitne
  precision = metrics.precision_score(y_true, y_pred, average = 'micro')
  recall = metrics.recall_score(y_true, y_pred, average = 'micro')
  f1 = metrics.f1_score(y_true, y_pred, average = 'micro')

  print("Confusion matrix:")
  print(confusion_matrix)
  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("Recall:", recall)
  print("F1 score:", f1)

In [13]:
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
Y_train_pred = knn.predict(X_train)
knn_report(Y_train, Y_train_pred)

Confusion matrix:
[[  31    0   36    0    0    0]
 [   0    0   10    0    0    0]
 [   3    0 1024    0    5    0]
 [   0    0    3    0    2    0]
 [   0    0   35    0   69    0]
 [   0    0    2    0    2    0]]
Accuracy: 0.9198036006546645
Precision: 0.9198036006546645
Recall: 0.9198036006546645
F1 score: 0.9198036006546645


In [14]:
Y_test_pred = knn.predict(X_test)
knn_report(Y_test, Y_test_pred)

Confusion matrix:
[[  2   0  27   0   0   0]
 [  0   0   4   0   0   0]
 [  4   0 436   0   4   0]
 [  0   0   1   0   1   0]
 [  0   0  28   0  17   0]
 [  0   0   0   0   1   0]]
Accuracy: 0.8666666666666667
Precision: 0.8666666666666667
Recall: 0.8666666666666667
F1 score: 0.8666666666666667


In [15]:
params = {
    'n_neighbors' : range(2,20),
    'weights' : ['uniform', 'distance'],
    'p' : [1,2]
}

In [16]:
estimator = GridSearchCV(KNeighborsClassifier(), params, cv = 4, verbose = 4)

In [17]:
estimator.fit(X_train, Y_train)

Fitting 4 folds for each of 72 candidates, totalling 288 fits
[CV 1/4] END n_neighbors=2, p=1, weights=uniform;, score=0.879 total time=   0.0s
[CV 2/4] END n_neighbors=2, p=1, weights=uniform;, score=0.856 total time=   0.0s
[CV 3/4] END n_neighbors=2, p=1, weights=uniform;, score=0.849 total time=   0.0s
[CV 4/4] END n_neighbors=2, p=1, weights=uniform;, score=0.843 total time=   0.0s
[CV 1/4] END n_neighbors=2, p=1, weights=distance;, score=0.882 total time=   0.0s
[CV 2/4] END n_neighbors=2, p=1, weights=distance;, score=0.882 total time=   0.0s
[CV 3/4] END n_neighbors=2, p=1, weights=distance;, score=0.866 total time=   0.0s
[CV 4/4] END n_neighbors=2, p=1, weights=distance;, score=0.869 total time=   0.0s
[CV 1/4] END n_neighbors=2, p=2, weights=uniform;, score=0.853 total time=   0.0s
[CV 2/4] END n_neighbors=2, p=2, weights=uniform;, score=0.833 total time=   0.0s
[CV 3/4] END n_neighbors=2, p=2, weights=uniform;, score=0.833 total time=   0.0s
[CV 4/4] END n_neighbors=2, p=2,

[CV 4/4] END n_neighbors=8, p=1, weights=distance;, score=0.898 total time=   0.0s
[CV 1/4] END n_neighbors=8, p=2, weights=uniform;, score=0.866 total time=   0.0s
[CV 2/4] END n_neighbors=8, p=2, weights=uniform;, score=0.843 total time=   0.0s
[CV 3/4] END n_neighbors=8, p=2, weights=uniform;, score=0.856 total time=   0.0s
[CV 4/4] END n_neighbors=8, p=2, weights=uniform;, score=0.875 total time=   0.0s
[CV 1/4] END n_neighbors=8, p=2, weights=distance;, score=0.892 total time=   0.0s
[CV 2/4] END n_neighbors=8, p=2, weights=distance;, score=0.869 total time=   0.0s
[CV 3/4] END n_neighbors=8, p=2, weights=distance;, score=0.859 total time=   0.0s
[CV 4/4] END n_neighbors=8, p=2, weights=distance;, score=0.885 total time=   0.0s
[CV 1/4] END n_neighbors=9, p=1, weights=uniform;, score=0.908 total time=   0.0s
[CV 2/4] END n_neighbors=9, p=1, weights=uniform;, score=0.886 total time=   0.0s
[CV 3/4] END n_neighbors=9, p=1, weights=uniform;, score=0.879 total time=   0.0s
[CV 4/4] EN

[CV 1/4] END n_neighbors=14, p=2, weights=distance;, score=0.876 total time=   0.0s
[CV 2/4] END n_neighbors=14, p=2, weights=distance;, score=0.856 total time=   0.0s
[CV 3/4] END n_neighbors=14, p=2, weights=distance;, score=0.856 total time=   0.0s
[CV 4/4] END n_neighbors=14, p=2, weights=distance;, score=0.879 total time=   0.0s
[CV 1/4] END n_neighbors=15, p=1, weights=uniform;, score=0.902 total time=   0.0s
[CV 2/4] END n_neighbors=15, p=1, weights=uniform;, score=0.876 total time=   0.0s
[CV 3/4] END n_neighbors=15, p=1, weights=uniform;, score=0.875 total time=   0.0s
[CV 4/4] END n_neighbors=15, p=1, weights=uniform;, score=0.898 total time=   0.0s
[CV 1/4] END n_neighbors=15, p=1, weights=distance;, score=0.908 total time=   0.0s
[CV 2/4] END n_neighbors=15, p=1, weights=distance;, score=0.889 total time=   0.0s
[CV 3/4] END n_neighbors=15, p=1, weights=distance;, score=0.875 total time=   0.0s
[CV 4/4] END n_neighbors=15, p=1, weights=distance;, score=0.905 total time=   0

In [18]:
estimator.best_estimator_

In [19]:
estimator.best_score_

0.9058957462766526

In [20]:
model = KNeighborsClassifier(n_neighbors=6, weights='distance', p = 1)
model.fit(X_train, Y_train)

In [21]:
Y_train_pred = model.predict(X_train)
knn_report(Y_train, Y_train_pred)

Confusion matrix:
[[  67    0    0    0    0    0]
 [   0   10    0    0    0    0]
 [   0    0 1032    0    0    0]
 [   0    0    0    5    0    0]
 [   0    0    0    0  104    0]
 [   0    0    0    0    0    4]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 score: 1.0


In [22]:
Y_test_pred = model.predict(X_test)
knn_report(Y_test, Y_test_pred)

Confusion matrix:
[[  7   0  22   0   0   0]
 [  0   0   4   0   0   0]
 [  4   0 438   0   2   0]
 [  0   0   1   0   1   0]
 [  0   0  18   0  27   0]
 [  0   0   0   0   1   0]]
Accuracy: 0.8990476190476191
Precision: 0.8990476190476191
Recall: 0.8990476190476191
F1 score: 0.8990476190476191


In [23]:
baggingKnn = BaggingClassifier(estimator=KNeighborsClassifier(), n_estimators=20)
baggingKnn.fit(X_train, Y_train)
Y_train_pred = baggingKnn.predict(X_train)
Y_test_pred = baggingKnn.predict(X_test)
knn_report(Y_train, Y_train_pred)
print("---------------------------------")
knn_report(Y_test, Y_test_pred)

Confusion matrix:
[[  22    0   45    0    0    0]
 [   0    0    9    0    1    0]
 [   0    0 1026    0    6    0]
 [   0    0    2    0    3    0]
 [   0    0   32    0   72    0]
 [   0    0    2    0    2    0]]
Accuracy: 0.9165302782324058
Precision: 0.9165302782324058
Recall: 0.9165302782324058
F1 score: 0.9165302782324058
---------------------------------
Confusion matrix:
[[  3   0  26   0   0   0]
 [  0   0   4   0   0   0]
 [  3   0 437   0   4   0]
 [  0   0   1   0   1   0]
 [  0   0  29   0  16   0]
 [  0   0   0   0   1   0]]
Accuracy: 0.8685714285714285
Precision: 0.8685714285714285
Recall: 0.8685714285714285
F1 score: 0.8685714285714285


In [24]:
smoteenn = SMOTEENN(random_state=42, sampling_strategy='auto', smote=SMOTE(sampling_strategy='auto', k_neighbors=1))
X_train_resampled, y_train_resampled = smoteenn.fit_resample(X_train, Y_train)

print(colored("Number of instances before resampling with SMOTEENN: {}. ".format(Counter(Y_train).items()), "green"))
print(colored("Number of instances after resampling with SMOTEENN: {}. ".format(Counter(y_train_resampled).items()), "blue"))

[32mNumber of instances before resampling with SMOTEENN: dict_items([('APO*', 1032), ('AMO*', 67), ('ATE*', 104), ('APO', 10), ('IEO*', 4), ('ATE', 5)]). [0m
[34mNumber of instances after resampling with SMOTEENN: dict_items([('AMO*', 1029), ('APO', 1032), ('APO*', 802), ('ATE', 1032), ('ATE*', 1032), ('IEO*', 1032)]). [0m


In [25]:
params = {
    'n_neighbors' : range(2,5),
    'weights' : ['uniform', 'distance'],
    'p' : [1,2]
}

In [26]:
estimator = GridSearchCV(KNeighborsClassifier(), params, cv = 4, verbose = 4)
estimator.fit(X_train_resampled, y_train_resampled)

Fitting 4 folds for each of 12 candidates, totalling 48 fits
[CV 1/4] END n_neighbors=2, p=1, weights=uniform;, score=0.991 total time=   0.1s
[CV 2/4] END n_neighbors=2, p=1, weights=uniform;, score=0.992 total time=   0.1s
[CV 3/4] END n_neighbors=2, p=1, weights=uniform;, score=0.992 total time=   0.1s
[CV 4/4] END n_neighbors=2, p=1, weights=uniform;, score=0.988 total time=   0.1s
[CV 1/4] END n_neighbors=2, p=1, weights=distance;, score=0.995 total time=   0.1s
[CV 2/4] END n_neighbors=2, p=1, weights=distance;, score=0.998 total time=   0.1s
[CV 3/4] END n_neighbors=2, p=1, weights=distance;, score=0.996 total time=   0.1s
[CV 4/4] END n_neighbors=2, p=1, weights=distance;, score=0.998 total time=   0.1s
[CV 1/4] END n_neighbors=2, p=2, weights=uniform;, score=0.994 total time=   0.1s
[CV 2/4] END n_neighbors=2, p=2, weights=uniform;, score=0.993 total time=   0.1s
[CV 3/4] END n_neighbors=2, p=2, weights=uniform;, score=0.996 total time=   0.1s
[CV 4/4] END n_neighbors=2, p=2, 

In [27]:
estimator.best_estimator_

In [28]:
estimator.best_score_

0.9993287463772362

In [29]:
model = KNeighborsClassifier(n_neighbors=2, weights='distance')
model.fit(X_train_resampled, y_train_resampled)

In [30]:
y_train_resampled_pred = model.predict(X_train_resampled)
knn_report(y_train_resampled, y_train_resampled_pred)

Confusion matrix:
[[1029    0    0    0    0    0]
 [   0 1032    0    0    0    0]
 [   0    0  802    0    0    0]
 [   0    0    0 1032    0    0]
 [   0    0    0    0 1032    0]
 [   0    0    0    0    0 1032]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 score: 1.0


In [31]:
Y_test_pred = model.predict(X_test)
knn_report(Y_test, Y_test_pred)

Confusion matrix:
[[ 12   0  16   0   1   0]
 [  0   0   4   0   0   0]
 [ 31   7 390   1  15   0]
 [  0   0   1   0   1   0]
 [  0   0  13   3  26   3]
 [  0   0   0   0   0   1]]
Accuracy: 0.8171428571428572
Precision: 0.8171428571428572
Recall: 0.8171428571428572
F1 score: 0.8171428571428572


In [32]:
#preprilagodio se model