In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier 
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier

In [2]:
df = pd.read_csv("../../preprocessing/data_preprocessed.csv")

In [3]:
df.head()

Unnamed: 0,a (AU),e,i (deg),w (deg),Node (deg),M (deg),q (AU),Q (AU),H (mag),MOID (AU),class
0,1.078066,0.826854,22.825495,31.382966,88.010681,215.528772,0.1867,1.97,16.9,0.034507,APO*
1,1.245304,0.335342,13.337482,276.893024,337.207958,104.155607,0.8277,1.66,15.6,0.030669,APO*
2,1.470264,0.559922,6.352995,285.852564,35.736768,174.626213,0.647,2.29,16.25,0.025795,APO*
3,1.776025,0.650141,39.832538,267.791993,356.903343,173.188556,0.6214,2.93,15.2,0.003551,APO*
4,1.874123,0.764602,1.326399,43.388048,349.694944,235.158622,0.4412,3.31,18.8,0.011645,APO*


In [4]:
Y = df['class']
X = df.drop('class', axis=1)
feature_names = X.columns
X.head()

Unnamed: 0,a (AU),e,i (deg),w (deg),Node (deg),M (deg),q (AU),Q (AU),H (mag),MOID (AU)
0,1.078066,0.826854,22.825495,31.382966,88.010681,215.528772,0.1867,1.97,16.9,0.034507
1,1.245304,0.335342,13.337482,276.893024,337.207958,104.155607,0.8277,1.66,15.6,0.030669
2,1.470264,0.559922,6.352995,285.852564,35.736768,174.626213,0.647,2.29,16.25,0.025795
3,1.776025,0.650141,39.832538,267.791993,356.903343,173.188556,0.6214,2.93,15.2,0.003551
4,1.874123,0.764602,1.326399,43.388048,349.694944,235.158622,0.4412,3.31,18.8,0.011645


In [5]:
X.shape

(1747, 10)

In [6]:
Y.shape

(1747,)

In [7]:
Y.unique()

array(['APO*', 'ATE*', 'AMO*', 'APO', 'IEO*', 'ATE'], dtype=object)

In [8]:
def IQR(data, feature_names):
    
    iqr = pd.DataFrame(0, index=feature_names, columns=['lower', 'min', 'num_lower', 'upper', 'max', 'num_upper', 'percantage'])
    for name in feature_names:

        (Q1, Q3) = X[name].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        upper = Q3 + (1.5 * IQR)
        lower = Q1 - (1.5 * IQR)
        
        iqr.loc[name, 'upper'] = upper
        iqr.loc[name, 'lower'] = lower

        num_of_out_lower = (X[name] < lower).sum()
        num_of_out_upper = (X[name] > upper).sum()

        percentage = round((num_of_out_lower + num_of_out_upper) / X.shape[0] * 100)

        iqr.loc[name, 'num_lower'], iqr.loc[name, 'num_upper'], iqr.loc[name, 'percantage']  = num_of_out_lower, num_of_out_upper, percentage
        iqr.loc[name, 'min'], iqr.loc[name, 'max'] = min(X[name]), max(X[name])    
    return iqr

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y, random_state=0)

In [10]:
IQR(X_train, feature_names)

Unnamed: 0,lower,min,num_lower,upper,max,num_upper,percantage
a (AU),-0.104511,0.635223,0,3.546646,3.888719,1,0
e,0.041192,0.025425,1,1.020715,0.956042,0,0
i (deg),-15.987849,0.146084,0,39.647466,75.412403,65,4
w (deg),-160.797589,0.521838,0,523.014817,359.662669,0,0
Node (deg),-178.426441,0.136042,0,517.340838,359.854602,0,0
M (deg),-204.964619,0.052165,0,564.446926,359.825201,0,0
q (AU),0.138375,0.0928,10,1.414575,1.0601,0,1
Q (AU),-0.9575,0.96,0,6.3025,7.01,2,0
H (mag),15.95,14.1,25,24.35,22.4,0,1
MOID (AU),-0.02457,1e-05,0,0.070444,0.049987,0,0


In [11]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
def knn_report(y_true, y_pred):

  # Calculate the confusion matrix.
  confusion_matrix = metrics.confusion_matrix(y_true, y_pred)

  # Calculate the accuracy, precision, recall, and F1 score.
  accuracy = metrics.accuracy_score(y_true, y_pred)
  # Izabran je micro jer je default binary i micro uzima u obzir da su sve komponente jednako bitne
  precision = metrics.precision_score(y_true, y_pred, average = 'micro')
  recall = metrics.recall_score(y_true, y_pred, average = 'micro')
  f1 = metrics.f1_score(y_true, y_pred, average = 'micro')

  # Print the report.
  print("Confusion matrix:")
  print(confusion_matrix)
  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("Recall:", recall)
  print("F1 score:", f1)

In [13]:
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train) # treniranje modela
Y_train_pred = knn.predict(X_train)
knn_report(Y_train, Y_train_pred)


Confusion matrix:
[[  31    0   36    0    0    0]
 [   0    0   10    0    0    0]
 [   3    0 1024    0    5    0]
 [   0    0    3    0    2    0]
 [   0    0   35    0   69    0]
 [   0    0    2    0    2    0]]
Accuracy: 0.9198036006546645
Precision: 0.9198036006546645
Recall: 0.9198036006546645
F1 score: 0.9198036006546645


In [14]:
Y_test_pred = knn.predict(X_test)
knn_report(Y_test, Y_test_pred)

Confusion matrix:
[[  2   0  27   0   0   0]
 [  0   0   4   0   0   0]
 [  4   0 436   0   4   0]
 [  0   0   1   0   1   0]
 [  0   0  28   0  17   0]
 [  0   0   0   0   1   0]]
Accuracy: 0.8666666666666667
Precision: 0.8666666666666667
Recall: 0.8666666666666667
F1 score: 0.8666666666666667


In [15]:
params = {
    'n_neighbors' : range(2,20),
    'weights' : ['uniform', 'distance']
}

In [30]:
estimator = GridSearchCV(KNeighborsClassifier(), params, cv = 4, verbose = 4)

In [31]:
estimator.fit(X_train, Y_train)

Fitting 4 folds for each of 36 candidates, totalling 144 fits
[CV 1/4] END ....n_neighbors=2, weights=uniform;, score=0.853 total time=   0.0s
[CV 2/4] END ....n_neighbors=2, weights=uniform;, score=0.833 total time=   0.0s
[CV 3/4] END ....n_neighbors=2, weights=uniform;, score=0.833 total time=   0.0s
[CV 4/4] END ....n_neighbors=2, weights=uniform;, score=0.836 total time=   0.0s
[CV 1/4] END ...n_neighbors=2, weights=distance;, score=0.882 total time=   0.0s
[CV 2/4] END ...n_neighbors=2, weights=distance;, score=0.853 total time=   0.0s
[CV 3/4] END ...n_neighbors=2, weights=distance;, score=0.849 total time=   0.0s
[CV 4/4] END ...n_neighbors=2, weights=distance;, score=0.869 total time=   0.0s
[CV 1/4] END ....n_neighbors=3, weights=uniform;, score=0.882 total time=   0.0s
[CV 2/4] END ....n_neighbors=3, weights=uniform;, score=0.866 total time=   0.0s
[CV 3/4] END ....n_neighbors=3, weights=uniform;, score=0.866 total time=   0.0s
[CV 4/4] END ....n_neighbors=3, weights=uniform

[CV 1/4] END ...n_neighbors=15, weights=uniform;, score=0.866 total time=   0.0s
[CV 2/4] END ...n_neighbors=15, weights=uniform;, score=0.856 total time=   0.0s
[CV 3/4] END ...n_neighbors=15, weights=uniform;, score=0.856 total time=   0.0s
[CV 4/4] END ...n_neighbors=15, weights=uniform;, score=0.859 total time=   0.0s
[CV 1/4] END ..n_neighbors=15, weights=distance;, score=0.873 total time=   0.0s
[CV 2/4] END ..n_neighbors=15, weights=distance;, score=0.856 total time=   0.0s
[CV 3/4] END ..n_neighbors=15, weights=distance;, score=0.856 total time=   0.0s
[CV 4/4] END ..n_neighbors=15, weights=distance;, score=0.869 total time=   0.0s
[CV 1/4] END ...n_neighbors=16, weights=uniform;, score=0.859 total time=   0.0s
[CV 2/4] END ...n_neighbors=16, weights=uniform;, score=0.853 total time=   0.0s
[CV 3/4] END ...n_neighbors=16, weights=uniform;, score=0.856 total time=   0.0s
[CV 4/4] END ...n_neighbors=16, weights=uniform;, score=0.856 total time=   0.0s
[CV 1/4] END ..n_neighbors=1

In [32]:
estimator.best_estimator_

In [33]:
estimator.best_score_

0.8829717132754741

In [34]:
model = KNeighborsClassifier(n_neighbors=6, weights='distance')
model.fit(X_train, Y_train)

In [35]:
Y_train_pred = model.predict(X_train)
knn_report(Y_train, Y_train_pred)

Confusion matrix:
[[  67    0    0    0    0    0]
 [   0   10    0    0    0    0]
 [   0    0 1032    0    0    0]
 [   0    0    0    5    0    0]
 [   0    0    0    0  104    0]
 [   0    0    0    0    0    4]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 score: 1.0


In [36]:
Y_test_pred = model.predict(X_test)
knn_report(Y_test, Y_test_pred)

Confusion matrix:
[[  2   0  27   0   0   0]
 [  0   0   4   0   0   0]
 [  4   0 435   0   5   0]
 [  0   0   1   0   1   0]
 [  0   0  26   0  19   0]
 [  0   0   0   0   1   0]]
Accuracy: 0.8685714285714285
Precision: 0.8685714285714285
Recall: 0.8685714285714285
F1 score: 0.8685714285714285


In [40]:

baggingKnn = BaggingClassifier(estimator=KNeighborsClassifier(), n_estimators=20)
baggingKnn.fit(X_train, Y_train)
Y_train_pred = baggingKnn.predict(X_train)
Y_test_pred = baggingKnn.predict(X_test)
knn_report(Y_train, Y_train_pred)
print("---------------------------------")
knn_report(Y_test, Y_test_pred)

Confusion matrix:
[[  24    0   43    0    0    0]
 [   0    0    9    0    1    0]
 [   2    0 1026    0    4    0]
 [   0    0    3    0    2    0]
 [   0    0   35    0   69    0]
 [   0    0    2    0    2    0]]
Accuracy: 0.9157119476268413
Precision: 0.9157119476268413
Recall: 0.9157119476268413
F1 score: 0.9157119476268413
---------------------------------
Confusion matrix:
[[  1   0  28   0   0   0]
 [  0   0   4   0   0   0]
 [  2   0 437   0   5   0]
 [  0   0   1   0   1   0]
 [  0   0  28   0  17   0]
 [  0   0   0   0   1   0]]
Accuracy: 0.8666666666666667
Precision: 0.8666666666666667
Recall: 0.8666666666666667
F1 score: 0.8666666666666667
