In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from pprint import pprint

# Data Load

In [3]:
iris = datasets.load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)
target = iris.target 

In [4]:
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [6]:
print('target data')
print(target)
print(' ')
print('length of target data :{}'.format(len(target)))


target data
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
 
length of target data :150


# Normal Knn

In [27]:
X_train, X_val, y_train, y_val = train_test_split(data, target)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [10]:
print('For Test')
print('Input vector labeled 0')
print(knn.predict([[5.1,	3.5,	1.4,	0.2]]))
print('Input vector labeled 2')
print(knn.predict([[6.5,	3.0,	5.2,	2.0]]))

For Test
Input vector labeled 0
[0]
Input vector labeled 2
[2]


In [36]:
original_pred  = knn.predict(X_val)
pprint(classification_report(y_val, original_pred))
original_acc = accuracy_score(original_pred, y_val)

('              precision    recall  f1-score   support\n'
 '\n'
 '           0       1.00      1.00      1.00        14\n'
 '           1       1.00      0.79      0.88        14\n'
 '           2       0.77      1.00      0.87        10\n'
 '\n'
 '    accuracy                           0.92        38\n'
 '   macro avg       0.92      0.93      0.92        38\n'
 'weighted avg       0.94      0.92      0.92        38\n')


# Variable Selection

In [34]:
def feature_iteration(data) :
  acc_vector = []

  for col in data :
    # remove one column 
    column = list(data.columns)
    column.remove(col)
    
    # make new dataset that removes one column
    adjusted_data = data[column]
    X_train, X_val, y_train, y_val = train_test_split(adjusted_data, target)

    # learning using Knn
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    acc_vector.append(accuracy_score(knn.predict(X_val), y_val))
    
  return acc_vector

In [35]:
acc_vector = feature_iteration(data)
pprint(acc_vector)

[0.9473684210526315, 0.9736842105263158, 0.8947368421052632, 0.9473684210526315]


# Calculate feature importance

In [45]:
for idx in range(len(acc_vector)) :
  acc_vector[idx] = 1-(acc_vector[idx]-original_acc) 

In [46]:
acc_vector

[0.9736842105263158, 0.9473684210526315, 1.026315789473684, 0.9736842105263158]

acc_vector의 accuracy가 작을수록 해당 feature의 가중치가 높음

In [50]:
sum_of_disc = sum(acc_vector)

weight = []
for acc in acc_vector :
  weight.append(acc/sum_of_disc) 

In [51]:
weight

[0.24832214765100674,
 0.24161073825503357,
 0.26174496644295303,
 0.24832214765100674]