**KNN CLASSIFIER**

https://www.datacamp.com/community/tutorials/k-nearest-neighbor-classification-scikit-learn

### **Example 1**

In [68]:
# preparing data

weather=['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny',
'Rainy','Sunny','Overcast','Overcast','Rainy']

temp=['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild']

play=['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']

In [69]:
#Encoding data columns

from sklearn import preprocessing

le = preprocessing.LabelEncoder()

weather_encoded = le.fit_transform(weather)

print(weather_encoded) # we have only 3 different type of values ['Overcast', 'Rainy', 'Sunny']

# Encoding temp & play columns

temp_encoded = le.fit_transform(temp)
play_encoded = le.fit_transform(play)

print(temp_encoded)
print(play_encoded)

[2 2 0 1 1 1 0 2 2 1 2 0 0 1]
[1 1 1 2 0 0 0 2 0 2 2 2 1 2]
[0 0 1 1 1 0 1 0 1 1 1 1 1 0]


In [70]:
# just to check no.of unique values in each column
import numpy as np

print(np.unique(weather))
print(np.unique(temp))
print(np.unique(play))

['Overcast' 'Rainy' 'Sunny']
['Cool' 'Hot' 'Mild']
['No' 'Yes']


In [71]:
# Check if all columns have same no of observations

print(weather_encoded.shape[0] == temp_encoded.shape[0] == play_encoded.shape[0])

True


In [72]:
# Combining Features

features = list(zip(weather_encoded, temp_encoded))

In [73]:
# Generating Model

from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(features, play_encoded)

clf.predict([[0,2]])

array([1], dtype=int64)

### **Example 2**

In [74]:
from sklearn.datasets import load_wine

wine = load_wine()

In [75]:
print(wine.feature_names)
print('\n')
print(wine.target_names)
print('\n')
print(wine.target)
print('\n')
print(wine.data[0:5])

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


['class_0' 'class_1' 'class_2']


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


[[1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00
  2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 1.120e+01 1.000e+02 2.650e+00 2.760e+00
  2.600e-01 1.280e+00 4.380e+00 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 1.860e+01 1.010e+02 2.800e+00 3.240e+00
  3.000e-01 2.810e+00 5.680e+00 1.030e+00 3.170e

In [76]:
print(wine.data.shape)
print(wine.target.shape)

(178, 13)
(178,)


In [77]:
# Split data

X = wine.data
y = wine.target

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% train & 30% test

print(  X_train.shape[0] + X_test.shape[0] == wine.data.shape[0]  )
print(  y_train.shape[0] + y_test.shape[0] == wine.target.shape[0]  )

True
True


In [78]:
# Generating Model with K=5

from sklearn.neighbors import KNeighborsClassifier

wine_knn_5 = KNeighborsClassifier(n_neighbors=5)

wine_knn_5.fit(X_train, y_train)

y_predicted_5 = wine_knn_5.predict(X_test)


# Model Evaluation

from sklearn.metrics import accuracy_score

accuracy_5 = accuracy_score(y_test, y_predicted_5)
print('Accuracy:', accuracy_5)

# 68.51%, considered as good accuracy.

Accuracy: 0.6296296296296297


In [79]:
# Re-generating Model for K=7

wine_knn_7 = KNeighborsClassifier(n_neighbors=7)

wine_knn_7.fit(X_train, y_train)

y_predicted_7 = wine_knn_7.predict(X_test)


# Model evaluation

accuracy_7 = accuracy_score(y_test, y_predicted_7)

print('Accuracy :', accuracy_7)

Accuracy : 0.6111111111111112


In [81]:
# Re-generating Model for K=9

wine_knn_9 = KNeighborsClassifier(n_neighbors=9)

wine_knn_9.fit(X_train, y_train)

y_predicted_9 = wine_knn_9.predict(X_test)

print(y_predicted_9)


# Model evaluation

accuracy_9 = accuracy_score(y_test, y_predicted_9)

print('Accuracy :', accuracy_9)

[0 1 1 0 0 0 1 1 0 1 1 0 0 0 1 1 1 2 2 0 1 2 2 1 2 1 1 2 1 2 2 0 2 1 1 0 1
 1 0 1 0 0 2 1 0 0 1 0 1 1 0 1 1 0]
Accuracy : 0.5925925925925926


In [92]:
# for the same data, LogisticRegression

from sklearn.linear_model import LogisticRegression

lgr = LogisticRegression(solver='newton-cg', random_state=1, multi_class='auto')

lgr.fit(X_train, y_train)

y_predicted_lgr = lgr.predict(X_test)

accuracy_score(y_test, y_predicted_lgr)

# not sure on solver arg for LogisticRegression()

0.9444444444444444

In [94]:
y_predicted_lgr

array([0, 1, 2, 2, 1, 0, 2, 2, 0, 1, 2, 0, 0, 0, 2, 1, 1, 2, 1, 0, 2, 1,
       1, 1, 2, 1, 2, 0, 1, 2, 2, 0, 0, 1, 2, 0, 1, 1, 0, 1, 0, 0, 0, 2,
       0, 0, 2, 0, 2, 1, 0, 2, 2, 0])