### K-Nearest Neighbour (KNN) Model


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

%matplotlib inline

In [2]:
# The Iris data set,consists of the measurements of four attributes of 150 iris flowers from three types of irises.
# The typical task for the Iris data set is to classify the type of iris based on the measurements

# The data dimensions are as follows:

#  sepal length in cm;
#  sepal width in cm;
#  petal length in cm;
#  petal width in cm;

#  class:
#    Iris Setosa
#    Iris Versicolour
#    Iris Virginica
# iris_data header name 

# iris_data header name 
iris_header = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'response' ]


In [3]:
# load the iris data 
#iris_data = pd.read_table('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', sep=',', header=None, names=iris_header)

iris_data = pd.read_csv('C:\\Users\\jp\\Desktop\\testData\\IRIS.csv')
iris_data.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,response
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [4]:
iris_data.response.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [5]:
# check it again if data got loaded properly
iris_data.head(10)    

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,response
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [6]:
# dimension of iris_data
iris_data.shape  # 150 rows, 5 columns

(150, 5)

In [7]:
# iris_data summary
iris_data.info()   # 4 columns are numeric and 5th column is categorical

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal_length    150 non-null float64
sepal_width     150 non-null float64
petal_length    150 non-null float64
petal_width     150 non-null float64
response        150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


### Machine Learning Models using Scikit Learn KNN

- K-Nearest Neighbour classification Model
- steps in sckit-learn for model training and prediction



In [8]:
# seperate the predictors X and response y

# x = iris_data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
# y = iris_data['response']

x = iris_data.drop(['response'], axis=1)
y = iris_data['response']

In [11]:
x.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [12]:
# standardize the predictors
from sklearn.preprocessing import scale
x_t = pd.DataFrame(scale(x))
x_t.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
x_t.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.900681,1.032057,-1.341272,-1.312977
1,-1.143017,-0.124958,-1.341272,-1.312977
2,-1.385353,0.337848,-1.398138,-1.312977
3,-1.506521,0.106445,-1.284407,-1.312977
4,-1.021849,1.26346,-1.341272,-1.312977


In [13]:
from sklearn.preprocessing import normalize
x_norm = pd.DataFrame(normalize(x))
x_norm.head()

Unnamed: 0,0,1,2,3
0,0.803773,0.551609,0.220644,0.031521
1,0.828133,0.50702,0.236609,0.033801
2,0.805333,0.548312,0.222752,0.034269
3,0.80003,0.539151,0.260879,0.034784
4,0.790965,0.569495,0.22147,0.031639


#### Scikit 6 steps process to build the KNN predictive model


In [13]:
#1: Import the KNN Model Classifier
from sklearn.neighbors import KNeighborsClassifier

In [14]:
#2: Initialize the Model
knn = KNeighborsClassifier()
knn
#knn = KNeighborsClassifier(n_neighbors=1)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [15]:
#3: split the data into training and testing data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_t, y, test_size=0.4, random_state=5)

In [16]:
#4:  train the model based on training data 
knn_fit = knn.fit(x_train,y_train)

In [17]:
#5: prediction on the test data
y_pred = knn.predict(x_test)

In [18]:
#6: Compare the predictied response(y_pred) vs actual response(y_test)
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))

0.9833333333333333


In [20]:
comp_df = pd.DataFrame()
comp_df['Class_actual'] = y_test
comp_df['Class_predicted'] = y_pred
#comp_df[comp_df.Class_actual != comp_df.Class_predicted]
comp_df.head(15)

Unnamed: 0,Class_actual,Class_predicted
82,Iris-versicolor,Iris-versicolor
134,Iris-virginica,Iris-virginica
114,Iris-virginica,Iris-virginica
42,Iris-setosa,Iris-setosa
109,Iris-virginica,Iris-virginica
57,Iris-versicolor,Iris-versicolor
1,Iris-setosa,Iris-setosa
70,Iris-versicolor,Iris-versicolor
25,Iris-setosa,Iris-setosa
84,Iris-versicolor,Iris-versicolor


In [None]:
# Try with differrent values of K=1 to K25 and record the testing accuracy
knn_range = list(range(1,26))
scores = []

for k in knn_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    scores.append(format(metrics.accuracy_score(y_test, y_pred), '.2f'))

print(scores)

In [None]:
# plot the accuracy for diff K values
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(knn_range, scores)
plt.xlabel('K values for knn')
plt.ylabel('testing accuracy')  # K=8 showing 97% accuracy

- that could give very high-variance-estimate of out-of-sample accuracy
- K-fold cross validation overcome this limiation

In [19]:
### Run Grid Search to tune some of the tunnable Sklearn parameter to improve performance
# SVM Model tuning Gridsearch
from sklearn.model_selection import GridSearchCV
knn_model = KNeighborsClassifier()

param_grid = { "n_neighbors"      : [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]             
                }

optimized_knn = GridSearchCV(knn_model, param_grid, scoring='accuracy', cv=5 )
optimized_knn.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [20]:
optimized_knn.best_score_

0.9666666666666667

In [21]:
optimized_knn.best_params_

{'n_neighbors': 7}

In [24]:
# Knn Model with K=8 - Final Model
final_knn = optimized_knn.best_estimator_
#final_knn.fit(x_train, y_train)

#### Predictive Accuracy estimate on unseen data

In [25]:
y_pred = final_knn.predict(x_test)
print(accuracy_score(y_pred, y_test))  # Predictive model performance estimate(Accuracy) on unseen data

0.9666666666666667
