In [1]:
import numpy as np
import pandas as pd

# K-Nearest Neighbors Classification (KNN)

1. pick a value for k
2. Search for the k observations in the training data that are "nearest" to the measurements of the unknown iris.
3. Use the most popular response value from the k nearest neighbors as the predicted response value for the unknown iris.

# Loading the data

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
print(X.shape)
print(y.shape)

(150, 4)
(150,)


# scikit-learn 4-steps modelling pattern

### Step 1 : Import the class you plan to use

In [3]:
from sklearn.neighbors import KNeighborsClassifier

### Step 2 : Instantiate the estimator

* "Estimator" is scikit-learn term for model

In [4]:
knn = KNeighborsClassifier(n_neighbors=1)
print(knn)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')


### Step 3 : Fit the model with data (aka model training)

* model is learning the relationship between X and y.

In [5]:
knn.fit(X,y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

### Step 4 : Predict the response for a new sample

* New sample is reffered to as "out-of-sample" data

In [6]:
knn.predict([[3,5,4,2]])

array([2])

* Returns a numpy array. 2 indicates the target name virginica

In [7]:
print(list(zip(np.unique(iris.target), iris.target_names)))

[(0, 'setosa'), (1, 'versicolor'), (2, 'virginica')]


* can predict multiple observations at once

In [8]:
x_new = ([3,5,4,2],[5,4,3,2])
knn.predict(x_new)

array([2, 1])

## Using a different value for K

In [9]:
knn2 = KNeighborsClassifier(n_neighbors=5)
knn2.fit(X,y)
knn2.predict(x_new)

array([1, 1])

## Using a different Classification model

In [10]:
#import the model
from sklearn.linear_model import LogisticRegression

#instantiate the model
logreg = LogisticRegression()

#fit the model with data
logreg.fit(X,y)

#predict the response for a new sample
logreg.predict(x_new)

array([2, 0])