In [15]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [16]:
iris = pd.read_csv('../iris.csv')

iris.drop('id', axis=1, inplace=True)

X = iris[['petal_len', 'petal_wd']]
y = iris['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1, stratify=y)

## instantiate 
knn = KNeighborsClassifier(n_neighbors=5)

## fit 
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [17]:
pred = knn.predict(X_test)
print(pred[:5])

['iris-virginica' 'iris-setosa' 'iris-setosa' 'iris-versicolor'
 'iris-versicolor']


In [18]:
"""
Each prediction is a species of iris and stored in a 1darray.
predict() returns an array of predicted class labels for the predictor data.
"""

'\nEach prediction is a species of iris and stored in a 1darray.\npredict() returns an array of predicted class labels for the predictor data.\n'

In [19]:
"""
Of all classification algorithms implemented in scikit learn, 
there is an additional method 'predict_prob'. 
Instead of splitting the label, 
it outputs the probability for the target in array form. 
Let’s take a look at what the predicted probabilities are for the 11th and 12th flowers:
"""

y_pred_prob = knn.predict_proba(X_test)
print(y_pred_prob[10:12])

[[1.  0.  0. ]
 [0.  0.2 0.8]]


In [20]:
#For example, the probability of the 11th flower being predicted an iris-setosa is 1, an iris-versicolor and an iris-virginica are both 0 :
print(y_pred_prob[10])

[1. 0. 0.]


In [21]:
"""
For the next flower, 
there is a 20% chance that it would be classified as iris-versicolor but 80% chance to be iris-virginica. 
What it tells us is that of the five nearest neighbours of the 12th flower in the testing set, 1 is an iris-versicolor, the rest 4 are iris-virginica.
"""

print(y_pred_prob[10+1])

[0.  0.2 0.8]


In [22]:
print(pred[10:12])
print(y_pred_prob[10:12])

['iris-setosa' 'iris-virginica']
[[1.  0.  0. ]
 [0.  0.2 0.8]]


In [23]:
"""
*** In classification tasks, 
soft prediction returns the predicted probabilities of data points belonging to each of the classes while hard prediction outputs the labels only.
"""

'\n*** In classification tasks, \nsoft prediction returns the predicted probabilities of data points belonging to each of the classes while hard prediction outputs the labels only.\n'