In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
iris = pd.read_csv('../iris.csv')

iris.drop('id', axis=1, inplace=True)

X = iris[['petal_len', 'petal_wd']]
y = iris['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,  random_state=1, stratify=y)

# create new a knn model
knn2 = KNeighborsClassifier()

# create a dict of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(2, 15)}

# use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(knn2, param_grid, cv=4)

#fit model to data
knn_gscv.fit(X, y)

knn_final = KNeighborsClassifier(n_neighbors=knn_gscv.best_params_['n_neighbors'])
knn_final.fit(X, y)

KNeighborsClassifier()

In [6]:
"""
We take some measurements of an iris and record that the length and width of its sepal are 5.84 cm and 3.06 cm, respectively, 
and the length and width of its petal are 3.76 cm and 1.20 cm, respectively. 
How do we make a prediction using the built model?

remember that our model depends on ['petal_len', 'petal_wd']
"""

#Error :
#new_data = np.array([3.76, 1.20])
#knn_final.predict(np.array(new_data))

"""
Wait, what just happened? 
When we trained the model, 
the data is 2D DataFrame, 
so the model was expecting a 2D array, 
which could be numpy array or pandas DataFrame. 
Now new_data is a 1D array, we need to make it 2D as the error message suggested:
"""

new_data = np.array([3.76, 1.20]).reshape(1, -1)
knn_final.predict(new_data)



array(['iris-versicolor'], dtype=object)

In [7]:
"""
Model.predict can also take a 2D list. 
For example, 
knn_final.predict([[3.76, 1.2]]) will output the same result as shown in the lesson.
"""
knn_final.predict([[3.76, 1.2]])



array(['iris-versicolor'], dtype=object)

In [8]:
"""
let's try this :
three plants of iris share the same petal width, 2.25cm, but are different in the length of the petal: 5.03 cm, 3.85 cm, and 1.77 cm, respectively.
"""

new_data = np.array([[3.76, 1.2], [5.25, 1.2], [1.58, 1.2]])
knn_final.predict(new_data)



array(['iris-versicolor', 'iris-virginica', 'iris-setosa'], dtype=object)

In [9]:
#Recall that in classifications, it is more common to predict the probability of each data point being assigned to each label:
print(knn_final.predict_proba(new_data))

"""
Each row sums to 1. 
Take the second iris, our model predicts that there is a probability of 40% that the iris would be versicolor, and 60% virginica. 
This is consistent with the label prediction: virginica.

#For classification algorithms in scikit learn, function predict_proba takes a new data point and outputs a probability for each class as a value between 0 and 1.
"""

[[0.  1.  0. ]
 [0.  0.4 0.6]
 [1.  0.  0. ]]


