In [None]:
# Begin by importing all necessary libraries
import pandas as pd
import sklearn as sk

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

You can download the iris.csv file <a href='https://www.kaggle.com/uciml/iris'>here</a>.

Just put the data file in the same directory as your Python file. The Pandas library has an easy way to load in data, <span style="color:red"> read_csv()</span>:

In [None]:
data = pd.read_csv('./data/iris.csv')
    
# It is a good idea to check and make sure the data is loaded as expected.
    
print(data.head(5))

Because the dataset has been prepared so well, we don't need to do a lot of preprocessing. One thing we may want to do though it drop the "ID" column, as it is just a representation of row the example is found on.

As this isn't helpful we could drop it from the dataset using the <span style="color:red">drop()</span> function.

We now need to define the features and labels. We can do this easily with Pandas by slicing the data table and choosing certain rows/columns with <span style="color:red">iloc()</span>:

In [None]:
data.drop('Id', axis=1, inplace=True)

# Pandas ".iloc" expects row_indexer, column_indexer  
X = data.iloc[:,:-1].values
# Now let's tell the dataframe which column we want for the target/labels.  
y = data['Species']

Now that we have the features and labels we want, we can split the data into training and testing sets using sklearn's handy feature <span style="color:red">train_test_split()</span>:

In [None]:
# Alternate way of selecting columns:
#X = data.iloc['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm']
print(X)
list(y)

In [None]:
# Test size specifies how much of the data you want to set aside for the testing set. 
# Random_state parameter is just a random seed we can use.
# You can use it if you'd like to reproduce these specific results.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=27)

In [None]:
#import numpy as np
#from sklearn.model_selection import train_test_split

#X, y = np.arange(10).reshape((5, 2)), range(5)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
#X_train
#y_train
#train_test_split(y, shuffle=False)

Now we can instantiate the models. Let's try using two classifiers, a Support Vector Classifier and a K-Nearest Neighbors Classifier:

In [None]:
SVC_model = SVC()
# KNN model requires you to specify n_neighbors,
# the number of points the classifier will look at to determine what class a new point belongs to
KNN_model = KNeighborsClassifier(n_neighbors=5)

Now let's fit the classifiers:

In [None]:
SVC_model.fit(X_train, y_train)
KNN_model.fit(X_train, y_train)

The call has trained the model, so now we can predict and store the prediction in a variable:

In [None]:
SVC_prediction = SVC_model.predict(X_test)
KNN_prediction = KNN_model.predict(X_test)

We should now evaluate how the classifier performed. There are multiple methods of evaluating a classifier's performance, and you can read more about there different methods below.

In Scikit-Learn you just pass in the predictions against the ground truth labels which were stored in your test labels:

In [None]:
# Accuracy score is the simplest way to evaluate
print(accuracy_score(SVC_prediction, y_test))
print(accuracy_score(KNN_prediction, y_test))
# But Confusion Matrix and Classification Report give more details about performance
print(confusion_matrix(SVC_prediction, y_test))
print(classification_report(KNN_prediction, y_test))

For reference, here's the output we got on the metrics:

SVC accuracy: 0.9333333333333333

KNN accuracy: 0.9666666666666667

At first glance, it seems KNN performed better. Here's the confusion matrix for SVC:

[[ 7  0  0]

 [ 0 10  1]

 [ 0  1 11]]