# Iris Plant Classification

The Iris dataset was used in R.A. Fisher's classic 1936 paper, The Use of Multiple Measurements in Taxonomic Problems, and can also be found on the UCI Machine Learning Repository.

It includes three iris species with 50 samples each as well as some properties about each flower. One flower species is linearly separable from the other two, but the other two are not linearly separable from each other.

The columns in this dataset are:

- Id
- SepalLengthCm
- SepalWidthCm
- PetalLengthCm
- PetalWidthCm
- Species

Data source: https://www.kaggle.com/uciml/iris?select=Iris.csv

# Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the dataset

In [None]:
ds = pd.read_csv('iris.csv')

In [None]:
ds.head()

# Visualising the dataset

In [None]:
sns.scatterplot(x ='SepalWidthCm', y='SepalLengthCm',hue = 'Species', data = ds )
plt.show()
sns.scatterplot(x ='PetalLengthCm', y='SepalWidthCm',hue = 'Species', data = ds )
plt.show()
sns.scatterplot(x ='PetalLengthCm', y='PetalWidthCm',hue = 'Species', data = ds )
plt.show()
sns.scatterplot(x ='SepalWidthCm', y='PetalWidthCm',hue = 'Species', data = ds )
plt.show()

In [None]:
plt.figure(figsize = (10,10))

plt.subplot(2,2,1)
sns.violinplot(x ='Species', y='SepalLengthCm',hue = 'Species', data = ds )

plt.subplot(2,2,2)
sns.violinplot(x ='Species', y='SepalWidthCm',hue = 'Species', data = ds )

plt.subplot(2,2,3)
sns.violinplot(x ='Species', y='PetalLengthCm',hue = 'Species', data = ds )

plt.subplot(2,2,4)
sns.violinplot(x ='Species', y='PetalWidthCm',hue = 'Species', data = ds )

In [None]:
sns.pairplot(ds, hue = 'Species', vars = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm'])

In [None]:
plt.figure(figsize = (10,7))
sns.heatmap(ds.corr(),annot = True)

# Taking care of missing data

In [None]:
# We observe no missing data
sns.heatmap(ds.isnull(), yticklabels = False, cbar = False, cmap = "Blues")

In [None]:
X = ds.iloc[:, :-1].values
y = ds.iloc[:, -1].values

In [None]:
X

In [None]:
y

# Encoding Categorical Data

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
# We observe 'Iris-setosa' being assigned label 0, 'Iris-versicolor' being assigned label 1 and 
# 'Iris-virginica' being assigned label 2.
y

# Splitting the dataset into the training set and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting the K-Neighbours Classifier to the dataset

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, p = 2, metric = 'minkowski')
knn.fit(X_train, y_train)

In [None]:
# Predicting the test set results

y_pred = knn.predict(X_test)

# Model Evaluation - Confusion Matrix and K-Fold Cross validation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
accuracies = cross_val_score(estimator = knn, X = X_train, y = y_train, cv = 10)
mean_accuracy = accuracies.mean()
std_accuracy = accuracies.std()

In [None]:
print(mean_accuracy)
print(std_accuracy)