In [14]:
!/opt/venv/bin/python -m pip install --upgrade pip
# Handling pip upgrades

import pandas as pd
import numpy as np
import math
import operator

# Making plotly as the backend for pandas
!pip install plotly
pd.options.plotting.backend = "plotly"

# Setting the theme
import plotly.io as pio
import plotly.express as px
pio.templates.default = "plotly_white"

from sklearn.model_selection import train_test_split
import pprint

Requirement already up-to-date: pip in /opt/venv/lib/python3.7/site-packages (20.2.4)


In [2]:
iris_set = pd.read_csv("./iris/iris.data")
# add the column names, as it doesn't have it
attributes = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
iris_set.columns = attributes
iris_set = iris_set.sample(frac=1).reset_index(drop=True) # Shuffle
iris_set

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,4.8,3.4,1.6,0.2,Iris-setosa
1,6.3,2.9,5.6,1.8,Iris-virginica
2,5.0,3.4,1.5,0.2,Iris-setosa
3,5.5,3.5,1.3,0.2,Iris-setosa
4,7.7,2.8,6.7,2.0,Iris-virginica
...,...,...,...,...,...
144,5.0,3.3,1.4,0.2,Iris-setosa
145,5.5,2.5,4.0,1.3,Iris-versicolor
146,6.2,2.8,4.8,1.8,Iris-virginica
147,5.2,3.4,1.4,0.2,Iris-setosa


In [81]:
X = iris_set[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
X = np.array(X)
X[:3]

array([[4.8, 3.4, 1.6, 0.2],
       [6.3, 2.9, 5.6, 1.8],
       [5. , 3.4, 1.5, 0.2]])

In [83]:
Y = (iris_set['species'])
Y = np.array(Y)
Y[:3]

array(['Iris-setosa', 'Iris-virginica', 'Iris-setosa'], dtype=object)

In [8]:
print("First five rows")
print(iris_set.head())
print("*********")
print("columns",iris_set.columns)
print("*********")
print("shape:",iris_set.shape)
print("*********")
print("Size:",iris_set.size)
print("*********")
print("no of samples available for each type")
print(iris_set['species'].value_counts())
print("*********")
print(iris_set.describe())

First five rows
   sepal_length  sepal_width  petal_length  petal_width         species
0           4.8          3.4           1.6          0.2     Iris-setosa
1           6.3          2.9           5.6          1.8  Iris-virginica
2           5.0          3.4           1.5          0.2     Iris-setosa
3           5.5          3.5           1.3          0.2     Iris-setosa
4           7.7          2.8           6.7          2.0  Iris-virginica
*********
columns Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')
*********
shape: (149, 5)
*********
Size: 745
*********
no of samples available for each type
Iris-virginica     50
Iris-versicolor    50
Iris-setosa        49
Name: species, dtype: int64
*********
       sepal_length  sepal_width  petal_length  petal_width
count    149.000000   149.000000    149.000000   149.000000
mean       5.848322     3.051007      3.774497     1.205369
std        0.828594     0.433499      1.759651

In [12]:
def euclidianDistance(data1, data2, length):
    distance = 0
    for x in range(length):
        distance += np.square(data1[x] - data2[x])
    return np.sqrt(distance)

In [99]:
def knn(trainingSet, testInstance, k): 
    distances = {}
    sort = {}
    length = np.shape(testInstance)[1]
    print(length)

    # Calculating euclidean distance between each row of training data and test data
    for x in range(len(trainingSet)):
        dist = euclidianDistance(testInstance, trainingSet.iloc[x], length)
        distances[x] = dist[0]

    # Sorting them on the basis of distance
    sorted_d = sorted(distances.items(), key=operator.itemgetter(1)) #by using it we store indices also
    sorted_d1 = sorted(distances.items())
    print(sorted_d[:5])
    print(sorted_d1[:5])
    neighbors = []

    # Extracting top k neighbors
    for x in range(k):
        neighbors.append(sorted_d[x][0])
        counts = {"Iris-setosa":0,"Iris-versicolor":0,"Iris-virginica":0}    
    # Calculating the most freq class in the neighbors
    
    for x in range(len(neighbors)):
        response = trainingSet.iloc[neighbors[x]][-1]
        if response in counts:
            counts[response] += 1
        else:
            counts[response] = 1
    
    print(counts)
    sortedVotes = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
    print(sortedVotes)
    
    return(sortedVotes[0][0], neighbors)

In [113]:
testSet = [[1.4, 3.6, 3.4, 1.2]]
test = pd.DataFrame(testSet)
result,neigh = knn(iris_set, test, 4)#here we gave k=4

print("And the flower is:",result)

4
[(100, 3.706750598570128), (24, 3.8065732621348567), (140, 3.817066936798463), (88, 3.8340579025361627), (17, 3.8431757701151272)]
[(0, 3.97994974842648), (1, 5.449770637375486), (2, 4.196427051671457), (3, 4.714870093650513), (4, 7.201388754955534)]
{'Iris-setosa': 2, 'Iris-versicolor': 2, 'Iris-virginica': 0}
[('Iris-setosa', 2), ('Iris-versicolor', 2), ('Iris-virginica', 0)]
And the flower is: Iris-setosa


In [85]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15)

In [115]:
# calculating the predictions
print(type(testSet))
pred = []
for instance in X_test:
    result,neigh = knn(iris_set,pd.DataFrame([instance]), 4)
    pred.append(result)
pred
# print([instance])

<class 'list'>
4
[(50, 0.0), (16, 0.17320508075688787), (129, 0.360555127546399), (5, 0.3605551275463994), (86, 0.3741657386773939)]
[(0, 4.748684028233507), (1, 0.7280109889280522), (2, 4.7455242070818695), (3, 4.747630988187688), (4, 1.5588457268119895)]
{'Iris-setosa': 0, 'Iris-versicolor': 0, 'Iris-virginica': 4}
[('Iris-virginica', 4), ('Iris-setosa', 0), ('Iris-versicolor', 0)]
4
[(29, 0.0), (93, 0.26457513110645936), (25, 0.45825756949558427), (64, 0.46904157598234325), (137, 0.5099019513592784)]
[(0, 5.500909015790027), (1, 1.2165525060596445), (2, 5.492722457943784), (3, 5.479963503528103), (4, 0.6782329983125273)]
{'Iris-setosa': 0, 'Iris-versicolor': 0, 'Iris-virginica': 4}
[('Iris-virginica', 4), ('Iris-setosa', 0), ('Iris-versicolor', 0)]
4
[(145, 0.0), (46, 0.20000000000000018), (117, 0.24494897427831766), (73, 0.3000000000000001), (75, 0.30000000000000016)]
[(0, 2.875760768909681), (1, 1.8999999999999997), (2, 2.9189039038652846), (3, 3.0822070014844885), (4, 3.565108693

['Iris-virginica',
 'Iris-virginica',
 'Iris-versicolor',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-versicolor',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-virginica',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-setosa']

In [118]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, pred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         7
Iris-versicolor       1.00      1.00      1.00         6
 Iris-virginica       1.00      1.00      1.00        10

       accuracy                           1.00        23
      macro avg       1.00      1.00      1.00        23
   weighted avg       1.00      1.00      1.00        23

