## KNN implementation on Iris Data Set (Standardized)

## Library

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import math 

## Data Set

In [2]:
from sklearn.datasets import load_iris

In [3]:
iris = load_iris()

In [4]:
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [5]:
type(iris)

sklearn.utils._bunch.Bunch

In [6]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [7]:
print(iris['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [8]:
iris['data']

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [9]:
iris['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [10]:
iris['target_names']

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [11]:
iris['target']

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [12]:
df = pd.DataFrame(iris['data'],columns=iris['feature_names'])
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [13]:
df['target'] = iris['target']

In [14]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [15]:
df.columns 

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target'],
      dtype='object')

In [16]:
n = len(df.columns)
n

5

In [17]:
df[df.columns[n-1]].value_counts()

0    50
1    50
2    50
Name: target, dtype: int64

In [18]:
df[df.columns[n-1]].value_counts().index

Int64Index([0, 1, 2], dtype='int64')

In [19]:
df[df.columns[n-1]].nunique()

3

In [20]:
# custom_palette = sns.color_palette("Set1",10)
# sns.palplot(custom_palette)

https://www.codecademy.com/article/seaborn-design-ii

In [21]:
# custom_palette = sns.color_palette("Set1", df[df.columns[2]].nunique()+1)
# color_dict = dict()
# markers_dict = dict()
# j = 0

# for i in df[df.columns[2]].value_counts().index:
#     color_dict[i] = custom_palette[j]
#     markers_dict[i] = 'o'
#     j = j + 1

# color_dict['Test Point'] = custom_palette[2]
# markers_dict['Test Point'] = 'X'

# # print(color_dict)
# # print(markers_dict)

# sns.scatterplot(x = df[df.columns[0]],y = df[df.columns[1]],hue=df[df.columns[2]],palette=color_dict,style=df[df.columns[2]],markers=markers_dict)
# plt.title('Full Data Points')
# plt.legend(loc=(1.05,0.75))

In [22]:
df.columns[:n-1]

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [23]:
df[df.columns[:n-1]]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


## Standardization of Data

In [24]:
original_data = df[df.columns[:n-1]]

original_data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [25]:
original_data.describe().loc[['mean','std']]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238


In [26]:
# just counting if any mean or std of any band or feature is 0

count = 0
for i in original_data.describe().loc[['mean','std']].loc['mean']:
    if i ==0 :
        count = count + 1
print(count)

count = 0
for i in original_data.describe().loc[['mean','std']].loc['std']:
    if i == 0 :
        count = count + 1
print(count)

0
0


In [27]:
# standardizing the data

l = list()
data = dict()

for i in original_data.columns:
    mean = original_data[i].mean()
    std = original_data[i].std()

    for x in original_data[i]:
        z = (x - mean) / std
        l.append(z)

    data[i] = l
    l = list()

data

{'sepal length (cm)': [-0.8976738791967672,
  -1.1392004834649543,
  -1.3807270877331426,
  -1.5014903898672372,
  -1.0184371813308608,
  -0.5353839727944845,
  -1.5014903898672372,
  -1.0184371813308608,
  -1.7430169941354243,
  -1.1392004834649543,
  -0.5353839727944845,
  -1.259963785599049,
  -1.259963785599049,
  -1.863780296269519,
  -0.05233076425810914,
  -0.1730940663922027,
  -0.5353839727944845,
  -0.8976738791967672,
  -0.1730940663922027,
  -0.8976738791967672,
  -0.5353839727944845,
  -0.8976738791967672,
  -1.5014903898672372,
  -0.8976738791967672,
  -1.259963785599049,
  -1.0184371813308608,
  -1.0184371813308608,
  -0.7769105770626726,
  -0.7769105770626726,
  -1.3807270877331426,
  -1.259963785599049,
  -0.5353839727944845,
  -0.7769105770626726,
  -0.4146206706603909,
  -1.1392004834649543,
  -1.0184371813308608,
  -0.4146206706603909,
  -1.1392004834649543,
  -1.7430169941354243,
  -0.8976738791967672,
  -1.0184371813308608,
  -1.6222536920013308,
  -1.743016994135

In [28]:
standardized_data = pd.DataFrame(data)

standardized_data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.897674,1.015602,-1.335752,-1.311052
1,-1.139200,-0.131539,-1.335752,-1.311052
2,-1.380727,0.327318,-1.392399,-1.311052
3,-1.501490,0.097889,-1.279104,-1.311052
4,-1.018437,1.245030,-1.335752,-1.311052
...,...,...,...,...
145,1.034539,-0.131539,0.816859,1.443994
146,0.551486,-1.278680,0.703564,0.919223
147,0.793012,-0.131539,0.816859,1.050416
148,0.430722,0.786174,0.930154,1.443994


In [29]:
standardized_data.describe().loc[['mean','std']]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
mean,-1.457168e-15,-1.638319e-15,-1.2923e-15,-5.543714e-16
std,1.0,1.0,1.0,1.0


In [30]:
column_name = df.columns[n-1]
standardized_data[column_name] = df[column_name]
standardized_data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,-0.897674,1.015602,-1.335752,-1.311052,0
1,-1.139200,-0.131539,-1.335752,-1.311052,0
2,-1.380727,0.327318,-1.392399,-1.311052,0
3,-1.501490,0.097889,-1.279104,-1.311052,0
4,-1.018437,1.245030,-1.335752,-1.311052,0
...,...,...,...,...,...
145,1.034539,-0.131539,0.816859,1.443994,2
146,0.551486,-1.278680,0.703564,0.919223,2
147,0.793012,-0.131539,0.816859,1.050416,2
148,0.430722,0.786174,0.930154,1.443994,2


In [31]:
df = standardized_data
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,-0.897674,1.015602,-1.335752,-1.311052,0
1,-1.139200,-0.131539,-1.335752,-1.311052,0
2,-1.380727,0.327318,-1.392399,-1.311052,0
3,-1.501490,0.097889,-1.279104,-1.311052,0
4,-1.018437,1.245030,-1.335752,-1.311052,0
...,...,...,...,...,...
145,1.034539,-0.131539,0.816859,1.443994,2
146,0.551486,-1.278680,0.703564,0.919223,2
147,0.793012,-0.131539,0.816859,1.050416,2
148,0.430722,0.786174,0.930154,1.443994,2


## Determing the value of K

In [32]:
len(df)

150

In [33]:
k = math.floor(math.sqrt(len(df)))

if k%2==0 :
    k = k + 1

print(k)

13


## Train Test split

In [34]:
train_percentage = 80
test_percentage = 100 - train_percentage

print('Train Percentage :',train_percentage)
print('Test Percentage :',test_percentage)

Train Percentage : 80
Test Percentage : 20


In [35]:
no_of_train_data = math.ceil((train_percentage * len(df)) / 100)
print('No of train data :',no_of_train_data)

no_of_test_data = len(df) - no_of_train_data
print('No of test data',no_of_test_data)

No of train data : 120
No of test data 30


In [36]:
df.head(no_of_train_data)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,-0.897674,1.015602,-1.335752,-1.311052,0
1,-1.139200,-0.131539,-1.335752,-1.311052,0
2,-1.380727,0.327318,-1.392399,-1.311052,0
3,-1.501490,0.097889,-1.279104,-1.311052,0
4,-1.018437,1.245030,-1.335752,-1.311052,0
...,...,...,...,...,...
115,0.672249,0.327318,0.873507,1.443994,2
116,0.793012,-0.131539,0.986802,0.788031,2
117,2.242172,1.703886,1.666574,1.312801,2
118,2.242172,-1.049251,1.779869,1.443994,2


In [37]:
df_train = df.head(no_of_train_data)
df_train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,-0.897674,1.015602,-1.335752,-1.311052,0
1,-1.139200,-0.131539,-1.335752,-1.311052,0
2,-1.380727,0.327318,-1.392399,-1.311052,0
3,-1.501490,0.097889,-1.279104,-1.311052,0
4,-1.018437,1.245030,-1.335752,-1.311052,0
...,...,...,...,...,...
115,0.672249,0.327318,0.873507,1.443994,2
116,0.793012,-0.131539,0.986802,0.788031,2
117,2.242172,1.703886,1.666574,1.312801,2
118,2.242172,-1.049251,1.779869,1.443994,2


In [38]:
df.tail(no_of_test_data)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
120,1.276066,0.327318,1.100097,1.443994,2
121,-0.293857,-0.590395,0.646916,1.050416,2
122,2.242172,-0.590395,1.666574,1.050416,2
123,0.551486,-0.819823,0.646916,0.788031,2
124,1.034539,0.556746,1.100097,1.181609,2
125,1.638355,0.327318,1.27004,0.788031,2
126,0.430722,-0.590395,0.590269,0.788031,2
127,0.309959,-0.131539,0.646916,0.788031,2
128,0.672249,-0.590395,1.04345,1.181609,2
129,1.638355,-0.131539,1.156745,0.525645,2


In [39]:
df_test = df.tail(no_of_test_data)
df_test

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
120,1.276066,0.327318,1.100097,1.443994,2
121,-0.293857,-0.590395,0.646916,1.050416,2
122,2.242172,-0.590395,1.666574,1.050416,2
123,0.551486,-0.819823,0.646916,0.788031,2
124,1.034539,0.556746,1.100097,1.181609,2
125,1.638355,0.327318,1.27004,0.788031,2
126,0.430722,-0.590395,0.590269,0.788031,2
127,0.309959,-0.131539,0.646916,0.788031,2
128,0.672249,-0.590395,1.04345,1.181609,2
129,1.638355,-0.131539,1.156745,0.525645,2


In [40]:
df_test = df_test.reset_index()
df_test

Unnamed: 0,index,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,120,1.276066,0.327318,1.100097,1.443994,2
1,121,-0.293857,-0.590395,0.646916,1.050416,2
2,122,2.242172,-0.590395,1.666574,1.050416,2
3,123,0.551486,-0.819823,0.646916,0.788031,2
4,124,1.034539,0.556746,1.100097,1.181609,2
5,125,1.638355,0.327318,1.27004,0.788031,2
6,126,0.430722,-0.590395,0.590269,0.788031,2
7,127,0.309959,-0.131539,0.646916,0.788031,2
8,128,0.672249,-0.590395,1.04345,1.181609,2
9,129,1.638355,-0.131539,1.156745,0.525645,2


In [41]:
df_test = df_test.drop('index',axis=1)
df_test

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,1.276066,0.327318,1.100097,1.443994,2
1,-0.293857,-0.590395,0.646916,1.050416,2
2,2.242172,-0.590395,1.666574,1.050416,2
3,0.551486,-0.819823,0.646916,0.788031,2
4,1.034539,0.556746,1.100097,1.181609,2
5,1.638355,0.327318,1.27004,0.788031,2
6,0.430722,-0.590395,0.590269,0.788031,2
7,0.309959,-0.131539,0.646916,0.788031,2
8,0.672249,-0.590395,1.04345,1.181609,2
9,1.638355,-0.131539,1.156745,0.525645,2


In [42]:
# df_temp = df.copy()
# df_temp

In [43]:
# df_temp['Cancer Present'][no_of_train_data:] = ['Test Point'] * no_of_test_data
# df_temp

In [44]:
# sns.scatterplot(x = df_temp[df_temp.columns[0]],y = df_temp[df_temp.columns[1]],hue=df_temp[df_temp.columns[2]],palette=color_dict,style=df_temp[df_temp.columns[2]],markers=markers_dict)
# plt.title('Train Data Points and Test Data Points')
# plt.legend(loc=(1.05,0.75))

# # hue without palette : sns will provide default color for each group or class in df_temp['Cancer Present]
# # huw with palette : sns will provide color we want for each group or class in df_temp['Cancer Present]
# # style without markers : sns will provide default shape for each group or class in df_temp['Cancer Present]
# # style with markers : sns will provide shape we want for each group or class in df_temp['Cancer Present]


## Calculating Euclean Distance from Test point to Train point , sorting it ascending order and then finding the nearest neighbor

In [45]:
df_train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,-0.897674,1.015602,-1.335752,-1.311052,0
1,-1.139200,-0.131539,-1.335752,-1.311052,0
2,-1.380727,0.327318,-1.392399,-1.311052,0
3,-1.501490,0.097889,-1.279104,-1.311052,0
4,-1.018437,1.245030,-1.335752,-1.311052,0
...,...,...,...,...,...
115,0.672249,0.327318,0.873507,1.443994,2
116,0.793012,-0.131539,0.986802,0.788031,2
117,2.242172,1.703886,1.666574,1.312801,2
118,2.242172,-1.049251,1.779869,1.443994,2


In [46]:
df_test

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,1.276066,0.327318,1.100097,1.443994,2
1,-0.293857,-0.590395,0.646916,1.050416,2
2,2.242172,-0.590395,1.666574,1.050416,2
3,0.551486,-0.819823,0.646916,0.788031,2
4,1.034539,0.556746,1.100097,1.181609,2
5,1.638355,0.327318,1.27004,0.788031,2
6,0.430722,-0.590395,0.590269,0.788031,2
7,0.309959,-0.131539,0.646916,0.788031,2
8,0.672249,-0.590395,1.04345,1.181609,2
9,1.638355,-0.131539,1.156745,0.525645,2


In [47]:
df_test.iloc[0]

sepal length (cm)    1.276066
sepal width (cm)     0.327318
petal length (cm)    1.100097
petal width (cm)     1.443994
target               2.000000
Name: 0, dtype: float64

In [48]:
df_train.iloc[0][0]

-0.8976738791967672

In [49]:
df_train[df.columns[n-1]][0]

0

In [50]:
n

5

In [51]:
df_train.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target'],
      dtype='object')

In [52]:
class_name_list = list()

# calculating euclidean distance from test data to train data

for i in range(len(df_test)):
    distance_list = list()
    for j in range(len(df_train)):
        distance = np.linalg.norm(df_test.iloc[i][:n-1]-df_train.iloc[j][:n-1])
        distance_list.append((df_train[df_train.columns[n-1]][j],distance))    

    # finding the first k smallest distances without sorting and counting the class

    minimum = None
    checked = dict()
    count = dict()
    min_list = list()

    for ii in range(k):
        for jj in range(len(distance_list)):
            if jj not in checked :
                if minimum is None :
                    minimum = distance_list[jj]
                elif distance_list[jj][1] < minimum[1]:
                    minimum = distance_list[jj]
                    index = jj
        min_list.append(minimum)
        if minimum[0] not in count:
            count[minimum[0]] = 1
        else:
            count[minimum[0]] = count[minimum[0]] + 1
        checked[index] = 1
        minimum = None
    

    # finding out the most nearest class
    
    min = 0
    for ii in count:
        if count[ii] > min:
            class_name = ii
            min = count[ii]
    class_name_list.append(class_name)
    
print(class_name_list)

[2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1]


In [53]:
print(len(class_name_list))

30


In [54]:
# df_temp2 = df.copy()
# df_temp2

In [55]:
# df_temp2[df_temp2.columns[n-1]][no_of_train_data:] = class_name_list
# df_temp2

In [56]:
# color_dict

In [57]:
# sns.scatterplot(x = df_temp2[df_temp2.columns[0]],y = df_temp2[df_temp2.columns[1]],hue=df_temp2[df_temp2.columns[2]],palette=color_dict,style=df_temp2[df_temp2.columns[2]],markers=markers_dict)
# plt.title('Final Data Points')
# plt.legend(loc=(1.05,0.75))

In [58]:
# plt.figure(figsize=(15,6))

# plt.subplot(1, 2, 1) # row 1, col 2 index 1

# sns.scatterplot(x = df_temp[df_temp.columns[0]],y = df_temp[df_temp.columns[1]],hue=df_temp[df_temp.columns[2]],palette=color_dict,style=df_temp[df_temp.columns[2]],markers=markers_dict)
# plt.title('Train Data Points and Test Data Points')

# # hue without palette : sns will provide default color for each group or class in df_temp['Cancer Present]
# # huw with palette : sns will provide color we want for each group or class in df_temp['Cancer Present]
# # style without markers : sns will provide default shape for each group or class in df_temp['Cancer Present]
# # style with markers : sns will provide shape we want for each group or class in df_temp['Cancer Present]





# plt.subplot(1, 2, 2) # index 2


# sns.scatterplot(x = df_temp2[df_temp2.columns[0]],y = df_temp2[df_temp2.columns[1]],hue=df_temp2[df_temp2.columns[2]],palette=color_dict,style=df_temp2[df_temp2.columns[2]],markers=markers_dict)
# plt.title('Final Data Points')

In [59]:
right = 0
wrong = 0

for i in range(len(class_name_list)):
    if class_name_list[i] == df_test[df_test.columns[n-1]][i]:
        right = right + 1
    else:
        wrong = wrong + 1

print(right,wrong)

19 11


In [60]:
accuarcy = (right * 100) / len(class_name_list)
print(accuarcy)

63.333333333333336
