## KNN implementation on Cancer Data Set (Standardized)

## Library

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt  
import math 

## Data Set

In [2]:
from sklearn.datasets import load_breast_cancer

In [3]:
cancer = load_breast_cancer()

In [4]:
cancer

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [5]:
type(cancer)

sklearn.utils._bunch.Bunch

In [6]:
cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [7]:
print(cancer['DESCR'])

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [8]:
cancer['data']

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [9]:
cancer['feature_names']

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [10]:
cancer['target_names']

array(['malignant', 'benign'], dtype='<U9')

In [11]:
cancer['target']

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [12]:
df = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [13]:
df['Breast Cancer Type'] = cancer['target']

In [14]:
df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Breast Cancer Type
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [15]:
df.columns 

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'Breast Cancer Type'],
      dtype='object')

In [16]:
n = len(df.columns)
n

31

In [17]:
df[df.columns[n-1]].value_counts()

1    357
0    212
Name: Breast Cancer Type, dtype: int64

In [18]:
df[df.columns[n-1]].value_counts().index

Int64Index([1, 0], dtype='int64')

In [19]:
df[df.columns[n-1]].nunique()

2

In [20]:
# custom_palette = sns.color_palette("Set1",10)
# sns.palplot(custom_palette)

https://www.codecademy.com/article/seaborn-design-ii

In [21]:
# custom_palette = sns.color_palette("Set1", df[df.columns[2]].nunique()+1)
# color_dict = dict()
# markers_dict = dict()
# j = 0

# for i in df[df.columns[2]].value_counts().index:
#     color_dict[i] = custom_palette[j]
#     markers_dict[i] = 'o'
#     j = j + 1

# color_dict['Test Point'] = custom_palette[2]
# markers_dict['Test Point'] = 'X'

# # print(color_dict)
# # print(markers_dict)

# sns.scatterplot(x = df[df.columns[0]],y = df[df.columns[1]],hue=df[df.columns[2]],palette=color_dict,style=df[df.columns[2]],markers=markers_dict)
# plt.title('Full Data Points')
# plt.legend(loc=(1.05,0.75))

In [22]:
df.columns[:n-1]

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [23]:
df[df.columns[:n-1]]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


## Standardization of Data

In [24]:
original_data = df[df.columns[:n-1]]

original_data

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [25]:
original_data.describe().loc[['mean','std']]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061


In [26]:
# just counting if any mean or std of any band or feature is 0

count = 0
for i in original_data.describe().loc[['mean','std']].loc['mean']:
    if i ==0 :
        count = count + 1
print(count)

count = 0
for i in original_data.describe().loc[['mean','std']].loc['std']:
    if i == 0 :
        count = count + 1
print(count)

0
0


In [27]:
# standardizing the data

l = list()
data = dict()

for i in original_data.columns:
    mean = original_data[i].mean()
    std = original_data[i].std()

    for x in original_data[i]:
        z = (x - mean) / std
        l.append(z)

    data[i] = l
    l = list()

data

{'mean radius': [1.096099529431712,
  1.8282119737343598,
  1.5784992020342323,
  -0.7682333229203782,
  1.7487579100115918,
  -0.4759558742259106,
  1.1698783028885684,
  -0.11841258747345444,
  -0.31988539191333054,
  -0.4731182290929542,
  0.5370834382393807,
  0.4689799550484367,
  1.4309416551205205,
  0.4888434709791287,
  -0.11273729720754257,
  0.1171119585618929,
  0.156838990423277,
  0.5682975347018965,
  1.6125509436297036,
  -0.16665255473370688,
  -0.29718423084968254,
  -1.3119261303947476,
  0.34412356919837295,
  1.9956330365787638,
  0.7158550816156083,
  0.8548996931304528,
  0.12846253909371713,
  1.2720335276749841,
  0.3327729886665492,
  0.9769184338475606,
  1.277708817940896,
  -0.6490522273362264,
  0.8208479515349805,
  1.45931810645008,
  0.5682975347018965,
  0.7413938878122122,
  0.0348202497061692,
  -0.3113724565144627,
  0.2448059895449131,
  -0.1836784255314425,
  -0.19502900606326676,
  -0.9016026441693102,
  1.4025652037909602,
  -0.2404313281905628,

In [28]:
standardized_data = pd.DataFrame(data)

standardized_data

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,1.096100,-2.071512,1.268817,0.983510,1.567087,3.280628,2.650542,2.530249,2.215566,2.253764,...,1.885031,-1.358098,2.301575,1.999478,1.306537,2.614365,2.107672,2.294058,2.748204,1.935312
1,1.828212,-0.353322,1.684473,1.907030,-0.826235,-0.486643,-0.023825,0.547662,0.001391,-0.867889,...,1.804340,-0.368879,1.533776,1.888827,-0.375282,-0.430066,-0.146620,1.086129,-0.243675,0.280943
2,1.578499,0.455786,1.565126,1.557513,0.941382,1.052000,1.362280,2.035440,0.938859,-0.397658,...,1.510541,-0.023953,1.346291,1.455004,0.526944,1.081980,0.854222,1.953282,1.151242,0.201214
3,-0.768233,0.253509,-0.592166,-0.763792,3.280667,3.399917,1.914213,1.450431,2.864862,4.906602,...,-0.281217,0.133866,-0.249720,-0.549538,3.391291,3.889975,1.987839,2.173873,6.040726,4.930672
4,1.748758,-1.150804,1.775011,1.824624,0.280125,0.538866,1.369806,1.427237,-0.009552,-0.561956,...,1.297434,-1.465481,1.337363,1.219651,0.220362,-0.313119,0.612640,0.728618,-0.867590,-0.396751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,2.109139,0.720838,2.058974,2.341795,1.040926,0.218868,1.945573,2.318924,-0.312314,-0.930209,...,1.899514,0.117596,1.751022,2.013529,0.378033,-0.273077,0.663928,1.627719,-1.358963,-0.708467
565,1.703356,2.083301,1.614511,1.722326,0.102368,-0.017817,0.692434,1.262558,-0.217473,-1.057681,...,1.535369,2.045599,1.420690,1.493644,-0.690623,-0.394473,0.236365,0.733182,-0.531387,-0.973122
566,0.701667,2.043775,0.672084,0.577445,-0.839745,-0.038646,0.046547,0.105684,-0.808406,-0.894800,...,0.560868,1.373645,0.578492,0.427529,-0.808876,0.350427,0.326479,0.413705,-1.103578,-0.318129
567,1.836725,2.334403,1.980781,1.733693,1.524426,3.269267,3.294046,2.656528,2.135315,1.042778,...,1.959515,2.235958,2.301575,1.651717,1.429169,3.901415,3.194794,2.287972,1.917396,2.217684


In [29]:
standardized_data.describe().loc[['mean','std']]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
mean,-3.142575e-15,-6.558316e-15,-7.012551e-16,-8.339355e-16,6.083788e-15,-1.081346e-15,-3.703345e-16,9.935423e-16,-1.88855e-15,-1.424363e-15,...,-2.346102e-15,1.761138e-15,-1.214416e-15,5.919889e-16,-5.036783e-15,-2.118204e-15,6.899382e-16,-1.73265e-16,-2.454417e-15,2.438979e-15
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [30]:
column_name = df.columns[n-1]
standardized_data[column_name] = df[column_name]
standardized_data

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Breast Cancer Type
0,1.096100,-2.071512,1.268817,0.983510,1.567087,3.280628,2.650542,2.530249,2.215566,2.253764,...,-1.358098,2.301575,1.999478,1.306537,2.614365,2.107672,2.294058,2.748204,1.935312,0
1,1.828212,-0.353322,1.684473,1.907030,-0.826235,-0.486643,-0.023825,0.547662,0.001391,-0.867889,...,-0.368879,1.533776,1.888827,-0.375282,-0.430066,-0.146620,1.086129,-0.243675,0.280943,0
2,1.578499,0.455786,1.565126,1.557513,0.941382,1.052000,1.362280,2.035440,0.938859,-0.397658,...,-0.023953,1.346291,1.455004,0.526944,1.081980,0.854222,1.953282,1.151242,0.201214,0
3,-0.768233,0.253509,-0.592166,-0.763792,3.280667,3.399917,1.914213,1.450431,2.864862,4.906602,...,0.133866,-0.249720,-0.549538,3.391291,3.889975,1.987839,2.173873,6.040726,4.930672,0
4,1.748758,-1.150804,1.775011,1.824624,0.280125,0.538866,1.369806,1.427237,-0.009552,-0.561956,...,-1.465481,1.337363,1.219651,0.220362,-0.313119,0.612640,0.728618,-0.867590,-0.396751,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,2.109139,0.720838,2.058974,2.341795,1.040926,0.218868,1.945573,2.318924,-0.312314,-0.930209,...,0.117596,1.751022,2.013529,0.378033,-0.273077,0.663928,1.627719,-1.358963,-0.708467,0
565,1.703356,2.083301,1.614511,1.722326,0.102368,-0.017817,0.692434,1.262558,-0.217473,-1.057681,...,2.045599,1.420690,1.493644,-0.690623,-0.394473,0.236365,0.733182,-0.531387,-0.973122,0
566,0.701667,2.043775,0.672084,0.577445,-0.839745,-0.038646,0.046547,0.105684,-0.808406,-0.894800,...,1.373645,0.578492,0.427529,-0.808876,0.350427,0.326479,0.413705,-1.103578,-0.318129,0
567,1.836725,2.334403,1.980781,1.733693,1.524426,3.269267,3.294046,2.656528,2.135315,1.042778,...,2.235958,2.301575,1.651717,1.429169,3.901415,3.194794,2.287972,1.917396,2.217684,0


In [31]:
df = standardized_data
df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Breast Cancer Type
0,1.096100,-2.071512,1.268817,0.983510,1.567087,3.280628,2.650542,2.530249,2.215566,2.253764,...,-1.358098,2.301575,1.999478,1.306537,2.614365,2.107672,2.294058,2.748204,1.935312,0
1,1.828212,-0.353322,1.684473,1.907030,-0.826235,-0.486643,-0.023825,0.547662,0.001391,-0.867889,...,-0.368879,1.533776,1.888827,-0.375282,-0.430066,-0.146620,1.086129,-0.243675,0.280943,0
2,1.578499,0.455786,1.565126,1.557513,0.941382,1.052000,1.362280,2.035440,0.938859,-0.397658,...,-0.023953,1.346291,1.455004,0.526944,1.081980,0.854222,1.953282,1.151242,0.201214,0
3,-0.768233,0.253509,-0.592166,-0.763792,3.280667,3.399917,1.914213,1.450431,2.864862,4.906602,...,0.133866,-0.249720,-0.549538,3.391291,3.889975,1.987839,2.173873,6.040726,4.930672,0
4,1.748758,-1.150804,1.775011,1.824624,0.280125,0.538866,1.369806,1.427237,-0.009552,-0.561956,...,-1.465481,1.337363,1.219651,0.220362,-0.313119,0.612640,0.728618,-0.867590,-0.396751,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,2.109139,0.720838,2.058974,2.341795,1.040926,0.218868,1.945573,2.318924,-0.312314,-0.930209,...,0.117596,1.751022,2.013529,0.378033,-0.273077,0.663928,1.627719,-1.358963,-0.708467,0
565,1.703356,2.083301,1.614511,1.722326,0.102368,-0.017817,0.692434,1.262558,-0.217473,-1.057681,...,2.045599,1.420690,1.493644,-0.690623,-0.394473,0.236365,0.733182,-0.531387,-0.973122,0
566,0.701667,2.043775,0.672084,0.577445,-0.839745,-0.038646,0.046547,0.105684,-0.808406,-0.894800,...,1.373645,0.578492,0.427529,-0.808876,0.350427,0.326479,0.413705,-1.103578,-0.318129,0
567,1.836725,2.334403,1.980781,1.733693,1.524426,3.269267,3.294046,2.656528,2.135315,1.042778,...,2.235958,2.301575,1.651717,1.429169,3.901415,3.194794,2.287972,1.917396,2.217684,0


## Determing the value of K

In [32]:
len(df)

569

In [33]:
k = math.floor(math.sqrt(len(df)))

if k%2==0 :
    k = k + 1

print(k)

23


## Train Test split

In [34]:
train_percentage = 70
test_percentage = 100 - train_percentage

print('Train Percentage :',train_percentage)
print('Test Percentage :',test_percentage)

Train Percentage : 70
Test Percentage : 30


In [35]:
no_of_train_data = math.ceil((train_percentage * len(df)) / 100)
print('No of train data :',no_of_train_data)

no_of_test_data = len(df) - no_of_train_data
print('No of test data',no_of_test_data)

No of train data : 399
No of test data 170


In [36]:
df.head(no_of_train_data)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Breast Cancer Type
0,1.096100,-2.071512,1.268817,0.983510,1.567087,3.280628,2.650542,2.530249,2.215566,2.253764,...,-1.358098,2.301575,1.999478,1.306537,2.614365,2.107672,2.294058,2.748204,1.935312,0
1,1.828212,-0.353322,1.684473,1.907030,-0.826235,-0.486643,-0.023825,0.547662,0.001391,-0.867889,...,-0.368879,1.533776,1.888827,-0.375282,-0.430066,-0.146620,1.086129,-0.243675,0.280943,0
2,1.578499,0.455786,1.565126,1.557513,0.941382,1.052000,1.362280,2.035440,0.938859,-0.397658,...,-0.023953,1.346291,1.455004,0.526944,1.081980,0.854222,1.953282,1.151242,0.201214,0
3,-0.768233,0.253509,-0.592166,-0.763792,3.280667,3.399917,1.914213,1.450431,2.864862,4.906602,...,0.133866,-0.249720,-0.549538,3.391291,3.889975,1.987839,2.173873,6.040726,4.930672,0
4,1.748758,-1.150804,1.775011,1.824624,0.280125,0.538866,1.369806,1.427237,-0.009552,-0.561956,...,-1.465481,1.337363,1.219651,0.220362,-0.313119,0.612640,0.728618,-0.867590,-0.396751,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,-0.575273,-0.364947,-0.572001,-0.593011,0.464993,-0.128018,-0.513916,-0.403557,0.457358,-0.168208,...,0.019976,-0.563386,-0.563940,0.474387,-0.489175,-0.536316,-0.790269,0.239616,-0.727292,1
395,-0.019095,-0.490498,-0.091322,-0.130114,-1.131267,-0.960582,-0.777590,-0.422885,-0.622371,-0.730502,...,-0.054866,-0.322631,-0.344394,-1.128596,-0.833659,-0.899169,-0.540012,-0.610589,-0.988625,1
396,-0.175165,-0.092919,-0.159226,-0.275036,0.678301,0.196146,-0.037623,0.126044,-0.020495,-0.284349,...,0.247757,-0.295549,-0.360728,0.456868,0.017383,0.343256,0.466951,-0.379449,-0.392321,1
397,-0.376638,-0.425397,-0.367054,-0.416548,-1.131978,-0.291425,-0.186896,-0.208468,-0.866769,-0.750331,...,-0.751225,-0.492261,-0.508614,-1.621754,-0.464387,-0.393475,-0.481441,-1.475340,-0.742795,1


In [37]:
df_train = df.head(no_of_train_data)
df_train

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Breast Cancer Type
0,1.096100,-2.071512,1.268817,0.983510,1.567087,3.280628,2.650542,2.530249,2.215566,2.253764,...,-1.358098,2.301575,1.999478,1.306537,2.614365,2.107672,2.294058,2.748204,1.935312,0
1,1.828212,-0.353322,1.684473,1.907030,-0.826235,-0.486643,-0.023825,0.547662,0.001391,-0.867889,...,-0.368879,1.533776,1.888827,-0.375282,-0.430066,-0.146620,1.086129,-0.243675,0.280943,0
2,1.578499,0.455786,1.565126,1.557513,0.941382,1.052000,1.362280,2.035440,0.938859,-0.397658,...,-0.023953,1.346291,1.455004,0.526944,1.081980,0.854222,1.953282,1.151242,0.201214,0
3,-0.768233,0.253509,-0.592166,-0.763792,3.280667,3.399917,1.914213,1.450431,2.864862,4.906602,...,0.133866,-0.249720,-0.549538,3.391291,3.889975,1.987839,2.173873,6.040726,4.930672,0
4,1.748758,-1.150804,1.775011,1.824624,0.280125,0.538866,1.369806,1.427237,-0.009552,-0.561956,...,-1.465481,1.337363,1.219651,0.220362,-0.313119,0.612640,0.728618,-0.867590,-0.396751,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,-0.575273,-0.364947,-0.572001,-0.593011,0.464993,-0.128018,-0.513916,-0.403557,0.457358,-0.168208,...,0.019976,-0.563386,-0.563940,0.474387,-0.489175,-0.536316,-0.790269,0.239616,-0.727292,1
395,-0.019095,-0.490498,-0.091322,-0.130114,-1.131267,-0.960582,-0.777590,-0.422885,-0.622371,-0.730502,...,-0.054866,-0.322631,-0.344394,-1.128596,-0.833659,-0.899169,-0.540012,-0.610589,-0.988625,1
396,-0.175165,-0.092919,-0.159226,-0.275036,0.678301,0.196146,-0.037623,0.126044,-0.020495,-0.284349,...,0.247757,-0.295549,-0.360728,0.456868,0.017383,0.343256,0.466951,-0.379449,-0.392321,1
397,-0.376638,-0.425397,-0.367054,-0.416548,-1.131978,-0.291425,-0.186896,-0.208468,-0.866769,-0.750331,...,-0.751225,-0.492261,-0.508614,-1.621754,-0.464387,-0.393475,-0.481441,-1.475340,-0.742795,1


In [38]:
df.tail(no_of_test_data)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Breast Cancer Type
399,-0.660403,-0.471898,-0.687643,-0.633646,-0.390375,-0.795660,-0.756014,-0.838576,0.129062,-0.369331,...,-0.193162,-0.632726,-0.559549,-0.349003,-0.519047,-0.610133,-0.928709,-0.196801,-0.151474,1
400,1.073398,0.402310,1.334664,0.963618,1.894161,2.901932,2.886368,1.826692,1.099359,1.182997,...,0.342123,1.259988,0.743676,2.405850,2.144671,3.025590,1.244346,0.556422,1.985142,0
401,-0.623513,-1.948286,-0.651428,-0.602957,-0.543246,-0.983114,-0.786998,-0.797858,-0.768281,-1.046350,...,-0.900910,-0.583920,-0.511249,0.220362,-0.615020,-0.578976,-0.697468,-0.712419,-0.627078,1
402,-0.331236,-0.232420,-0.320550,-0.368525,-1.624721,-0.480016,-0.604985,-0.775437,0.227551,-0.539294,...,-0.173638,-0.325904,-0.454343,-1.711539,-0.142783,-0.535836,-0.738240,0.495001,-0.635383,1
403,-0.336911,-0.725325,-0.361704,-0.418537,0.172760,-0.302597,-0.700445,-0.644776,-0.279484,-0.112970,...,-0.432332,-0.522913,-0.526354,-0.664344,-0.371592,-0.437094,-0.467445,0.640473,-0.310378,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,2.109139,0.720838,2.058974,2.341795,1.040926,0.218868,1.945573,2.318924,-0.312314,-0.930209,...,0.117596,1.751022,2.013529,0.378033,-0.273077,0.663928,1.627719,-1.358963,-0.708467,0
565,1.703356,2.083301,1.614511,1.722326,0.102368,-0.017817,0.692434,1.262558,-0.217473,-1.057681,...,2.045599,1.420690,1.493644,-0.690623,-0.394473,0.236365,0.733182,-0.531387,-0.973122,0
566,0.701667,2.043775,0.672084,0.577445,-0.839745,-0.038646,0.046547,0.105684,-0.808406,-0.894800,...,1.373645,0.578492,0.427529,-0.808876,0.350427,0.326479,0.413705,-1.103578,-0.318129,0
567,1.836725,2.334403,1.980781,1.733693,1.524426,3.269267,3.294046,2.656528,2.135315,1.042778,...,2.235958,2.301575,1.651717,1.429169,3.901415,3.194794,2.287972,1.917396,2.217684,0


In [39]:
df_test = df.tail(no_of_test_data)
df_test

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Breast Cancer Type
399,-0.660403,-0.471898,-0.687643,-0.633646,-0.390375,-0.795660,-0.756014,-0.838576,0.129062,-0.369331,...,-0.193162,-0.632726,-0.559549,-0.349003,-0.519047,-0.610133,-0.928709,-0.196801,-0.151474,1
400,1.073398,0.402310,1.334664,0.963618,1.894161,2.901932,2.886368,1.826692,1.099359,1.182997,...,0.342123,1.259988,0.743676,2.405850,2.144671,3.025590,1.244346,0.556422,1.985142,0
401,-0.623513,-1.948286,-0.651428,-0.602957,-0.543246,-0.983114,-0.786998,-0.797858,-0.768281,-1.046350,...,-0.900910,-0.583920,-0.511249,0.220362,-0.615020,-0.578976,-0.697468,-0.712419,-0.627078,1
402,-0.331236,-0.232420,-0.320550,-0.368525,-1.624721,-0.480016,-0.604985,-0.775437,0.227551,-0.539294,...,-0.173638,-0.325904,-0.454343,-1.711539,-0.142783,-0.535836,-0.738240,0.495001,-0.635383,1
403,-0.336911,-0.725325,-0.361704,-0.418537,0.172760,-0.302597,-0.700445,-0.644776,-0.279484,-0.112970,...,-0.432332,-0.522913,-0.526354,-0.664344,-0.371592,-0.437094,-0.467445,0.640473,-0.310378,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,2.109139,0.720838,2.058974,2.341795,1.040926,0.218868,1.945573,2.318924,-0.312314,-0.930209,...,0.117596,1.751022,2.013529,0.378033,-0.273077,0.663928,1.627719,-1.358963,-0.708467,0
565,1.703356,2.083301,1.614511,1.722326,0.102368,-0.017817,0.692434,1.262558,-0.217473,-1.057681,...,2.045599,1.420690,1.493644,-0.690623,-0.394473,0.236365,0.733182,-0.531387,-0.973122,0
566,0.701667,2.043775,0.672084,0.577445,-0.839745,-0.038646,0.046547,0.105684,-0.808406,-0.894800,...,1.373645,0.578492,0.427529,-0.808876,0.350427,0.326479,0.413705,-1.103578,-0.318129,0
567,1.836725,2.334403,1.980781,1.733693,1.524426,3.269267,3.294046,2.656528,2.135315,1.042778,...,2.235958,2.301575,1.651717,1.429169,3.901415,3.194794,2.287972,1.917396,2.217684,0


In [40]:
df_test = df_test.reset_index()
df_test

Unnamed: 0,index,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Breast Cancer Type
0,399,-0.660403,-0.471898,-0.687643,-0.633646,-0.390375,-0.795660,-0.756014,-0.838576,0.129062,...,-0.193162,-0.632726,-0.559549,-0.349003,-0.519047,-0.610133,-0.928709,-0.196801,-0.151474,1
1,400,1.073398,0.402310,1.334664,0.963618,1.894161,2.901932,2.886368,1.826692,1.099359,...,0.342123,1.259988,0.743676,2.405850,2.144671,3.025590,1.244346,0.556422,1.985142,0
2,401,-0.623513,-1.948286,-0.651428,-0.602957,-0.543246,-0.983114,-0.786998,-0.797858,-0.768281,...,-0.900910,-0.583920,-0.511249,0.220362,-0.615020,-0.578976,-0.697468,-0.712419,-0.627078,1
3,402,-0.331236,-0.232420,-0.320550,-0.368525,-1.624721,-0.480016,-0.604985,-0.775437,0.227551,...,-0.173638,-0.325904,-0.454343,-1.711539,-0.142783,-0.535836,-0.738240,0.495001,-0.635383,1
4,403,-0.336911,-0.725325,-0.361704,-0.418537,0.172760,-0.302597,-0.700445,-0.644776,-0.279484,...,-0.432332,-0.522913,-0.526354,-0.664344,-0.371592,-0.437094,-0.467445,0.640473,-0.310378,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,564,2.109139,0.720838,2.058974,2.341795,1.040926,0.218868,1.945573,2.318924,-0.312314,...,0.117596,1.751022,2.013529,0.378033,-0.273077,0.663928,1.627719,-1.358963,-0.708467,0
166,565,1.703356,2.083301,1.614511,1.722326,0.102368,-0.017817,0.692434,1.262558,-0.217473,...,2.045599,1.420690,1.493644,-0.690623,-0.394473,0.236365,0.733182,-0.531387,-0.973122,0
167,566,0.701667,2.043775,0.672084,0.577445,-0.839745,-0.038646,0.046547,0.105684,-0.808406,...,1.373645,0.578492,0.427529,-0.808876,0.350427,0.326479,0.413705,-1.103578,-0.318129,0
168,567,1.836725,2.334403,1.980781,1.733693,1.524426,3.269267,3.294046,2.656528,2.135315,...,2.235958,2.301575,1.651717,1.429169,3.901415,3.194794,2.287972,1.917396,2.217684,0


In [41]:
df_test = df_test.drop('index',axis=1)
df_test

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Breast Cancer Type
0,-0.660403,-0.471898,-0.687643,-0.633646,-0.390375,-0.795660,-0.756014,-0.838576,0.129062,-0.369331,...,-0.193162,-0.632726,-0.559549,-0.349003,-0.519047,-0.610133,-0.928709,-0.196801,-0.151474,1
1,1.073398,0.402310,1.334664,0.963618,1.894161,2.901932,2.886368,1.826692,1.099359,1.182997,...,0.342123,1.259988,0.743676,2.405850,2.144671,3.025590,1.244346,0.556422,1.985142,0
2,-0.623513,-1.948286,-0.651428,-0.602957,-0.543246,-0.983114,-0.786998,-0.797858,-0.768281,-1.046350,...,-0.900910,-0.583920,-0.511249,0.220362,-0.615020,-0.578976,-0.697468,-0.712419,-0.627078,1
3,-0.331236,-0.232420,-0.320550,-0.368525,-1.624721,-0.480016,-0.604985,-0.775437,0.227551,-0.539294,...,-0.173638,-0.325904,-0.454343,-1.711539,-0.142783,-0.535836,-0.738240,0.495001,-0.635383,1
4,-0.336911,-0.725325,-0.361704,-0.418537,0.172760,-0.302597,-0.700445,-0.644776,-0.279484,-0.112970,...,-0.432332,-0.522913,-0.526354,-0.664344,-0.371592,-0.437094,-0.467445,0.640473,-0.310378,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,2.109139,0.720838,2.058974,2.341795,1.040926,0.218868,1.945573,2.318924,-0.312314,-0.930209,...,0.117596,1.751022,2.013529,0.378033,-0.273077,0.663928,1.627719,-1.358963,-0.708467,0
166,1.703356,2.083301,1.614511,1.722326,0.102368,-0.017817,0.692434,1.262558,-0.217473,-1.057681,...,2.045599,1.420690,1.493644,-0.690623,-0.394473,0.236365,0.733182,-0.531387,-0.973122,0
167,0.701667,2.043775,0.672084,0.577445,-0.839745,-0.038646,0.046547,0.105684,-0.808406,-0.894800,...,1.373645,0.578492,0.427529,-0.808876,0.350427,0.326479,0.413705,-1.103578,-0.318129,0
168,1.836725,2.334403,1.980781,1.733693,1.524426,3.269267,3.294046,2.656528,2.135315,1.042778,...,2.235958,2.301575,1.651717,1.429169,3.901415,3.194794,2.287972,1.917396,2.217684,0


In [42]:
# df_temp = df.copy()
# df_temp

In [43]:
# df_temp['Cancer Present'][no_of_train_data:] = ['Test Point'] * no_of_test_data
# df_temp

In [44]:
# sns.scatterplot(x = df_temp[df_temp.columns[0]],y = df_temp[df_temp.columns[1]],hue=df_temp[df_temp.columns[2]],palette=color_dict,style=df_temp[df_temp.columns[2]],markers=markers_dict)
# plt.title('Train Data Points and Test Data Points')
# plt.legend(loc=(1.05,0.75))

# # hue without palette : sns will provide default color for each group or class in df_temp['Cancer Present]
# # huw with palette : sns will provide color we want for each group or class in df_temp['Cancer Present]
# # style without markers : sns will provide default shape for each group or class in df_temp['Cancer Present]
# # style with markers : sns will provide shape we want for each group or class in df_temp['Cancer Present]


## Calculating Euclean Distance from Test point to Train point , sorting it ascending order and then finding the nearest neighbor

In [45]:
df_train

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Breast Cancer Type
0,1.096100,-2.071512,1.268817,0.983510,1.567087,3.280628,2.650542,2.530249,2.215566,2.253764,...,-1.358098,2.301575,1.999478,1.306537,2.614365,2.107672,2.294058,2.748204,1.935312,0
1,1.828212,-0.353322,1.684473,1.907030,-0.826235,-0.486643,-0.023825,0.547662,0.001391,-0.867889,...,-0.368879,1.533776,1.888827,-0.375282,-0.430066,-0.146620,1.086129,-0.243675,0.280943,0
2,1.578499,0.455786,1.565126,1.557513,0.941382,1.052000,1.362280,2.035440,0.938859,-0.397658,...,-0.023953,1.346291,1.455004,0.526944,1.081980,0.854222,1.953282,1.151242,0.201214,0
3,-0.768233,0.253509,-0.592166,-0.763792,3.280667,3.399917,1.914213,1.450431,2.864862,4.906602,...,0.133866,-0.249720,-0.549538,3.391291,3.889975,1.987839,2.173873,6.040726,4.930672,0
4,1.748758,-1.150804,1.775011,1.824624,0.280125,0.538866,1.369806,1.427237,-0.009552,-0.561956,...,-1.465481,1.337363,1.219651,0.220362,-0.313119,0.612640,0.728618,-0.867590,-0.396751,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,-0.575273,-0.364947,-0.572001,-0.593011,0.464993,-0.128018,-0.513916,-0.403557,0.457358,-0.168208,...,0.019976,-0.563386,-0.563940,0.474387,-0.489175,-0.536316,-0.790269,0.239616,-0.727292,1
395,-0.019095,-0.490498,-0.091322,-0.130114,-1.131267,-0.960582,-0.777590,-0.422885,-0.622371,-0.730502,...,-0.054866,-0.322631,-0.344394,-1.128596,-0.833659,-0.899169,-0.540012,-0.610589,-0.988625,1
396,-0.175165,-0.092919,-0.159226,-0.275036,0.678301,0.196146,-0.037623,0.126044,-0.020495,-0.284349,...,0.247757,-0.295549,-0.360728,0.456868,0.017383,0.343256,0.466951,-0.379449,-0.392321,1
397,-0.376638,-0.425397,-0.367054,-0.416548,-1.131978,-0.291425,-0.186896,-0.208468,-0.866769,-0.750331,...,-0.751225,-0.492261,-0.508614,-1.621754,-0.464387,-0.393475,-0.481441,-1.475340,-0.742795,1


In [46]:
df_test

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Breast Cancer Type
0,-0.660403,-0.471898,-0.687643,-0.633646,-0.390375,-0.795660,-0.756014,-0.838576,0.129062,-0.369331,...,-0.193162,-0.632726,-0.559549,-0.349003,-0.519047,-0.610133,-0.928709,-0.196801,-0.151474,1
1,1.073398,0.402310,1.334664,0.963618,1.894161,2.901932,2.886368,1.826692,1.099359,1.182997,...,0.342123,1.259988,0.743676,2.405850,2.144671,3.025590,1.244346,0.556422,1.985142,0
2,-0.623513,-1.948286,-0.651428,-0.602957,-0.543246,-0.983114,-0.786998,-0.797858,-0.768281,-1.046350,...,-0.900910,-0.583920,-0.511249,0.220362,-0.615020,-0.578976,-0.697468,-0.712419,-0.627078,1
3,-0.331236,-0.232420,-0.320550,-0.368525,-1.624721,-0.480016,-0.604985,-0.775437,0.227551,-0.539294,...,-0.173638,-0.325904,-0.454343,-1.711539,-0.142783,-0.535836,-0.738240,0.495001,-0.635383,1
4,-0.336911,-0.725325,-0.361704,-0.418537,0.172760,-0.302597,-0.700445,-0.644776,-0.279484,-0.112970,...,-0.432332,-0.522913,-0.526354,-0.664344,-0.371592,-0.437094,-0.467445,0.640473,-0.310378,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,2.109139,0.720838,2.058974,2.341795,1.040926,0.218868,1.945573,2.318924,-0.312314,-0.930209,...,0.117596,1.751022,2.013529,0.378033,-0.273077,0.663928,1.627719,-1.358963,-0.708467,0
166,1.703356,2.083301,1.614511,1.722326,0.102368,-0.017817,0.692434,1.262558,-0.217473,-1.057681,...,2.045599,1.420690,1.493644,-0.690623,-0.394473,0.236365,0.733182,-0.531387,-0.973122,0
167,0.701667,2.043775,0.672084,0.577445,-0.839745,-0.038646,0.046547,0.105684,-0.808406,-0.894800,...,1.373645,0.578492,0.427529,-0.808876,0.350427,0.326479,0.413705,-1.103578,-0.318129,0
168,1.836725,2.334403,1.980781,1.733693,1.524426,3.269267,3.294046,2.656528,2.135315,1.042778,...,2.235958,2.301575,1.651717,1.429169,3.901415,3.194794,2.287972,1.917396,2.217684,0


In [47]:
df_test.iloc[0]

mean radius               -0.660403
mean texture              -0.471898
mean perimeter            -0.687643
mean area                 -0.633646
mean smoothness           -0.390375
mean compactness          -0.795660
mean concavity            -0.756014
mean concave points       -0.838576
mean symmetry              0.129062
mean fractal dimension    -0.369331
radius error              -0.221310
texture error             -0.139316
perimeter error           -0.317065
area error                -0.335826
smoothness error          -0.525552
compactness error         -0.326004
concavity error           -0.367843
concave points error      -1.036927
symmetry error            -0.698287
fractal dimension error   -0.273577
worst radius              -0.583292
worst texture             -0.193162
worst perimeter           -0.632726
worst area                -0.559549
worst smoothness          -0.349003
worst compactness         -0.519047
worst concavity           -0.610133
worst concave points      -0

In [48]:
df_train.iloc[0][0]

1.096099529431712

In [49]:
df_train[df.columns[n-1]][0]

0

In [50]:
n

31

In [51]:
df_train.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'Breast Cancer Type'],
      dtype='object')

In [52]:
# distance_list = list()
# class_name_list = list()
# total_distance = 0

# # calculating euclidean distance from test data to train data

# for i in range(len(df_test)):
#     for j in range(len(df_train)):
#         for c in range(n-1):
#             distance = (df_test.iloc[i][c] - df_train.iloc[j][c])**2
#             total_distance = total_distance + distance
#         total_distance = math.sqrt(total_distance)
#         distance_list.append((df_train[df_train.columns[n-1]][j],total_distance))
#         total_distance = 0
    
    
#     # sorting all those distances

#     for ii in range(len(distance_list)):
#         for jj in range(ii+1,len(distance_list)):
#             if distance_list[jj][1] < distance_list[ii][1]:
#                 temp = distance_list[ii]
#                 distance_list[ii] = distance_list[jj]
#                 distance_list[jj] = temp
    

#     # selecting first 'k' points and then counting the number of classes
    
#     count = dict()
#     for ii in range(k):
#         if distance_list[ii][0] not in count:
#             count[distance_list[ii][0]] = 1
#         else:
#             count[distance_list[ii][0]] = count[distance_list[ii][0]] + 1
    
    
#     # finding out the most nearest class

#     min = 0
#     for ii in count:
#         if count[ii] > min:
#             class_name = ii
#             min = count[ii]
    
#     class_name_list.append(class_name)
#     distance_list = list()

# print(class_name_list)

This code takes a lot of time for giving the output since we are using nested loop here and also used bubble sort inside
the first loop which consumed too much time. and then again we had to find number of classes with another loop. so too much
use of loop made this code too much slower. it took almost 7 min to give the output.

So here i have optimized this code with eliminating the bubble sort.

here , after i got the distance list , i only took the first k smallest distance without sorting them and everytime i took
k(i) smallest number , i count the class number too in dicitonary.after that I just found the most nearest neghbor

In [53]:
class_name_list = list()

# calculating euclidean distance from test data to train data

for i in range(len(df_test)):
    distance_list = list()
    for j in range(len(df_train)):
        distance = np.linalg.norm(df_test.iloc[i]-df_train.iloc[j])
        distance_list.append((df_train[df_train.columns[n-1]][j],distance))
    # print(distance_list)
    

    
    # selecting first 'k' points with smallest distance without sorting and then counting the number of classes

    minimum = None
    checked = dict()
    count = dict()
    min_list = list()

    for ii in range(k):
        for jj in range(len(distance_list)):
            if jj not in checked :
                if minimum is None :
                    minimum = distance_list[jj]
                elif distance_list[jj][1] < minimum[1]:
                    minimum = distance_list[jj]
                    index = jj
        min_list.append(minimum)
        if minimum[0] not in count:
            count[minimum[0]] = 1
        else:
            count[minimum[0]] = count[minimum[0]] + 1
        checked[index] = 1
        minimum = None

    # print(min_list)
    # print(count)

    # finding out the most nearest class
    
    min = 0
    for ii in count:
        if count[ii] > min:
            class_name = ii
            min = count[ii]
    class_name_list.append(class_name)
print(class_name_list)

[1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1]


Most surprising fact is , this optimized cose has given the output in only 20 second which is a lot faster than previous code

In [54]:
print(len(class_name_list))

170


In [55]:
# df_temp2 = df.copy()
# df_temp2

In [56]:
# df_temp2[df_temp2.columns[n-1]][no_of_train_data:] = class_name_list
# df_temp2

In [57]:
# color_dict

In [58]:
# sns.scatterplot(x = df_temp2[df_temp2.columns[0]],y = df_temp2[df_temp2.columns[1]],hue=df_temp2[df_temp2.columns[2]],palette=color_dict,style=df_temp2[df_temp2.columns[2]],markers=markers_dict)
# plt.title('Final Data Points')
# plt.legend(loc=(1.05,0.75))

In [59]:
# plt.figure(figsize=(15,6))

# plt.subplot(1, 2, 1) # row 1, col 2 index 1

# sns.scatterplot(x = df_temp[df_temp.columns[0]],y = df_temp[df_temp.columns[1]],hue=df_temp[df_temp.columns[2]],palette=color_dict,style=df_temp[df_temp.columns[2]],markers=markers_dict)
# plt.title('Train Data Points and Test Data Points')

# # hue without palette : sns will provide default color for each group or class in df_temp['Cancer Present]
# # huw with palette : sns will provide color we want for each group or class in df_temp['Cancer Present]
# # style without markers : sns will provide default shape for each group or class in df_temp['Cancer Present]
# # style with markers : sns will provide shape we want for each group or class in df_temp['Cancer Present]





# plt.subplot(1, 2, 2) # index 2


# sns.scatterplot(x = df_temp2[df_temp2.columns[0]],y = df_temp2[df_temp2.columns[1]],hue=df_temp2[df_temp2.columns[2]],palette=color_dict,style=df_temp2[df_temp2.columns[2]],markers=markers_dict)
# plt.title('Final Data Points')

In [60]:
right = 0
wrong = 0

for i in range(len(class_name_list)):
    if class_name_list[i] == df_test[df_test.columns[n-1]][i]:
        right = right + 1
    else:
        wrong = wrong + 1

print(right,wrong)

168 2


In [61]:
accuarcy = (right * 100) / len(class_name_list)
print(accuarcy)

98.82352941176471
