In [1]:
# Importing the required libs
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
# loading the dataset
data = pd.read_csv('breast-cancer-wisconsin.csv')
data

Unnamed: 0,Id,Clump_thickness,Uniformity_Cell_Size,Uniformity_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [3]:
# Dropping the redundant column 'Id'
data.drop('Id',axis=1,inplace=True)

In [4]:
# Checking the overview of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Clump_thickness              699 non-null    int64 
 1   Uniformity_Cell_Size         699 non-null    int64 
 2   Uniformity_Cell_Shape        699 non-null    int64 
 3   Marginal_Adhesion            699 non-null    int64 
 4   Single_Epithelial_Cell_Size  699 non-null    int64 
 5   Bare_Nuclei                  699 non-null    object
 6   Bland_Chromatin              699 non-null    int64 
 7   Normal_Nucleoli              699 non-null    int64 
 8   Mitoses                      699 non-null    int64 
 9   Class                        699 non-null    int64 
dtypes: int64(9), object(1)
memory usage: 54.7+ KB


In [5]:
# Looking at the spread of the feature "Bare_Nuclei"
data['Bare_Nuclei'].value_counts()

Bare_Nuclei
1     402
10    132
2      30
5      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: count, dtype: int64

In [6]:
# It is seen that the feature "Bare_Nuclei" is contains numeric data, thus 
# Converting the column "Bare_Nuclei" to numeric datatype while replacinig the "?" with NaN/NA 
data['Bare_Nuclei'] = pd.to_numeric(data['Bare_Nuclei'], errors='coerce')

In [7]:
# Checking for the null values
print(data['Bare_Nuclei'].unique())

[ 1. 10.  2.  4.  3.  9.  7. nan  5.  8.  6.]


In [8]:
# Since we don't want to drop any necessary information we move forward with the imputation method
# Replacing the NA/NaN with the median of the feature and converting its datatype into same as the others
data['Bare_Nuclei'] = data['Bare_Nuclei'].fillna(data['Bare_Nuclei'].median())
data['Bare_Nuclei'] = data['Bare_Nuclei'].astype(np.int64)

In [9]:
# Verifying the no. of null values for the feature
data.isnull().sum()

Clump_thickness                0
Uniformity_Cell_Size           0
Uniformity_Cell_Shape          0
Marginal_Adhesion              0
Single_Epithelial_Cell_Size    0
Bare_Nuclei                    0
Bland_Chromatin                0
Normal_Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

In [10]:
# Lookijng at the datasets summary
data.describe()

Unnamed: 0,Clump_thickness,Uniformity_Cell_Size,Uniformity_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,4.41774,3.134478,3.207439,2.806867,3.216023,3.486409,3.437768,2.866953,1.589413,2.689557
std,2.815741,3.051459,2.971913,2.855379,2.2143,3.621929,2.438364,3.053634,1.715078,0.951273
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,5.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [11]:
# Taking the features (X) and target (y) from the original dataset
X = data[['Clump_thickness', 'Uniformity_Cell_Size', 'Uniformity_Cell_Shape','Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses']]
y = data['Class']

In machine learning, K-Nearest Neighbours is the simplest of all machine learning algorithms. It is a non-parametric algorithm used for classification and regression tasks. Non-parametric means there is no assumption required for data distribution. So, it does not require any underlying assumption to be made. 

The intuition behind the KNN is very straightforward. It simply calculates the distance between a sample data point and all the other training data points, which can be Euclidean or Manhattan distance. Then, it selects the k nearest data points, where k can be any integer. Finally, it assigns the sample data point to the class to which most of the k data points belong.

In KNN algorithm, k is the number of nearest neighbors. Generally, k is an odd number because it helps to decide the majority of the class. 
When k=1, then the algorithm is known as the nearest neighbor algorithm. 

One of the methods to find the optimal k is the Elbow Method, in which the k varies, and with each variation the model metrics are compared and the value from which the difference in the metric is not significant is chosen as the optimal k

In KNN classification, the output is a class membership. If k=1, then the data point is simply assigned to the class of that single nearest neighbor.

In KNN regression, the output is some property value for the object. This value is the average of the values of k nearest neighbors.

KNN is a type of instance-based learning or lazy learning i.e. it does not require any training data points for model generation. All training data will be used in the testing phase. This makes training faster and testing slower and costlier. So, the testing phase requires more time and memory resources.


In [13]:
# This function calculates the Euclidean distance between two points
# This method calculates the Euclidean distance between two points.
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [14]:
class KNN:
    # The constructor initializes the number of neighbors (k) to consider. By default, it's set to 3
    def __init__(self, k=3):
        self.k = k

    # This fit method simply stores the training data and labels.
    # So it doesn't do any actual training
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)
    
    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        # Sort by distance and return indices of the first k neighbors
        k_indices = np.argsort(distances)[:self.k]
        # Extract the labels of the k nearest neighbor training samples
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Return the most common class label
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

In [15]:
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Converting the datasets to numpy arrays (if not) and ensuring y_train, y_test are flat
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [17]:
# Creating and training the KNN classifier
classifier = KNN(k=5)
classifier.fit(X_train, y_train)

In [18]:
# Making predictions on the test set
predictions = classifier.predict(X_test)

In [19]:
# Calculating the accuracy of the predictions by taking the mean over correct number of predictions
accuracy = np.mean(predictions == y_test)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9857
