In [3]:
import numpy as np
import math
import collections
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score



In [15]:
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((np.array(point1) - np.array(point2))**2))
def knn_predict(training_data, labels, test_point, k):
    distances = []
    for i in range(len(training_data)):
        dist = euclidean_distance(test_point, training_data[i])
        distances.append((dist,labels[i]))
    distances.sort(key=lambda x:x[0], )
    k_nearest_labels = [label for (dist,label) in distances[:k]]
    label_count = collections.Counter(k_nearest_labels)
    most_common_tuple = label_count.most_common(1)
    return most_common_tuple[0][0]
def knn_algorithm(training_data,labels,k):
    correct_labels = 0
    for i in range(len(training_data)):
        test_sample = training_data[i]
        test_label = labels[i]
        predicted_label = knn_predict(training_data,labels,test_sample,k)
        if predicted_label == test_label:
            correct_labels +=1
    accuracy = (correct_labels/len(training_data))*100
    return accuracy

print(f"Euclidean distance is {euclidean_distance((7,5),(9,9))}")



Euclidean distance is 4.47213595499958


In [16]:
training_data = [[1,2],[2,3],[3,44],[6,7],[7,8]]
training_labels = ['A','A','A','B','B']
K = 3
print(knn_predict(training_data,training_labels,[2,3],3))

A


In [17]:
training_data = np.array([
    [2.7810836, 2.550537003],
    [1.465489372, 2.362125076],
    [3.396561688, 4.400293529],
    [1.38807019, 1.850220317],
    [3.06407232, 3.005305973],
    [7.627531214, 2.759262235],
    [5.332441248, 2.088626775],
    [6.922596716, 1.77106367],
    [8.675418651, -0.242068655],
    [7.673756466, 3.508563011]
])
training_labels = np.array([
    0,
    0,
    0,
    0,
    0,
    1,
    1,
    1,
    1,
    1
])

print(f"accuracy of the knn is {knn_algorithm(training_data,training_labels,3)}")


accuracy of the knn is 100.0


In [5]:
df = pd.read_csv('./Heart Disease dataset/heart.csv')
print(df.head())

   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  


In [12]:
# getting array of unique values
chest_pain_unique = df.ChestPainType.unique()
resting_ecg_unique = df.RestingECG.unique()
angina_unique = df.ExerciseAngina.unique()
st_slope_unique = df.ST_Slope.unique()
sex_unique = df.Sex.unique()
# assigning numerical value based on string value
mapping_chestpain = {
    chest_pain_unique[i]: i for i in range (len(chest_pain_unique))
}
mapping_ecg = {
    resting_ecg_unique[i]: i for i in range(len(resting_ecg_unique))
}
mapping_angina = {
    angina_unique[i]: i for i in range(len(angina_unique))
}
mapping_st_slope = {
    st_slope_unique[i]: i for i in range(len(st_slope_unique))
}
mapping_sex ={
    sex_unique[i]: i for i in range(len(sex_unique))
}

# changing the strings to numerical value
df.ChestPainType = df.ChestPainType.map(mapping_chestpain)
df.RestingECG = df.RestingECG.map(mapping_ecg)
df.ExerciseAngina = df.ExerciseAngina.map(mapping_angina)
df.ST_Slope = df.ST_Slope.map(mapping_st_slope)
df.Sex = df.Sex.map(mapping_sex)
print(df.head())

        Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  \
0 -1.433140    0              0   0.410909     0.825070          0   
1 -0.478484    1              1   1.491752    -0.171961          0   
2 -1.751359    0              0  -0.129513     0.770188          0   
3 -0.584556    1              2   0.302825     0.139040          0   
4  0.051881    0              1   0.951331    -0.034755          0   

   RestingECG     MaxHR  ExerciseAngina   Oldpeak  ST_Slope  HeartDisease  
0           0  1.382928               0 -0.832432         0             0  
1           0  0.754157               0  0.105664         1             1  
2           1 -1.525138               0 -0.832432         0             0  
3           0 -1.132156               1  0.574711         1             1  
4           0 -0.581981               0 -0.832432         0             0  


In [8]:
# Lets now normalise all the columns now
numerical_features = ['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])
print(df.head())


        Age Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0 -1.433140   M              0   0.410909     0.825070          0           0   
1 -0.478484   F              1   1.491752    -0.171961          0           0   
2 -1.751359   M              0  -0.129513     0.770188          0           1   
3 -0.584556   F              2   0.302825     0.139040          0           0   
4  0.051881   M              1   0.951331    -0.034755          0           0   

      MaxHR  ExerciseAngina   Oldpeak  ST_Slope  HeartDisease  
0  1.382928               0 -0.832432         0             0  
1  0.754157               0  0.105664         1             1  
2 -1.525138               0 -0.832432         0             0  
3 -1.132156               1  0.574711         1             1  
4 -0.581981               0 -0.832432         0             0  


In [13]:
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\n--- Model Results ---")
print(f"Model Accuracy on Test Data: {accuracy * 100:.2f}%")


--- Model Results ---
Model Accuracy on Test Data: 85.87%
