<a href="https://colab.research.google.com/github/Manahil4/ML_Labs/blob/main/ML_Lab_03_Nearest_Neighbour.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## The Heart Dataset

File name: 'D6_Heart_Dataset_2.csv'

This dataset has been obtained from Kaggle.

The dataset contains 303 observations with 13 features and 1 class label with 0 and 1 values.
These features are discussed below:
1. age: in years
2. sex: (1 = male; 0 = female)
3. cp: chest pain type (1 = typical angina; 2 = atypical angina; 3 = non-anginal pain; 4 = asymptomatic)
4. trestbps: resting blood pressure, in mm Hg on admission to the hospital
5. chol: serum cholestrol in mg/dl
6. fbs: fasting blood sugar, 120 mg.dl (1 = true; 0 = false)
7. restecg: restinng electrocardiographic results (values: 0,1,2)
8. thalach: maximum heart ache achieved
9. exang: exercise induced angina (1 = yes; 0 = no)
10. oldpeak: ST depression induced by exercise relative to rest
11. slope: the slope of the peak exercise ST segment
12. ca: number of major vessels (0-3) coloured by flouroscopy
13. thal: (3 = normal; 6 = fixed defect; 7 = reversable defect)
14. target: the predicted attribute, diagnosis of heart disease (0 = fit; 1 = diseased)

This is a binary classification problem.
Does not contain any categorical data, the dataset is clean. sed)

## Loading and exploring dataset

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
import pandas as pd
#Reading the file into a dataframe
data=pd.read_csv(f'D6_Heart_Dataset_2 (1).csv')
#Displaying the read contents
data

In [None]:
# Description of data
data.describe()

In [None]:
# Displaying general info
data.info()

In [None]:
# Checking first five rows
data.head()

In [None]:
# Checking last five rows
data.tail(10)

In [None]:
# Describing the target column
data["target"].describe()
# data['age'].describe()

In [None]:
# Finding unique outputs
data["target"].unique()
# data['sex'].unique()

## Separating features and target

In [None]:
# separating predictors
X = data.drop("target",axis=1)
X

In [None]:
# separating target
Y = data["target"]
Y

## Visualizing data distributions

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Histogram for the target
plt.figure(figsize=(10, 6))
plt.hist(Y, ec='purple', color='black', rwidth=0.3)
plt.xlabel('target')
plt.ylabel('Nr. of Patients')
plt.show()

In [None]:
# Histogram for the ChestPainType
plt.figure(figsize=(10, 6))
plt.hist(X['cp'], ec='black', color='blue')

plt.xlabel('Chest Pain Type')
plt.ylabel('Nr. of Patients')
plt.show()

In [None]:
# Histogram for the Age (continuous)
plt.figure(figsize=(10, 6))
plt.hist(X['age'], ec='black', color='red')
plt.xlabel('age')
plt.ylabel('Nr. of Patients')
plt.show()

In [None]:
import seaborn as sns
sns.barplot(x=data['sex'],y=Y)
plt.show()

## Finding correlations

In [None]:
# Finding correlations
X.corr() # target (HeartDisease) not included
#data.corr() # target (HeartDisease) included

In [None]:
# Finding correlation between HeartDisease and features
data.corr()["target"].abs().sort_values(ascending=False)

In [None]:
# Finding correlation betwwen Age and the target
X['age'].corr(Y)

In [None]:
# Finding correlation betwwen Age and RestingBP
X['age'].corr(X['trestbps'])

In [None]:
X['age'].corr(X['chol'])

In [None]:
# Generating scatter plot showing correlation between Age and the target
corr = round(X['cp'].corr(X['trestbps']), 3)

plt.figure(figsize=(6, 4))
plt.scatter(x=X['cp'], y=X['trestbps'], alpha=0.8, s=80, color='green')
plt.title(f'RestingBP vs Age (Correlation {corr})', fontsize=14)
plt.xlabel('Age', fontsize=14)
plt.ylabel('RestingBP', fontsize=14)
plt.show()

## Creating kNN Model - Coding the Algorithm

In [None]:
import numpy as np

# storing size of dataset in a variable
dataset_size = X.shape[0] # 303
dataset_size

In [None]:
# Let's select first row and predict its class
X.loc[0]

In [None]:
# Actual class for this row
Y[0]

In [None]:
# The numpy.tile() function constructs a new array by repeating
# array the number of times we want to repeat
np.tile(X.loc[0], (dataset_size,1))

In [None]:
X

In [None]:
diff_mat = X.loc[0] - X #broadcasting
diff_mat

In [None]:
sq_diff_mat = diff_mat**2
sq_diff_mat = sq_diff_mat.sum(axis=1)
distances = sq_diff_mat**0.5
distances

In [None]:
# argsort returns the integer indices of the sorted values
sorted_dist_indicies = distances.argsort()
sorted_dist_indicies

In [None]:
sorted_labels=Y[sorted_dist_indicies]
sorted_labels

In [None]:
k_labels=sorted_labels[:3] # say k=3
k_labels

In [None]:
import statistics as st
st.mode(k_labels)

In [None]:
def my_knn_classifier(inX,predictors,target,k):
    diff_mat =inX - predictors
    sq_diff_mat = diff_mat**2
    sq_diff_mat = sq_diff_mat.sum(axis=1)
    distances = sq_diff_mat**0.5
    sorted_dist_indicies = distances.argsort()
    sorted_labels=target[sorted_dist_indicies]
    k_labels=sorted_labels[:k]
    return st.mode(k_labels)

In [None]:
print("predicted class is ",my_knn_classifier(X.loc[299],X,Y,6))

## Splitting the Dataset into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X, Y,test_size=0.20,random_state=0)
print(X_train.shape )
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

## Creating kNN Model - using sklearn

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Loading the knn classifier
classifier1 = KNeighborsClassifier(n_neighbors=5)

In [None]:
# Training the model
model1=classifier1.fit(X_train, Y_train)

In [None]:
# Evaluating the model
Y_pred1 = model1.predict(X_test)
print("The accuracy is "+str(metrics.accuracy_score(Y_test,Y_pred1)*100)+"%")
print(confusion_matrix(Y_test, Y_pred1))

In [None]:
target_names = ['class 0', 'class 1']
print(classification_report(Y_test, Y_pred1, target_names=target_names))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Initialize KNN classifier with Manhattan distance
knnManhattan = KNeighborsClassifier(n_neighbors=5, metric='manhattan')

# Fit the model
model2 = knnManhattan.fit(X_train, Y_train)

# Predict the labels for the test set
Y_predictMan = model2.predict(X_test)

# Calculate accuracy
accMat = accuracy_score(Y_test, Y_predictMan)
print(accMat

# **Manhattan Distance manual code**
*

In [None]:
import numpy as np
from scipy.stats import mode

def Manhattan(TestPt, DatasetPts, n_neighbors=1):
    # Calculate Manhattan distances
    diff = DatasetPts - TestPt  # Broadcast subtraction
    absDiff = np.abs(diff)      # Element-wise absolute difference
    ManDist = absDiff.sum(axis=1)  # Sum distances row-wise

    # Get indices of the smallest distances
    SortManDistanceValues = ManDist.argsort()
    NearestIndices = SortManDistanceValues[:n_neighbors]  # Top 'n_neighbors'

    # Get the classes of the nearest neighbors
    NearestLabels = Y[NearestIndices]

    # Predict the class using the mode of the nearest neighbors' labels
    predict = mode(NearestLabels).mode[0]  # Extract the mode value from result

    return predict
