## Name: Taldybayev Batyrkhan
## Group: IT3-2203
## Date: 04.02.2025

---

## Importing Libraries and reading CSV

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

In [2]:
data = pd.read_csv("heart.csv")
data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


## Basic info about the dataset

In [3]:
data.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


---

## For this dataset, I'm creating a class which can work as classificator and regressor

In [5]:
class Algorithm:
    def __init__(self, k, mode):
        self.k = k
        self.mode = mode # "classification" or "regression"

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        results = []
        X = np.array(X)
        for x in X:
            x = np.array(x)
            distances = [self.euclidean_distance(x, x_train) for x_train in self.X_train]
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_indices]

            if self.mode == "classification":
                label_counts = {}
                for label in k_nearest_labels:
                    label_counts[label] = label_counts.get(label, 0) + 1
                best_label = max(label_counts, key=label_counts.get)
                results.append(best_label)
            elif self.mode == "regression":
                results.append(np.mean(k_nearest_labels))
            else:
                raise("Hmm...")
        
        return results

    @staticmethod
    def euclidean_distance(x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

---

## As the algorithm done, we can step forward to practice tasks. In this dataset, it's possible to use classificator for classify Heart Disease according to other parameters.
## So, I'm gonna start with classificator

## At first, importing scikit for comparison purposes

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

### The dataset has categorical variables, so I need to encode this

In [7]:
categorical_columns = ["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]

le = LabelEncoder()
data[categorical_columns] = data[categorical_columns].apply(le.fit_transform)

## Classificator

In [8]:
X = data.drop(columns=["HeartDisease"])
y = data["HeartDisease"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [9]:
classificator = Algorithm(k=5, mode="classification")
classificator.fit(X_train, y_train)

predictions = classificator.predict(X_test)

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error

cm = confusion_matrix(y_test, predictions)
print(f"{cm}\n")

accuracy = accuracy_score(y_test, predictions)*100
print(f"Accuracy: {round(accuracy, 2)}%")

[[52 26]
 [26 80]]

Accuracy: 71.74%


In [20]:
X = data.drop(columns=["HeartDisease"])
y = data["HeartDisease"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

results = {}
for k in range(1, 30):
    classificator = Algorithm(k=k, mode="classification")
    classificator.fit(X_train, y_train)
    predictions = classificator.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)*100
    results[k] = accuracy

print(results)

{1: 65.21739130434783, 2: 65.21739130434783, 3: 70.1086956521739, 4: 66.30434782608695, 5: 71.73913043478261, 6: 69.56521739130434, 7: 69.02173913043478, 8: 71.19565217391305, 9: 71.73913043478261, 10: 72.82608695652173, 11: 72.82608695652173, 12: 75.0, 13: 73.91304347826086, 14: 72.82608695652173, 15: 75.0, 16: 72.82608695652173, 17: 71.73913043478261, 18: 72.82608695652173, 19: 70.65217391304348, 20: 72.82608695652173, 21: 71.73913043478261, 22: 72.28260869565217, 23: 73.36956521739131, 24: 72.82608695652173, 25: 72.28260869565217, 26: 73.36956521739131, 27: 72.28260869565217, 28: 73.36956521739131, 29: 71.73913043478261}


In [22]:
max(results)

29

In [24]:
best = 0
for i, item in results.items():
    if item > best:
        best = item
        print('i:',i, ' best:', item)

print(best)

i: 1  best: 65.21739130434783
i: 3  best: 70.1086956521739
i: 5  best: 71.73913043478261
i: 10  best: 72.82608695652173
i: 12  best: 75.0
75.0


### Let's see what results Scikit show

In [11]:
from sklearn.neighbors import KNeighborsClassifier

# k = 3 as first hyperparameters
classifier = KNeighborsClassifier(n_neighbors=5)

# Fitting the model
classifier.fit(X_train, y_train)

# Predicting the Test set results
predictions = classifier.predict(X_test)

In [12]:
accuracy = accuracy_score(y_test, predictions)*100
print(f"Accuracy: {round(accuracy, 2)}%")

Accuracy: 71.74%


### Same percents is great

## Regressor

In [13]:
X = data.drop(columns=["Cholesterol"])
y = data["Cholesterol"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [14]:
regressor = Algorithm(k=5, mode="regression")
regressor.fit(X_train, y_train)

predictions = regressor.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse:.4f}")

Mean Squared Error: 13769.8233


In [15]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.4f}")

Mean Squared Error: 13788.8907
