<a href="https://colab.research.google.com/github/MarehWilliams01/heart-ml-me/blob/main/heart_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the neccesary libraries

In [None]:
#importing the neccessary libraries
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn import svm, metrics
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score

# loading the datset

In [None]:
#loading the dataset
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/shuffled_data_with_features.csv'
data = pd.read_csv(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# creating the training and the test data

In [None]:
#changing the '?' in the dataset to Nan so that python can identify it as a missing value
data.replace('?', np.nan, inplace=True)

#selecting the columns with missing values
columns_with_missing = ['trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

#converting columns with missing vales to numeric
data[columns_with_missing] = data[columns_with_missing].apply(pd.to_numeric, errors='coerce')

#splitting the data into X and y
X = data.drop("num", axis=1)
y = data["num"]

#separating them into train and test data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# **filling up missing data**







> **mean value**
>> **don't run this before knn!!**



In [None]:
#filling missing values in each column
for column in columns_with_missing:
  mean = data[column].mean()
  data[column].fillna(mean, inplace=True)

#print the data head
print(data.head(10))



> **k-NN**



In [None]:
#defining the range of k values to test
k_values = range(1, 41)

#storing performance results
svm_scores = []
bn_scores = []

#iteration
for k in k_values:
  #creating a copy of the training sets for imputation
  impute_x_train = x_train.copy()
  impute_x_test = x_test.copy()

  #initializing the KNN imputer with the right value of k
  imputer = KNNImputer(n_neighbors=k)

  #fit the imputer on the training set to transform the training and testing sets
  impute_x_train = pd.DataFrame(imputer.fit_transform(impute_x_train), columns=x_train.columns)
  impute_x_test = pd.DataFrame(imputer.fit_transform(impute_x_test), columns=x_test.columns)

  #training the svm model to identify the accuracies of each k
  svm_classifier= SVC()
  svm_classifier.fit(impute_x_train, y_train)
  svm_pred = svm_classifier.predict(impute_x_test)
  svm_scores.append(accuracy_score(y_test, svm_pred))

  #using bn classifier and calculating the accuracy using cross-validation
  bn_classifier = GaussianNB()
  bn_scores.append(np.mean(cross_val_score(bn_classifier, impute_x_train, y_train, cv=10)))

# Print the performance scores for each value of k
for k, svm_score, bn_score in zip(k_values, svm_scores, bn_scores):
    print(f"K = {k}: SVM Accuracy = {svm_score}, BN Accuracy = {bn_score}")

#finding the best k value
best_k_index = np.argmax(np.add(svm_scores, bn_scores))
best_k = k_values[best_k_index]
best_svm_score = svm_scores[best_k_index]
best_bn_score = bn_scores[best_k_index]
print(f"Best k: {best_k}")
print(f"SVM Accuracy: {best_svm_score}")
print(f"BN Accuracy: {best_bn_score}")

#printing the graph
plt.plot(k_values, svm_scores, label='SVM Accuracy')
plt.plot(k_values, bn_scores, label='BN Scores')
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.title('Accuracy Scores for Different k Values')
plt.legend()
plt.show()


>**MICE**



