In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
data = pd.read_csv("cleveland.csv")
# Preprocess the data
data["num"] = data["num"].apply(lambda x: 1 if x > 0 else 0)
# Change num column name to make more sense
data = data.rename({'num':'disease'}, axis=1)

data.replace('?', pd.NA, inplace=True)
# TODO: may want replace ? with the mode of the column
data.dropna(inplace=True)

non_numeric_columns = data.select_dtypes(exclude=['number']).columns
for col in non_numeric_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

data.head(5)


In [None]:
class MyKNN:
    def __init__(self, k=5, dist_metric='euclidean'):
        self.k = k
        self.dist_metric = dist_metric
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def compute_distance(self, x1, x2):
        if self.dist_metric == 'euclidean':
            return np.sqrt(np.sum((x1 - x2) ** 2))
        elif self.dist_metric == 'manhattan':
            return np.sum(np.abs(x1 - x2))
        
    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
            sorted_indices = sorted(range(len(distances)), key=lambda i: distances[i])
            k_indices = sorted_indices[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            y_pred_fold = self.most_common(k_nearest_labels)
            # Append the predicted label to y_pred
            y_pred.append(y_pred_fold)
        # Convert y_pred to numpy array for compatibility with evaluation functions
        y_pred = np.array(y_pred)
        return y_pred
    
    def most_common(self, lst):
        return max(set(lst), key=lst.count)

    def evaluate(self, X_test, y_test):
        y_pred, precision_scores, recall_scores, f1_scores = self.predict(X_test)
        # Print precision, recall, and F1 scores for each fold
        for i in range(len(precision_scores)):
            print(f"Fold {i+1}: Precision = {precision_scores[i]}, Recall = {recall_scores[i]}, F1 Score = {f1_scores[i]}")
        # Compute mean precision, recall, and F1 scores
        mean_precision = np.mean(precision_scores)
        mean_recall = np.mean(recall_scores)
        mean_f1 = np.mean(f1_scores)
        # Print mean precision, recall, and F1 scores
        print(f"Mean Precision: {mean_precision}, Mean Recall: {mean_recall}, Mean F1 Score: {mean_f1}")
        return mean_precision, mean_recall, mean_f1
    
    def get_params(self, deep=True):
        return {'k': self.k, 'dist_metric': self.dist_metric}
    
    def set_params(self, **params):
        if 'k' in params:
            self.k = params['k']
        if 'dist_metric' in params:
            self.dist_metric = params['dist_metric']
        return self

In [None]:
# Choose Features
features = ["age","sex","cp","trestbps","chol",
            "fbs","restecg","thalach","exang",
            "oldpeak","slope","ca","thal"] # all features
# features = ["age", "cp", "chol", "ca"]

# Split features and target variable
X = data[features].values
y = data["disease"].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [None]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'k': range(1, 21),
    'dist_metric': ['euclidean', 'manhattan'],
}

# Create an instance of MyKNN
knn = MyKNN()

# Create GridSearchCV object
cv_num = 10
grid_search = GridSearchCV(knn, param_grid, cv=cv_num, scoring='accuracy')

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Extract results from the grid search
results = grid_search.cv_results_
mean_test_scores = results['mean_test_score']
params = results['params']
data_results = pd.DataFrame({
    'k_values': [p['k'] for p in params],  # Extract k values
    'dist_metric': [p['dist_metric'] for p in params],  # Extract distance metrics
    'mean_accuracy': mean_test_scores  # Mean accuracy scores
})

# Plot Scores
plt.figure(figsize=(10, 6))
sns.lineplot(data=data_results, x='k_values', y='mean_accuracy', hue='dist_metric', marker='o')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Mean Accuracy')
plt.title('Performance of k-NN with different distance metrics')
plt.grid(True)
plt.xticks(range(1, 21))
plt.legend(title='Distance Metric')
plt.show()

# Perform cross-validation with the best parameters
best_params = grid_search.best_params_
best_k = best_params['k']
best_dist_metric = best_params['dist_metric']
print(f"Best parameters: {best_params}\n")

knn_best = MyKNN(k=best_k, dist_metric=best_dist_metric)
cv_scores = cross_val_score(knn_best, X_train, y_train, cv=cv_num, scoring='accuracy')
cv = KFold(n_splits=cv_num, shuffle=True, random_state=42)

precision_scores = []
recall_scores = []
f1_scores = []

# Print precision, recall, and F1 scores for each fold during cross-validation
for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_train)):
    X_train_fold, X_test_fold = X_train[train_idx], X_train[test_idx]
    y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]
    knn_best.fit(X_train_fold, y_train_fold)
    y_pred_fold = knn_best.predict(X_test_fold)
    precision_fold = precision_score(y_test_fold, y_pred_fold)
    recall_fold = recall_score(y_test_fold, y_pred_fold)
    f1_fold = f1_score(y_test_fold, y_pred_fold)
    print(f"Fold {fold_idx + 1}: Precision = {precision_fold}, Recall = {recall_fold}, F1 Score = {f1_fold}")# Append the scores to the respective lists
    precision_scores.append(precision_fold)
    recall_scores.append(recall_fold)
    f1_scores.append(f1_fold)

# Calculate the mean precision, recall, and F1 score
mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)
mean_f1 = np.mean(f1_scores)

# Print the mean precision, recall, and F1 score
print(f"\nMean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean F1 Score: {mean_f1}")