In [93]:
import pandas as pd
import random
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

def knearestneighbors(file, k):
    # Load and preprocess the data
    df = pd.read_csv(file)
    df = df.rename({'num': 'disease'}, axis=1)
    df['disease'] = df['disease'].apply(lambda x: min(x, 1))  # Collapse values 1-4 to 1 (binary classification)

    # Select features and target
    X = df[['age', 'trestbps', 'chol', 'thalach']].values
    y = df['disease'].values
    
    # Initialize Nearest Neighbors model
    nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

    # Track precision, recall, and F1 scores for 10 iterations
    test_sizes = [0.1, 0.15, 0.2, 0.25, 0.3, 0.33, 0.4, 0.45, 0.5, 0.6]
    precision_scores = []
    recall_scores = []
    f1_scores = []
    
    for _ in range(10):  # 10 iterations
        # Randomly choose a test size and perform train-test split
        test_size = random.choice(test_sizes)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random.randint(1, 100))

        # Fit the Nearest Neighbors model on the training set
        nn.fit(X_train)

        # Store predictions for the test set
        y_pred = []
        for test_point in X_test:
            distances, indices = nn.kneighbors([test_point])
            nbrs = df.iloc[indices[0]]

            # Get the majority class among the k-nearest neighbors
            healthy = nbrs[nbrs['disease'] == 0].count().disease
            sick = nbrs[nbrs['disease'] == 1].count().disease
            predict = 0 if (healthy > sick) else 1
            y_pred.append(predict)

        # Evaluate the predictions: precision, recall, F1 score
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    
    # Report the results
    print("Precision Scores (10 iterations):", precision_scores)
    print("Recall Scores (10 iterations):", recall_scores)
    print("F1 Scores (10 iterations):", f1_scores)
    print("Mean F1 Score:", sum(f1_scores) / len(f1_scores))


# Call the function
knearestneighbors('cleveland.csv', 10)


Precision Scores (10 iterations): [np.float64(0.47058823529411764), np.float64(0.5333333333333333), np.float64(0.3695652173913043), np.float64(0.5), np.float64(0.39285714285714285), np.float64(0.4666666666666667), np.float64(0.4444444444444444), np.float64(0.40350877192982454), np.float64(0.5), np.float64(0.4430379746835443)]
Recall Scores (10 iterations): [np.float64(0.4), np.float64(0.6857142857142857), np.float64(0.3953488372093023), np.float64(0.417910447761194), np.float64(0.39285714285714285), np.float64(0.5833333333333334), np.float64(0.5333333333333333), np.float64(0.4423076923076923), np.float64(0.6190476190476191), np.float64(0.49295774647887325)]
F1 Scores (10 iterations): [np.float64(0.43243243243243246), np.float64(0.6), np.float64(0.38202247191011235), np.float64(0.45528455284552843), np.float64(0.39285714285714285), np.float64(0.5185185185185185), np.float64(0.48484848484848486), np.float64(0.42201834862385323), np.float64(0.5531914893617021), np.float64(0.46666666666666