In [96]:
import pandas as pd
import random
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

def knearestneighbors(file, k, attributes):
    # Load and preprocess the data
    df = pd.read_csv(file)
    df = df.rename({'num': 'disease'}, axis=1)
    df['disease'] = df['disease'].apply(lambda x: min(x, 1))

    # Select features and target
    X = df[attributes].values
    y = df['disease'].values
    
    # Initialize Nearest Neighbors model
    nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

    # Track precision, recall, and F1 scores for 10 iterations
    test_sizes = [0.1, 0.15, 0.2, 0.25, 0.3, 0.33, 0.4, 0.45, 0.5, 0.6]
    precision_scores = []
    recall_scores = []
    f1_scores = []
    
    for _ in range(10):  # 10 iterations
        # Randomly choose a test size and perform train-test split
        test_size = random.choice(test_sizes)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random.randint(1, 100))

        # Fit the Nearest Neighbors model on the training set
        nn.fit(X_train)

        # Store predictions for the test set
        y_pred = []
        for test_point in X_test:
            distances, indices = nn.kneighbors([test_point])
            nbrs = df.iloc[indices[0]]

            # Get the majority class among the k-nearest neighbors
            healthy = nbrs[nbrs['disease'] == 0].count().disease
            sick = nbrs[nbrs['disease'] == 1].count().disease
            predict = 0 if (healthy > sick) else 1
            y_pred.append(predict)

        # Evaluate the predictions: precision, recall, F1 score
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    
    # Report the results
    print("Precision Scores (10 iterations):", precision_scores)
    print("Recall Scores (10 iterations):", recall_scores)
    print("F1 Scores (10 iterations):", f1_scores)
    print("Mean F1 Score:", sum(f1_scores) / len(f1_scores))


# Call the function
knearestneighbors('cleveland.csv', 10, ['age', 'trestbps', 'chol', 'thalach'])


Precision Scores (10 iterations): [np.float64(0.4897959183673469), np.float64(0.37037037037037035), np.float64(0.4528301886792453), np.float64(0.625), np.float64(0.4166666666666667), np.float64(0.42424242424242425), np.float64(0.3448275862068966), np.float64(0.3333333333333333), np.float64(0.35714285714285715), np.float64(0.35)]
Recall Scores (10 iterations): [np.float64(0.5217391304347826), np.float64(0.6451612903225806), np.float64(0.34782608695652173), np.float64(0.6521739130434783), np.float64(0.46511627906976744), np.float64(0.4), np.float64(0.45454545454545453), np.float64(0.42105263157894735), np.float64(0.5), np.float64(0.3684210526315789)]
F1 Scores (10 iterations): [np.float64(0.5052631578947369), np.float64(0.47058823529411764), np.float64(0.39344262295081966), np.float64(0.6382978723404256), np.float64(0.43956043956043955), np.float64(0.4117647058823529), np.float64(0.39215686274509803), np.float64(0.37209302325581395), np.float64(0.4166666666666667), np.float64(0.358974358