In [2]:
import pandas as pd
import random
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

def knearestneighbors(file, k, attributes):
    # Load and preprocess the data
    df = pd.read_csv(file)
    df = df.rename({'num': 'disease'}, axis=1)
    df['disease'] = df['disease'].apply(lambda x: min(x, 1))

    # Select features and target
    X = df[attributes].values
    y = df['disease'].values
    
    # Initialize Nearest Neighbors model
    nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

    # Track precision, recall, and F1 scores for 10 iterations
    test_sizes = [0.1, 0.15, 0.2, 0.25, 0.3, 0.33, 0.4, 0.45, 0.5, 0.6]
    precision_scores = []
    recall_scores = []
    f1_scores = []
    
    for _ in range(10):  # 10 iterations
        # Randomly choose a test size and perform train-test split
        test_size = random.choice(test_sizes)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random.randint(1, 100))

        # Fit the Nearest Neighbors model on the training set
        nn.fit(X_train)

        # Store predictions for the test set
        y_pred = []
        for test_point in X_test:
            distances, indices = nn.kneighbors([test_point])
            nbrs = df.iloc[indices[0]]

            # Get the majority class among the k-nearest neighbors
            healthy = nbrs[nbrs['disease'] == 0].count().disease
            sick = nbrs[nbrs['disease'] == 1].count().disease
            predict = 0 if (healthy > sick) else 1
            y_pred.append(predict)

        # Evaluate the predictions: precision, recall, F1 score
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)


    for i, precision in enumerate(precision_scores):
        print(f"Iteration: {i + 1}")  # Start iteration count from 1 instead of 0
        print(f"Precision Score: {precision_scores[i]}")
        print(f"Recall Score: {recall_scores[i]}")
        print(f"F1 Score: {f1_scores[i]}")
        print()
    print("Mean F1 Score:", sum(f1_scores) / len(f1_scores))


# Call the function
knearestneighbors('cleveland.csv', 10, ['age', 'trestbps', 'chol', 'thalach'])


Iteration: 1
Precision Score: 0.4
Recall Score: 0.4
F1 Score: 0.4

Iteration: 2
Precision Score: 0.48484848484848486
Recall Score: 0.5161290322580645
F1 Score: 0.5

Iteration: 3
Precision Score: 0.2962962962962963
Recall Score: 0.4
F1 Score: 0.3404255319148936

Iteration: 4
Precision Score: 0.42105263157894735
Recall Score: 0.3902439024390244
F1 Score: 0.4050632911392405

Iteration: 5
Precision Score: 0.6176470588235294
Recall Score: 0.7241379310344828
F1 Score: 0.6666666666666666

Iteration: 6
Precision Score: 0.5476190476190477
Recall Score: 0.38333333333333336
F1 Score: 0.45098039215686275

Iteration: 7
Precision Score: 0.43478260869565216
Recall Score: 0.4166666666666667
F1 Score: 0.425531914893617

Iteration: 8
Precision Score: 0.30612244897959184
Recall Score: 0.45454545454545453
F1 Score: 0.36585365853658536

Iteration: 9
Precision Score: 0.4461538461538462
Recall Score: 0.47540983606557374
F1 Score: 0.4603174603174603

Iteration: 10
Precision Score: 0.33695652173913043
Recall S