In [None]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('cleveland.csv')
# Rename 'num' column to 'disease' and make it binary
df['disease'] = df.num.apply(lambda x: min(x,1))
df

In [None]:
# standardize age and bp
df['age_s'] = (df.age - df.age.mean())/df.age.std()
df['trestbps_s'] = (df.trestbps - df.trestbps.mean())/df.trestbps.std()

In [None]:
def get_scores(k):
  # Use knn on age. First create a nearest neighbors object.
  nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

  # This builds an index data structure under the hood for query performance
  X = df[['age_s', 'trestbps_s']].values
  fit = nn.fit(X)

  # Get random patients to test on
  n = 50
  patients = df.sample(n)
  patientsX = patients[['age_s','trestbps_s']].values
  patientsy = patients[['disease']].values
  # display(patients)

  # Find the k nearest neighbors to the patient.
  distances, indices = fit.kneighbors(patientsX)

  y_pred = []
  for i in range(n):
      nbrs = df.iloc[indices[i]]
      # Drop the patient of interest
      nbrs = nbrs.drop(patients.index[i], errors='ignore')

      healthy = nbrs[nbrs.disease == 0].count().disease
      sick = nbrs[nbrs.disease == 1].count().disease
      predict = 0 if (healthy > sick) else 1
      y_pred.append(predict)

  return precision_recall_fscore_support(patientsy, y_pred, labels=[1])

kvals = range(2, 250)
scores = [get_scores(k) for k in kvals]
scores = [(p[0], r[0], f[0], s[0]) for (p,r,f,s) in scores]
scores = list(zip(*scores))

In [None]:
k_options = pd.DataFrame(
    {'f score': scores[2],
     'k': kvals
    })

k_options = k_options.sort_values(by='f score', ascending=False)
k_options.head()

In [None]:
plt.plot(kvals, scores[2])
plt.xlabel('k')
plt.ylabel('f score')