In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import random
import seaborn as sns

import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('cleveland.csv')

df = df.rename({'num':'disease'}, axis=1)
df['disease'] = df.disease.apply(lambda x: min(x, 1))
display(df.head(5))
sns.displot(data=df, x='age', hue='disease')
# Plot histograms
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.5)

ax1.hist(df[df.disease == 0].age);
# ax1.set_xlabel('age');
ax1.set_ylabel('number of patients');
ax1.set_xlim(20, 80);
ax1.set_ylim(0, 50);
ax1.set_title('healthy');

ax2.hist(df[df.disease == 1].age, color='orange');
ax2.set_xlabel('age');
ax2.set_ylabel('number of patients');
ax2.set_xlim(20, 80);
ax2.set_ylim(0, 50);
ax2.set_title('unhealthy');


In [None]:
# Use knn on age. First create a nearest neighbors object.
nn = NearestNeighbors(n_neighbors=5, metric='euclidean', algorithm='auto')

# Create a two-dimensional array. This is basically a one-dimensional array with
# single-element arrays of patient ages in the second dimension. We're going to
# search for neighbors using only the age dimension.
X = [[x] for x in df.age]
X

# This builds an index data structure under the hood for query performance
fit = nn.fit(X)

# Find the k nearest neighbors
distances, indices = fit.kneighbors([[70]])
display(distances)
display(indices)
distances, indices

# Get the patients that are near the age
nbrs = df.iloc[indices[0]]
display(nbrs)

# Print how many patients are sick and how many are healthy
healthy = nbrs[nbrs.disease == 0].count().disease
sick = nbrs[nbrs.disease == 1].count().disease
print('healthy: {}\nsick: {}'.format(healthy, sick))

In [None]:
from sklearn.metrics import precision_recall_fscore_support


## From what I understand, the below is doing this:
## 1. getting all of the values and doing nn.fit(X) to do something cool
## 2. getting a sample of the data based on n
## 3. taking the sample, and grabbing the values for disease, age, etc, for each of them
## 4. We are then grabbing the k-nearest neighbors of each sample based on patientsX (age, trestbps, each of the patients data).
## 5. Predict all of them

# Use knn on age. First create a nearest neighbors object.
nn = NearestNeighbors(n_neighbors=6, metric='euclidean', algorithm='auto')
# This builds an index data structure under the hood for query performance
X = df[['age', 'trestbps']].values

fit = nn.fit(X)

# Get random patients to test on
n = 50
patients = df.sample(n)
display(patients)
patientsX = patients[['age','trestbps']].values
display(patientsX)

patientsy = patients[['disease']].values
display(patientsy)

# # Find the k nearest neighbors to the patient.
distances, indices = fit.kneighbors(patientsX)
# # print('indices of k-nearest neighbors for each patient:')
display(indices)

# y_pred = []
# for i in range(n):
#     # print('nearest neighbors to patient: {}:'.format(patientsX[i]))
#     nbrs = df.iloc[indices[i]]
#     # Drop the patient of interest
#     nbrs = nbrs.drop(patients.index[i])
#     # display(nbrs)

#     healthy = nbrs[nbrs.disease == 0].count().disease
#     sick = nbrs[nbrs.disease == 1].count().disease
#     predict = 0 if (healthy > sick) else 1
#     print(f'healthy: {healthy}, sick: {sick}, predicted: {predict}, actual: {patientsy[i][0]}')
#     y_pred.append(predict)

# # This is where we would compile how many patients are predicted
# # correctly. Remember:
# #    precision = tp/(tp+fp)  ("sloppiness")
# #    recall    = tp/(tp+fn)  ("What percentage did we find?")
# #    f-score - a balance between precision and recall
# #    support - number of positive labels
# (p,r,f,s) = precision_recall_fscore_support(patientsy, y_pred, labels=[0,1])
# print(f'precision={p}, recall={r}, f-score={f}, support={s}')

In [None]:
# New addition

In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('cleveland.csv')
df.rename(columns={'num': 'disease'}, inplace=True)
df['disease'] = df.disease.apply(lambda x: min(x, 1))
df.loc[df['ca'] == '?', 'ca'] = None
df.loc[df['thal'] == '?', 'thal'] = None
df['ca'] = df['ca'].apply(lambda a: float(a) if (a is not None) else None)
df['thal'] = df['thal'].apply(lambda a: float(a) if (a is not None) else None)
display(df)
df.dropna().corr()

In [None]:
def testDataset(dataframe, testCol, k=5, attributes=1, verbose=True):
    df = dataframe.copy()

    # Test a random set of attributes
    if type(attributes) == int:
        # If more attributes are specified than there are, just use all of them
        if attributes > len(df.columns)-1:
            attributes = len(df.columns)-1
        attributes = df[df.columns[df.columns != testCol]].sample(axis=1, n=attributes).columns

    # Test a specific set of attributes
    elif type(attributes) != list:
        print('attributes must be an integer or a list of attribute names')
        return


    # Clear out any records that don't have a valid value for one of the attributes in question
    for attribute in attributes:
        if attribute == testCol:
            print(f'Cannot predict {testCol} using {testCol}')
            return
        
        try:
            df = df.dropna(subset=attribute)
        except Exception as e:
            print(e)
            return
    
    print(f'Predicting based on the {k} nearest neighbors using {attributes}:') if verbose else None
    # display(df)

    # Standardize the data
    for attribute in attributes:
        df[attribute] = (df[attribute] - df[attribute].mean()) / df[attribute].std()

    f1Scores = []
    for j in range(10):
        # Use knn. First create a nearest neighbors object.
        nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')
        
        X = df[attributes].values
        y = df[[testCol]].values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

        fit = nn.fit(X_train)

        distances, indices = fit.kneighbors(X_test)

        y_pred = []
        for i in range(len(indices)):
            nbrs = y_train[indices[i]]

            # Predict the class with the highest frequency among neighbors
            greatestValue = df[testCol].unique()[0]
            greatestValueCount = 0
            for value in df[testCol].unique():
                valueCount = [item for sublist in nbrs for item in sublist].count(value)
                if valueCount >= greatestValueCount:
                    greatestValueCount = valueCount
                    greatestValue = value

            # healthy = [item for sublist in nbrs for item in sublist].count(0)
            # sick = [item for sublist in nbrs for item in sublist].count(1)
            predict = greatestValue
            # print(healthy, sick, predict)

            y_pred.append(predict)
        
        (p,r,f,s) = precision_recall_fscore_support(y_test, y_pred)
        f1Scores.append(f)
        print(f'Test {j}: precision={p}, recall={r}, f-score={f}, support={s}') if verbose else None

    meanF1s = []
    for i in range(len(f1Scores[0])):
        f1Total = 0
        for score in f1Scores:
            f1Total += score[i]
        meanF1s.append(f1Total / len(f1Scores))

    print(f'Mean F1 scores: f-score={meanF1s}') if verbose else None

    f1sTotal = 0
    for f1 in meanF1s:
        f1sTotal += f1
    meanOfF1s = f1sTotal / len(meanF1s)
    meanF1s.append(meanOfF1s)

    print(f'Mean of mean F1 scores: f-score={meanF1s[-1]}') if verbose else None

    return meanF1s


scores = {'k': [], 'scores': []}
for k in range(1, 200):
    scores['k'].append(k)
    scores['scores'].append(testDataset(dataframe=df, testCol='disease', k=k, attributes=['oldpeak', 'cp'], verbose=False))

plt.plot(scores['k'], scores['scores'])
plt.xlabel('k')
plt.ylabel('f score')

# testDataset(df, 'disease', 5, ['oldpeak', 'thal', 'ca'])
# testDataset(df, 'disease', 5, ['thalach', 'ca', 'thal'])
# testDataset(df, 'disease', 5, ['oldpeak', 'age', 'cp'])
# testDataset(df, 'disease', 5, ['slope', 'thal', 'cp'])
testDataset(df, 'disease', 5, 4)