In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# Part 1
## kNN using dataset on heart disease obtained from https://archive.ics.uci.edu/ml/datasets/Heart+Disease
#### Data attributes:
* age: age in years 
* sex: sex (1 = male; 0 = female) 
* cp: chest pain type 
    - Value 1: typical angina 
    - Value 2: atypical angina 
    - Value 3: non-anginal pain 
    - Value 4: asymptomatic 
* trestbps: resting blood pressure (in mm Hg on admission to the hospital) 
* chol: serum cholestoral in mg/dl 
* fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
* restecg: resting electrocardiographic results 
    - Value 0: normal 
    - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 
    - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 
* thalach: maximum heart rate achieved 
* exang: exercise induced angina (1 = yes; 0 = no) 
* oldpeak = ST depression induced by exercise relative to rest 
* slope: the slope of the peak exercise ST segment 
    - Value 1: upsloping 
    - Value 2: flat 
    - Value 3: downsloping 
* ca: number of major vessels (0-3) colored by flourosopy 
* thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 
* num: diagnosis of heart disease (angiographic disease status) 
    - Value 0: absence.
    - Value 1,2,3,4: presence of heart disease

### Create dataframe and modify num column to reflect presence of disease (1) or no presence of disase (0)

In [None]:
df = pd.read_csv('./cleveland.csv')

df = df.rename({'num':'disease'}, axis=1)
df['disease'] = df.disease.apply(lambda x: min(x, 1))
df

# Part 2
## kNN using dataset on diabetes 
#### Dataset obtained from https://www.kaggle.com/datasets/houcembenmansour/predict-diabetes-based-on-diagnostic-measures

### Step 1: Create dataframe 
 * ### Transform gender and diabetes columns to numeric representation (0 or 1)
 * ### Transform chol_hdl_ratio, bmi, and waist_hip_ratio columns to float representations (instead of comma seperated numbers)
 * ### Drop patient_number column (essentially a second index)

In [None]:
diabetes_df = pd.read_csv('./diabetes.csv')

diabetes_df['diabetes'] = diabetes_df.diabetes.apply(lambda x: 0 if x=='No diabetes' else 1)
diabetes_df['gender'] = diabetes_df.gender.apply(lambda x: 0 if x=='female' else 1)
diabetes_df['chol_hdl_ratio'] = diabetes_df.chol_hdl_ratio.apply(lambda x: float(str(x).replace(',', '.')))
diabetes_df['bmi'] = diabetes_df.bmi.apply(lambda x: float(str(x).replace(',', '.')))
diabetes_df['waist_hip_ratio'] = diabetes_df.waist_hip_ratio.apply(lambda x: float(str(x).replace(',', '.')))
diabetes_df.drop(columns='patient_number', inplace=True)

diabetes_df.head()

### Step 2: Make standardized versions of each column in the dataframe

In [None]:
diabetes_df['cholesterol_s'] = (diabetes_df.cholesterol-diabetes_df.cholesterol.mean())/diabetes_df.cholesterol.std()
diabetes_df['glucose_s'] = (diabetes_df.glucose-diabetes_df.glucose.mean())/diabetes_df.glucose.std()
diabetes_df['hdl_chol_s'] = (diabetes_df.hdl_chol-diabetes_df.hdl_chol.mean())/diabetes_df.hdl_chol.std()
diabetes_df['chol_hdl_ratio_s'] = (diabetes_df.chol_hdl_ratio-diabetes_df.chol_hdl_ratio.mean())/diabetes_df.chol_hdl_ratio.std()
diabetes_df['age_s'] = (diabetes_df.age-diabetes_df.age.mean())/diabetes_df.age.std()
diabetes_df['gender_s'] = (diabetes_df.gender-diabetes_df.gender.mean())/diabetes_df.gender.std()
diabetes_df['height_s'] = (diabetes_df.height-diabetes_df.height.mean())/diabetes_df.height.std()
diabetes_df['weight_s'] = (diabetes_df.weight-diabetes_df.weight.mean())/diabetes_df.weight.std()
diabetes_df['bmi_s'] = (diabetes_df.bmi-diabetes_df.bmi.mean())/diabetes_df.bmi.std()
diabetes_df['systolic_bp_s'] = (diabetes_df.systolic_bp-diabetes_df.systolic_bp.mean())/diabetes_df.systolic_bp.std()
diabetes_df['diastolic_bp_s'] = (diabetes_df.diastolic_bp-diabetes_df.diastolic_bp.mean())/diabetes_df.diastolic_bp.std()
diabetes_df['waist_s'] = (diabetes_df.waist-diabetes_df.waist.mean())/diabetes_df.waist.std()
diabetes_df['hip_s'] = (diabetes_df.hip-diabetes_df.hip.mean())/diabetes_df.hip.std()
diabetes_df['waist_hip_ratio_s'] = (diabetes_df.waist_hip_ratio-diabetes_df.waist_hip_ratio.mean())/diabetes_df.waist_hip_ratio.std()

diabetes_df.head()

### Step 3: Create function get_scores() 
 * ### takes a k value as input
 * ### builds a kNN model
 * ### returns the recall, precision, and f-score results 

In [None]:
def get_scores(k):
    nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

    X = diabetes_df[['cholesterol_s', 'glucose_s', 'chol_hdl_ratio_s', 'bmi_s', 'systolic_bp_s', 'diastolic_bp_s', 'waist_hip_ratio_s']].values
    y = diabetes_df[['diabetes']].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    # X_train is the training data set.
    # y_train is the set of labels to all the data in x_train.
    # X_test is the test data set.
    # y_test is the set of labels to all the data in x_test.
    
    # print(X_train)

    fit = nn.fit(X_train)

    distances, indices = fit.kneighbors(X_test)

    predictions = []
    for i in range(len(X_test)):
        # print('patient: ', X_test[i])
        # print('paintent_d: ', y_test[i])
        # print('indices: ', indices[i])
        # nbrs_g = [X_train[index] for index in indices[i]]
        # print('nbrs_g: ', nbrs_g)
        nbrs_diabetes = [y_train[index] for index in indices[i]]
        nbrs_diabetes = [x[0] for x in nbrs_diabetes]
        # print('nbrs: ', nbrs)

        diabetes = nbrs_diabetes.count(1)
        # print('yes: ', diabetes)
        no_diabetes = nbrs_diabetes.count(0)
        # print('no: ', no_diabetes)

        prediction = 0 if (no_diabetes > diabetes) else 1
        predictions.append(prediction)
    # return
    return precision_recall_fscore_support(y_test, predictions, labels=[1])



### Step 4: Find the optimum k value

In [None]:
k_values = range(5, 30)
scores = []

for k in k_values:
    k_scores = []
    for i in range(10):
        k_scores.append(get_scores(k)[2][0])
    scores.append(np.array(k_scores).mean())

plt.figure()
plt.title('k value vs f-score')
plt.ylabel('f-score')
plt.xlabel('k value')
plt.plot(k_values, scores)
plt.show()

### Step 5: Build 10 kNN models with a k value of 8 and report results

In [None]:
results = []
for i in range(10):
    scores = get_scores(8)
    results.append(scores)

f_scores = [result[2][0] for result in results]
mean_f = np.array(f_scores).mean()

print('Mean f-score of all models: ', mean_f, '\n')

print('Individual Model Scores: ')
for i in range(10):
    print('\tModel', i+1, ': precision =', results[i][0][0], 'recall =', results[i][1][0], 'f-score =', results[i][2][0])