In [24]:
import numpy as np
import pandas as pd
import os

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_validate

In [25]:
WORKING_DIR = "C:/Users/joanp/Projects/UCI_Heart_Disease/"
os.chdir(WORKING_DIR)

In [26]:
# All features
# fmt off
# Don't split feature list into a feature per row
features = [
    "data_center",
    "age",
    "sex",
    "cp",
    "trestbps",
    "chol",
    "fbs",
    "restecg",
    "thalach",
    "exang",
    "oldpeak",
    "slope",
    "ca",
    "thal",
    "num",
]
# fmt: on

target = ["num"]
drop_features = ["data_center", "slope", "ca", "thal"]
imputed_features = ["chol", "fbs"]

core_features = list(set(features) - (set(imputed_features + target + drop_features)))

In [27]:
# Load Dataset
df = pd.read_csv("Data/uci_heart_disease.processed.four_databases.tsv", sep="\t")

# Select only releveant features
# Remove data from data center VA as it contains a large number of missing values in many features
df = df[df["data_center"] != "va"]

# Retain only records without NaN in the core features.
# Core features are defined manually based no or very few
# missing values
df.dropna(subset=core_features, inplace=True)

# Flag records missing "chol" values
df["chol_imputed"] = np.nan
df["chol_missing"] = 0
df.loc[df["chol"].isna(), "chol_missing"] = 1

# Flag records missing "fbl" values
df["fbs_imputed"] = np.nan
df["fbs_missing"] = 0
df.loc[df["fbs"].isna(), "fbs_missing"] = 1

df.head()

Unnamed: 0,data_center,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,chol_imputed,chol_missing,fbs_imputed,fbs_missing
0,cleveland,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0,,0,,0
1,cleveland,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2,,0,,0
2,cleveland,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1,,0,,0
3,cleveland,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,,0,,0
4,cleveland,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0,,0,,0


 ## Impute Chol
 Train a KNeighborsRegressor to input the missing values of cholesterol

In [37]:
# Scoring methods to perform cross validation and assess the performance of Regressor
scoring_regressor = {
    "r2": "r2",
    "MAPE": "neg_mean_absolute_percentage_error",
    "MSE": "neg_mean_squared_error",
}

# Select subset of records with "chol" values to train the imputer
chol_df = df.loc[df["chol_missing"] == 0, :]


X = chol_df[core_features]  # imputer input
y = chol_df["chol"].values  # imputer target

imputer_chol = KNeighborsRegressor()
scores_regression = (
    cross_validate(imputer_chol, X, y, cv=3, scoring=scoring_regressor),
)

In [38]:
scores_regression

({'fit_time': array([0.        , 0.00392962, 0.        ]),
  'score_time': array([0.00464296, 0.        , 0.00801706]),
  'test_r2': array([-2.80182923, -1.41186027, -0.60775782]),
  'test_MAPE': array([-2.98589239e-01, -2.67778956e-01, -5.51594589e+17]),
  'test_MSE': array([-10359.728     ,  -7164.92017467, -33167.88541485])},)

## Impute fbs
Train a KNeighborsClassifier to imput the missing values of fbs

In [39]:
# Scoring methods to perform cross validation and assess the performance of Classifier
scoring_classifier = {
    "acc": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1",
}

# Select subset of records with "fbs" values to train the imputer
fbs_df = df.loc[df["fbs_missing"] == 0, :]

X = fbs_df[core_features]  # imputer input
y = fbs_df["fbs"].values  # imputer target

imputer_fbs = KNeighborsClassifier()
scores_classification = (
    cross_validate(imputer_fbs, X, y, cv=3, scoring=scoring_classifier),
)

In [40]:
scores_classification

({'fit_time': array([0.        , 0.00801563, 0.00804138]),
  'score_time': array([0.01604152, 0.01139331, 0.01614714]),
  'test_acc': array([0.86729858, 0.88625592, 0.88151659]),
  'test_precision': array([0.14285714, 0.        , 0.        ]),
  'test_recall': array([0.04347826, 0.        , 0.        ]),
  'test_f1': array([0.06666667, 0.        , 0.        ])},)

In [42]:
KNeighborsClassifier?

[1;31mInit signature:[0m
[0mKNeighborsClassifier[0m[1;33m([0m[1;33m
[0m    [0mn_neighbors[0m[1;33m=[0m[1;36m5[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mweights[0m[1;33m=[0m[1;34m'uniform'[0m[1;33m,[0m[1;33m
[0m    [0malgorithm[0m[1;33m=[0m[1;34m'auto'[0m[1;33m,[0m[1;33m
[0m    [0mleaf_size[0m[1;33m=[0m[1;36m30[0m[1;33m,[0m[1;33m
[0m    [0mp[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmetric[0m[1;33m=[0m[1;34m'minkowski'[0m[1;33m,[0m[1;33m
[0m    [0mmetric_params[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Classifier implementing the k-nearest neighbors vote.

Read more in the :ref:`User Guide <classification>`.

Parameters
----------
n_neighbors : int, default=5
    Number of neighbors to use by default for :meth:`kneighbors` queries.

weights 