In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier 
import pickle

In [5]:
data = pd.read_csv("LLCP2020.csv")

In [6]:
df = data.copy()
df.shape

(401958, 280)

In [7]:
variables = ['GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'EXERANY2', 'CVDINFR4', 'CVDCRHD4', 'ASTHMA3', 'HAVARTH4', '_MRACE1', 'SEXVAR', '_AGE_G', 'HEIGHT3', 'WEIGHT2', '_MICHD', 'SMOKE100']
df = df[variables]
df = df[(df['_MICHD'] == 1) | (df['_MICHD'] == 2)]
df['_MICHD'] = df['_MICHD'].replace({
    2: 0
    })
df['GENHLTH'] = df['GENHLTH'].replace({
    1: 'Excellent',
    2: 'Very good',
    3: 'Good',
    4: 'Fair', 
    5: 'Poor', 
    7: np.NaN, 
    9: np.NaN})
df[['PHYSHLTH', 'MENTHLTH']] = df[['PHYSHLTH', 'MENTHLTH']].replace({
    88: 0,
    77: np.NaN,
    99: np.NaN
})
df[['EXERANY2', 'CVDINFR4', 'CVDCRHD4', 'ASTHMA3', 'HAVARTH4', 'SMOKE100']] = df[['EXERANY2', 'CVDINFR4', 'CVDCRHD4', 'ASTHMA3', 'HAVARTH4', 'SMOKE100']].replace({
    1: 'Yes',
    2: 'No',
    7: np.NaN,
    9: np.NaN
})
df['_MRACE1'] = df['_MRACE1'].replace({
    1: 'White',
    2: 'Black',
    3: 'American Indian or Alaskan Native only',
    4: 'Asian',
    5: 'Native Hawaiian or other Pacific Islander',
    6: 'Other',
    7: 'Multiracial',
    8: 'Hispanic',
    9: np.NaN,
    77: np.NaN,
    99: np.NaN
})
df['SEXVAR'] = df['SEXVAR'].replace({
    1: 'Male',
    2: 'Female'
})
df['_AGE_G'] = df['_AGE_G'].replace({
    1: 'Age 18 to 24',
    2: 'Age 25 to 34',
    3: 'Age 35 to 44',
    4: 'Age 45 to 54',
    5: 'Age 55 to 64',
    6: 'Age 65 or older'
})
df[['HEIGHT3', 'WEIGHT2']] = df[['HEIGHT3', 'WEIGHT2']].replace({
    7777: np.NaN,
    9999: np.NaN
})
df = df.dropna()
df.shape

(329965, 15)

In [9]:
num_vars = ['PHYSHLTH', 'MENTHLTH', 'HEIGHT3', 'WEIGHT2']
df[num_cols] = StandardScaler().fit_transform(df[num_vars])
cat_vars = df[['GENHLTH', 'EXERANY2', 'CVDINFR4', 'CVDCRHD4', 'ASTHMA3', 'HAVARTH4', 'SMOKE100', '_MRACE1', 'SEXVAR', '_AGE_G']]
encoded_cat_vars = pd.DataFrame(OneHotEncoder().fit_transform(cat_vars).toarray())
df = pd.concat([df, encoded_cat_vars], axis = 1)
df = df.drop(columns = ['GENHLTH', 'EXERANY2', 'CVDINFR4', 'CVDCRHD4', 'ASTHMA3', 'HAVARTH4', 'SMOKE100', '_MRACE1', 'SEXVAR', '_AGE_G'], axis = 1)
df = df.dropna()
y = df['_MICHD']
x = df.drop(['_MICHD'], axis=1)

In [11]:
def evaluate_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    y_pred_proba = model.predict_proba(x_test)[::,1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)
    cm = metrics.confusion_matrix(y_test, y_pred)

    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'kappa': kappa, 
            'fpr': fpr, 'tpr': tpr, 'auc': auc, 'cm': cm}

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=101)
knn = KNeighborsClassifier(n_neighbors = 8)
knn.fit(x_train, y_train)
knn_eval = evaluate_model(knn, x_test, y_test)

print('Accuracy:', knn_eval['acc'])
print('Precision:', knn_eval['prec'])
print('Recall:', knn_eval['rec'])
print('F1 Score:', knn_eval['f1'])
print('Area Under Curve:', knn_eval['auc'])
print('Confusion Matrix:\n', knn_eval['cm'])



Accuracy: 0.9163633135043949
Precision: 0.25
Recall: 0.006270154066642781
F1 Score: 0.01223348479552604
Cohens Kappa Score: 0.008224744291336905
Area Under Curve: 0.5687005580901198
Confusion Matrix:
 [[61891   105]
 [ 5547    35]]


In [None]:
with open('model.pkl', "wb") as pickle_file:
    pickle.dump(knn, pickle_file)