In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_recall_curve
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
data = pd.read_csv('medical_clean.csv')
#Divide the data depending on the variable type
data_num = data.iloc[:, np.r_[14:17, 20]].copy() #Numeric variables
data_enc = data.iloc[:, np.r_[17, 18]].copy() #Non-binary Categorical Variables
data_yn = data.iloc[:, np.r_[24, 26, 27, 29:38]].copy() #Binary Categorical Variables
data_ord = data.iloc[:, [11, 25, 28]].copy() #Ordinal Categorical Variables
data_tar = data['ReAdmis'] #Target Variable

In [3]:
def meddata_preprocessing(data_num, data_enc, data_yn, data_ord, data_tar): #This is saved as a function for reusability in later tasks
    for i in range(len(data_num.columns)):
        #Compute the mean and standard deviation of each column
        mean, std = np.mean(data_num.iloc[:,i]), np.std(data_num.iloc[:,i])
        #Set the upper and lower bounds at three standard deviations from the mean
        upper, lower = mean + 3 * std, mean - 3 * std
        #Record the index for each row that contains a value outside the previously set boundaries
        drop = [inx for inx, x in enumerate(data_num.iloc[:, i]) if x < lower or x > upper]
        #Compare the list of indices to be dropped with those within the dataframe and drop those that still remain
        #within the data frame while ignoring those that were already dropped
        for d in drop:
            if d in data_num.index:
                data_num = data_num.drop(d)
    #One-hot encode categorical variables
    data_enc = pd.get_dummies(data_enc, prefix=data_enc.columns, drop_first=True)
        
    for col in range(len(data_yn.columns)):
        #Replace values of "Yes" with 1 and values of "No" with 0
        for inx, val in enumerate(data_yn.iloc[:, col]):
            if val == 'Yes':
                data_yn.iloc[inx, col] = 1
            else:
                data_yn.iloc[inx, col] = 0
    #Determine levels of ordinal variables
    scale_mapper = {
        "Area" : {
            "Rural" : 0,
            "Suburban" : 0.5,
            "Urban" : 1
        },
        "Initial_admin" : {
            "Emergency Admission" : 1,
            "Observation Admission" : 0.5,
            "Elective Admission" : 0
        },
        "Complication_risk" : {
            "Low" : 0,
            "Medium" : 0.5,
            "High" : 1
        }
    }
    #Replace values with numerical equivalents specified above
    for col in data_ord.columns:
        data_ord[col] = data_ord[col].copy().replace(scale_mapper[col])
    
    mm = MinMaxScaler() #Instantiate the MinMaxScaler method
    data_num[data_num.columns] = mm.fit_transform(data_num) #Standardize the data
    
    #The use of an inner join preserves the dropping of rows performed on data_num
    #The data_yn dataframe is converted to a numeric datatype, int32, before joining
    data_clean = data_num.copy().join(data_enc, how='inner').join(data_yn.astype('int32'), how='inner').join(data_ord, how='inner').join(data_tar, how='inner')
    
    return data_clean #Returns a fully prepared data set

In [4]:
data_clean = meddata_preprocessing(data_num, data_enc, data_yn, data_ord, data_tar)
data_clean

Unnamed: 0,Children,Age,Income,VitD_levels,Marital_Married,Marital_Never Married,Marital_Separated,Marital_Widowed,Gender_Male,Gender_Nonbinary,...,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma,Area,Initial_admin,Complication_risk,ReAdmis
0,0.125,0.492958,0.417305,0.562756,0,0,0,0,1,0,...,0,1,1,1,0,1,0.5,1.0,0.5,No
1,0.375,0.464789,0.225268,0.550632,1,0,0,0,0,0,...,0,0,0,0,1,0,1.0,1.0,1.0,No
2,0.375,0.492958,0.068645,0.497410,0,0,0,1,0,0,...,0,0,0,0,0,0,0.5,0.0,0.5,No
3,0.000,0.845070,0.191156,0.408150,1,0,0,0,1,0,...,0,0,0,0,1,1,0.5,0.0,0.5,No
4,0.125,0.056338,0.005097,0.460128,0,0,0,1,0,0,...,1,0,0,1,0,0,0.0,0.0,0.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.250,0.098592,0.221220,0.432505,0,0,0,1,1,0,...,0,0,1,0,1,0,1.0,1.0,0.5,No
9996,0.500,0.971831,0.071605,0.504615,0,0,0,1,1,0,...,0,0,0,0,0,1,1.0,0.0,0.5,Yes
9997,0.375,0.380282,0.317553,0.441440,0,0,1,0,0,0,...,0,0,1,1,0,0,0.0,0.0,1.0,Yes
9998,0.375,0.352113,0.142680,0.609113,0,0,0,0,1,0,...,0,1,0,0,0,0,0.0,1.0,0.5,Yes


In [5]:
X = data_clean.drop(labels=['ReAdmis'], axis=1).copy()
y = data_clean['ReAdmis'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.65      0.72      0.68      1239
         Yes       0.38      0.31      0.34       689

    accuracy                           0.57      1928
   macro avg       0.52      0.51      0.51      1928
weighted avg       0.55      0.57      0.56      1928



In [7]:
leaf_size = list(range(1, 50))
n_neighbors = list(range(1, 30))
p = [1, 2]
weights = ['uniform', 'distance']

hyperparams = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p, weights=weights)

knn_2 = KNeighborsClassifier()
clf = RandomizedSearchCV(knn_2, hyperparams, n_jobs=-1)
best_model = clf.fit(X_train, y_train)

print('Best leaf_size: ', best_model.best_estimator_.get_params()['leaf_size'])
print('Best n_neighbors: ', best_model.best_estimator_.get_params()['n_neighbors'])
print('Best p: ', best_model.best_estimator_.get_params()['p'])
print('Best weights: ', best_model.best_estimator_.get_params()['weights'])

Best leaf_size:  12
Best n_neighbors:  27
Best p:  2
Best weights:  distance


In [9]:
knn_2.fit(X_train, y_train)
y_pred_int = knn_2.predict(X_test)
print(classification_report(y_test, y_pred_int))

              precision    recall  f1-score   support

          No       0.65      0.72      0.68      1239
         Yes       0.38      0.31      0.34       689

    accuracy                           0.57      1928
   macro avg       0.52      0.51      0.51      1928
weighted avg       0.55      0.57      0.56      1928



In [10]:
leaf_size = list(range(best_model.best_estimator_.get_params()['leaf_size'] - 10, best_model.best_estimator_.get_params()['leaf_size'] + 10))
n_neighbors = list(range(best_model.best_estimator_.get_params()['n_neighbors'] - 10, best_model.best_estimator_.get_params()['n_neighbors'] + 10))
p = [1, 2]
weights = ['uniform', 'distance']

hyperparams = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p, weights=weights)

knn_3 = KNeighborsClassifier()
clf_2 = GridSearchCV(knn_3, hyperparams, n_jobs=-1)
best_model_2 = clf_2.fit(X_train, y_train)

print('Best leaf_size: ', best_model_2.best_estimator_.get_params()['leaf_size'])
print('Best n_neighbors: ', best_model_2.best_estimator_.get_params()['n_neighbors'])
print('Best p: ', best_model.best_estimator_.get_params()['p'])
print('Best weights: ', best_model.best_estimator_.get_params()['weights'])

Best leaf_size:  2
Best n_neighbors:  34
Best p:  2
Best weights:  distance


In [11]:
knn_4 = KNeighborsClassifier(n_neighbors=best_model_2.best_estimator_.get_params()['n_neighbors'], weights=best_model_2.best_estimator_.get_params()['weights'], leaf_size=best_model_2.best_estimator_.get_params()['leaf_size'], p=best_model.best_estimator_.get_params()['p'])
knn_4.fit(X_train, y_train)
y_pred_knn_4 = knn_4.predict(X_test)
print(classification_report(y_test, y_pred_knn_4))
print(confusion_matrix(y_test, y_pred_knn_4))

              precision    recall  f1-score   support

          No       0.64      0.95      0.77      1239
         Yes       0.33      0.04      0.07       689

    accuracy                           0.63      1928
   macro avg       0.49      0.50      0.42      1928
weighted avg       0.53      0.63      0.52      1928

[[1182   57]
 [ 661   28]]


In [16]:
prob_knn_4 = knn_4.predict_proba(X_test)
roc_auc_score(y_test, prob_knn_4, multi_class='ovo')

0.5015641549116118

In [None]:
data_clean.to_csv('data_clean.csv')
X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')