
# Codes for comparing KNN and iterative imputation with RFC notes

# Note that the cell outputs and file links have been erased, and the notebook kernel has been reset to protect confidential data


# KNN VS Iterative w/ RFC Imputation Notes


In [None]:


import numpy as np
import pandas as pd
import time

# remember to check the file path names
cv19_cat_df = pd.read_csv()

#also remember to change the missing (null) values to numpy "not a number" or nan
cv19_cat_df.fillna(np.nan)




# KNN Imputation


In [None]:

from sklearn.impute import KNNImputer

# construct the knn imputator with 1 nearest neighbor
knn1_imputer = KNNImputer(missing_values=np.nan, n_neighbors =1, weights='uniform', metric='nan_euclidean', copy ='False')

# train (fit) the model, then transform does the imputation
cv19_cat_knn_imp = knn1_imputer.fit_transform( cv19_cat_df )

# this line creates a new dataframe with the data imputated
cv19_cat_knn_imp_df = pd.DataFrame( cv19_cat_knn_imp, columns = cv19_cat_df.columns )





# Iterative Imputation with Random Forest Classifier (RFC) <br> as the predicting algorithm <br> Note: Iterative Imputation may take a while


In [None]:

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier

# you can chose a different predicting algoirthm
# 10 iterations should be decent - more iterations will take longer
# you can also pick the random state number
# construct the iterative imputer model with RFC as the classifier
iterative_imputer = IterativeImputer(estimator = RandomForestClassifier(), initial_strategy = 'most_frequent', max_iter = 10, random_state = 2)

# similar to the knn, you do training (fit) and then transform (imputing)
cv19_cat_iter_imp = iterative_imputer.fit_transform(cv19_cat_df)

# create the dataframe
cv19_cat_iter_imp_df = pd.DataFrame(cv19_cat_iter_imp, columns = cv19_cat_df.columns)

print('Done Processing!')


# Setting up the dataframe sections (slices) for testing and training



In [None]:

# get the column names of the original data
cv19_cat_feature_cols = list (cv19_cat_df.columns)

# remove the names of the columns that are labels (also called targets, these are what we are tryingto predict)
cv19_cat_feature_cols.remove('evol_Death')
cv19_cat_feature_cols.remove('evol_ICU_admission')
cv19_cat_feature_cols.remove('evol_Hospitalization')
cv19_cat_feature_cols.remove('evol_Recovered')

# create the feature section (also called feature matrix or X) for each dataframe
cv19_cat_knn_features = cv19_cat_knn_imp_df[cv19_cat_feature_cols]
cv19_cat_iter_features = cv19_cat_iter_imp_df[cv19_cat_feature_cols]

#create the label sections (or simply Y)
cv19_cat_knn_deaths_labels = cv19_cat_knn_imp_df[ 'evol_Death']
cv19_cat_knn_ICUs_labels = cv19_cat_knn_imp_df[ 'evol_ICU_admission']
cv19_cat_knn_hospitalizations_labels = cv19_cat_knn_imp_df[ 'evol_Hospitalization']
cv19_cat_knn_recovered_labels = cv19_cat_knn_imp_df[ 'evol_Recovered']

cv19_cat_iter_deaths_labels = cv19_cat_iter_imp_df[ 'evol_Death']
cv19_cat_iter_ICUs_labels = cv19_cat_iter_imp_df[ 'evol_ICU_admission']
cv19_cat_iter_hospitalizations_labels = cv19_cat_iter_imp_df[ 'evol_Hospitalization']
cv19_cat_iter_recovered_labels = cv19_cat_iter_imp_df[ 'evol_Recovered']

print('Done Processing!')



# Setting up the training sets and testing sets


In [None]:

from sklearn.model_selection import train_test_split

# Notes: you can name the sets whatevers
# I choose to use obvious names to make things easier for myself

# the format is
# Features_Training, Features_Testing, Labels_Training, Labels_Testing = train_test_split(parameters)
# the parameters follow the format ( Feature_Matrix, Labels_Matrix, test_size = #.##, random_state = ## )
# test_size = 0.1 means 10% for testing and 90% for training, 0.2 means 20% for testing and 80% for training

# setting up training and testing sets for knn imputated data
knn_deaths_features_training, knn_deaths_features_testing, knn_deaths_labels_training, knn_deaths_labels_testing = train_test_split(cv19_cat_knn_features, cv19_cat_knn_deaths_labels, test_size = 0.2, random_state = 2)
knn_ICUs_features_training, knn_ICUs_features_testing, knn_ICUs_labels_training, knn_ICUs_labels_testing = train_test_split(cv19_cat_knn_features, cv19_cat_knn_ICUs_labels, test_size = 0.2, random_state = 2)
knn_hospitalizations_features_training, knn_hospitalizations_features_testing, knn_hospitalizations_labels_training, knn_hospitalizations_labels_testing = train_test_split(cv19_cat_knn_features, cv19_cat_knn_hospitalizations_labels, test_size = 0.2, random_state = 2)
knn_recovered_features_training, knn_recovered_features_testing, knn_recovered_labels_training, knn_recovered_labels_testing = train_test_split(cv19_cat_knn_features, cv19_cat_knn_recovered_labels, test_size = 0.2, random_state = 2)

#setting up training and testing sets for iterative w/ RFC imputated data
iter_deaths_features_training, iter_deaths_features_testing, iter_deaths_labels_training, iter_deaths_labels_testing = train_test_split(cv19_cat_iter_features, cv19_cat_iter_deaths_labels, test_size = 0.2, random_state = 2)
iter_ICUs_features_training, iter_ICUs_features_testing, iter_ICUs_labels_training, iter_ICUs_labels_testing = train_test_split(cv19_cat_iter_features, cv19_cat_iter_ICUs_labels, test_size = 0.2, random_state = 2)
iter_hospitalizations_features_training, iter_hospitalizations_features_testing, iter_hospitalizations_labels_training, iter_hospitalizations_labels_testing = train_test_split(cv19_cat_iter_features, cv19_cat_iter_hospitalizations_labels, test_size = 0.2, random_state = 2)
iter_recovered_features_training, iter_recovered_features_testing, iter_recovered_labels_training, iter_recovered_labels_testing = train_test_split(cv19_cat_iter_features, cv19_cat_iter_recovered_labels, test_size = 0.2, random_state = 2)




# Training Logistic Regression Models

Below codes will do deaths for knn and iterative imputated data.

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score

# the order goes as follows:

# construct the model - max_iter parameter can be omitted
# model.fit (Features_Training, Labels_Training) - this trains the model
# predictions = model.predict (Features_Testing) - this will use the model
# to predict the labels of the testing features set
# probabilities = model.predict_proba (Features_Testing)
# algorithms like Logistic Regression can do this - some algorithms cannot do this
# FPR, TPR, Thresholds = metrics.roc_curve ( Labels_Testing, probabilities[:, 1], pos_label = 1 )
# AUC = metrics.auc (FPR, TPR)
# accuracy = accuracy_score (Labels_Testing, predictions)

# Note:
# FPR, TPR, Thresholds = metrics.roc_curve ( Labels_Testing, probabilities[:,1], pos_label = 1 )
# FPR is false positive rate, TPR is True positive rate, thresholds is the boundary that makes the decisions for each
# normally in the probabilities list, it is a list of pairs. in this case the 2nd of the pair is the positive label
# since "1" marks postive, you set that parameter
# if the positive label is not a number, set pos_label = 'name value of positive'
# if the positive label is the 1st of the pair, set probabilities[:,0]

knn_deaths_lgr_clssr = LogisticRegression(max_iter=1500)
knn_deaths_lgr_clssr.fit (knn_deaths_features_training, knn_deaths_labels_training)
knn_deaths_features_predictions = knn_deaths_lgr_clssr.predict(knn_deaths_features_testing)
knn_deaths_features_probabilities = knn_deaths_lgr_clssr.predict_proba(knn_deaths_features_testing)
knn_deaths_FPR, knn_deaths_TPR, knn_deaths_Thresholds = metrics.roc_curve(knn_deaths_labels_testing, knn_deaths_features_probabilities[:,1], pos_label = 1)
knn_deaths_AUC = metrics.auc(knn_deaths_FPR, knn_deaths_TPR)
knn_deaths_features_accuracy = accuracy_score(knn_deaths_labels_testing, knn_deaths_features_predictions)


iter_deaths_lgr_clssr = LogisticRegression(max_iter=1500)
iter_deaths_lgr_clssr.fit (iter_deaths_features_training, iter_deaths_labels_training)
iter_deaths_features_predictions = iter_deaths_lgr_clssr.predict(iter_deaths_features_testing)
iter_deaths_features_probabilities = iter_deaths_lgr_clssr.predict_proba(iter_deaths_features_testing)
iter_deaths_FPR, iter_deaths_TPR, iter_deaths_Thresholds = metrics.roc_curve(iter_deaths_labels_testing, iter_deaths_features_probabilities[:,1], pos_label = 1)
iter_deaths_AUC = metrics.auc(iter_deaths_FPR, iter_deaths_TPR)
iter_deaths_features_accuracy = accuracy_score(iter_deaths_labels_testing, iter_deaths_features_predictions)



In [None]:

print ('knn imputation deaths accuracy score: ', '%0.4f' % knn_deaths_features_accuracy ) 
print ('iterative imputation deaths accuracy score: ', '%0.4f' % iter_deaths_features_accuracy )




# Using matplotlib to plot graphs



In [None]:

import matplotlib.pyplot as plt

%matplotlib inline
plt.figure()

# you need FPR, TPR, and AUC to plot graphs like below.

plt.plot(knn_deaths_FPR, knn_deaths_TPR, color='red', lw= 3, 
           label='KNN Imputated Deaths ROC Curve ( area = %0.4f)' % knn_deaths_AUC)

plt.plot(iter_deaths_FPR, iter_deaths_TPR, color='blue', lw= 3, 
           label='Iteratived Imputated Deaths ROC Curve ( area = %0.4f)' % iter_deaths_AUC)

plt.plot([0, 1], [0, 1], color='blue', lw=1, linestyle='--')


plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

plt.show()
