
# Note for general machine learning procedures
# Note that the cell outputs and file links have been erased, and the notebook kernel has been reset to protect confidential data


In [None]:


import numpy as np
import pandas as pd
import time

# remember to check the file path names
cv19_cat_df = pd.read_csv()

#also remember to change the missing (null) values to numpy "not a number" or nan
cv19_cat_df.fillna(np.nan)





# KNN Imputation


In [None]:

from sklearn.impute import KNNImputer

# construct the knn imputator with 1 nearest neighbor
knn1_imputer = KNNImputer(missing_values=np.nan, n_neighbors =1, weights='uniform', metric='nan_euclidean', copy ='False')

# train (fit) the model, then transform does the imputation
cv19_cat_knn_imp = knn1_imputer.fit_transform( cv19_cat_df )

# this line creates a new dataframe with the data imputated
cv19_cat_knn_imp_df = pd.DataFrame(cv19_cat_knn_imp, columns = cv19_cat_df.columns)



# Setting up the dataframe sections (slices) for testing and training the model, <br> and doing Chi Square Test 


In [None]:


# get the column names of the original data
cv19_cat_feature_cols = list (cv19_cat_df.columns)

# remove the names of the columns that are labels (also called targets, these are what we are tryingto predict)
cv19_cat_feature_cols.remove('evol_Death')
cv19_cat_feature_cols.remove('evol_ICU_admission')
cv19_cat_feature_cols.remove('evol_Hospitalization')
cv19_cat_feature_cols.remove('evol_Recovered')

# create the feature section (also called feature matrix or X) for each dataframe
cv19_cat_knn_features = cv19_cat_knn_imp_df[cv19_cat_feature_cols]

#create the label sections (or simply Y)
cv19_cat_knn_deaths_labels = cv19_cat_knn_imp_df[ 'evol_Death']
cv19_cat_knn_ICUs_labels = cv19_cat_knn_imp_df[ 'evol_ICU_admission']
cv19_cat_knn_hospitalizations_labels = cv19_cat_knn_imp_df[ 'evol_Hospitalization']
cv19_cat_knn_recovered_labels = cv19_cat_knn_imp_df[ 'evol_Recovered']

# the features for chi square test
cv19_cat_array = cv19_cat_knn_features.values

# the death labels for chi square test
cv19_cat_death_lb_array = cv19_cat_knn_deaths_labels.values

# the ICU labels for chi square test
cv19_cat_ICU_lb_array = cv19_cat_knn_ICUs_labels.values

# the hospitalization labels for chi square test
cv19_cat_hospitalizations_lb_array = cv19_cat_knn_hospitalizations_labels.values




# Chi2 Square Test (For Death Outcomes)


In [None]:

# the follow codes do feature selection
# based on the goodness of fit tests using chi squared tests
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# setup the testing model for k = 5 features
best5_chi2_deaths_label_test = SelectKBest(score_func=chi2, k= 5)

# run/fit the testing model with the features and label(s)
best5_chi2_deaths_label_test.fit(cv19_cat_array, cv19_cat_death_lb_array)

# this is to get the indices of the column features that are kept by the model
selected_deaths_features_indices_top5 = best5_chi2_deaths_label_test.get_support(indices = True)

# this is to get the scores of the selected features
selected_deaths_features_scores_top5 = best5_chi2_deaths_label_test.scores_[selected_deaths_features_indices_top5]

# this is to get the feature names of the corresponding columns
# in the dataframe
selected_deaths_features_top5 = []
for index in selected_deaths_features_indices_top5:
    selected_deaths_features_top5.append(cv19_cat_knn_imp_df.columns[index])


In [None]:

print (selected_deaths_features_indices_top5) 
print (selected_deaths_features_scores_top5)
print (selected_deaths_features_top5)



# Using matplotlib to graph the Chi Square Test Results


In [None]:

import matplotlib.pyplot as plt

# the codes below generate a horizontal bar graph
# for the top five features

plt.rcdefaults()
top5_figure, top5_axes = plt.subplots()
top5_y_pos = np.arange( len(selected_deaths_features_top5) )
top5_axes.barh( top5_y_pos, width = selected_deaths_features_scores_top5, color='red', align='center' )
top5_axes.set_yticks( top5_y_pos )
top5_axes.set_yticklabels(selected_deaths_features_top5)
top5_axes.invert_yaxis()
top5_axes.set_xlabel('Chi Squared Test Scores')
top5_axes.set_title('Chi Squared Test Scores of The Top Five\nFeatures of Death Outcomes Using KNN Imputation')
plt.show()


# Chi2 Square Test (For ICU and hospitalization Outcomes)


In [None]:

# setup the testing model for k = 5 features
best5_chi2_ICUs_label_test = SelectKBest(score_func=chi2, k= 5)

# run/fit the testing model with the features and label(s)
best5_chi2_ICUs_label_test.fit(cv19_cat_array, cv19_cat_ICU_lb_array)

# this is to get the indices of the column features that are kept by the model
selected_ICUs_features_indices_top5 = best5_chi2_ICUs_label_test.get_support(indices = True)

# this is to get the scores of the selected features
selected_ICUs_features_scores_top5 = best5_chi2_ICUs_label_test.scores_[selected_ICUs_features_indices_top5]

# this is to get the feature names of the corresponding columns
# in the dataframe
selected_ICUs_features_top5 = []
for index in selected_ICUs_features_indices_top5:
    selected_ICUs_features_top5.append(cv19_cat_knn_imp_df.columns[index])

    
# setup the testing model for k = 5 features
best5_chi2_hospitalizations_label_test = SelectKBest(score_func=chi2, k= 5)

# run/fit the testing model with the features and label(s)
best5_chi2_hospitalizations_label_test.fit(cv19_cat_array, cv19_cat_hospitalizations_lb_array)

# this is to get the indices of the column features that are kept by the model
selected_hospitalizations_features_indices_top5 = best5_chi2_hospitalizations_label_test.get_support(indices = True)

# this is to get the scores of the selected features
selected_hospitalizations_features_scores_top5 = best5_chi2_hospitalizations_label_test.scores_[selected_hospitalizations_features_indices_top5]

# this is to get the feature names of the corresponding columns
# in the dataframe
selected_hospitalizations_features_top5 = []
for index in selected_hospitalizations_features_indices_top5:
    selected_hospitalizations_features_top5.append(cv19_cat_knn_imp_df.columns[index])


In [None]:

# the codes below generate a horizontal bar graph
# for the top five features

plt.rcdefaults()
top5_figure, top5_axes = plt.subplots()
top5_y_pos = np.arange( len(selected_ICUs_features_top5) )
top5_axes.barh( top5_y_pos, width = selected_ICUs_features_scores_top5, color='orange', align='center' )
top5_axes.set_yticks( top5_y_pos )
top5_axes.set_yticklabels(selected_ICUs_features_top5)
top5_axes.invert_yaxis()
top5_axes.set_xlabel('Chi Squared Test Scores')
top5_axes.set_title('Chi Squared Test Scores of The Top Five\nFeatures of ICU Outcomes Using KNN Imputation')
plt.show()


In [None]:

# the codes below generate a horizontal bar graph
# for the top five features

plt.rcdefaults()
top5_figure, top5_axes = plt.subplots()
top5_y_pos = np.arange( len(selected_hospitalizations_features_top5) )
top5_axes.barh( top5_y_pos, width = selected_hospitalizations_features_scores_top5, color='yellow', align='center' )
top5_axes.set_yticks( top5_y_pos )
top5_axes.set_yticklabels(selected_hospitalizations_features_top5)
top5_axes.invert_yaxis()
top5_axes.set_xlabel('Chi Squared Test Scores')
top5_axes.set_title('Chi Squared Test Scores of The Top Five\nFeatures of Hospitalizations Outcomes Using KNN Imputation')
plt.show()



# Setting up the training sets and testing sets


In [None]:

from sklearn.model_selection import train_test_split

# Notes: you can name the sets whatevers
# I choose to use obvious names to make things easier for myself

# the format is
# Features_Training, Features_Testing, Labels_Training, Labels_Testing = train_test_split(parameters)
# the parameters follow the format ( Feature_Matrix, Labels_Matrix, test_size = #.##, random_state = ## )
# test_size = 0.1 means 10% for testing and 90% for training, 0.2 means 20% for testing and 80% for training

# setting up training and testing sets for knn imputated data
knn_deaths_features_training, knn_deaths_features_testing, knn_deaths_labels_training, knn_deaths_labels_testing = train_test_split(cv19_cat_knn_features, cv19_cat_knn_deaths_labels, test_size = 0.2, random_state = 2)
knn_ICUs_features_training, knn_ICUs_features_testing, knn_ICUs_labels_training, knn_ICUs_labels_testing = train_test_split(cv19_cat_knn_features, cv19_cat_knn_ICUs_labels, test_size = 0.2, random_state = 2)
knn_hospitalizations_features_training, knn_hospitalizations_features_testing, knn_hospitalizations_labels_training, knn_hospitalizations_labels_testing = train_test_split(cv19_cat_knn_features, cv19_cat_knn_hospitalizations_labels, test_size = 0.2, random_state = 2)
knn_recovered_features_training, knn_recovered_features_testing, knn_recovered_labels_training, knn_recovered_labels_testing = train_test_split(cv19_cat_knn_features, cv19_cat_knn_recovered_labels, test_size = 0.2, random_state = 2)






# Training Logistic Regression Models

Below codes will do death, ICU, and hospitalization outcomes for knn 


In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score

# the order goes as follows:

# construct the model - max_iter parameter can be omitted
# model.fit (Features_Training, Labels_Training) - this trains the model
# predictions = model.predict (Features_Testing) - this will use the model
# to predict the labels of the testing features set
# probabilities = model.predict_proba (Features_Testing)
# algorithms like Logistic Regression can do this - some algorithms cannot do this
# FPR, TPR, Thresholds = metrics.roc_curve ( Labels_Testing, probabilities[:, 1], pos_label = 1 )
# AUC = metrics.auc (FPR, TPR)
# accuracy = accuracy_score (Labels_Testing, predictions)

# Note:
# FPR, TPR, Thresholds = metrics.roc_curve ( Labels_Testing, probabilities[:,1], pos_label = 1 )
# FPR is false positive rate, TPR is True positive rate, thresholds is the boundary that makes the decisions for each
# normally in the probabilities list, it is a list of pairs. in this case the 2nd of the pair is the positive label
# since "1" marks postive, you set that parameter
# if the positive label is not a number, set pos_label = 'name value of positive'
# if the positive label is the 1st of the pair, set probabilities[:,0]




# Logistic Regresison for Death Outcomes.


In [None]:

knn_deaths_lgr_clssr = LogisticRegression(max_iter=1500)
knn_deaths_lgr_clssr.fit (knn_deaths_features_training, knn_deaths_labels_training)
knn_deaths_features_predictions = knn_deaths_lgr_clssr.predict(knn_deaths_features_testing)
knn_deaths_features_probabilities = knn_deaths_lgr_clssr.predict_proba(knn_deaths_features_testing)
knn_deaths_FPR, knn_deaths_TPR, knn_deaths_Thresholds = metrics.roc_curve(knn_deaths_labels_testing, knn_deaths_features_probabilities[:,1], pos_label = 1)
knn_deaths_AUC = metrics.auc(knn_deaths_FPR, knn_deaths_TPR)
knn_deaths_features_accuracy = accuracy_score(knn_deaths_labels_testing, knn_deaths_features_predictions)



# Logistic Regresison for ICU Outcomes.


In [None]:

knn_ICUs_lgr_clssr = LogisticRegression(max_iter=1500)
knn_ICUs_lgr_clssr.fit (knn_ICUs_features_training, knn_ICUs_labels_training)
knn_ICUs_features_predictions = knn_ICUs_lgr_clssr.predict(knn_ICUs_features_testing)
knn_ICUs_features_probabilities = knn_ICUs_lgr_clssr.predict_proba(knn_ICUs_features_testing)
knn_ICUs_FPR, knn_ICUs_TPR, knn_ICUs_Thresholds = metrics.roc_curve(knn_ICUs_labels_testing, knn_ICUs_features_probabilities[:,1], pos_label = 1)
knn_ICUs_AUC = metrics.auc(knn_ICUs_FPR, knn_ICUs_TPR)
knn_ICUs_features_accuracy = accuracy_score(knn_ICUs_labels_testing, knn_ICUs_features_predictions)



# Logistic Regresison for Hospitalizations Outcomes.


In [None]:

knn_HZs_lgr_clssr = LogisticRegression(max_iter=1500)
knn_HZs_lgr_clssr.fit (knn_hospitalizations_features_training, knn_hospitalizations_labels_training)
knn_HZs_features_predictions = knn_HZs_lgr_clssr.predict(knn_hospitalizations_features_testing)
knn_HZs_features_probabilities = knn_HZs_lgr_clssr.predict_proba(knn_hospitalizations_features_testing)
knn_HZs_FPR, knn_HZs_TPR, knn_HZs_Thresholds = metrics.roc_curve(knn_hospitalizations_labels_testing, knn_HZs_features_probabilities[:,1], pos_label = 1)
knn_HZs_AUC = metrics.auc(knn_HZs_FPR, knn_HZs_TPR)
knn_HZs_features_accuracy = accuracy_score(knn_hospitalizations_labels_testing, knn_HZs_features_predictions)


In [None]:

print('The Accuracy Score for Death Outcomes is: ', '%0.4f' % knn_deaths_features_accuracy)
print('The Accuracy Score for ICU Outcomes is: ', '%0.4f' % knn_ICUs_features_accuracy)
print('The Accuracy Score for Hospitalization Outcomes is: ', '%0.4f' % knn_HZs_features_accuracy)



# Using matplotlib to graph the False Positive Rate (FPR), True Positive Rate (TPR), and AUC (Area under the [ROC] Curve).
# Note: Using certain algorithms that has the predict_proba function will make AUC values easier to calculate.
# If predict_proba is not available, you will need to use sklearn confusion matrix to calculate them.


In [None]:

%matplotlib inline

plt.rcParams.update({'font.size': 16})

plt.figure(figsize =(12, 8))

# you need FPR, TPR, and AUC to plot graphs like below.

plt.plot(knn_deaths_FPR, knn_deaths_TPR, color='red', lw= 3, 
           label='KNN Imputated Deaths ROC Curve ( area = %0.4f)' % knn_deaths_AUC)

plt.plot(knn_ICUs_FPR, knn_ICUs_TPR, color='orange', lw= 3, 
           label='KNN Imputated ICUs ROC Curve ( area = %0.4f)' % knn_ICUs_AUC)

plt.plot(knn_HZs_FPR, knn_HZs_TPR, color='yellow', lw= 3, 
           label='KNN Imputated Hospitalizations ROC Curve ( area = %0.4f)' % knn_HZs_AUC)

plt.plot([0, 1], [0, 1], color='blue', lw=2, 
         label ='Random Guess Line',linestyle='--')


plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")

plt.show()

In [None]:



selected_deaths_features = cv19_cat_knn_imp_df[selected_deaths_features_top5]
selected_deaths_features.head()


selected_deaths_features_train, selected_deaths_features_test, selected_deaths_labels_train, selected_deaths_labels_test = train_test_split(selected_deaths_features, cv19_cat_knn_deaths_labels, test_size = 0.2, random_state=2)

knn5_deaths_lgr_clssr = LogisticRegression(max_iter=1500)
knn5_deaths_lgr_clssr.fit (selected_deaths_features_train, selected_deaths_labels_train)
knn5_deaths_features_predictions = knn5_deaths_lgr_clssr.predict(selected_deaths_features_test)
knn5_deaths_features_probabilities = knn5_deaths_lgr_clssr.predict_proba(selected_deaths_features_test)
knn5_deaths_FPR, knn5_deaths_TPR, knn5_deaths_Thresholds = metrics.roc_curve( selected_deaths_labels_test, knn5_deaths_features_probabilities[:,1], pos_label = 1)
knn5_deaths_AUC = metrics.auc(knn5_deaths_FPR, knn5_deaths_TPR)
knn5_deaths_features_accuracy = accuracy_score( selected_deaths_labels_test, knn5_deaths_features_predictions)

