
# Codes on General Death Outcomes Notes<br>with A Focus on Chi Square vs. RFE
# Note that the cell outputs and file links have been erased, and the notebook kernel has been reset to protect confidential data


In [None]:

import numpy as np
import pandas as pd
import time

# remember to check the file path names
cv19_cat_df = pd.read_csv()

#also remember to change the missing (null) values to numpy "not a number" or nan
cv19_cat_df.fillna(np.nan)




# KNN Imputation


In [None]:


from sklearn.impute import KNNImputer

# construct the knn imputator with 1 nearest neighbor
knn1_imputer = KNNImputer(missing_values=np.nan, n_neighbors =1, weights='uniform', metric='nan_euclidean', copy ='False')

# train (fit) the model, then transform does the imputation
cv19_cat_knn_imp = knn1_imputer.fit_transform( cv19_cat_df )

# this line creates a new dataframe with the data imputated
cv19_cat_knn_imp_df = pd.DataFrame(cv19_cat_knn_imp, columns = cv19_cat_df.columns)






# Setting up the dataframe sections (slices) for testing and training the model, <br> and doing Chi Square Test 


In [None]:


# get the column names of the original data
cv19_cat_feature_cols = list (cv19_cat_df.columns)

# remove the names of the columns that are labels (also called targets, these are what we are tryingto predict)
cv19_cat_feature_cols.remove('evol_Death')
cv19_cat_feature_cols.remove('evol_ICU_admission')
cv19_cat_feature_cols.remove('evol_Hospitalization')
cv19_cat_feature_cols.remove('evol_Recovered')

# create the feature section (also called feature matrix or X) for each dataframe
cv19_cat_knn_features = cv19_cat_knn_imp_df[cv19_cat_feature_cols]

#create the label sections (or simply Y)
cv19_cat_knn_deaths_labels = cv19_cat_knn_imp_df[ 'evol_Death']


# the features for chi square test
cv19_cat_array = cv19_cat_knn_features.values

# the death labels for chi square test
cv19_cat_death_lb_array = cv19_cat_knn_deaths_labels.values




# Chi2 Square Test (For Death Outcomes)


In [None]:

# the follow codes do feature selection
# based on the goodness of fit tests using chi squared tests
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# setup the testing model for k = 10 features
best10_chi2_deaths_label_test = SelectKBest(score_func=chi2, k= 10)

# run/fit the testing model with the features and label(s)
best10_chi2_deaths_label_test.fit(cv19_cat_array, cv19_cat_death_lb_array)

# this is to get the indices of the column features that are kept by the model
selected_chi2_deaths_features_indices_top10 = best10_chi2_deaths_label_test.get_support(indices = True)

# this is to get the scores of the selected features
selected_chi2_deaths_features_scores_top10 = best10_chi2_deaths_label_test.scores_[selected_chi2_deaths_features_indices_top10]

# this is to get the feature names of the corresponding columns
# in the dataframe
selected_chi2_deaths_features_top10 = []
for index in selected_chi2_deaths_features_indices_top10:
    selected_chi2_deaths_features_top10.append(cv19_cat_knn_imp_df.columns[index])


In [None]:

import matplotlib.pyplot as plt

# the codes below generate a horizontal bar graph
# for the top five features

plt.rcdefaults()
top10_chi2_figure, top10_chi2_axes = plt.subplots()
top10_chi2_y_pos = np.arange( len(selected_chi2_deaths_features_top10) )
top10_chi2_axes.barh( top10_chi2_y_pos, width = selected_chi2_deaths_features_scores_top10, color='red', align='center' )
top10_chi2_axes.set_yticks( top10_chi2_y_pos )
top10_chi2_axes.set_yticklabels(selected_chi2_deaths_features_top10)
top10_chi2_axes.invert_yaxis()
top10_chi2_axes.set_xlabel('Chi Squared Test Scores')
top10_chi2_axes.set_title('Chi Squared Test Scores of The Top Ten\nFeatures of Death Outcomes Using KNN Imputation')
plt.show()



# Recursive Feature Elimination (RFE) w/ Random Forest Classifier (RFC)

# Note: Although Support Vector Machines (SVM) Classifiers are recommended for this dataset, Random Forest is a good algorithm for a general dataset.

In [None]:

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# setting up the RFE model for 5 features
rfe_deaths_label_test = RFE(estimator = RandomForestClassifier(), n_features_to_select = 10, step = 1)

# run/fit the testing model with the features and label(s)
rfe_deaths_label_test.fit(cv19_cat_array, cv19_cat_death_lb_array)

# this is to get the indices of the column features that are kept by the models
selected_rfe_deaths_features_indices_top10 = rfe_deaths_label_test.get_support(indices = True)

# this is to get the scores of the selected features
selected_rfe_deaths_features_ranks = rfe_deaths_label_test.ranking_

selected_top10_rfe_deaths_features = []
for index in selected_top10_rfe_deaths_features_indices:
    selected_top10_rfe_deaths_features.append(cv19_cat_knn_imp_df.columns[index])
    
print('Done Processing!')


# Note: Unlike Chi Squared Tests which selects features by test scores,<br> Recursive Feature Elimination assigns ranks to features,<br>with 1 being the highest. <br> Also note, because Random Forest is used for ranking, the results may change each iteration.


In [None]:

rfe_ranks_df = pd.DataFrame(selected_rfe_deaths_features_ranks, index = cv19_cat_knn_features.columns, columns= ['Feature Rank']).sort_values(by='Feature Rank', ascending = True)

rfe_ranks_df.head(15)



# Recursive Feature Elimination w/ Cross Validation (RFECV)<br>Using the Random Forest Classifier (RFC)<br> Note: RFECV will pick k number of features at minimum, so it is possible to have more than k features selected by RFECV.



In [None]:

from sklearn.feature_selection import RFECV

# unlike RFE, this selects AT MINIMUM k number of chosen features
# the setup is similar to RFE, except that the n_jobs parameter will determine the number of cpu cores
# allocated to handle the cross validation folds.
rfecv_for_deaths = RFECV(estimator = RandomForestClassifier(), step = 1, min_features_to_select = 8, cv = 10, n_jobs = -1)

rfecv_for_deaths.fit(cv19_cat_array, cv19_cat_death_lb_array)

rfecv_selected_top_deaths_features_indices = rfecv_for_deaths.get_support(indices = True)

rfecv_deaths_features_grid_scores = rfecv_for_deaths.grid_scores_

rfecv_deaths_features_rankings = rfecv_for_deaths.ranking_

rfecv_selected_top_deaths_features = []
for index in rfecv_selected_top_deaths_features_indices:
    rfecv_selected_top_deaths_features.append(cv19_cat_knn_imp_df.columns[index])

print('Done Processing!')
    

In [None]:

rfecv_ranks_df = pd.DataFrame(rfecv_deaths_features_rankings, index = cv19_cat_knn_features.columns,columns= ['Feature Rank']).sort_values(by='Feature Rank', ascending = True)

rfecv_ranks_df.head(20)



# Dataframe section (slice) of the top 10 death outcome features<br>selected by Chi Squared Test


In [None]:

chi2_10_features = cv19_cat_knn_imp_df[selected_chi2_deaths_features_top10]
chi2_10_features.head(10)



# Dataframe section (slice) of the top 10 death outcome features<br>selected by Recursive Feature Elimination method.


In [None]:

rfe_10_features = cv19_cat_knn_imp_df[selected_top10_rfe_deaths_features]
rfe_10_features.head(10)



# Dataframe section (slice) of the top death outcome features<br>selected by Recursive Feature Elimination w/ Cross Validation method. <br>Note: more than 10 features may be selected, but these are just used to reference to the other two feature sets selected by the other two methods.


In [None]:

refcv_10_features = cv19_cat_knn_imp_df[rfecv_selected_top_deaths_features]
refcv_10_features.head(10)



# Setting up the Training and Testing Sets.<br> Note: 20% testing size, 80% training, random state = 2.


In [None]:

from sklearn.model_selection import train_test_split

chi2_10_deaths_features_training, chi2_10_deaths_features_testing, chi2_10_deaths_label_training, chi2_10_deaths_label_testing = train_test_split(chi2_10_features, cv19_cat_knn_deaths_labels, test_size = 0.2, random_state = 2)
rfe_10_deaths_features_training, rfe_10_deaths_features_testing, rfe_10_deaths_label_training, rfe_10_deaths_label_testing = train_test_split(rfe_10_features, cv19_cat_knn_deaths_labels, test_size = 0.2, random_state = 2)




# Logistic Regression Models



In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score


chi2_deaths_lgr_clssr = LogisticRegression(max_iter=1500)
chi2_deaths_lgr_clssr.fit (chi2_10_deaths_features_training, chi2_10_deaths_label_training)
chi2_deaths_features_predictions = chi2_deaths_lgr_clssr.predict(chi2_10_deaths_features_testing)
chi2_deaths_features_probabilities = chi2_deaths_lgr_clssr.predict_proba(chi2_10_deaths_features_testing)
chi2_deaths_FPR, chi2_deaths_TPR, chi2_deaths_Thresholds = metrics.roc_curve(chi2_10_deaths_label_testing, chi2_deaths_features_probabilities[:,1], pos_label = 1)
chi2_deaths_AUC = metrics.auc(chi2_deaths_FPR, chi2_deaths_TPR)
chi2_deaths_features_accuracy = accuracy_score(chi2_10_deaths_label_testing, chi2_deaths_features_predictions)


rfe_deaths_lgr_clssr = LogisticRegression(max_iter=1500)
rfe_deaths_lgr_clssr.fit (rfe_10_deaths_features_training, rfe_10_deaths_label_training)
rfe_deaths_features_predictions = rfe_deaths_lgr_clssr.predict(rfe_10_deaths_features_testing)
rfe_deaths_features_probabilities = rfe_deaths_lgr_clssr.predict_proba(rfe_10_deaths_features_testing)
rfe_deaths_FPR, rfe_deaths_TPR, rfe_deaths_Thresholds = metrics.roc_curve(rfe_10_deaths_label_testing, rfe_deaths_features_probabilities[:,1], pos_label = 1)
rfe_deaths_AUC = metrics.auc(rfe_deaths_FPR, rfe_deaths_TPR)
rfe_deaths_features_accuracy = accuracy_score(rfe_10_deaths_label_testing, rfe_deaths_features_predictions)


print('Deaths Accuracy Score of Chi Square Features: ', '%0.4f' % rfe_deaths_features_accuracy, '.')
print('Deaths Accuracy Score of RFE Features: ', '%0.4f' % rfe_deaths_features_accuracy, '.')


In [None]:

%matplotlib inline

plt.rcParams.update({'font.size': 16})

plt.figure(figsize =(12, 8))

# you need FPR, TPR, and AUC to plot graphs like below.

plt.plot(chi2_deaths_FPR, chi2_deaths_TPR, color='red', lw= 3, 
           label='Chi2 Top 10 Death Features ROC Curve ( area = %0.4f)' % chi2_deaths_AUC)

plt.plot(rfe_deaths_FPR, rfe_deaths_TPR, color='orange', lw= 3, 
           label='RFE Top 10 ROC Curve ( area = %0.4f)' % rfe_deaths_AUC)


plt.plot([0, 1], [0, 1], color='blue', lw=2, 
         label ='Random Guess Line',linestyle='--')


plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")

plt.show()
