

# Notes on RFE and RFECV from sklearn
# Note that the cell outputs and file links have been erased, and the notebook kernel has been reset to protect confidential data

In [None]:

import numpy as np
import pandas as pd
import time

# remember to check the file path names
cv19_cat_df = pd.read_csv()

#also remember to change the missing (null) values to numpy "not a number" or nan
cv19_cat_df.fillna(np.nan)



# KNN Imputation



In [None]:


from sklearn.impute import KNNImputer

# construct the knn imputator with 1 nearest neighbor
knn1_imputer = KNNImputer(missing_values=np.nan, n_neighbors =1, weights='uniform', metric='nan_euclidean', copy ='False')

# train (fit) the model, then transform does the imputation
cv19_cat_knn_imp = knn1_imputer.fit_transform( cv19_cat_df )

# this line creates a new dataframe with the data imputated
cv19_cat_knn_imp_df = pd.DataFrame( cv19_cat_knn_imp, columns = cv19_cat_df.columns )



# Iterative Imputation


In [None]:

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier

# you can chose a different predicting algoirthm
# 10 iterations should be decent - more iterations will take longer
# you can also pick the random state number
# construct the iterative imputer model with RFC as the classifier
iterative_imputer = IterativeImputer(estimator = RandomForestClassifier(), initial_strategy = 'most_frequent', max_iter = 10, random_state = 2)

# similar to the knn, you do training (fit) and then transform (imputing)
cv19_cat_iter_imp = iterative_imputer.fit_transform(cv19_cat_df)

# create the dataframe
cv19_cat_iter_imp_df = pd.DataFrame(cv19_cat_iter_imp, columns = cv19_cat_df.columns)

print('Done Processing!')


In [None]:

# get the column names of the original data
cv19_cat_feature_cols = list (cv19_cat_df.columns)

# remove the names of the columns that are labels (also called targets, these are what we are tryingto predict)
cv19_cat_feature_cols.remove('evol_Death')
cv19_cat_feature_cols.remove('evol_ICU_admission')
cv19_cat_feature_cols.remove('evol_Hospitalization')
cv19_cat_feature_cols.remove('evol_Recovered')

# create the feature section (also called feature matrix or X) for each dataframe
cv19_cat_knn_features = cv19_cat_knn_imp_df[cv19_cat_feature_cols]
cv19_cat_iter_features = cv19_cat_iter_imp_df[cv19_cat_feature_cols]

#create the label sections (or simply Y)
cv19_cat_knn_deaths_labels = cv19_cat_knn_imp_df[ 'evol_Death']
cv19_cat_knn_ICUs_labels = cv19_cat_knn_imp_df[ 'evol_ICU_admission']
cv19_cat_knn_hospitalizations_labels = cv19_cat_knn_imp_df[ 'evol_Hospitalization']
cv19_cat_knn_recovered_labels = cv19_cat_knn_imp_df[ 'evol_Recovered']

cv19_cat_iter_deaths_labels = cv19_cat_iter_imp_df[ 'evol_Death']
cv19_cat_iter_ICUs_labels = cv19_cat_iter_imp_df[ 'evol_ICU_admission']
cv19_cat_iter_hospitalizations_labels = cv19_cat_iter_imp_df[ 'evol_Hospitalization']
cv19_cat_iter_recovered_labels = cv19_cat_iter_imp_df[ 'evol_Recovered']



# Recursive Feature Elimination (RFE) <br> We're going to try apply RFE on hospitalizations cases
# Note: this may take a while.

In [None]:


from sklearn.feature_selection import RFE

# setting up the RFE model for 5 features
rfe_for_knn = RFE(estimator = RandomForestClassifier(), n_features_to_select = 5, step = 1)
rfe_for_iter = RFE(estimator = RandomForestClassifier(), n_features_to_select = 5, step = 1)

# run/fit the testing model with the features and label(s)
rfe_for_knn.fit(cv19_cat_knn_features, cv19_cat_knn_hospitalizations_labels)
rfe_for_iter.fit(cv19_cat_iter_features, cv19_cat_iter_hospitalizations_labels)

# this is to get the indices of the column features that are kept by the models
selected_top5_knn_hospitalization_features_indices = rfe_for_knn.get_support(indices = True)
selected_top5_iter_hospitalization_features_indices = rfe_for_iter.get_support(indices = True)

# this is to get the feature names of the corresponding columns
# in the dataframe
selected_top5_knn_hospitalization_features = []
selected_top5_iter_hospitalization_features = []
for index in selected_top5_knn_hospitalization_features_indices:
    selected_top5_knn_hospitalization_features.append(cv19_cat_knn_imp_df.columns[index])
    selected_top5_iter_hospitalization_features.append(cv19_cat_iter_imp_df.columns[index])
    
print('Done Processing!')


# Note: 
The selected features are based on a ranking. <br>
The higher ranking features are kept. <br>
In RFE, the features are eliminated iteratively in this case one by one (step = 1).<br>
In RFE, the features selected may depend on the predicting model/algorithm used to rank the features,<br>
We just happen to use Random Forest Classifier (RFC) as the model.

In [None]:

print(selected_top5_knn_hospitalization_features)
print(selected_top5_iter_hospitalization_features)



# Using Recursive Feature Elimination w/ Cross Validation<br>Note:<br>Just like RFE, this may take a while.<br> If parameter n_jobs = -1 is set, this means all the available cpu cores are set to run in parallel while fitting across the chosen (cross validation) folds.

In [None]:

from sklearn.feature_selection import RFECV

# unlike RFE, this selects AT MINIMUM k number of chosen features
# the setup is similar to RFE, except that the n_jobs parameter will determine the number of cpu cores
# allocated to handle the cross validation folds.
rfecv_for_knn = RFECV(estimator = RandomForestClassifier(), step = 1, min_features_to_select = 3, cv = 10, n_jobs = -1)
rfecv_for_iter = RFECV(estimator = RandomForestClassifier(), step = 1, min_features_to_select = 3, cv = 10, n_jobs = -1)

rfecv_for_knn.fit(cv19_cat_knn_features, cv19_cat_knn_hospitalizations_labels)
rfecv_for_iter.fit(cv19_cat_iter_features, cv19_cat_iter_hospitalizations_labels)

rfecv_selected_top_knn_hospitalization_features_indices = rfecv_for_knn.get_support(indices = True)
rfecv_selected_top_iter_hospitalization_features_indices = rfecv_for_iter.get_support(indices = True)

rfecv_selected_top_knn_hospitalization_features = []
rfecv_selected_top_iter_hospitalization_features = []
for index in selected_top5_knn_hospitalization_features_indices:
    rfecv_selected_top_knn_hospitalization_features.append(cv19_cat_knn_imp_df.columns[index])
    rfecv_selected_top_iter_hospitalization_features.append(cv19_cat_iter_imp_df.columns[index])
    
print('Done Processing!')

In [None]:

print(rfecv_selected_top_knn_hospitalization_features)
print(rfecv_selected_top_iter_hospitalization_features)



# Using matplotlib to visualize the RFECV with a graph


In [None]:



import matplotlib.pyplot as plt

plt.figure(figsize = (12, 8))
plt.xlabel('Number of features selected')
plt.ylabel('CV Score (accuracy/number of correct predictions)')
plt.plot(range(1, len(rfecv_for_knn.grid_scores_) + 1), rfecv_for_knn.grid_scores_, color = 'green',
         lw = 3, label ='RFECV Line for KNN Imputated Data')

plt.plot(range(1, len(rfecv_for_iter.grid_scores_) + 1), rfecv_for_iter.grid_scores_, color = 'orange',
         lw = 3, label ='RFECV Line for Iteratively Imputated Data')
plt.title('Recursive Feature Elimination with Cross Validation for KNN and Iteratively Imputated Data')
plt.legend(loc='lower right')
plt.show()