
# Codes for Imperative Imputation Notes

# Note that the cell outputs and file links have been erased, and the notebook kernel has been reset to protect confidential data

In [None]:


import numpy as np
import pandas as pd
import time


cv19_cat_df = pd.read_csv()

print ( 'Done processing!')


In [None]:


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier

iter_imp_start_time = time.perf_counter()

cv19_cat_df.fillna(np.nan)

iterative_imputer_cat = IterativeImputer(estimator = RandomForestClassifier(), initial_strategy = 'most_frequent', max_iter = 30, random_state = 2)

cv19_cat_iter_imp = iterative_imputer_cat.fit_transform( cv19_cat_df )

iter_imp_end_time = time.perf_counter()

print ( 'Done processing!')




In [None]:

cv19_cat_iter_imp_df = pd.DataFrame(cv19_cat_iter_imp, columns = cv19_cat_df.columns )



In [None]:

iter_imp_time_minutes = ( iter_imp_end_time - iter_imp_start_time )/ 60

print ('Iterative Imputation using Random Forest Classifier Time: ', str(iter_imp_time_minutes), ' minutes')

cv19_cat_iter_imp_df.head()



In [None]:


cv19_cat_feature_cols = list(cv19_cat_iter_imp_df.columns)

cv19_cat_feature_cols.remove('evol_Death')

cv19_cat_feature_cols.remove('evol_ICU_admission')

cv19_cat_feature_cols.remove('evol_Hospitalization')

cv19_cat_feature_cols.remove('evol_Recovered')

cv19_cat_imp_features = cv19_cat_iter_imp_df[ cv19_cat_feature_cols ]


cv19_cat_imp_deaths_labels = cv19_cat_iter_imp_df[ 'evol_Death']

cv19_cat_imp_ICUs_labels = cv19_cat_iter_imp_df[ 'evol_ICU_admission']

cv19_cat_imp_hospitalizations_labels = cv19_cat_iter_imp_df[ 'evol_Hospitalization']

cv19_cat_imp_recovered_labels = cv19_cat_iter_imp_df[ 'evol_Recovered']





In [None]:


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score

start_train_test_split_metrics_time = time.perf_counter()

cv19_deaths_features_training, cv19_deaths_features_testing, cv19_deaths_labels_training, cv19_deaths_labels_testing = train_test_split(cv19_cat_imp_features, cv19_cat_imp_deaths_labels, test_size = 0.2, random_state = 2)
cv19_ICUs_features_training, cv19_ICUs_features_testing, cv19_ICUs_labels_training, cv19_ICUs_labels_testing = train_test_split(cv19_cat_imp_features, cv19_cat_imp_ICUs_labels, test_size = 0.2, random_state = 2)
cv19_hospitalizations_features_training, cv19_hospitalizations_features_testing, cv19_hospitalizations_labels_training, cv19_hospitalizations_labels_testing = train_test_split(cv19_cat_imp_features, cv19_cat_imp_hospitalizations_labels, test_size = 0.2, random_state = 2)
cv19_recovered_features_training, cv19_recovered_features_testing, cv19_recovered_labels_training, cv19_recovered_labels_testing = train_test_split(cv19_cat_imp_features, cv19_cat_imp_recovered_labels, test_size = 0.2, random_state = 2)


deaths_lgr_clssr = LogisticRegression(max_iter=1500)
deaths_lgr_clssr.fit ( cv19_deaths_features_training, cv19_deaths_labels_training )
deaths_features_predictions = deaths_lgr_clssr.predict( cv19_deaths_features_testing )
deaths_features_probabilities = deaths_lgr_clssr.predict_proba( cv19_deaths_features_testing )
deaths_FPR, deaths_TPR, deaths_Thresholds = metrics.roc_curve( cv19_deaths_labels_testing, deaths_features_probabilities[:,1], pos_label = 1 )
deaths_AUC = metrics.auc( deaths_FPR, deaths_TPR )
deaths_features_accuracy = accuracy_score( cv19_deaths_labels_testing, deaths_features_predictions )


ICUs_lgr_clssr = LogisticRegression(max_iter=1500)
ICUs_lgr_clssr.fit ( cv19_ICUs_features_training, cv19_ICUs_labels_training )
ICUs_features_predictions = ICUs_lgr_clssr.predict( cv19_ICUs_features_testing )
ICUs_features_probabilities = ICUs_lgr_clssr.predict_proba( cv19_ICUs_features_testing )
ICUs_FPR, ICUs_TPR, ICUs_Thresholds = metrics.roc_curve( cv19_ICUs_labels_testing , ICUs_features_probabilities[:,1], pos_label = 1 )
ICUs_AUC = metrics.auc( ICUs_FPR, ICUs_TPR )
ICUs_features_accuracy = accuracy_score( cv19_ICUs_labels_testing, ICUs_features_predictions )



hospitalizations_lgr_clssr = LogisticRegression(max_iter=1500)
hospitalizations_lgr_clssr.fit ( cv19_hospitalizations_features_training, cv19_hospitalizations_labels_training )
hospitalizations_features_predictions = hospitalizations_lgr_clssr.predict( cv19_hospitalizations_features_testing )
hospitalizations_features_probabilities = hospitalizations_lgr_clssr.predict_proba( cv19_hospitalizations_features_testing )
HZs_FPR, HZs_TPR, HZs_Thresholds = metrics.roc_curve( cv19_hospitalizations_labels_testing , hospitalizations_features_probabilities[:,1], pos_label = 1 )
HZs_AUC = metrics.auc( HZs_FPR, HZs_TPR )
HZs_features_accuracy = accuracy_score( cv19_hospitalizations_labels_testing, hospitalizations_features_predictions )

 
recovered_lgr_clssr = LogisticRegression(max_iter=1500)
recovered_lgr_clssr.fit ( cv19_recovered_features_training, cv19_recovered_labels_training )
recovered_features_predictions = recovered_lgr_clssr.predict( cv19_recovered_features_testing )
recovered_features_probabilities = recovered_lgr_clssr.predict_proba( cv19_recovered_features_testing )
RCV_FPR, RCV_TPR, RCV_Thresholds = metrics.roc_curve( cv19_recovered_labels_testing , recovered_features_probabilities[:,1], pos_label = 1 )
RCV_AUC = metrics.auc( RCV_FPR, RCV_TPR )
RCV_features_accuracy = accuracy_score( cv19_recovered_labels_testing, recovered_features_predictions )
    
    
end_train_test_split_metrics_time = time.perf_counter()
   
print ( 'Done processing!')
    

In [None]:

print ('Deaths Prediction Accuracy Score: ', str (deaths_features_accuracy ))
print ('ICUs Prediction Accuracy Score: ', str (ICUs_features_accuracy ))
print ('Hospitalization Prediction Accuracy Score: ', str (HZs_features_accuracy ))
print ('Recovery Prediction Accuracy Score: ', str (RCV_features_accuracy ))


In [None]:


import matplotlib.pyplot as plt

%matplotlib inline
plt.figure()

plt.plot( deaths_FPR, deaths_TPR, color='red', lw= 3, 
           label='Deaths Log Reg ROC Curve ( area = %0.4f)' % deaths_AUC )


plt.plot( ICUs_FPR, ICUs_TPR, color='orange', lw= 3, 
           label='ICU Log Reg Trees ROC Curve ( area = %0.4f)' % ICUs_AUC )


plt.plot( HZs_FPR, HZs_TPR, color='yellow', lw= 3, 
           label='Hospitalization Log Reg ROC Curve ( area = %0.4f)' % HZs_AUC )

plt.plot( RCV_FPR, RCV_TPR, color='green', lw= 3, 
           label='Recovered Log Reg ROC Curve ( area = %0.4f)' % RCV_AUC )


plt.plot([0, 1], [0, 1], color='blue', lw=1, linestyle='--')


plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

plt.show()

