Merge pull request #479 from VijaySingh-GSLab/branch-1

Imputation of missing values using ML models. (Enhancement and Bug fix opened in #477, #478)
HealthCatalyst · Nov 6, 2018 · cb82b94 · cb82b94
2 parents 25cb3ae + 69c83b7
commit cb82b94
Show file tree

Hide file tree

Showing 9 changed files with 1,328 additions and 49 deletions.
diff --git a/example_classification_1.py b/example_classification_1.py
@@ -11,6 +11,7 @@
 This code uses the diabetes sample data in datasets/data/diabetes.csv.
 """
 import pandas as pd
+import numpy as np
 
 import healthcareai
 import healthcareai.trained_models.trained_supervised_model as tsm_plots
@@ -41,7 +42,7 @@ def main():
 
     # Drop columns that won't help machine learning
     dataframe.drop(['PatientID'], axis=1, inplace=True)
-
+    
     # Step 1: Setup a healthcareai classification trainer. This prepares your data for model building
     classification_trainer = healthcareai.SupervisedModelTrainer(
         dataframe=dataframe,
@@ -50,7 +51,52 @@ def main():
         grain_column='PatientEncounterID',
         impute=True,
         verbose=False)
-
+
+
+    """
+    The below code demonstrate the advance features for imputation of missing values.
+    imputeStrategy: 
+        'MeanMode': (default), Impute using mean and mode values of column
+        'RandomForest': Impute missing values in RandomForest models.(Imputed values are much more realistic)
+    
+    tunedRandomForest:
+        True: ML to be used for imputation of missing values are tuned using grid search and K-fold cross 
+              validation.
+    
+    numeric_columns_as_categorical :
+        For example: GenderFLG (0,0,1,0,1,1 .... )
+        So in normal case pandas by default will consider this column as numeric and missing values of this column 
+        will be imputed using MEAN value (ex. 0.78 or 1.46 ....).
+        
+        Thus to explicitly mention such  as categorical there is this option which can be used as below:
+            numeric_columns_as_categorical = 'GenderFLG'
+        Now imputation will be done by MODE value and final type of the column wil be np.object.
+    """
+
+    # Uncomment below code to see advance imputation in action.
+    """
+    # Creating missing values in GenderFLG column and converting it into Numeric type to demostrate advance imputation features.
+    dataframe['GenderFLG'].iloc[ 500:530, ] = np.NaN
+    dataframe['GenderFLG'].replace( to_replace=[ 'M', 'F' ], value=[ 0, 1], inplace=True )
+    pd.options.mode.chained_assignment = None
+    
+    classification_trainer = healthcareai.SupervisedModelTrainer(
+        dataframe=dataframe,
+        predicted_column='ThirtyDayReadmitFLG',
+        model_type='classification',
+        grain_column='PatientEncounterID',
+        impute=True,
+        verbose=False,
+        imputeStrategy = 'RandomForest',
+        tunedRandomForest = True,
+        numeric_columns_as_categorical = 'GenderFLG'   
+        )
+    """
+
+
+
+
+
     # Look at the first few rows of your dataframe after loading the data
     print('\n\n-------------------[ Cleaned Dataframe ]--------------------------')
     print(classification_trainer.clean_dataframe.head())
@@ -107,7 +153,7 @@ def main():
     # Once you are happy with the performance of any model, you can save it for use later in predicting new data.
     # File names are timestamped and look like '2017-05-31T12-36-21_classification_RandomForestClassifier.pkl')
     # Note the file you saved and that will be used in example_classification_2.py
-    trained_random_forest.save()
+    # trained_random_forest.save()
 
 
 if __name__ == "__main__":

diff --git a/example_classification_2.py b/example_classification_2.py
@@ -11,6 +11,7 @@
 This code uses the diabetes sample data in datasets/data/diabetes.csv.
 """
 import pandas as pd
+import numpy as np
 import sqlalchemy
 
 import healthcareai
@@ -21,6 +22,14 @@ def main():
     """Template script for using healthcareai predict using a trained classification model."""
     # Load the included diabetes sample data
     prediction_dataframe = healthcareai.load_diabetes()
+
+
+    # uncomment below code if advance imputaion is used in example_classification_1 
+    # beacuse we have intentionally converted GenderFLG column into numeric type for demonstration of numeric_columns_as_categorical feature.
+    """
+    prediction_dataframe['GenderFLG'].iloc[ 500:530, ] = np.NaN
+    prediction_dataframe['GenderFLG'].replace( to_replace=[ 'M', 'F' ], value=[ 0, 1], inplace=True )
+    """
 
     # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
     # prediction_dataframe = healthcareai.load_csv('path/to/your.csv')
@@ -41,7 +50,8 @@ def main():
     # Load the saved model using your filename.
     # File names are timestamped and look like '2017-05-31T12-36-21_classification_RandomForestClassifier.pkl')
     # Note the file you saved in example_classification_1.py and set that here.
-    trained_model = healthcareai.load_saved_model('2017-08-16T16-45-57_classification_RandomForestClassifier.pkl')
+    trained_model = healthcareai.load_saved_model('2018-10-09T13-53-44_classification_RandomForestClassifier_defaultImputation.pkl')
+    #trained_model = healthcareai.load_saved_model('2018-10-09T13-25-28_classification_RandomForestClassifier_advanceImputation.pkl')
 
     # Any saved model can be inspected for properties such as plots, metrics, columns, etc. (More examples in the docs)
     trained_model.roc_plot()

diff --git a/example_regression_1.py b/example_regression_1.py
@@ -11,6 +11,7 @@
 This code uses the diabetes sample data in datasets/data/diabetes.csv.
 """
 import pandas as pd
+import numpy as np
 
 import healthcareai
 import healthcareai.common.database_connections as hcai_db
@@ -37,7 +38,7 @@ def main():
 
     # Peek at the first 5 rows of data
     print(dataframe.head(5))
-
+    
     # Step 1: Setup a healthcareai regression trainer. This prepares your data for model building
     regression_trainer = healthcareai.SupervisedModelTrainer(
         dataframe=dataframe,
@@ -46,6 +47,49 @@ def main():
         grain_column='PatientEncounterID',
         impute=True,
         verbose=False)
+
+
+    """
+    The below code demonstrate the advance features for imputation of missing values.
+    imputeStrategy: 
+        'MeanMode': (default), Impute using mean and mode values of column
+        'RandomForest': Impute missing values in RandomForest models. (Imputed values are much more realistic)
+    
+    tunedRandomForest:
+        True: ML to be used for imputation of missing values are tuned using grid search and K-fold cross 
+              validation.
+    
+    numeric_columns_as_categorical :
+        For example: GenderFLG (0,0,1,0,1,1 .... )
+        So in normal case pandas by default will consider this column as numeric and missing values of this column 
+        will be imputed using MEAN value (ex. 0.78 or 1.46 ....).
+        
+        Thus to explicitly mention such  as categorical there is this option which can be used as below:
+            numeric_columns_as_categorical = 'GenderFLG'
+        Now imputation will be done by MODE value and final type of the column wil be np.object.
+    """
+
+    # Uncomment below code to see advance imputation in action.
+    """
+    # Creating missing values in GenderFLG column and converting it into Numeric type to demostrate advance imputation features.
+    dataframe['GenderFLG'].iloc[ 500:530, ] = np.NaN
+    dataframe['GenderFLG'].replace( to_replace=[ 'M', 'F' ], value=[ 0, 1], inplace=True )
+    pd.options.mode.chained_assignment = None
+    
+    regression_trainer = healthcareai.SupervisedModelTrainer(
+        dataframe=dataframe,
+        predicted_column='SystolicBPNBR',
+        model_type='regression',
+        grain_column='PatientEncounterID',
+        impute=True,
+        verbose=False,
+        imputeStrategy = 'RandomForest',
+        tunedRandomForest = True,
+        numeric_columns_as_categorical = 'GenderFLG'   
+        )
+    """
+
+
 
     # Look at the first few rows of your dataframe after loading the data
     print('\n\n-------------------[ Cleaned Dataframe ]--------------------------')

diff --git a/example_regression_2.py b/example_regression_2.py
@@ -11,6 +11,7 @@
 This code uses the diabetes sample data in datasets/data/diabetes.csv.
 """
 import pandas as pd
+import numpy as np
 
 import healthcareai
 import healthcareai.common.database_connections as hcai_db
@@ -20,6 +21,13 @@ def main():
     """Template script for using healthcareai predict using a trained regression model."""
     # Load the included diabetes sample data
     prediction_dataframe = healthcareai.load_diabetes()
+
+    # uncomment below code if advance imputaion is used in example_regression_1 
+    # beacuse we have intentionally converted GenderFLG column into numeric type for demonstration of numeric_columns_as_categorical feature.
+    """
+    prediction_dataframe['GenderFLG'].iloc[ 500:530, ] = np.NaN
+    prediction_dataframe['GenderFLG'].replace( to_replace=[ 'M', 'F' ], value=[ 0, 1], inplace=True )
+    """
 
     # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
     # prediction_dataframe = healthcareai.load_csv('path/to/your.csv')
@@ -40,7 +48,8 @@ def main():
     # Load the saved model using your filename.
     # File names are timestamped and look like '2017-05-31T12-36-21_regression_LinearRegression.pkl')
     # Note the file you saved in example_regression_1.py and set that here.
-    trained_model = healthcareai.load_saved_model('2017-08-16T16-48-02_regression_LinearRegression.pkl')
+    trained_model = healthcareai.load_saved_model('2018-10-09T13-56-20_regression_LinearRegression_defaultImputation.pkl')
+    #trained_model = healthcareai.load_saved_model('2018-10-09T13-28-40_regression_LinearRegression_advanceImputation.pkl')
 
     # Any saved models can be inspected for properties such as metrics, columns, etc. (More examples are in the docs)
     print(trained_model.metrics)