Skip to content

Commit

Permalink
Merge pull request #479 from VijaySingh-GSLab/branch-1
Browse files Browse the repository at this point in the history
Imputation of missing values using ML models. (Enhancement and Bug fix opened in #477, #478)
  • Loading branch information
mmastand committed Nov 6, 2018
2 parents 25cb3ae + 69c83b7 commit cb82b94
Show file tree
Hide file tree
Showing 9 changed files with 1,328 additions and 49 deletions.
52 changes: 49 additions & 3 deletions example_classification_1.py
Expand Up @@ -11,6 +11,7 @@
This code uses the diabetes sample data in datasets/data/diabetes.csv.
"""
import pandas as pd
import numpy as np

import healthcareai
import healthcareai.trained_models.trained_supervised_model as tsm_plots
Expand Down Expand Up @@ -41,7 +42,7 @@ def main():

# Drop columns that won't help machine learning
dataframe.drop(['PatientID'], axis=1, inplace=True)

# Step 1: Setup a healthcareai classification trainer. This prepares your data for model building
classification_trainer = healthcareai.SupervisedModelTrainer(
dataframe=dataframe,
Expand All @@ -50,7 +51,52 @@ def main():
grain_column='PatientEncounterID',
impute=True,
verbose=False)



"""
The below code demonstrate the advance features for imputation of missing values.
imputeStrategy:
'MeanMode': (default), Impute using mean and mode values of column
'RandomForest': Impute missing values in RandomForest models.(Imputed values are much more realistic)
tunedRandomForest:
True: ML to be used for imputation of missing values are tuned using grid search and K-fold cross
validation.
numeric_columns_as_categorical :
For example: GenderFLG (0,0,1,0,1,1 .... )
So in normal case pandas by default will consider this column as numeric and missing values of this column
will be imputed using MEAN value (ex. 0.78 or 1.46 ....).
Thus to explicitly mention such as categorical there is this option which can be used as below:
numeric_columns_as_categorical = 'GenderFLG'
Now imputation will be done by MODE value and final type of the column wil be np.object.
"""

# Uncomment below code to see advance imputation in action.
"""
# Creating missing values in GenderFLG column and converting it into Numeric type to demostrate advance imputation features.
dataframe['GenderFLG'].iloc[ 500:530, ] = np.NaN
dataframe['GenderFLG'].replace( to_replace=[ 'M', 'F' ], value=[ 0, 1], inplace=True )
pd.options.mode.chained_assignment = None
classification_trainer = healthcareai.SupervisedModelTrainer(
dataframe=dataframe,
predicted_column='ThirtyDayReadmitFLG',
model_type='classification',
grain_column='PatientEncounterID',
impute=True,
verbose=False,
imputeStrategy = 'RandomForest',
tunedRandomForest = True,
numeric_columns_as_categorical = 'GenderFLG'
)
"""





# Look at the first few rows of your dataframe after loading the data
print('\n\n-------------------[ Cleaned Dataframe ]--------------------------')
print(classification_trainer.clean_dataframe.head())
Expand Down Expand Up @@ -107,7 +153,7 @@ def main():
# Once you are happy with the performance of any model, you can save it for use later in predicting new data.
# File names are timestamped and look like '2017-05-31T12-36-21_classification_RandomForestClassifier.pkl')
# Note the file you saved and that will be used in example_classification_2.py
trained_random_forest.save()
# trained_random_forest.save()


if __name__ == "__main__":
Expand Down
12 changes: 11 additions & 1 deletion example_classification_2.py
Expand Up @@ -11,6 +11,7 @@
This code uses the diabetes sample data in datasets/data/diabetes.csv.
"""
import pandas as pd
import numpy as np
import sqlalchemy

import healthcareai
Expand All @@ -21,6 +22,14 @@ def main():
"""Template script for using healthcareai predict using a trained classification model."""
# Load the included diabetes sample data
prediction_dataframe = healthcareai.load_diabetes()


# uncomment below code if advance imputaion is used in example_classification_1
# beacuse we have intentionally converted GenderFLG column into numeric type for demonstration of numeric_columns_as_categorical feature.
"""
prediction_dataframe['GenderFLG'].iloc[ 500:530, ] = np.NaN
prediction_dataframe['GenderFLG'].replace( to_replace=[ 'M', 'F' ], value=[ 0, 1], inplace=True )
"""

# ...or load your own data from a .csv file: Uncomment to pull data from your CSV
# prediction_dataframe = healthcareai.load_csv('path/to/your.csv')
Expand All @@ -41,7 +50,8 @@ def main():
# Load the saved model using your filename.
# File names are timestamped and look like '2017-05-31T12-36-21_classification_RandomForestClassifier.pkl')
# Note the file you saved in example_classification_1.py and set that here.
trained_model = healthcareai.load_saved_model('2017-08-16T16-45-57_classification_RandomForestClassifier.pkl')
trained_model = healthcareai.load_saved_model('2018-10-09T13-53-44_classification_RandomForestClassifier_defaultImputation.pkl')
#trained_model = healthcareai.load_saved_model('2018-10-09T13-25-28_classification_RandomForestClassifier_advanceImputation.pkl')

# Any saved model can be inspected for properties such as plots, metrics, columns, etc. (More examples in the docs)
trained_model.roc_plot()
Expand Down
46 changes: 45 additions & 1 deletion example_regression_1.py
Expand Up @@ -11,6 +11,7 @@
This code uses the diabetes sample data in datasets/data/diabetes.csv.
"""
import pandas as pd
import numpy as np

import healthcareai
import healthcareai.common.database_connections as hcai_db
Expand All @@ -37,7 +38,7 @@ def main():

# Peek at the first 5 rows of data
print(dataframe.head(5))

# Step 1: Setup a healthcareai regression trainer. This prepares your data for model building
regression_trainer = healthcareai.SupervisedModelTrainer(
dataframe=dataframe,
Expand All @@ -46,6 +47,49 @@ def main():
grain_column='PatientEncounterID',
impute=True,
verbose=False)


"""
The below code demonstrate the advance features for imputation of missing values.
imputeStrategy:
'MeanMode': (default), Impute using mean and mode values of column
'RandomForest': Impute missing values in RandomForest models. (Imputed values are much more realistic)
tunedRandomForest:
True: ML to be used for imputation of missing values are tuned using grid search and K-fold cross
validation.
numeric_columns_as_categorical :
For example: GenderFLG (0,0,1,0,1,1 .... )
So in normal case pandas by default will consider this column as numeric and missing values of this column
will be imputed using MEAN value (ex. 0.78 or 1.46 ....).
Thus to explicitly mention such as categorical there is this option which can be used as below:
numeric_columns_as_categorical = 'GenderFLG'
Now imputation will be done by MODE value and final type of the column wil be np.object.
"""

# Uncomment below code to see advance imputation in action.
"""
# Creating missing values in GenderFLG column and converting it into Numeric type to demostrate advance imputation features.
dataframe['GenderFLG'].iloc[ 500:530, ] = np.NaN
dataframe['GenderFLG'].replace( to_replace=[ 'M', 'F' ], value=[ 0, 1], inplace=True )
pd.options.mode.chained_assignment = None
regression_trainer = healthcareai.SupervisedModelTrainer(
dataframe=dataframe,
predicted_column='SystolicBPNBR',
model_type='regression',
grain_column='PatientEncounterID',
impute=True,
verbose=False,
imputeStrategy = 'RandomForest',
tunedRandomForest = True,
numeric_columns_as_categorical = 'GenderFLG'
)
"""



# Look at the first few rows of your dataframe after loading the data
print('\n\n-------------------[ Cleaned Dataframe ]--------------------------')
Expand Down
11 changes: 10 additions & 1 deletion example_regression_2.py
Expand Up @@ -11,6 +11,7 @@
This code uses the diabetes sample data in datasets/data/diabetes.csv.
"""
import pandas as pd
import numpy as np

import healthcareai
import healthcareai.common.database_connections as hcai_db
Expand All @@ -20,6 +21,13 @@ def main():
"""Template script for using healthcareai predict using a trained regression model."""
# Load the included diabetes sample data
prediction_dataframe = healthcareai.load_diabetes()

# uncomment below code if advance imputaion is used in example_regression_1
# beacuse we have intentionally converted GenderFLG column into numeric type for demonstration of numeric_columns_as_categorical feature.
"""
prediction_dataframe['GenderFLG'].iloc[ 500:530, ] = np.NaN
prediction_dataframe['GenderFLG'].replace( to_replace=[ 'M', 'F' ], value=[ 0, 1], inplace=True )
"""

# ...or load your own data from a .csv file: Uncomment to pull data from your CSV
# prediction_dataframe = healthcareai.load_csv('path/to/your.csv')
Expand All @@ -40,7 +48,8 @@ def main():
# Load the saved model using your filename.
# File names are timestamped and look like '2017-05-31T12-36-21_regression_LinearRegression.pkl')
# Note the file you saved in example_regression_1.py and set that here.
trained_model = healthcareai.load_saved_model('2017-08-16T16-48-02_regression_LinearRegression.pkl')
trained_model = healthcareai.load_saved_model('2018-10-09T13-56-20_regression_LinearRegression_defaultImputation.pkl')
#trained_model = healthcareai.load_saved_model('2018-10-09T13-28-40_regression_LinearRegression_advanceImputation.pkl')

# Any saved models can be inspected for properties such as metrics, columns, etc. (More examples are in the docs)
print(trained_model.metrics)
Expand Down

0 comments on commit cb82b94

Please sign in to comment.