# Using healthcareai for regression

## Load the libraries (TODO simplify this)

In [1]:
import pandas as pd
import time
import numpy as np

import healthcareai.common.file_io_utilities as io
import healthcareai.common.predict as predict
import healthcareai.tests.helpers as helpers
import healthcareai.pipelines.data_preparation as pipelines

from healthcareai.simple_mode import SimpleDevelopSupervisedModel
from healthcareai.simple_deploy_supervised_model import SimpleDeploySupervisedModel

## Load the data

In [2]:
df = pd.read_csv(helpers.fixture('DiabetesClincialSampleData.csv'), na_values=['None'])

# Drop columns that won't help machine learning
df.drop(['PatientID', 'InTestWindowFLG'], axis=1, inplace=True)

# Look at the first few rows of your dataframe after the data preparation
df.head()

Unnamed: 0,PatientEncounterID,SystolicBPNBR,LDLNBR,A1CNBR,GenderFLG,ThirtyDayReadmitFLG
0,1,167.0,195.0,4.2,M,N
1,2,153.0,214.0,5.0,M,N
2,3,170.0,191.0,4.0,M,N
3,4,187.0,135.0,4.4,M,N
4,5,188.0,125.0,4.3,M,N


## Train a model

In [3]:
# Start a timer
t0 = time.time()

# Step 1: Setup healthcareai for developing a regression model.
hcai = SimpleDevelopSupervisedModel(df,
                                    'SystolicBPNBR',
                                    'regression',
                                    impute=True,
                                    grain_column='PatientEncounterID')

# Train the linear regression model
trained_linear_model = hcai.linear_regression()
print('Model trained in {} seconds'.format(time.time() - t0))

# Once you are happy with the result of the trained model, it is time to save the model.
saved_model_filename = 'linear_regression_2017-04-18.pkl'
io.save_object_as_pickle(saved_model_filename, trained_linear_model)
print('model saved as {}'.format(saved_model_filename))

Training linear_regression
No randomized search. Using <class 'sklearn.linear_model.base.LinearRegression'>
{'mean_squared_error': 638.67516416733338, 'mean_absolute_error': 20.750708464344861}
Model trained in 0.038100242614746094 seconds
model saved as linear_regression_2017-04-18.pkl


## Make some predictions

### First load a new dataframe and drop some columns

In [10]:
prediction_dataframe = pd.read_csv('healthcareai/tests/fixtures/DiabetesClincialSampleData.csv', na_values=['None'])

# Set None string to be None type
prediction_dataframe.replace(['None'], [None], inplace=True)

# Drop columns that won't help machine learning
prediction_dataframe.drop(['PatientID', 'InTestWindowFLG'], axis=1, inplace=True)

# Run through the preparation pipeline
# TODO this may have implications for null values
prediction_dataframe = pipelines.dataframe_prediction(
    prediction_dataframe,
    'regression',
    'PatientEncounterID',
    'SystolicBPNBR',
    impute=True)

### Load the saved model and predict!

In [11]:
# Load the saved model
linear_model = io.load_saved_model(saved_model_filename)

# Make some prections
predictions = linear_model.predict(prediction_dataframe)

# Save the predictions back to your dataframe
prediction_dataframe['SystolicBPNBR_predicted'] = predictions

# Peek at the predictions
prediction_dataframe.head()

Unnamed: 0,LDLNBR,A1CNBR,GenderFLG.M,ThirtyDayReadmitFLG.Y,SystolicBPNBR_predicted
0,195.0,4.2,1,0,148.931936
1,214.0,5.0,1,0,148.03575
2,191.0,4.0,1,0,149.141533
3,135.0,4.4,1,0,149.955368
4,125.0,4.3,1,0,150.214295
