In [11]:
# Note need r5.4xl to run this on
import os
import sys
import time
import random
import warnings
import collections
from dateutil.relativedelta import relativedelta
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.experimental import enable_halving_search_cv  
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, HalvingRandomSearchCV 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor, HistGradientBoostingRegressor 

# from sklearn.linear_model import Ridge, Lasso, BayesianRidge, ElasticNet
from sklearn.preprocessing import OneHotEncoder

sys.path.append('../../src')
%pip install pymysql
import cb_utils
import cb_model_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

import sagemaker
from sagemaker.sklearn.estimator import SKLearn

%load_ext autoreload
%autoreload 2

Note: you may need to restart the kernel to use updated packages.
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# configuration
use_cache = False
seed = random.randint(0, 100)

print(f'Seed: {seed}')

Seed: 98


In [3]:
query = f"select * from fn_dsml_sfere_features_v2_fl();"
fl_features = cb_utils.sql_query_to_df(query, use_cache=use_cache, source='member_doc')

Pulling query from db


In [4]:
fl_features.head()

Unnamed: 0,sfere_id,patient_id,sfere_type_ft,payer_id_ft,group_id_ft,age_ft,transfer_bed_to_chair_ft,mobility_ft,dressing_ft,bathing_ft,eating_ft,grooming_ft,toileting_ft,turn_change_position_ft,has_incontinence_ft,incontinence_type_ft,incontinence_frequency_ft,bathing_support_days_natural_ft,dressing_support_days_natural_ft,eating_support_days_natural_ft,grooming_support_days_natural_ft,toileting_support_days_natural_ft,mobility_support_days_natural_ft,transfer_bed_to_chair_support_days_natural_ft,turn_change_position_support_days_natural_ft,bathing_transfers_support_days_natural_ft,dressing_lower_support_days_natural_ft,hair_support_days_natural_ft,mobility_outside_support_days_natural_ft,calling_friends_and_family_ft,articulating_needs_ft,meal_prep_ft,shopping_ft,medication_management_ft,finances_ft,housework_ft,transportation_ft,daily_routine_decisions_ft,comprehension_ft,member_opinion_ft,cleaning_ft,laundry_ft,change_bed_ft,clean_kitchen_ft,clean_home_ft,medical_appointments_ft,work_school_socialize_ft,driving_ft,calling_friends_and_family_support_days_natural_ft,finances_support_days_natural_ft,laundry_support_days_natural_ft,housework_support_days_natural_ft,meal_prep_support_days_natural_ft,medication_management_support_days_natural_ft,shopping_support_days_natural_ft,alert_oriented_self_ft,alert_oriented_place_ft,alert_oriented_day_time_ft,has_alz_or_dem_ft,has_acquired_cognitive_deficit_ft,has_developmental_delay_ft,has_idd_ft,has_no_cog_deficit_ft,has_traumatic_brain_injury_ft,on_ventilator_ft,using_oxygen_ft,has_paralysis_ft,has_amputations_ft,receive_wound_care_ft,has_ostomy_ft,receiving_enteral_feedings_ft,using_specialized_bed_ft,using_mobility_device_ft,has_dme_ft,mean_supports_ft,reporting_current_hrs_ft,reporting_rec_hrs_tg
0,8590,2113,1,4,2,33.0,3,3,3,3,3,3.0,3,,1,,2.0,,,,,,,,,,,,,1,1.0,1,1,1,1,1,1,,,,,,,,,,,0.0,,,,,,,,1.0,1.0,0.0,0.0,,,,,,0.0,1.0,1.0,0,0.0,1.0,1.0,1.0,1.0,0.0,,14.0,14.0
1,55517,2113,1,4,2,33.0,3,3,3,3,3,3.0,3,,1,2.0,2.0,,,,,,,,,,,,,0,0.0,1,0,0,0,0,0,0.0,1.0,1.0,,,,,,,,0.0,,,,,,,,1.0,1.0,0.0,0.0,,,,,,0.0,1.0,1.0,0,,1.0,1.0,1.0,1.0,1.0,,14.0,14.0
2,6376,2115,1,4,2,69.0,3,0,3,3,3,3.0,3,,1,,2.0,,,,,,,,,,,,,0,0.0,0,0,0,0,0,0,,,,,,,,,,,0.0,,,,,,,,1.0,1.0,1.0,0.0,,,,,,0.0,0.0,1.0,0,1.0,0.0,0.0,1.0,1.0,1.0,,20.0,20.0
3,6380,2115,1,4,2,69.0,3,0,3,3,3,3.0,3,,1,,2.0,,,,,,,,,,,,,0,0.0,0,0,0,0,0,0,,,,,,,,,,,0.0,,,,,,,,1.0,1.0,1.0,0.0,,,,,,0.0,0.0,1.0,0,1.0,0.0,0.0,1.0,1.0,1.0,,,20.0
4,76130,2115,2,4,2,69.0,3,0,3,3,3,3.0,3,,1,2.0,2.0,2.0,2.0,7.0,7.0,7.0,,,,,,,,3,,3,3,3,3,3,3,,,,,3.0,,,,,,,7.0,7.0,7.0,2.0,7.0,7.0,7.0,1.0,1.0,1.0,,0.0,0.0,0.0,1.0,0.0,,,,0,1.0,,,,,,5.75,20.0,20.0


In [5]:
final_columns = [
 'reporting_rec_hrs_tg',
 'transfer_bed_to_chair_ft',
 'mobility_ft',
 'dressing_ft',
 'bathing_ft',
 'eating_ft',
 'grooming_ft',
 'toileting_ft',
 'has_incontinence_ft',
 'incontinence_type_ft',
 'incontinence_frequency_ft',
 'calling_friends_and_family_ft',
 'articulating_needs_ft',
 'meal_prep_ft',
 'shopping_ft',
 'medication_management_ft',
 'finances_ft',
 'housework_ft',
 'transportation_ft',
 'daily_routine_decisions_ft',
 'comprehension_ft',
 'member_opinion_ft',
 'laundry_ft',
 'driving_ft',
 'on_ventilator_ft',
 'using_oxygen_ft',
 'has_paralysis_ft',
 'has_amputations_ft',
 'receive_wound_care_ft',
 'has_ostomy_ft',
 'receiving_enteral_feedings_ft',
 'using_specialized_bed_ft',
 'using_mobility_device_ft',
 'has_dme_ft',
 'sfere_type_ft',
 'payer_id_ft',
 'mean_supports_ft',
 'age_ft',
 'reporting_current_hrs_ft'
]

In [6]:
file_suffix = '20231023_dsml_fl_v1'
training_data_bucket = 'cb-analytics-us-east-2-prd'
training_data_path = 'sagemaker/dsml/training_data/' + file_suffix + '/train.csv'

In [7]:
local_file = f'../data/{file_suffix}.csv'
fl_features[final_columns].to_csv(local_file, header=False, index=False)

In [8]:
cb_utils.upload_file_to_s3(local_file, training_data_bucket, training_data_path)

### Test training on file

In [9]:
df = pd.read_csv(local_file, header=None)
df.columns = final_columns
df.head()

Unnamed: 0,reporting_rec_hrs_tg,transfer_bed_to_chair_ft,mobility_ft,dressing_ft,bathing_ft,eating_ft,grooming_ft,toileting_ft,has_incontinence_ft,incontinence_type_ft,incontinence_frequency_ft,calling_friends_and_family_ft,articulating_needs_ft,meal_prep_ft,shopping_ft,medication_management_ft,finances_ft,housework_ft,transportation_ft,daily_routine_decisions_ft,comprehension_ft,member_opinion_ft,laundry_ft,driving_ft,on_ventilator_ft,using_oxygen_ft,has_paralysis_ft,has_amputations_ft,receive_wound_care_ft,has_ostomy_ft,receiving_enteral_feedings_ft,using_specialized_bed_ft,using_mobility_device_ft,has_dme_ft,sfere_type_ft,payer_id_ft,mean_supports_ft,age_ft,reporting_current_hrs_ft
0,14.0,3,3,3,3,3,3.0,3,1,,2.0,1,1.0,1,1,1,1,1,1,,,,,0.0,0.0,1.0,1.0,0,0.0,1.0,1.0,1.0,1.0,0.0,1,4,,33.0,14.0
1,14.0,3,3,3,3,3,3.0,3,1,2.0,2.0,0,0.0,1,0,0,0,0,0,0.0,1.0,1.0,,0.0,0.0,1.0,1.0,0,,1.0,1.0,1.0,1.0,1.0,1,4,,33.0,14.0
2,20.0,3,0,3,3,3,3.0,3,1,,2.0,0,0.0,0,0,0,0,0,0,,,,,0.0,0.0,0.0,1.0,0,1.0,0.0,0.0,1.0,1.0,1.0,1,4,,69.0,20.0
3,20.0,3,0,3,3,3,3.0,3,1,,2.0,0,0.0,0,0,0,0,0,0,,,,,0.0,0.0,0.0,1.0,0,1.0,0.0,0.0,1.0,1.0,1.0,1,4,,69.0,
4,20.0,3,0,3,3,3,3.0,3,1,2.0,2.0,3,,3,3,3,3,3,3,,,,3.0,,,,,0,1.0,,,,,,2,4,5.75,69.0,20.0


In [10]:
x_train = df.iloc[:, 1:]
y_train = df.iloc[:, 0]

est = HistGradientBoostingRegressor(categorical_features=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35], max_iter=1000).fit(x_train, y_train)

### Sagemaker Training

In [12]:
region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
AWS Region: us-east-2
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
RoleArn: arn:aws:iam::257056996471:role/cb-sagemaker


In [13]:
prefix = f'sagemaker/dsml'
model_name = f'{file_suffix}_model'
model_bucket = 'cb-analytics-exports-us-east-2-prd'

In [14]:
s3_model_output_location ='s3://{}/{}/{}'.format(
    model_bucket,
    prefix,
    model_name
)
s3_model_output_location

's3://cb-analytics-exports-us-east-2-prd/sagemaker/dsml/20231023_dsml_fl_v1_model'

In [15]:
sklearn_estimator = SKLearn(entry_point='train_deploy.py',
                            instance_type='ml.m5.xlarge',
                            role=role,
                            framework_version='1.0-1')

sklearn_estimator.fit({
    'train': f's3://{training_data_bucket}/{training_data_path}'
})

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Using provided s3_resource


INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2023-10-24-17-30-30-325


2023-10-24 17:30:30 Starting - Starting the training job...
2023-10-24 17:30:46 Starting - Preparing the instances for training......
2023-10-24 17:31:50 Downloading - Downloading input data...
2023-10-24 17:32:15 Training - Downloading the training image...
2023-10-24 17:33:01 Uploading - Uploading generated training model[34m2023-10-24 17:32:50,557 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-10-24 17:32:50,560 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-10-24 17:32:50,568 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-10-24 17:32:50,748 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-10-24 17:32:50,760 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-10-24 17:32:50,773 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gp

### Attempt serverless deploy

In [16]:
from time import gmtime, strftime

deployed_model_name = model_name.replace('_', '-')
print("Model name: " + deployed_model_name)

Model name: 20231023-dsml-fl-v1-model


In [17]:
serverless_conf = sagemaker.serverless.serverless_inference_config.ServerlessInferenceConfig(memory_size_in_mb=2048, max_concurrency=100)

In [None]:
predictor = sklearn_estimator.deploy(serverless_inference_config=serverless_conf,
                                     endpoint_name='ep-' + deployed_model_name,
                                     model_name=deployed_model_name
                                    )


INFO:sagemaker:Creating model with name: 20231023-dsml-fl-v1-model
INFO:sagemaker:Creating endpoint-config with name ep-20231023-dsml-fl-v1-model
INFO:sagemaker:Creating endpoint with name ep-20231023-dsml-fl-v1-model


-------------------!

In [None]:
predictor.endpoint_name

'ep-20231023-dsml-fl-v1-model'