In [1]:
Experiments_file='/home/kate/Research/YearBuilt/Experiments/DwellingExperiments.xlsx'
AllExperiments_tab='Experiments'
Experiment_name='FWaterClaims'
#Experiment configuration: differenet datasets to try
#each line in the file contains the model name and set of features to built a dataset for SageMaker
Experiment_tab='%s Models'%Experiment_name

#Looks like Trial name should be unique in my environment, not in the experiment it belongs to
Trial_name='%s-PreparingTrainValidData'%Experiment_name



#original dataset was created from a Redshift query and uploaded into S3
bucket='kdproperty'
path_to_data_file='/Data/'


split_year='2020'
val_size='0.25'


instance_type='ml.t3.large'
instance_count=1

In [2]:
import boto3
import sys
import time
import pandas as pd
import numpy as np

import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

from sagemaker.analytics import ExperimentAnalytics

In [3]:
sys.path.append('/home/kate/Research/YearBuilt/Notebooks/Property')
import ExperimentsUtils as eu

In [4]:
#delete experiment and trials but not output files associated with jobs in experiments
eu.cleanup_experiment(Experiment_name)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


FWaterClaims is a new experiment. nothing to delete


In [5]:
eu.create_experiment(Experiment_name)
eu.create_trial(Experiment_name,Trial_name)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [6]:
experiments = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=AllExperiments_tab)
target=experiments[experiments['Experiment']==Experiment_name]['Target'].values[0]
print(target)
data_file=experiments[experiments['Experiment']==Experiment_name]['Dataset'].values[0]
print(data_file)

hasclaim_water
dwelling_basedata_v4.csv


In [7]:
models = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Experiment_tab)
models

Unnamed: 0,Model,F1,F2,F3,F4,F5,F6,F7,F8
0,FWaterClaims,usagetype_encd,cova_deductible,cova_limit,sqft,cal_year - yearbuilt,landlordind,water_risk_3_blk,constructioncd_encd


In [8]:
region = boto3.session.Session().region_name
role = 'arn:aws:iam::757107622481:role/service-role/AmazonSageMaker-ExecutionRole-20200819T131882'
sagemaker_session = sagemaker.session.Session(default_bucket=bucket)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [9]:
%%writefile preprocessing.py

#Training and Validation dataset for SageMaker are the same structure: no headers, the first column is a target and the rest are features


import argparse
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

if __name__=='__main__':
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_file', type=str)
    parser.add_argument('--split_year', type=int)     
    parser.add_argument('--val_size', type=float)     
    parser.add_argument('--target', type=str)      
    parser.add_argument('--model', type=str)  
    parser.add_argument('--featureset', type=str)    
    args, _ = parser.parse_known_args()    
    print('Received arguments {}'.format(args))
    
    featureset=args.featureset.split(',')
    target_column=args.target
    input_data_path = os.path.join('/opt/ml/processing/input', args.data_file)
    train_data_output_path = os.path.join('/opt/ml/processing/output/training_data', 'training_%s.csv'%args.model)    
    validation_data_output_path = os.path.join('/opt/ml/processing/output/validation_data', 'validation_%s.csv'%args.model)
  
    
   
    
    print('Reading input data from {}'.format(input_data_path))
    dataset = pd.read_csv(input_data_path, error_bad_lines=False, index_col=False)
    
    
    dataset=dataset[(dataset.cal_year < args.split_year)]
    
    X = pd.DataFrame()
    for f in featureset:
        X[f]=dataset.eval(f)
    
    y=dataset.eval(target_column)
    


    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=args.val_size, random_state=42)
    
    training_dataset=pd.DataFrame({'hasclaim':y_train}).join(X_train)
    training_dataset.to_csv(train_data_output_path, header=False, index=False)


    
    validation_dataset=pd.DataFrame({'hasclaim':y_val}).join(X_val)   
    validation_dataset.to_csv(validation_data_output_path, header=False, index=False)
    
 
    

Overwriting preprocessing.py


In [10]:
processors=list()

for index, row in models.iterrows():
    model=row['Model']
    print (index, ': Creating datasets for model %s'%model)
    featureset=row[1:51].tolist()
    featureset=[x for x in featureset if str(x) != 'nan']
    print(','.join(featureset))
    data_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type=instance_type,
                                     instance_count=instance_count)

    data_processor.run(code='preprocessing.py',
                        inputs=[ProcessingInput(
                        source='s3://%s%s'%(bucket,path_to_data_file+data_file),
                        destination='/opt/ml/processing/input')],
                        outputs=[ProcessingOutput(output_name='training_data', source='/opt/ml/processing/output/training_data',destination='s3://%s%straining_data/%s'%(bucket,path_to_data_file,model)),                                 
                                 ProcessingOutput(output_name='validation_data', source='/opt/ml/processing/output/validation_data',destination='s3://%s%svalidation_data/%s'%(bucket,path_to_data_file,model)),                                                          
                                ],
                        arguments=['--data_file',data_file,
                                 '--split_year',split_year,
                                 '--val_size',val_size, 
                                 '--target',target,                                    
                                 '--model',model,                                  
                                 '--featureset', ','.join(featureset).replace(' ','')],
                       experiment_config = {
        'ExperimentName': Experiment_name ,
        'TrialName' : Trial_name,
        'TrialComponentDisplayName' : '%s-%s'%(Trial_name,model.replace('_','-')),},
                    wait=False
                     )
    processors.append(data_processor)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


0 : Creating datasets for model FWaterClaims
usagetype_encd,cova_deductible,cova_limit ,sqft,cal_year - yearbuilt,landlordind,water_risk_3_blk,constructioncd_encd


INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2020-12-31-20-05-41-583



Job Name:  sagemaker-scikit-learn-2020-12-31-20-05-41-583
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://kdproperty/Data/dwelling_basedata_v4.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-757107622481/sagemaker-scikit-learn-2020-12-31-20-05-41-583/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'training_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://kdproperty/Data/training_data/FWaterClaims', 'LocalPath': '/opt/ml/processing/output/training_data', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validation_data', 'AppManaged': False, 'S3Output': {'S3Uri'

In [11]:
#wait till the rest of processing jobs complete
eu.wait_processing_jobs(processors=processors,check_every_sec=10,print_every_n_output=10,wait_min=60)

Processing job sagemaker-scikit-learn-2020-12-31-20-05-41-583 status: InProgress
Continue waiting...
Processing job sagemaker-scikit-learn-2020-12-31-20-05-41-583 status: InProgress
Continue waiting...
Processing job sagemaker-scikit-learn-2020-12-31-20-05-41-583 status: InProgress
Continue waiting...
Processing job sagemaker-scikit-learn-2020-12-31-20-05-41-583 status: InProgress
Continue waiting...
All Processing Jobs are Completed


When all jobs are done I want to have the list of training and validation data created in each job/trail component

In [12]:
trial_component_analytics = ExperimentAnalytics(
    experiment_name=Experiment_name
)
trial_comp_ds = trial_component_analytics.dataframe()
trial_ds=trial_comp_ds[trial_comp_ds['DisplayName'].str.contains(Trial_name)].copy()
trial_ds['Model']=trial_ds['DisplayName'].str.replace(Trial_name+'-','')
trial_ds['Model']=trial_ds['Model'].str.replace('-','_')
trial_ds=trial_ds[['Model','DisplayName','training_data - Value','validation_data - Value']]
trial_ds.columns=['Model','Trial Component','Training_data','Validation_data']
trial_ds

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


Unnamed: 0,Model,Trial Component,Training_data,Validation_data
0,FWaterClaims,FWaterClaims-PreparingTrainValidData-FWaterClaims,s3://kdproperty/Data/training_data/FWaterClaims,s3://kdproperty/Data/validation_data/FWaterClaims


There is no actual file name in Training_data and Validation_data. I can add it based on model's names but it's not needed. Training job will use whatever is in S3 bucket folder

In [13]:
#Saving into the Experiment log file names of created training and validation datasets in S3 to train models in other module 
eu.SaveToExperimentLog(Experiments_file, '%s InputData'%Experiment_name, trial_ds)