In [1]:
Experiments_file='/home/kate/Research/YearBuilt/Experiments/DwellingExperiments.xlsx'
AllExperiments_tab='Experiments'
Experiment_name='FMultiPol'
#Experiment configuration: differenet datasets to try
#each line in the file contains the model name and set of features to built a dataset for SageMaker
Experiment_tab='%s Models'%Experiment_name

#Looks like Trial name should be unique in my environment, not in the experiment it belongs to
Trial_name='%s-PreparingTrainValidData'%Experiment_name



#original dataset was created from a Redshift query and uploaded into S3
bucket='kdproperty'
path_to_data_file='/Data/'


split_year='2020'
num_folds='10'


instance_type='ml.t3.large'
instance_count=1

In [2]:
import sys
import time
import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

from sagemaker.analytics import ExperimentAnalytics

In [3]:
sys.path.append('/home/kate/Research/YearBuilt/Notebooks/Property')
import ExperimentsUtils as eu

In [4]:
experiments = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=AllExperiments_tab)

In [5]:
#delete experiment and trials but not output files associated with jobs in experiments
eu.cleanup_experiment(Experiment_name)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


FMultiPol is a new experiment. nothing to delete


In [6]:
eu.create_experiment(Experiment_name)
eu.create_trial(Experiment_name,Trial_name)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [7]:
experiments = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=AllExperiments_tab)
target=experiments[experiments['Experiment']==Experiment_name]['Target'].values[0]
print(target)
data_file=experiments[experiments['Experiment']==Experiment_name]['Dataset'].values[0]
print(data_file)

hasclaim
dwelling_basedata_v4.csv


In [8]:
models = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Experiment_tab)
models

Unnamed: 0,Model,F1,F2,F3,F4,F5,F6
0,BaseModel,usagetype_encd,cova_deductible,cova_limit,sqft,yearbuilt,
1,Landlord,usagetype_encd,cova_deductible,cova_limit,sqft,yearbuilt,landlordind
2,ActiveOtherPolicies,usagetype_encd,cova_deductible,cova_limit,sqft,yearbuilt,cnt_active_other_policies


In [9]:
region = boto3.session.Session().region_name
role = 'arn:aws:iam::757107622481:role/service-role/AmazonSageMaker-ExecutionRole-20200819T131882'
sagemaker_session = sagemaker.session.Session(default_bucket=bucket)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [10]:
%%writefile preprocessingStratifiedKFold.py

#Training and Validation dataset for SageMaker are the same structure: no headers, the first column is a target and the rest are features


import argparse
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

if __name__=='__main__':
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_file', type=str)
    parser.add_argument('--split_year', type=int)   
    parser.add_argument('--num_folds', type=int)      
    parser.add_argument('--target', type=str)      
    parser.add_argument('--model', type=str)
    parser.add_argument('--featureset', type=str)    
    args, _ = parser.parse_known_args()    
    print('Received arguments {}'.format(args))
    
    featureset=args.featureset.split(',')
    target_column=args.target
    input_data_path = os.path.join('/opt/ml/processing/input', args.data_file)
    train_data_output_path = '/opt/ml/processing/output/training_data'  
    validation_data_output_path = '/opt/ml/processing/output/validation_data'
  
    
   
    
    print('Reading input data from {}'.format(input_data_path))
    dataset = pd.read_csv(input_data_path, error_bad_lines=False, index_col=False)
    
    
    dataset=dataset[(dataset.cal_year < args.split_year)][featureset + [target_column]]
    
    X = dataset[featureset]
    y = dataset[target_column]

    #StratifiedKFold
    kfold =args.num_folds 
    skf = StratifiedKFold(n_splits=kfold, random_state=42, shuffle=True)
    
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        print(' fold: {}  of  {} : '.format(i+1, kfold))
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index] 
        
        train_data_output_path_fold = os.path.join(train_data_output_path,  'fold_%s_training_%s.csv'%(i,args.model))    
        validation_data_output_path_fold = os.path.join(validation_data_output_path, 'fold_%s_validation_%s.csv'%(i,args.model))       
        
        training_dataset=pd.DataFrame({'hasclaim':y_train}).join(X_train)
        training_dataset.to_csv(train_data_output_path_fold, header=False, index=False)
                                                   
        validation_dataset=pd.DataFrame({'hasclaim':y_valid}).join(X_valid)   
        validation_dataset.to_csv(validation_data_output_path_fold, header=False, index=False)    

Overwriting preprocessingStratifiedKFold.py


I need 10 separate datasets for each model. I can not use 10 pairs of ProcessingOutput for training and validation data because of some errors related to size (? not data size)
So I save 10 files in each training and validation folders with fold_0, fold_1 etc addition in front

In [11]:
processors=list()

for index, row in models.iterrows():
    model=row['Model']
    print (index, ': Creating datasets for model %s'%model)
    featureset=row[1:51].tolist()
    featureset=[x for x in featureset if str(x) != 'nan']
    print(','.join(featureset))
    data_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type=instance_type,
                                     instance_count=instance_count)

    data_processor.run(code='preprocessingStratifiedKFold.py',
                        inputs=[ProcessingInput(
                        source='s3://%s%s'%(bucket,path_to_data_file+data_file),
                        destination='/opt/ml/processing/input')],
                        outputs=[ProcessingOutput(output_name='training_data', source='/opt/ml/processing/output/training_data',destination='s3://%s%straining_data/%s'%(bucket,path_to_data_file,model)),                                 
                                 ProcessingOutput(output_name='validation_data', source='/opt/ml/processing/output/validation_data',destination='s3://%s%svalidation_data/%s'%(bucket,path_to_data_file,model)),                                                          
                                ],
                        arguments=['--data_file',data_file,
                                 '--split_year',split_year,
                                 '--num_folds',num_folds,     
                                 '--target',target,                                     
                                 '--model',model,
                                 '--featureset', ','.join(featureset).replace(' ','')],
                       experiment_config = {
        'ExperimentName': Experiment_name ,
        'TrialName' : Trial_name,
        'TrialComponentDisplayName' : '%s-%s'%(Trial_name,model.replace('_','-')),},
                    wait=False
                     )
    processors.append(data_processor)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


0 : Creating datasets for model BaseModel
usagetype_encd,cova_deductible,cova_limit ,sqft,yearbuilt  


INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2020-12-30-22-02-17-778



Job Name:  sagemaker-scikit-learn-2020-12-30-22-02-17-778
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://kdproperty/Data/dwelling_basedata_v4.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-757107622481/sagemaker-scikit-learn-2020-12-30-22-02-17-778/input/code/preprocessingStratifiedKFold.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'training_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://kdproperty/Data/training_data/BaseModel', 'LocalPath': '/opt/ml/processing/output/training_data', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validation_data', 'AppManaged': False, 'S3Outpu

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


1 : Creating datasets for model Landlord
usagetype_encd,cova_deductible,cova_limit ,sqft,yearbuilt  ,landlordind


INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2020-12-30-22-02-19-013



Job Name:  sagemaker-scikit-learn-2020-12-30-22-02-19-013
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://kdproperty/Data/dwelling_basedata_v4.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-757107622481/sagemaker-scikit-learn-2020-12-30-22-02-19-013/input/code/preprocessingStratifiedKFold.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'training_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://kdproperty/Data/training_data/Landlord', 'LocalPath': '/opt/ml/processing/output/training_data', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validation_data', 'AppManaged': False, 'S3Output

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


2 : Creating datasets for model ActiveOtherPolicies
usagetype_encd,cova_deductible,cova_limit ,sqft,yearbuilt  ,cnt_active_other_policies


INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2020-12-30-22-02-23-544



Job Name:  sagemaker-scikit-learn-2020-12-30-22-02-23-544
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://kdproperty/Data/dwelling_basedata_v4.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-757107622481/sagemaker-scikit-learn-2020-12-30-22-02-23-544/input/code/preprocessingStratifiedKFold.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'training_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://kdproperty/Data/training_data/ActiveOtherPolicies', 'LocalPath': '/opt/ml/processing/output/training_data', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validation_data', 'AppManaged': False

In [12]:
#wait till the rest of processing jobs complete
eu.wait_processing_jobs(processors=processors,check_every_sec=15,print_every_n_output=6,wait_min=60)

Processing job sagemaker-scikit-learn-2020-12-30-22-02-17-778 status: InProgress
Processing job sagemaker-scikit-learn-2020-12-30-22-02-19-013 status: InProgress
Processing job sagemaker-scikit-learn-2020-12-30-22-02-23-544 status: InProgress
Continue waiting...
Processing job sagemaker-scikit-learn-2020-12-30-22-02-17-778 status: InProgress
Processing job sagemaker-scikit-learn-2020-12-30-22-02-19-013 status: InProgress
Processing job sagemaker-scikit-learn-2020-12-30-22-02-23-544 status: InProgress
Continue waiting...
Processing job sagemaker-scikit-learn-2020-12-30-22-02-17-778 status: InProgress
Processing job sagemaker-scikit-learn-2020-12-30-22-02-19-013 status: InProgress
Processing job sagemaker-scikit-learn-2020-12-30-22-02-23-544 status: InProgress
Continue waiting...
Processing job sagemaker-scikit-learn-2020-12-30-22-02-17-778 status: InProgress
Processing job sagemaker-scikit-learn-2020-12-30-22-02-19-013 status: InProgress
Processing job sagemaker-scikit-learn-2020-12-30-

When all jobs are done I want to have the list of training and validation data created in each job/trail component

In [13]:
#no way to see analitics for a specific trial using SageMaker Python SDK
#It's possible to have more then one trial to prepare data in each experiments
#Or they plan one experiment - one trial only?
#Can filter by DisplayName if it includes Trial Name
from sagemaker.analytics import ExperimentAnalytics
trial_component_analytics = ExperimentAnalytics(
    experiment_name=Experiment_name
)
trial_comp_ds = trial_component_analytics.dataframe()
trial_ds=trial_comp_ds[trial_comp_ds['DisplayName'].str.contains(Trial_name)].copy()
trial_ds['Model']=trial_ds['DisplayName'].str.replace(Trial_name+'-','')
trial_ds['Model']=trial_ds['Model'].str.replace('-','_')
trial_ds=trial_ds[['Model','DisplayName','training_data - Value','validation_data - Value']]
trial_ds.columns=['Model','Trial Component','Training_data','Validation_data']
trial_ds

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


Unnamed: 0,Model,Trial Component,Training_data,Validation_data
0,ActiveOtherPolicies,FMultiPol-PreparingTrainValidData-ActiveOtherP...,s3://kdproperty/Data/training_data/ActiveOther...,s3://kdproperty/Data/validation_data/ActiveOth...
1,Landlord,FMultiPol-PreparingTrainValidData-Landlord,s3://kdproperty/Data/training_data/Landlord,s3://kdproperty/Data/validation_data/Landlord
2,BaseModel,FMultiPol-PreparingTrainValidData-BaseModel,s3://kdproperty/Data/training_data/BaseModel,s3://kdproperty/Data/validation_data/BaseModel


We need an actual file name because we need to traine separate models on separate folds

In [14]:
Model_l=list()
Trial_Component_l=list()
Training_data_l=list()
Validation_data_l=list()
for i in range(0,int(num_folds)):
    for index, row in trial_ds.iterrows():
        Model_l.append('%s_%s'%(row['Model'],i))
        Trial_Component_l.append(row['Trial Component'])
        Training_data_l.append('%s/fold_%s_training_%s.csv'%(row['Training_data'],i,row['Model']))
        Validation_data_l.append('%s/fold_%s_validation_%s.csv'%(row['Validation_data'],i,row['Model']))        
trial_ds_extended = pd.DataFrame(list(zip(Model_l,Trial_Component_l,Training_data_l,Validation_data_l)), 
               columns =['Model','Trial Component','Training_data','Validation_data']).sort_values('Model', ascending=False)
trial_ds_extended        

Unnamed: 0,Model,Trial Component,Training_data,Validation_data
28,Landlord_9,FMultiPol-PreparingTrainValidData-Landlord,s3://kdproperty/Data/training_data/Landlord/fo...,s3://kdproperty/Data/validation_data/Landlord/...
25,Landlord_8,FMultiPol-PreparingTrainValidData-Landlord,s3://kdproperty/Data/training_data/Landlord/fo...,s3://kdproperty/Data/validation_data/Landlord/...
22,Landlord_7,FMultiPol-PreparingTrainValidData-Landlord,s3://kdproperty/Data/training_data/Landlord/fo...,s3://kdproperty/Data/validation_data/Landlord/...
19,Landlord_6,FMultiPol-PreparingTrainValidData-Landlord,s3://kdproperty/Data/training_data/Landlord/fo...,s3://kdproperty/Data/validation_data/Landlord/...
16,Landlord_5,FMultiPol-PreparingTrainValidData-Landlord,s3://kdproperty/Data/training_data/Landlord/fo...,s3://kdproperty/Data/validation_data/Landlord/...
13,Landlord_4,FMultiPol-PreparingTrainValidData-Landlord,s3://kdproperty/Data/training_data/Landlord/fo...,s3://kdproperty/Data/validation_data/Landlord/...
10,Landlord_3,FMultiPol-PreparingTrainValidData-Landlord,s3://kdproperty/Data/training_data/Landlord/fo...,s3://kdproperty/Data/validation_data/Landlord/...
7,Landlord_2,FMultiPol-PreparingTrainValidData-Landlord,s3://kdproperty/Data/training_data/Landlord/fo...,s3://kdproperty/Data/validation_data/Landlord/...
4,Landlord_1,FMultiPol-PreparingTrainValidData-Landlord,s3://kdproperty/Data/training_data/Landlord/fo...,s3://kdproperty/Data/validation_data/Landlord/...
1,Landlord_0,FMultiPol-PreparingTrainValidData-Landlord,s3://kdproperty/Data/training_data/Landlord/fo...,s3://kdproperty/Data/validation_data/Landlord/...


In [15]:
#Saving into the Experiment log file names of created training and validation datasets in S3 to train models in other module  
eu.SaveToExperimentLog(Experiments_file, '%s InputData'%Experiment_name, trial_ds_extended)