In [1]:
Experiments_file='/home/kate/Research/YearBuilt/Experiments/Experiments.xlsx'
AllExperiments_tab='Experiments'
Experiment_name='Overfitting'
#Experiment configuration: differenet datasets to try
#each line in the file contains the model name and set of features to built a dataset for SageMaker
Experiment_tab='%s Models'%Experiment_name

#Looks like Trial name should be unique in my environment, not in the experiment it belongs to
Trial_name='%s-PreparingTrainValidData'%Experiment_name



#original dataset was created from a Redshift query and uploaded into S3
bucket='kdproperty'
path_to_data_file='/Data/'
data_file='property_basedata_v3.csv'
target='hasclaim'
split_year='2020'
training_dataset_sizes=['0.7','0.65','0.6','0.55','0.5','0.45','0.4','0.35']

In [4]:
import sys
import time
import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

In [5]:
sys.path.append('/home/kate/Research/YearBuilt/Notebooks/Property')
import ExperimentsUtils as eu

In [6]:
#delete experiment and trials but not output files associated with jobs in experiments
eu.cleanup_experiment(Experiment_name)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials

In [8]:
eu.create_experiment(Experiment_name)
eu.create_trial(Experiment_name,Trial_name)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [9]:
models = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Experiment_tab)
models

Unnamed: 0,Model,F1,F2,F3,F4,F5,F6,F7,F8,F9
0,FinalModel,roofcd_encd,usagetype_encd,cova_deductible,cova_limit,sqft,yearbuilt,landlordind,water_risk_3_blk,constructioncd_encd


In [10]:
region = boto3.session.Session().region_name
role = 'arn:aws:iam::757107622481:role/service-role/AmazonSageMaker-ExecutionRole-20200819T131882'
sagemaker_session = sagemaker.session.Session(default_bucket=bucket)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


'kdproperty'

In [11]:
%%writefile preprocessingBatchesToTestOverfiiting.py

#Training and Validation dataset for SageMaker are the same structure: no headers, the first column is a target and the rest are features


import argparse
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

if __name__=='__main__':
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_file', type=str)
    parser.add_argument('--split_year', type=int)    
    parser.add_argument('--target', type=str)     
    parser.add_argument('--model', type=str)
    parser.add_argument('--featureset', type=str)  
    parser.add_argument('--training_dataset_sizes', type=str) 
    args, _ = parser.parse_known_args()   
    
    print('Received arguments {}'.format(args))
    
    featureset=args.featureset.split(',')
    target_column=args.target
    training_dataset_sizes=args.training_dataset_sizes.split(',')
    input_data_path = os.path.join('/opt/ml/processing/input', args.data_file)
    train_data_output_path = '/opt/ml/processing/output/training_data'  
    validation_data_output_path = '/opt/ml/processing/output/validation_data'

    
   
    
    print('Reading input data from {}'.format(input_data_path))
    dataset = pd.read_csv(input_data_path, error_bad_lines=False, index_col=False)
    
    
    dataset=dataset[(dataset.cal_year < args.split_year)][featureset + [target_column]]
    
    X = dataset[featureset]
    y = dataset[target_column]
    
    for s in training_dataset_sizes:
        
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=1 - float(s), random_state=42)
       
        
        train_data_output_path_batch = os.path.join(train_data_output_path,  'batch_%s_training_%s.csv'%(s,args.model))    
        validation_data_output_path_batch = os.path.join(validation_data_output_path, 'batch_%s_validation_%s.csv'%(s,args.model))
     
        
        training_dataset=pd.DataFrame({'hasclaim':y_train}).join(X_train)
        training_dataset.to_csv(train_data_output_path_batch, header=False, index=False)
                                                   
        validation_dataset=pd.DataFrame({'hasclaim':y_val}).join(X_val)   
        validation_dataset.to_csv(validation_data_output_path_batch, header=False, index=False)    


Overwriting preprocessingBatchesToTestOverfiiting.py


I need several different size datasets for each model. I can not use N pairs of ProcessingOutput for training and validation data because of some errors related to size of parameters(? not data size)
So I save 10 files in each training and validation folders with batch_0, batch_1 etc addition in front

In [13]:
processors=list()

for index, row in models.iterrows():
    model=row['Model']
    print (index, ': Creating datasets for model %s'%model)
    featureset=row[1:51].tolist()
    featureset=[x for x in featureset if str(x) != 'nan']
    print(','.join(featureset))
    data_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type='ml.t3.medium',
                                     instance_count=1,
                                     tags=[{'Key': 'Model', 'Value':model},
                                           {'Key': 'Featureset', 'Value': ':'.join(featureset)},
                                           {'Key': 'split_year','Value':split_year},
                                           {'Key': 'training_dataset_sizes','Value':':'.join(training_dataset_sizes)},])

    data_processor.run(code='preprocessingBatchesToTestOverfiiting.py',
                        inputs=[ProcessingInput(
                        source='s3://%s%s'%(bucket,path_to_data_file+data_file),
                        destination='/opt/ml/processing/input')],
                        outputs=[ProcessingOutput(output_name='training_data', source='/opt/ml/processing/output/training_data'),                                 
                                 ProcessingOutput(output_name='validation_data', source='/opt/ml/processing/output/validation_data'),                                 
                                ],
                        arguments=['--data_file',data_file,
                                 '--split_year',split_year,    
                                 '--target',target,                                     
                                 '--model',model,
                                 '--featureset', ','.join(featureset).replace(' ',''),
                                 '--training_dataset_sizes', ','.join(training_dataset_sizes) 
                                  ],
                       experiment_config = {
        'ExperimentName': Experiment_name ,
        'TrialName' : Trial_name,
        'TrialComponentDisplayName' : '%s-%s'%(Trial_name,model.replace('_','-')),},
                    wait=False
                     )
    processors.append(data_processor)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


0 : Creating datasets for model FinalModel
usagetype_encd,cova_deductible,cova_limit ,sqft,yearbuilt  ,landlordind,water_risk_3_blk,constructioncd_encd


INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2020-12-17-17-53-30-180



Job Name:  sagemaker-scikit-learn-2020-12-17-17-53-30-180
Inputs:  [{'InputName': 'input-1', 'S3Input': {'S3Uri': 's3://kdproperty/Data/property_basedata_v3.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-757107622481/sagemaker-scikit-learn-2020-12-17-17-53-30-180/input/code/preprocessingBatchesToTestOverfiiting.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'training_data', 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-757107622481/sagemaker-scikit-learn-2020-12-17-17-53-30-180/output/training_data', 'LocalPath': '/opt/ml/processing/output/training_data', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validation_data', 'S3Output': {'S3Uri': 

In [14]:
#wait till the rest of processing jobs complete
eu.wait_processing_jobs(processors=processors,check_every_sec=15,print_every_n_output=6,wait_min=60)

Processing job sagemaker-scikit-learn-2020-12-17-17-53-30-180 status: InProgress
Continue waiting...
Processing job sagemaker-scikit-learn-2020-12-17-17-53-30-180 status: InProgress
Continue waiting...
Processing job sagemaker-scikit-learn-2020-12-17-17-53-30-180 status: InProgress
Continue waiting...
Processing job sagemaker-scikit-learn-2020-12-17-17-53-30-180 status: InProgress
Continue waiting...
Processing job sagemaker-scikit-learn-2020-12-17-17-53-30-180 status: InProgress
Continue waiting...
Processing job sagemaker-scikit-learn-2020-12-17-17-53-30-180 status: InProgress
Continue waiting...
Processing job sagemaker-scikit-learn-2020-12-17-17-53-30-180 status: InProgress
Continue waiting...
Processing job sagemaker-scikit-learn-2020-12-17-17-53-30-180 status: InProgress
Continue waiting...
Processing job sagemaker-scikit-learn-2020-12-17-17-53-30-180 status: InProgress
Continue waiting...
All Processing Jobs are Completed


When all jobs are done I want to have the list of training and validation data created in each job/trail component

In [15]:
from sagemaker.analytics import ExperimentAnalytics
trial_component_analytics = ExperimentAnalytics(
    experiment_name=Experiment_name
)
trial_comp_ds = trial_component_analytics.dataframe()
trial_ds=trial_comp_ds[trial_comp_ds['DisplayName'].str.contains(Trial_name)].copy()
trial_ds['Model']=trial_ds['DisplayName'].str.replace(Trial_name+'-','')
trial_ds['Model']=trial_ds['Model'].str.replace('-','_')
trial_ds=trial_ds[['Model','DisplayName','training_data - Value','validation_data - Value']]
trial_ds.columns=['Model','Trial Component','Training_data','Validation_data']
trial_ds

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


Unnamed: 0,Model,Trial Component,Training_data,Validation_data
0,FinalModel,Overfitting-PreparingTrainValidData-FinalModel,s3://sagemaker-us-west-2-757107622481/sagemake...,s3://sagemaker-us-west-2-757107622481/sagemake...


We need an actual file name because we need to traine separate models on separate folds

In [16]:
Model_l=list()
Trial_Component_l=list()
Training_data_l=list()
Validation_data_l=list()
for s in training_dataset_sizes:
    for index, row in trial_ds.iterrows():
        Model_l.append('%s_batch_%s'%(row['Model'],s))
        Trial_Component_l.append(row['Trial Component'])
        Training_data_l.append('%s/batch_%s_training_%s.csv'%(row['Training_data'],s,row['Model']))
        Validation_data_l.append('%s/batch_%s_validation_%s.csv'%(row['Validation_data'],s,row['Model']))        
trial_ds_extended = pd.DataFrame(list(zip(Model_l,Trial_Component_l,Training_data_l,Validation_data_l)), 
               columns =['Model','Trial Component','Training_data','Validation_data']).sort_values('Model', ascending=False)
trial_ds_extended        

Unnamed: 0,Model,Trial Component,Training_data,Validation_data
0,FinalModel_batch_0.7,Overfitting-PreparingTrainValidData-FinalModel,s3://sagemaker-us-west-2-757107622481/sagemake...,s3://sagemaker-us-west-2-757107622481/sagemake...
1,FinalModel_batch_0.65,Overfitting-PreparingTrainValidData-FinalModel,s3://sagemaker-us-west-2-757107622481/sagemake...,s3://sagemaker-us-west-2-757107622481/sagemake...
2,FinalModel_batch_0.6,Overfitting-PreparingTrainValidData-FinalModel,s3://sagemaker-us-west-2-757107622481/sagemake...,s3://sagemaker-us-west-2-757107622481/sagemake...
3,FinalModel_batch_0.55,Overfitting-PreparingTrainValidData-FinalModel,s3://sagemaker-us-west-2-757107622481/sagemake...,s3://sagemaker-us-west-2-757107622481/sagemake...
4,FinalModel_batch_0.5,Overfitting-PreparingTrainValidData-FinalModel,s3://sagemaker-us-west-2-757107622481/sagemake...,s3://sagemaker-us-west-2-757107622481/sagemake...
5,FinalModel_batch_0.45,Overfitting-PreparingTrainValidData-FinalModel,s3://sagemaker-us-west-2-757107622481/sagemake...,s3://sagemaker-us-west-2-757107622481/sagemake...
6,FinalModel_batch_0.4,Overfitting-PreparingTrainValidData-FinalModel,s3://sagemaker-us-west-2-757107622481/sagemake...,s3://sagemaker-us-west-2-757107622481/sagemake...
7,FinalModel_batch_0.35,Overfitting-PreparingTrainValidData-FinalModel,s3://sagemaker-us-west-2-757107622481/sagemake...,s3://sagemaker-us-west-2-757107622481/sagemake...


In [17]:
#Saving into the Experiment log file names of created training and validation datasets in S3 to train models in other module  
eu.SaveToExperimentLog(Experiments_file, '%s InputData'%Experiment_name, trial_ds)