### AWS Sagemaker Experiments Cost
Purpose: calculate the cost of experiments based on time spent for prpcessing, training and inference.
The code loops thru the configured list of experiments, reads the names of related jobs and extracts the run time for each as well instance type and number of instances.
Based on the configured price per hour for each instance type the total experiments cost is calculated

In [1]:
Experiments_file='/home/kate/Research/Property/Notebooks/Experiments/Logs/Set1-Classification.xlsx'
#Experimets_tab: #,Experiment,other columns not related to this specific process
Experiments_tab='Experiments'
#https://aws.amazon.com/sagemaker/pricing/
cost_map = {'ml.c5.xlarge': 0.238, 'ml.t3.large': 0.1165,'ml.t3.2xlarge':0.4659,'ml.m5.xlarge':0.269}

In [2]:
import sys
import boto3
import time
import pandas as pd
import numpy as np
import sagemaker
from smexperiments.experiment import Experiment

from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent

In [3]:
#sys.path.append('/home/kate/Research/YearBuilt/Notebooks/Experiments')
import ExperimentsUtils as eu

1. Reading experiments from an excel file

In [4]:
Experiments=pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Experiments_tab)

In [5]:
Experiments   

Unnamed: 0,#,Experiment,Objective,Status,Result,Dataset,Target
0,1,BaseFeatures,Get a basic set of features,Done,"CntCustomers,usagetype,rplcostdwel,PipeFroze,l...",property_water_claims_non_cat_fs_v5.csv,hasclaim
1,2,FeatureSet,Get full set of features,Done,"customer_cnt_active_policies,usagetype_encd,wa...",property_water_claims_non_cat_fs_v5.csv,hasclaim
2,3,BaseFeaturesPD,Partial Dependency,Done,pipe froze and replcostdwelling do not make se...,property_water_claims_non_cat_fs_v5.csv,hasclaim
3,4,LastFeatures,Check CovA Limit impact,Done,no imapct of cova limit or andlord,property_water_claims_non_cat_fs_v5.csv,hasclaim
4,5,HyperParams,,Done,Nothing ineteresting Just increased eta,property_water_claims_non_cat_fs_v5.csv,hasclaim
5,6,Reg,Researcg Regularization Parameters - alpha and...,Done,lambda 0.1 has best score but even more severe...,property_water_claims_non_cat_fs_v5.csv,hasclaim
6,7,alpha,Research alpha less then 1,Done,alpha = 0.5 is more or less Ok,property_water_claims_non_cat_fs_v5.csv,hasclaim
7,8,Final,Final set of params,In Progress,,property_water_claims_non_cat_fs_v5.csv,hasclaim


In [6]:
#should be run as a first step
#role arn is used when run from a local machine
role = 'arn:aws:iam::757107622481:role/service-role/AmazonSageMaker-ExecutionRole-20200819T131882'
region = 'us-west-2'
sm_sess = sagemaker.session.Session()
sm = boto3.Session().client('sagemaker')

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [7]:
#Experiments summaries
from sagemaker.analytics import ExperimentAnalytics

2. Extracting run time for each job registered in the experiments

In [8]:
experiments_l = list()
job_l = list()
job_type_l = list()
TransformingTimeInSeconds_l = list()
ProcessingTimeInSeconds_l = list()
BillableTimeInSeconds_l = list()
TrainingTimeInSeconds_l = list()
InstanceType_l = list()
InstanceCount_l = list()
for e in Experiments['Experiment']:
    print('Processing experiment: %s'%e)
    analytics = ExperimentAnalytics(experiment_name=e)
    analytics_ds = analytics.dataframe()
    if analytics_ds.empty:
        continue
    for j in analytics_ds['SourceArn'].values:
        experiments_l.append(e)
        job_name=j[j.index('/')+1:len(j)]
        job_l.append(job_name)
        if 'training-job' in j:
            job_type_l.append('training-job')
            job_desc=sm.describe_training_job(TrainingJobName=job_name)
            try:
                BillableTimeInSeconds_l.append(job_desc['BillableTimeInSeconds'])
            except:
                BillableTimeInSeconds_l.append(0)
            try:
                TrainingTimeInSeconds_l.append(job_desc['TrainingTimeInSeconds'])
            except:
                TrainingTimeInSeconds_l.append(0)
            try:
                InstanceType_l.append(job_desc['ResourceConfig']['InstanceType'])
            except:
                InstanceType_l.append(0)
            try:
                InstanceCount_l.append(job_desc['ResourceConfig']['InstanceCount'])
            except:
                InstanceCount_l.append(0)
            ProcessingTimeInSeconds_l.append(0)
            TransformingTimeInSeconds_l.append(0)            
        elif 'processing-job' in j:
            job_type_l.append('processing-job')        
            job_desc=sm.describe_processing_job(ProcessingJobName=job_name)
            try:
                duration = job_desc['ProcessingEndTime'] - job_desc['ProcessingStartTime']
                ProcessingTimeInSeconds_l.append(float(str(duration.seconds) +'.'+ str(duration.microseconds)))
            except:
                ProcessingTimeInSeconds_l.append(0)
            try:
                InstanceType_l.append(job_desc['ProcessingResources']['ClusterConfig']['InstanceType'])
            except:
                InstanceType_l.append(0)
            try:
                InstanceCount_l.append(job_desc['ProcessingResources']['ClusterConfig']['InstanceCount'])
            except:
                InstanceCount_l.append(0)
            BillableTimeInSeconds_l.append(0)
            TrainingTimeInSeconds_l.append(0)      
            TransformingTimeInSeconds_l.append(0)
        elif 'transform-job' in j:
            job_type_l.append('transform-job')        
            job_desc=sm.describe_transform_job(TransformJobName=job_name)
            try:
                duration = job_desc['TransformEndTime'] - job_desc['TransformStartTime']
                TransformingTimeInSeconds_l.append(float(str(duration.seconds) +'.'+ str(duration.microseconds)))
            except:
                TransformingTimeInSeconds_l.append(0)
            try:
                InstanceType_l.append(job_desc['TransformResources']['InstanceType'])
            except:
                InstanceType_l.append(0)
            try:
                InstanceCount_l.append(job_desc['TransformResources']['InstanceCount'])
            except:
                InstanceCount_l.append(0)
            BillableTimeInSeconds_l.append(0)
            TrainingTimeInSeconds_l.append(0)  
            ProcessingTimeInSeconds_l.append(0)
JobsSummary = pd.DataFrame(list(zip(experiments_l, job_l, job_type_l, ProcessingTimeInSeconds_l, BillableTimeInSeconds_l, TrainingTimeInSeconds_l, TransformingTimeInSeconds_l, InstanceType_l, InstanceCount_l)), 
columns =['Experiment','Job Name', 'Job Type', 'ProcessingTimeInSeconds', 'BillableTimeInSeconds', 'TrainingTimeInSeconds', 'TransformingTimeInSeconds','InstanceType', 'InstanceCount'])
JobsSummary

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


Processing experiment: BaseFeatures


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


Processing experiment: FeatureSet


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


Processing experiment: BaseFeaturesPD


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


Processing experiment: LastFeatures


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


Processing experiment: HyperParams


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


Processing experiment: Reg


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


Processing experiment: alpha


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


Processing experiment: Final


Unnamed: 0,Experiment,Job Name,Job Type,ProcessingTimeInSeconds,BillableTimeInSeconds,TrainingTimeInSeconds,TransformingTimeInSeconds,InstanceType,InstanceCount
0,BaseFeatures,waterhfail-45-2021-05-27-22-26-19,training-job,0.0,1022,1022,0.0,ml.c5.xlarge,1
1,BaseFeatures,plumbleak-44-2021-05-27-22-26-07,training-job,0.0,1018,1018,0.0,ml.c5.xlarge,1
2,BaseFeatures,applfail-43-2021-05-27-22-25-55,training-job,0.0,1001,1001,0.0,ml.c5.xlarge,1
3,BaseFeatures,cova-limit-41-2021-05-27-22-25-31,training-job,0.0,1028,1028,0.0,ml.c5.xlarge,1
4,BaseFeatures,waterrisk-42-2021-05-27-22-25-43,training-job,0.0,1007,1007,0.0,ml.c5.xlarge,1
...,...,...,...,...,...,...,...,...,...
163,alpha,basemodel-4-2021-06-01-22-19-28,training-job,0.0,1002,1002,0.0,ml.c5.xlarge,1
164,Final,basemodel-0-2021-06-01-23-46-02,training-job,0.0,1089,1089,0.0,ml.c5.xlarge,1
165,Final,basemodel-2-2021-06-01-23-46-06,training-job,0.0,1045,1045,0.0,ml.c5.xlarge,1
166,Final,basemodel-1-2021-06-01-23-46-04,training-job,0.0,1058,1058,0.0,ml.c5.xlarge,1


3. Adding hourle instance price and recalculating total experiments cost. SAving to the log file

In [9]:
JobsSummary['TotalTimeHrs']=JobsSummary['InstanceCount']*JobsSummary['ProcessingTimeInSeconds']/60/60+JobsSummary['InstanceCount']*JobsSummary['BillableTimeInSeconds']/60/60+JobsSummary['InstanceCount']*JobsSummary['TransformingTimeInSeconds']/60/60
JobsSummary['PricePerHour']=JobsSummary['InstanceType'].map(cost_map)
JobsSummary['TotalPrice']=JobsSummary['TotalTimeHrs']*JobsSummary['PricePerHour']
JobsSummary.head()

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


Unnamed: 0,Experiment,Job Name,Job Type,ProcessingTimeInSeconds,BillableTimeInSeconds,TrainingTimeInSeconds,TransformingTimeInSeconds,InstanceType,InstanceCount,TotalTimeHrs,PricePerHour,TotalPrice
0,BaseFeatures,waterhfail-45-2021-05-27-22-26-19,training-job,0.0,1022,1022,0.0,ml.c5.xlarge,1,0.283889,0.238,0.067566
1,BaseFeatures,plumbleak-44-2021-05-27-22-26-07,training-job,0.0,1018,1018,0.0,ml.c5.xlarge,1,0.282778,0.238,0.067301
2,BaseFeatures,applfail-43-2021-05-27-22-25-55,training-job,0.0,1001,1001,0.0,ml.c5.xlarge,1,0.278056,0.238,0.066177
3,BaseFeatures,cova-limit-41-2021-05-27-22-25-31,training-job,0.0,1028,1028,0.0,ml.c5.xlarge,1,0.285556,0.238,0.067962
4,BaseFeatures,waterrisk-42-2021-05-27-22-25-43,training-job,0.0,1007,1007,0.0,ml.c5.xlarge,1,0.279722,0.238,0.066574


In [10]:
#Saving into the Experiment log 
eu.SaveToExperimentLog(Experiments_file, 'Experiments Cost', JobsSummary)

## Total experiments time if the instances weer run sequntually, not in parallel

In [11]:
JobsSummary['TotalTimeHrs'].sum()

43.825051944444446

## Total experiments cost

In [12]:
JobsSummary['TotalPrice'].sum()

10.76598776675