In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', None)

In [2]:
## reading exported csv

# to get csv - save 'BEAM Deploy Status and Run Data' as csv
# if there is not enough permissions - save a copy and then save as csv

data = pd.read_csv("../../../beam-production/jupyter/local_files/simulations_spreadsheet.csv", parse_dates=['Time'])

# using only runs from specific data 
min_time = pd.to_datetime("2022-12-01") # yyyy-mm-dd
data = data[data['Time'] > min_time].copy()

print(f"there are roughly {len(data) / 2} runs since {min_time}")
print(f"the latest run is from {data['Time'].max()}")

data['Month Period'] = data['Time'].dt.strftime('%Y-%m')
print(f"following data periods are included: {sorted(data['Month Period'].unique())}")

data.head(3)

there are roughly 200.0 runs since 2022-12-01 00:00:00
the latest run is from 2022-12-13 10:27:36
following data periods are included: ['2022-12']


Unnamed: 0,Status,Run Name,S3 Url,Instance type,Time,Host name,Web browser,Region,Batch,Branch,Commit,Data Branch,Data Commit,Instance ID,Config,Max RAM,Stacktrace,Died Actor,Error,Warning,SigOpt Client Id,SigOpt Dev Id,Profiler,Unnamed: 23,Month Period
16996,Run Started,sfbay-freight-calibration-4-batch8_10712736+htmlrbATusers_noreply_github_com,,r5d.24xlarge,2022-12-01 07:44:13,ec2-18-224-199-181.us-east-2.compute.amazonaws.com,http://ec2-18-224-199-181.us-east-2.compute.amazonaws.com:8000,us-east-2,1ded3732,freight-develop-merge,fe1904ad0290c746ecaf39b91b1a1fafa8d68986,develop,4517e0476102c7fc115ce2a46c0a1509362c96f3,i-0286b968d8686bca3,production/sfbay/freight/sfbay-freight-calibration-4.conf,740g,,,,,LVJBLBUUDVULXEAGEQVKUNHZEUSLBDYHEQBIMKEURFEKDWMA,EZLHTZTCVNWIKGTTTEWYXIILFYAMVGSUVIBVYPOHNAZPVEJK,cpumem,,2022-12
16997,Run Started,sfbay-freight-calibration-3-batch8_10712736+htmlrbATusers_noreply_github_com,,r5d.24xlarge,2022-12-01 07:44:13,ec2-18-219-191-182.us-east-2.compute.amazonaws.com,http://ec2-18-219-191-182.us-east-2.compute.amazonaws.com:8000,us-east-2,94efe454,freight-develop-merge,fe1904ad0290c746ecaf39b91b1a1fafa8d68986,develop,4517e0476102c7fc115ce2a46c0a1509362c96f3,i-0aac5e56939e0726c,production/sfbay/freight/sfbay-freight-calibration-3.conf,740g,,,,,LVJBLBUUDVULXEAGEQVKUNHZEUSLBDYHEQBIMKEURFEKDWMA,EZLHTZTCVNWIKGTTTEWYXIILFYAMVGSUVIBVYPOHNAZPVEJK,cpumem,,2022-12
16998,Run Started,sfbay-freight-calibration-2-batch8_10712736+htmlrbATusers_noreply_github_com,,r5d.24xlarge,2022-12-01 07:44:13,ec2-18-118-145-201.us-east-2.compute.amazonaws.com,http://ec2-18-118-145-201.us-east-2.compute.amazonaws.com:8000,us-east-2,d93338e7,freight-develop-merge,fe1904ad0290c746ecaf39b91b1a1fafa8d68986,develop,4517e0476102c7fc115ce2a46c0a1509362c96f3,i-0b0c8ecc89f7933fc,production/sfbay/freight/sfbay-freight-calibration-2.conf,740g,,,,,LVJBLBUUDVULXEAGEQVKUNHZEUSLBDYHEQBIMKEURFEKDWMA,EZLHTZTCVNWIKGTTTEWYXIILFYAMVGSUVIBVYPOHNAZPVEJK,cpumem,,2022-12


In [3]:
## getting data frame with each row as one simulation

take_first_columns = ['Run Name','Month Period','Branch','Instance type']

df = data.groupby("Host name").agg(list)
for col in take_first_columns:
    df[col] = df.apply(lambda r: r[col][0], axis=1)

df['Time Start'] = df.apply(lambda r: r['Time'][0], axis=1)
df['Time Stop'] = df.apply(lambda r: r['Time'][-1], axis=1)

all_columns = set(df.columns)
taken_columns = take_first_columns + ['Time Start', 'Time Stop']

df = df[taken_columns].copy()

print(f"removed columns: {list(sorted(all_columns - set(taken_columns)))}")

# fix for some wierd shift in the spreadsheet for few rows
for v in ['ec2-18-221-208-40.us-east-2.compute.amazonaws.com','ec2-3-144-69-95.us-east-2.compute.amazonaws.com','ec2-52-15-53-101.us-east-2.compute.amazonaws.com']:
    df.replace(to_replace=v, value='r5d.24xlarge', inplace=True)

df['duration_hours'] = (df['Time Stop'] - df['Time Start']).astype('timedelta64[h]')

df.head(3)



Unnamed: 0_level_0,Run Name,Month Period,Branch,Instance type,Time Start,Time Stop,duration_hours
Host name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ec2-13-58-102-130.us-east-2.compute.amazonaws.com,sfbay-freight-calibration-5-batch8_10712736+htmlrbATusers_noreply_github_com,2022-12,freight-develop-merge,r5d.24xlarge,2022-12-01 07:44:14,2022-12-01 23:26:03,15.0
ec2-13-58-111-254.us-east-2.compute.amazonaws.com,sfbay-micro-mobility-0.3pop-cali25_j503440616ATberkeley_edu,2022-12,zn/tour-mode-with-new-stall-sampling,r5d.24xlarge,2022-12-12 17:30:25,2022-12-13 03:25:43,9.0
ec2-13-58-24-242.us-east-2.compute.amazonaws.com,sfbay-micro-mobility-0.3pop-cali13_j503440616ATberkeley_edu,2022-12,zn/tour-mode-with-new-stall-sampling,r5d.24xlarge,2022-12-13 10:09:58,2022-12-13 10:09:58,0.0


In [4]:
## calculating a price in USD of each simulation

instance_to_price = {
    'c5d.24xlarge' : 4.608,
    'c6a.24xlarge' : 3.672,
    'hpc6a.48xlarge' : 2.88,
    'm4.16xlarge' : 3.2,
    'm5.12xlarge' : 2.304,
    'm5.24xlarge' : 4.608,
    'm5d.24xlarge' : 5.424,
    'r5.24xlarge' : 6.048,
    'r5.2xlarge' : 0.504,
    'r5.4xlarge' : 1.008,
    'r5.8xlarge' : 2.016,
    'r5.large' : 0.126,
    'r5.xlarge' : 0.252,
    'r5d.12xlarge' : 3.456,
    'r5d.16xlarge' : 4.608,
    'r5d.24xlarge' : 6.912,
    't2.medium' : 0.0464                 
}

# for instance_type in sorted(instance_to_price.keys()):
#     print(f"'{instance_type}' : {instance_to_price[instance_type]},")

missing_instance_types = set()
def get_price(row):
    instance_type = row['Instance type']
    if instance_type in instance_to_price :
        return instance_to_price[instance_type]

    missing_instance_types.add(instance_type)
    return 0.0

df['aws_price_cost'] = df.apply(get_price, axis=1)

if len(missing_instance_types) > 0:
    print(f"Can't find price for {len(missing_instance_types)} instance types.")
    for missing_instance in missing_instance_types:
        print(f"'{missing_instance}': ,")
    
df['cost'] = df['duration_hours'] * df['aws_price_cost']
total_cost = int(df['cost'].sum())

def print_total_info():
    dt_interval = f"from {min_time.strftime('%Y-%m-%d')} to {data['Time'].max().strftime('%Y-%m-%d')}"
    print(f"There are {len(df)} simulations {dt_interval}")
    print(f"The total cost of all instances time is ${total_cost}")

print_total_info()
    
df.head(3)

There are 241 simulations from 2022-12-01 to 2022-12-13
The total cost of all instances time is $13103


Unnamed: 0_level_0,Run Name,Month Period,Branch,Instance type,Time Start,Time Stop,duration_hours,aws_price_cost,cost
Host name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ec2-13-58-102-130.us-east-2.compute.amazonaws.com,sfbay-freight-calibration-5-batch8_10712736+htmlrbATusers_noreply_github_com,2022-12,freight-develop-merge,r5d.24xlarge,2022-12-01 07:44:14,2022-12-01 23:26:03,15.0,6.912,103.68
ec2-13-58-111-254.us-east-2.compute.amazonaws.com,sfbay-micro-mobility-0.3pop-cali25_j503440616ATberkeley_edu,2022-12,zn/tour-mode-with-new-stall-sampling,r5d.24xlarge,2022-12-12 17:30:25,2022-12-13 03:25:43,9.0,6.912,62.208
ec2-13-58-24-242.us-east-2.compute.amazonaws.com,sfbay-micro-mobility-0.3pop-cali13_j503440616ATberkeley_edu,2022-12,zn/tour-mode-with-new-stall-sampling,r5d.24xlarge,2022-12-13 10:09:58,2022-12-13 10:09:58,0.0,6.912,0.0


In [5]:
## grouping simulations by projects

def get_owner(row):
    run_name = row['Run Name']
    if '/' in run_name:
        return run_name.split('/')[0]
    return run_name


def get_branch_owner(row):
    branch = row['Branch'].split('/')
    if len(branch) > 1:
        return branch[0]
    return branch


def get_project(row):
    owner = get_owner(row)
    branch_owner = get_branch_owner(row)
    project = f"{owner} | {branch_owner}".lower()
    
    if 'new-york' in project:
        return "NYC"
    if 'freight' in project:
        return "Freight"
    if 'micro-mobility' in project or 'micromobility' in project and 'j503440616atberkeley_edu' in project:
        return "micro-mobility by Xuan"
    if 'shared' in project and 'j503440616atberkeley_edu' in project:
        return "shared fleet by Xuan"
    if 'profiling' in project:
        return "profiling"
    
    return project
    
print_total_info()

df["project"] = df.apply(get_project, axis=1)
df_sum = (df.groupby("project")['cost'].sum() / total_cost).reset_index().sort_values("cost", ascending=False)
df_sum

There are 241 simulations from 2022-12-01 to 2022-12-13
The total cost of all instances time is $13103


Unnamed: 0,project,cost
1,Freight,0.50061
2,NYC,0.190608
7,micro-mobility by Xuan,0.181464
9,shared fleet by Xuan,0.08704
8,profiling,0.025084
3,alex-vv | alex-vv,0.012127
5,edward | edward,0.002572
6,inm | hl,0.000528
0,0.1pop-totalbike-cali13_j503440616atberkeley_edu | xuan,0.0
4,edward | ['develop'],0.0
