In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
## reading exported csv

# to get csv - save 'BEAM Deploy Status and Run Data' as csv
# if there is not enough permissions - save a copy and then save as csv

data = pd.read_csv("../../../beam-production/jupyter/local_files/Copy of BEAM Deploy Status and Run Data - BEAM Instances.csv", parse_dates=['Time'])

# using only runs from specific data 
min_time = pd.to_datetime("2022-03-01") # yyyy-mm-dd
data = data[data['Time'] > min_time].copy()

print(f"there are roughly {len(data) / 2} runs since {min_time}")
print(f"the latest run is from {data['Time'].max()}")

data['Month Period'] = data['Time'].dt.strftime('%Y-%m')
print(f"following data periods are included: {sorted(data['Month Period'].unique())}")

data.head(3)

there are roughly 898.0 runs since 2022-03-01 00:00:00
the latest run is from 2022-05-24 14:01:46
following data periods are included: ['2022-03', '2022-04', '2022-05']


Unnamed: 0,Status,Run Name,Instance ID,Instance type,Time,Host name,Web browser,Region,Batch,Branch,Commit,S3 Url,Config,Max RAM,Stacktrace,Died Actor,Error,Warning,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Month Period
10866,Run Started,gemini-scenario-4-constrained-scaledUp-25perce...,i-0e687a480c996abea,r5d.24xlarge,2022-03-01 03:10:41,ec2-52-15-53-54.us-east-2.compute.amazonaws.com,http://ec2-52-15-53-54.us-east-2.compute.amazo...,us-east-2,a6523d0b,gemini-develop,360a0e6dd136cf70f6622c33505d39ff75d59638,,production/sfbay/gemini/gemini-scenario-4-cons...,740g,,,,,LVJBLBUUDVULXEAGEQVKUNHZEUSLBDYHEQBIMKEURFEKDWMA,EZLHTZTCVNWIKGTTTEWYXIILFYAMVGSUVIBVYPOHNAZPVEJK,develop,,2022-03
10867,Run Started,gemini-scenario-4-constrained-scaledUp-25perce...,i-09ab6664da0c975d6,r5d.24xlarge,2022-03-01 03:54:28,ec2-18-119-162-84.us-east-2.compute.amazonaws.com,http://ec2-18-119-162-84.us-east-2.compute.ama...,us-east-2,f44402b2,gemini-develop,360a0e6dd136cf70f6622c33505d39ff75d59638,,production/sfbay/gemini/gemini-scenario-4-scal...,740g,,,,,LVJBLBUUDVULXEAGEQVKUNHZEUSLBDYHEQBIMKEURFEKDWMA,EZLHTZTCVNWIKGTTTEWYXIILFYAMVGSUVIBVYPOHNAZPVEJK,develop,,2022-03
10868,Run Started,gemini-scenario-4-constrained-scaledUp-40perce...,i-077efab479b67af52,r5d.24xlarge,2022-03-01 03:54:29,ec2-3-145-20-131.us-east-2.compute.amazonaws.com,http://ec2-3-145-20-131.us-east-2.compute.amaz...,us-east-2,7d851334,gemini-develop,360a0e6dd136cf70f6622c33505d39ff75d59638,,production/sfbay/gemini/gemini-scenario-4-scal...,740g,,,,,LVJBLBUUDVULXEAGEQVKUNHZEUSLBDYHEQBIMKEURFEKDWMA,EZLHTZTCVNWIKGTTTEWYXIILFYAMVGSUVIBVYPOHNAZPVEJK,develop,,2022-03


In [4]:
## getting data frame with each row as one simulation

take_first_columns = ['Run Name','Month Period','Branch','Instance type']

df = data.groupby("Host name").agg(list)
for col in take_first_columns:
    df[col] = df.apply(lambda r: r[col][0], axis=1)

df['Time Start'] = df.apply(lambda r: r['Time'][0], axis=1)
df['Time Stop'] = df.apply(lambda r: r['Time'][-1], axis=1)

all_columns = set(df.columns)
taken_columns = take_first_columns + ['Time Start', 'Time Stop']

df = df[taken_columns].copy()

print(f"removed columns: {list(sorted(all_columns - set(taken_columns)))}")

# fix for some wierd shift in the spreadsheet for few rows
for v in ['ec2-18-221-208-40.us-east-2.compute.amazonaws.com','ec2-3-144-69-95.us-east-2.compute.amazonaws.com','ec2-52-15-53-101.us-east-2.compute.amazonaws.com']:
    df.replace(to_replace=v, value='r5d.24xlarge', inplace=True)

df['duration_hours'] = (df['Time Stop'] - df['Time Start']).astype('timedelta64[h]')

df.head(3)



Unnamed: 0_level_0,Run Name,Month Period,Branch,Instance type,Time Start,Time Stop,duration_hours
Host name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ec2-13-58-100-43.us-east-2.compute.amazonaws.com,inm/nyc-collaboration-plans-generation_irishwi...,2022-05,inm/nyc-code-change-with-plans-generation,r5d.24xlarge,2022-05-13 17:53:09,2022-05-13 17:53:09,0.0
ec2-13-58-112-199.us-east-2.compute.amazonaws.com,rrp/gemini-calibration-23_heyrutvikATgmail_com,2022-05,rrp/hl/gemini-develop-overnight,r5d.24xlarge,2022-05-16 18:44:20,2022-05-16 20:53:45,2.0
ec2-13-58-116-208.us-east-2.compute.amazonaws.com,rrp/gemini-calibration-6k-095_heyrutvikATgmail...,2022-04,rrp/hl/gemini-develop-enroute-overnight-charging,r5d.24xlarge,2022-04-25 12:22:47,2022-04-25 13:08:56,0.0


In [5]:
## just a check
df['Instance type'].unique()

array(['r5d.24xlarge', 'r5.xlarge', 'r5.24xlarge', 'r5.8xlarge',
       'm5.24xlarge', 'r5.2xlarge', 'm4.16xlarge', 'm5d.24xlarge'],
      dtype=object)

In [6]:
## calculating a price in USD of each simulation

instance_to_price = {'r5d.24xlarge':6.912, 
                     'm5d.24xlarge':5.424, 
                     'r5.xlarge':0.252, 
                     'r5.24xlarge':6.048,
                     'r5.8xlarge':2.016, 
                     'm5.24xlarge':4.608, 
                     'r5.2xlarge':0.504, 
                     'm4.16xlarge':3.20
                    }

missing_instance_types = set()
def get_price(row):
    instance_type = row['Instance type']
    if instance_type in instance_to_price :
        return instance_to_price[instance_type]

    missing_instance_types.add(instance_type)
    return 0.0

df['aws_price_cost'] = df.apply(get_price, axis=1)

if len(missing_instance_types) > 0:
    print(f"Can't find price for following instances: {missing_instance_types}")
    
df['cost'] = df['duration_hours'] * df['aws_price_cost']
print(f"The total cost of all instances from df: {int(df['cost'].sum())}")
df.head(3)

The total cost of all instances from df: 32355


Unnamed: 0_level_0,Run Name,Month Period,Branch,Instance type,Time Start,Time Stop,duration_hours,aws_price_cost,cost
Host name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ec2-13-58-100-43.us-east-2.compute.amazonaws.com,inm/nyc-collaboration-plans-generation_irishwi...,2022-05,inm/nyc-code-change-with-plans-generation,r5d.24xlarge,2022-05-13 17:53:09,2022-05-13 17:53:09,0.0,6.912,0.0
ec2-13-58-112-199.us-east-2.compute.amazonaws.com,rrp/gemini-calibration-23_heyrutvikATgmail_com,2022-05,rrp/hl/gemini-develop-overnight,r5d.24xlarge,2022-05-16 18:44:20,2022-05-16 20:53:45,2.0,6.912,13.824
ec2-13-58-116-208.us-east-2.compute.amazonaws.com,rrp/gemini-calibration-6k-095_heyrutvikATgmail...,2022-04,rrp/hl/gemini-develop-enroute-overnight-charging,r5d.24xlarge,2022-04-25 12:22:47,2022-04-25 13:08:56,0.0,6.912,0.0


In [17]:
## grouping simulations by something

def get_owner(row):
    run_name = row['Run Name']
    if '/' in run_name:
        return run_name.split('/')[0]
    return "??"

# df['owner of run'] = df.apply(get_owner, axis=1)

def get_branch_owner(row):
    branch = row['Branch'].split('/')
    if len(branch) > 1:
        return branch[0]
    return "??"

def get_project(row):
    owner = get_owner(row)
    branch_owner = get_branch_owner(row)
    return f"{owner} | {branch_owner}"
    

df["project"] = df.apply(get_project, axis=1)
df_sum = (df.groupby("project")['cost'].sum() / 32355).reset_index().sort_values("cost", ascending=False)
df_sum

Unnamed: 0,project,cost
5,?? | zn,0.235714
12,rrp | hl,0.175177
2,?? | hl,0.152532
1,?? | Xuan,0.131596
13,rrp | rrp,0.087588
11,rrp | ??,0.064089
0,?? | ??,0.049776
10,inm | zn,0.036958
3,?? | rrp,0.030585
6,do | ??,0.025913
