# Import and preprocess data
## aws_historical_data_new

In [1]:
import pandas as pd
from pandasql import sqldf

pd.set_option('display.max_columns', 500)

def storage_type(row):
    if pd.isna(row['storage_size']):
        return 'EBS'
    elif row['storage_size'] == 0:
        return 'EBS'
    elif row['storage_nvme_ssd'] and not pd.isna(row['storage_nvme_ssd']):
        return 'NVMe'
    elif row['storage_ssd'] and not pd.isna(row['storage_ssd']):
        return 'SSD'
    else:
        return 'HDD'

# original also checks for instance type starts with a1
# but these instances are already filtered
def CPU_brand(row):
    if 'AMD' in row['physical_processor']:
        return 'AMD'
    elif 'Intel' in row['physical_processor']:
        return 'Intel'
    else:
        return '?'

def aws_data_historical_new_load():
    dates = pd.read_csv('../data/historical-data-times.csv')
    data = pd.read_csv('../data/historical-data-raw.csv', skipinitialspace=True)

    # translation of sql query where clause
    data.dropna(subset=['vCPU'], inplace=True)
    data.drop(data[data['generation'] != 'current'].index, inplace=True)
    data.drop(data[data['network_performance'].isin(['High', 'Moderate', 'Low', 'Very Low', 'Very High'])].index, inplace=True)
    data.drop(data[data['GPU'].notna() & data['GPU'] != 0].index, inplace=True)
    data.drop(data[data['FPGA'].notna() & data['FPGA'] != 0].index, inplace=True)
    data.drop(data[data['instance_type'].str.contains('^a1', regex=True)].index, inplace=True) # ARM
    data.drop(data[data['instance_type'].str.contains('.metal$', regex=True)].index, inplace=True)
    data.drop(data[data['instance_type'].str.contains('^t2', regex=True)].index, inplace=True) # burst
    data.drop(data[data['instance_type'].str.contains('^t3', regex=True)].index, inplace=True) # burst
    # translation of sql query join
    data = data.join(dates.set_index('entry'), on='entry', how='inner')

    # translation of sql query new fields
    data['storage_type'] = data.apply(storage_type, axis=1)
    # why are we doing this here if we're just dropping the columns later?
    # keep the columns anyway because then we don't need complicaetd regular expressions later
    split_instance_types = data['instance_type'].str.split('.', n=1, expand=True)
    split_instance_types = split_instance_types.rename(columns={0: 'instance_prefix', 1: 'instance_suffix'})
    data = pd.concat([data, split_instance_types], axis=1)
    data['physical_processor'] = data['physical_processor'].astype(str)
    data['CPU_brand'] = data.apply(CPU_brand, axis=1)

    # translation of transmute
    data['id'] = data['instance_type'].astype(str)
    data['vcpu_value_count'] = data['vCPU']
    data['memory_value_gib'] = data['memory']
    data['clockSpeed_value_ghz'] = pd.to_numeric(data['clock_speed_ghz'].str.replace('GHz', '', case=False))
    data['storage_sum_gib'] = data.apply(lambda row: row['storage_size'] * row['storage_devices'], axis=1)
    data['storage_sum_gib'] = data['storage_sum_gib'].fillna(0)
    data['storage_count'] = data['storage_devices'].fillna(value=0)
    # keep storage_type
    # keep network_performance
    data['network_performance_value_Gib'] = data['network_performance'].str.replace("Up to", "", case=False)
    data['network_performance_value_Gib'] = data['network_performance_value_Gib'].str.replace("Gigabit", "", case=False)
    data['network_performance_value_Gib'] = data['network_performance_value_Gib'].str.replace("Gbps", "", case=False)
    data['network_performance_value_Gib'] = data['network_performance_value_Gib'].str.replace("Gpbs", "", case=False)
    data['network_performance_value_Gib'] = pd.to_numeric(data['network_performance_value_Gib'])
    data['is_guaranteed'] = ~data["network_performance"].str.contains("Up to", case=False)
    data['cost_ondemand_value_usdph'] = data['pricing']
    data['processorName'] = data['physical_processor']
    data['clockSpeed_text'] = data['clock_speed_ghz']
    data['region_name'] = "us-east-1"
    data['join_entry'] = data['entry'] - 1
    data['join_time'] = data['time']
    
    data = data[['id', 'vcpu_value_count', 'memory_value_gib', 'clockSpeed_value_ghz', 'storage_sum_gib',
                 'storage_count', 'storage_type', 'network_performance', 'network_performance_value_Gib',
                 'is_guaranteed', 'cost_ondemand_value_usdph', 'processorName', 'clockSpeed_text', 'region_name',
                 'join_entry', 'join_time',
                 'instance_prefix', 'instance_suffix']] # added by me
    return data


aws_data_historical_new = aws_data_historical_new_load()
aws_data_historical_new

Unnamed: 0,id,vcpu_value_count,memory_value_gib,clockSpeed_value_ghz,storage_sum_gib,storage_count,storage_type,network_performance,network_performance_value_Gib,is_guaranteed,cost_ondemand_value_usdph,processorName,clockSpeed_text,region_name,join_entry,join_time,instance_prefix,instance_suffix
76,m4.10xlarge,40.0,160.0,,0.0,0.0,EBS,10 Gigabit,10,True,2.520,,,us-east-1,24,2015.59,m4,10xlarge
85,c4.8xlarge,36.0,60.0,,0.0,0.0,EBS,10 Gigabit,10,True,1.763,,,us-east-1,24,2015.59,c4,8xlarge
90,c3.8xlarge,32.0,60.0,,640.0,2.0,SSD,10 Gigabit,10,True,1.680,,,us-east-1,24,2015.59,c3,8xlarge
92,g2.8xlarge,32.0,60.0,,240.0,2.0,SSD,10 Gigabit,10,True,2.600,,,us-east-1,24,2015.59,g2,8xlarge
97,r3.8xlarge,32.0,244.0,,640.0,2.0,SSD,10 Gigabit,10,True,2.800,,,us-east-1,24,2015.59,r3,8xlarge
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8868,m5n.8xlarge,32.0,128.0,3.1,0.0,0.0,EBS,25 Gigabit,25,True,1.904,Intel Xeon Platinum 8259 (Cascade Lake),3.1 GHz,us-east-1,102,2020.42,m5n,8xlarge
8869,m5a.24xlarge,96.0,384.0,2.5,0.0,0.0,EBS,20 Gigabit,20,True,4.128,AMD EPYC 7571,2.5 GHz,us-east-1,102,2020.42,m5a,24xlarge
8891,m6g.xlarge,4.0,16.0,,0.0,0.0,EBS,Up to 10 Gigabit,10,False,0.154,AWS Graviton2 Processor,,us-east-1,102,2020.42,m6g,xlarge
8892,m6g.12xlarge,48.0,192.0,,0.0,0.0,EBS,20 Gigabit,20,True,1.848,AWS Graviton2 Processor,,us-east-1,102,2020.42,m6g,12xlarge


## aws_data_all

In [2]:
def aws_data_cleanup(data):
    data.loc[pd.isna(data['clock_ghz']), ['loading_comment', 'clock_ghz']] = ["Clock speed unkown, assuming default value of 2.5 GHz", 2.5]
    return data

def aws_data_normalize(data):
    # translation of transmute
    # keep id
    data['memory_Gib'] = data['memory_value_gib']
    data['vcpu_count'] = data['vcpu_value_count']
    data['clock_ghz'] = data['clockSpeed_value_ghz']
    data['storage_Gib'] = data['storage_sum_gib']
    # keep storage_count
    # keep storage_type
    data['network_Gbps'] = data['network_performance_value_Gib']
    data['network_is_steady'] = data['is_guaranteed']
    data['cost_usdph'] = data['cost_ondemand_value_usdph']
    data['meta_region_name'] = data['region_name']
    data['meta_join_entry'] = data['join_entry']
    data['meta_join_time'] = data['join_time']
    data['loading_comment'] = ""
    data = data[['id', 'memory_Gib', 'vcpu_count', 'clock_ghz', 'storage_Gib', 'storage_count', 'storage_type',
                'network_Gbps', 'network_is_steady', 'cost_usdph', 'meta_region_name', 'meta_join_entry',
                'meta_join_time', 'loading_comment',
                 'instance_prefix', 'instance_suffix']] # added by me
    # cleanup function
    data = aws_data_cleanup(data)
    # join  with commits
    commits = pd.read_csv('../data/ec2-instances.info-commit-mapping.csv')
    data = data.join(commits.set_index('join.entry'), on='meta_join_entry', how='inner')
    return data

def aws_data_enhance_ids(data):
    # use instance_prefix and instance_suffix columns instead of complicated regular expressions
    data['id_prefix'] = data['instance_prefix']
    # first numbers of instance_suffix.
    # originally only 1-9: 10 -> 1
    # doesn't make sense to me
    data['id_numstr'] = data['instance_suffix'].str.extract(r'^(\d+)', expand=False)
    data['id_number'] = pd.to_numeric(data['id_numstr'], errors='coerce').fillna(0)
    data = data.drop(columns=['instance_prefix', 'instance_suffix'])
    return data

def add_slice_info(data, largest):
    def f(row):
        largest_index = largest[row['meta_join_entry']][row['id_prefix']]
        largest_row = data.loc[largest_index]
        # id_slice
        if not row['id_number'] == 0:
            row['id_slice'] = row['id_number']
        elif 'metal' in row['id']:
            row['id_slice'] = largest_row['id_number']
        elif 'xlarge' in row['id']:
            row['id_slice'] = 1
        else:
            row['id_slice'] = 0.5

        # id_slice_factor
        if (largest_row['id_number'] == 0):
            row['id_slice_factor'] = 1
        else:
            row['id_slice_factor'] = row['id_slice'] / largest_row['id_number']  

        row['id_slice_of'] = largest_index
        row['id_slice_net'] = largest_row['network_Gbps'] * row['id_slice_factor']
        row['id_slice_sto'] = largest_row['storage_count']
        return(row)
    return f

def aws_data_with_prefixes(data):
    grouped_data = data.groupby(['meta_join_entry', 'id_prefix'])
    # find largest per group
    largest = grouped_data['id_number'].idxmax()
    data = data.apply(add_slice_info(data, largest), axis=1)
    return data

def aws_data_all_transform(data):
    data = aws_data_normalize(data)
    data = aws_data_enhance_ids(data)
    data = aws_data_with_prefixes(data)
    data['meta_origin'] = 'instances.json'
    return data
    
aws_data_all = aws_data_all_transform(aws_data_historical_new)
aws_data_all

Unnamed: 0,id,memory_Gib,vcpu_count,clock_ghz,storage_Gib,storage_count,storage_type,network_Gbps,network_is_steady,cost_usdph,meta_region_name,meta_join_entry,meta_join_time,loading_comment,commit.hash,commit.date,commit.msg,id_prefix,id_numstr,id_number,id_slice,id_slice_factor,id_slice_of,id_slice_net,id_slice_sto,meta_origin
76,m4.10xlarge,160.0,40.0,2.5,0.0,0.0,EBS,10,True,2.520,us-east-1,24,2015.59,"Clock speed unkown, assuming default value of ...",310ac56,2015-08-02,Pull in and parse reserved instance pricing data,m4,10,10.0,10.0,1.000000,76,10.000000,0.0,instances.json
85,c4.8xlarge,60.0,36.0,2.5,0.0,0.0,EBS,10,True,1.763,us-east-1,24,2015.59,"Clock speed unkown, assuming default value of ...",310ac56,2015-08-02,Pull in and parse reserved instance pricing data,c4,8,8.0,8.0,1.000000,85,10.000000,0.0,instances.json
90,c3.8xlarge,60.0,32.0,2.5,640.0,2.0,SSD,10,True,1.680,us-east-1,24,2015.59,"Clock speed unkown, assuming default value of ...",310ac56,2015-08-02,Pull in and parse reserved instance pricing data,c3,8,8.0,8.0,1.000000,90,10.000000,2.0,instances.json
92,g2.8xlarge,60.0,32.0,2.5,240.0,2.0,SSD,10,True,2.600,us-east-1,24,2015.59,"Clock speed unkown, assuming default value of ...",310ac56,2015-08-02,Pull in and parse reserved instance pricing data,g2,8,8.0,8.0,1.000000,92,10.000000,2.0,instances.json
97,r3.8xlarge,244.0,32.0,2.5,640.0,2.0,SSD,10,True,2.800,us-east-1,24,2015.59,"Clock speed unkown, assuming default value of ...",310ac56,2015-08-02,Pull in and parse reserved instance pricing data,r3,8,8.0,8.0,1.000000,97,10.000000,2.0,instances.json
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8868,m5n.8xlarge,128.0,32.0,3.1,0.0,0.0,EBS,25,True,1.904,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,m5n,8,8.0,8.0,0.333333,8767,33.333333,0.0,instances.json
8869,m5a.24xlarge,384.0,96.0,2.5,0.0,0.0,EBS,20,True,4.128,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,m5a,24,24.0,24.0,1.000000,8869,20.000000,0.0,instances.json
8891,m6g.xlarge,16.0,4.0,2.5,0.0,0.0,EBS,10,False,0.154,us-east-1,102,2020.42,"Clock speed unkown, assuming default value of ...",70b380e,2020-06-02,Rebuild,m6g,,0.0,1.0,0.062500,8742,1.562500,0.0,instances.json
8892,m6g.12xlarge,192.0,48.0,2.5,0.0,0.0,EBS,20,True,1.848,us-east-1,102,2020.42,"Clock speed unkown, assuming default value of ...",70b380e,2020-06-02,Rebuild,m6g,12,12.0,12.0,0.750000,8742,18.750000,0.0,instances.json


## aws_data_all_by_date

In [3]:
def aws_data_all_by_date_transform(data):
    data['meta_group'] = data['commit.date'].astype(str) + ' | ' + data['meta_join_entry'].astype(str) + \
                            ' | ' + data['meta_origin']
    data.sort_values(by=['meta_join_entry', 'meta_origin'], ascending=False, inplace=True)
    return data

aws_data_all_by_date = aws_data_all_by_date_transform(aws_data_all)
aws_data_all_by_date

Unnamed: 0,id,memory_Gib,vcpu_count,clock_ghz,storage_Gib,storage_count,storage_type,network_Gbps,network_is_steady,cost_usdph,meta_region_name,meta_join_entry,meta_join_time,loading_comment,commit.hash,commit.date,commit.msg,id_prefix,id_numstr,id_number,id_slice,id_slice_factor,id_slice_of,id_slice_net,id_slice_sto,meta_origin,meta_group
8620,c5d.xlarge,8.0,4.0,3.0,100.0,1.0,NVMe,10,False,0.192,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,c5d,,0.0,1.0,0.041667,8698,1.041667,4.0,instances.json,2020-06-02 | 102 | instances.json
8621,m5dn.xlarge,16.0,4.0,3.1,150.0,1.0,NVMe,25,False,0.272,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,m5dn,,0.0,1.0,0.041667,8728,4.166667,4.0,instances.json,2020-06-02 | 102 | instances.json
8623,m5a.2xlarge,32.0,8.0,2.5,0.0,0.0,EBS,10,False,0.344,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,m5a,2,2.0,2.0,0.083333,8869,1.666667,0.0,instances.json,2020-06-02 | 102 | instances.json
8624,r5n.12xlarge,384.0,48.0,2.5,0.0,0.0,EBS,50,True,3.576,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,r5n,12,12.0,12.0,0.500000,8646,50.000000,0.0,instances.json,2020-06-02 | 102 | instances.json
8625,c5.9xlarge,72.0,36.0,3.0,0.0,0.0,EBS,10,True,1.530,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,c5,9,9.0,9.0,0.375000,8635,9.375000,0.0,instances.json,2020-06-02 | 102 | instances.json
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,c3.8xlarge,60.0,32.0,2.5,640.0,2.0,SSD,10,True,1.680,us-east-1,24,2015.59,"Clock speed unkown, assuming default value of ...",310ac56,2015-08-02,Pull in and parse reserved instance pricing data,c3,8,8.0,8.0,1.000000,90,10.000000,2.0,instances.json,2015-08-02 | 24 | instances.json
92,g2.8xlarge,60.0,32.0,2.5,240.0,2.0,SSD,10,True,2.600,us-east-1,24,2015.59,"Clock speed unkown, assuming default value of ...",310ac56,2015-08-02,Pull in and parse reserved instance pricing data,g2,8,8.0,8.0,1.000000,92,10.000000,2.0,instances.json,2015-08-02 | 24 | instances.json
97,r3.8xlarge,244.0,32.0,2.5,640.0,2.0,SSD,10,True,2.800,us-east-1,24,2015.59,"Clock speed unkown, assuming default value of ...",310ac56,2015-08-02,Pull in and parse reserved instance pricing data,r3,8,8.0,8.0,1.000000,97,10.000000,2.0,instances.json,2015-08-02 | 24 | instances.json
101,i2.8xlarge,244.0,32.0,2.5,6400.0,8.0,SSD,10,True,6.820,us-east-1,24,2015.59,"Clock speed unkown, assuming default value of ...",310ac56,2015-08-02,Pull in and parse reserved instance pricing data,i2,8,8.0,8.0,1.000000,101,10.000000,8.0,instances.json,2015-08-02 | 24 | instances.json


## ui_instance_sets

In [4]:
def ui_instance_sets_transform(data):
    # do something
    return data
# not really needed right now. we can get instSet.all directly from aws_data_all_by_date
#ui_instance_sets = ui_instance_sets_transform(aws_data_all_by_date)
#ui_instance_sets

## instSet_all

In [5]:
input_instanceSet = '2020-06-02 | 102 | instances.json'

def instSet_all_transform(data):
    grouped_data = data.groupby(['meta_group']) 
    data = grouped_data.get_group(input_instanceSet)
    return data

instSet_all = instSet_all_transform(aws_data_all_by_date)
instSet_all

Unnamed: 0,id,memory_Gib,vcpu_count,clock_ghz,storage_Gib,storage_count,storage_type,network_Gbps,network_is_steady,cost_usdph,meta_region_name,meta_join_entry,meta_join_time,loading_comment,commit.hash,commit.date,commit.msg,id_prefix,id_numstr,id_number,id_slice,id_slice_factor,id_slice_of,id_slice_net,id_slice_sto,meta_origin,meta_group
8620,c5d.xlarge,8.0,4.0,3.0,100.0,1.0,NVMe,10,False,0.192,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,c5d,,0.0,1.0,0.041667,8698,1.041667,4.0,instances.json,2020-06-02 | 102 | instances.json
8621,m5dn.xlarge,16.0,4.0,3.1,150.0,1.0,NVMe,25,False,0.272,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,m5dn,,0.0,1.0,0.041667,8728,4.166667,4.0,instances.json,2020-06-02 | 102 | instances.json
8623,m5a.2xlarge,32.0,8.0,2.5,0.0,0.0,EBS,10,False,0.344,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,m5a,2,2.0,2.0,0.083333,8869,1.666667,0.0,instances.json,2020-06-02 | 102 | instances.json
8624,r5n.12xlarge,384.0,48.0,2.5,0.0,0.0,EBS,50,True,3.576,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,r5n,12,12.0,12.0,0.500000,8646,50.000000,0.0,instances.json,2020-06-02 | 102 | instances.json
8625,c5.9xlarge,72.0,36.0,3.0,0.0,0.0,EBS,10,True,1.530,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,c5,9,9.0,9.0,0.375000,8635,9.375000,0.0,instances.json,2020-06-02 | 102 | instances.json
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8868,m5n.8xlarge,128.0,32.0,3.1,0.0,0.0,EBS,25,True,1.904,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,m5n,8,8.0,8.0,0.333333,8767,33.333333,0.0,instances.json,2020-06-02 | 102 | instances.json
8869,m5a.24xlarge,384.0,96.0,2.5,0.0,0.0,EBS,20,True,4.128,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,m5a,24,24.0,24.0,1.000000,8869,20.000000,0.0,instances.json,2020-06-02 | 102 | instances.json
8891,m6g.xlarge,16.0,4.0,2.5,0.0,0.0,EBS,10,False,0.154,us-east-1,102,2020.42,"Clock speed unkown, assuming default value of ...",70b380e,2020-06-02,Rebuild,m6g,,0.0,1.0,0.062500,8742,1.562500,0.0,instances.json,2020-06-02 | 102 | instances.json
8892,m6g.12xlarge,192.0,48.0,2.5,0.0,0.0,EBS,20,True,1.848,us-east-1,102,2020.42,"Clock speed unkown, assuming default value of ...",70b380e,2020-06-02,Rebuild,m6g,12,12.0,12.0,0.750000,8742,18.750000,0.0,instances.json,2020-06-02 | 102 | instances.json


## instSet_long

In [6]:
paper_inst_ids = ["c5n.18xlarge", "c5.24xlarge", "z1d.12xlarge", "c5d.24xlarge",
 "m5.24xlarge", "i3.16xlarge", "m5d.24xlarge", "m5n.24xlarge",
 "r5.24xlarge", "m5dn.24xlarge", "r5d.24xlarge", "r5n.24xlarge",
 "r5dn.24xlarge", "i3en.24xlarge", "x1e.32xlarge"]

def aws_data_filter_paper(data):
    return data.loc[data['id'].isin(paper_inst_ids)]

# only implemented default filter
instanceFilter = [aws_data_filter_paper]

def calc_net_speed(row):
    return (row['network_Gbps'] if row['network_is_steady'] else row['id_slice_net']) / 8

model_factors_bandwidth = {
    'RAM': 50,
    'NVMe': 2,
    'SSD': 0.5,
    'HDD': 0.25,
    'EBS': float('nan')
}

def model_calc_storage_speed(data):
    bws = data['storage_type'].map(model_factors_bandwidth) * data['id_slice_sto'] * data['id_slice_factor']
    return bws.fillna(data['calc_net_speed'])

def model_with_speeds(data):
    data['calc_net_speed'] = data.apply(calc_net_speed, axis=1)
    data['calc_s3_speed'] = data['calc_net_speed'] * 0.8
    data['calc_mem_speed'] = model_factors_bandwidth['RAM']
    data['calc_sto_speed'] = model_calc_storage_speed(data)
    ## no hyperthreads, assume 2 threads/core
    data['calc_cpu_real'] = data['vcpu_count'] / 2
    data['calc_mem_caching'] = data['memory_Gib'] / 2
    data['calc_sto_caching'] = data['storage_Gib'] / 2
    data['calc_mem_spooling'] = data['memory_Gib'] - data['calc_mem_caching']
    data['calc_sto_spooling'] = data['storage_Gib'] - data['calc_sto_caching']
    return data

def instSet_long_transform(data):
    for function in instanceFilter:
        data = function(data)
    data = data.dropna(axis=0)
    data = model_with_speeds(data)
    return data

instSet_long = instSet_long_transform(instSet_all)
instSet_long

Unnamed: 0,id,memory_Gib,vcpu_count,clock_ghz,storage_Gib,storage_count,storage_type,network_Gbps,network_is_steady,cost_usdph,meta_region_name,meta_join_entry,meta_join_time,loading_comment,commit.hash,commit.date,commit.msg,id_prefix,id_numstr,id_number,id_slice,id_slice_factor,id_slice_of,id_slice_net,id_slice_sto,meta_origin,meta_group,calc_net_speed,calc_s3_speed,calc_mem_speed,calc_sto_speed,calc_cpu_real,calc_mem_caching,calc_sto_caching,calc_mem_spooling,calc_sto_spooling
8627,m5.24xlarge,384.0,96.0,3.1,0.0,0.0,EBS,25,True,4.608,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,m5,24,24.0,24.0,1.0,8627,25.0,0.0,instances.json,2020-06-02 | 102 | instances.json,3.125,2.5,50,3.125,48.0,192.0,0.0,192.0,0.0
8635,c5.24xlarge,192.0,96.0,3.0,0.0,0.0,EBS,25,True,4.08,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,c5,24,24.0,24.0,1.0,8635,25.0,0.0,instances.json,2020-06-02 | 102 | instances.json,3.125,2.5,50,3.125,48.0,96.0,0.0,96.0,0.0
8646,r5n.24xlarge,768.0,96.0,2.5,0.0,0.0,EBS,100,True,7.152,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,r5n,24,24.0,24.0,1.0,8646,100.0,0.0,instances.json,2020-06-02 | 102 | instances.json,12.5,10.0,50,12.5,48.0,384.0,0.0,384.0,0.0
8656,r5d.24xlarge,768.0,96.0,3.1,3600.0,4.0,NVMe,25,True,6.912,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,r5d,24,24.0,24.0,1.0,8656,25.0,4.0,instances.json,2020-06-02 | 102 | instances.json,3.125,2.5,50,8.0,48.0,384.0,1800.0,384.0,1800.0
8673,i3en.24xlarge,768.0,96.0,3.1,60000.0,8.0,NVMe,100,True,10.848,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,i3en,24,24.0,24.0,1.0,8673,100.0,8.0,instances.json,2020-06-02 | 102 | instances.json,12.5,10.0,50,16.0,48.0,384.0,30000.0,384.0,30000.0
8674,c5n.18xlarge,192.0,72.0,3.0,0.0,0.0,EBS,100,True,3.888,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,c5n,18,18.0,18.0,1.0,8674,100.0,0.0,instances.json,2020-06-02 | 102 | instances.json,12.5,10.0,50,12.5,36.0,96.0,0.0,96.0,0.0
8683,m5d.24xlarge,384.0,96.0,3.1,3600.0,4.0,NVMe,25,True,5.424,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,m5d,24,24.0,24.0,1.0,8683,25.0,4.0,instances.json,2020-06-02 | 102 | instances.json,3.125,2.5,50,8.0,48.0,192.0,1800.0,192.0,1800.0
8698,c5d.24xlarge,192.0,96.0,3.0,3600.0,4.0,NVMe,25,True,4.608,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,c5d,24,24.0,24.0,1.0,8698,25.0,4.0,instances.json,2020-06-02 | 102 | instances.json,3.125,2.5,50,8.0,48.0,96.0,1800.0,96.0,1800.0
8712,r5.24xlarge,768.0,96.0,3.1,0.0,0.0,EBS,25,True,6.048,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,r5,24,24.0,24.0,1.0,8712,25.0,0.0,instances.json,2020-06-02 | 102 | instances.json,3.125,2.5,50,3.125,48.0,384.0,0.0,384.0,0.0
8717,x1e.32xlarge,3904.0,128.0,2.3,3840.0,2.0,SSD,25,True,26.688,us-east-1,102,2020.42,,70b380e,2020-06-02,Rebuild,x1e,32,32.0,32.0,1.0,8717,25.0,2.0,instances.json,2020-06-02 | 102 | instances.json,3.125,2.5,50,1.0,64.0,1952.0,1920.0,1952.0,1920.0


## instSet

In [7]:
def instSet_transform(data):
    data.drop(list(data.filter(regex = '^meta')), axis = 1, inplace = True)
    data.drop(list(data.filter(regex = '^commit')), axis = 1, inplace = True)
    return data

instSet = instSet_transform(instSet_long)
instSet

Unnamed: 0,id,memory_Gib,vcpu_count,clock_ghz,storage_Gib,storage_count,storage_type,network_Gbps,network_is_steady,cost_usdph,loading_comment,id_prefix,id_numstr,id_number,id_slice,id_slice_factor,id_slice_of,id_slice_net,id_slice_sto,calc_net_speed,calc_s3_speed,calc_mem_speed,calc_sto_speed,calc_cpu_real,calc_mem_caching,calc_sto_caching,calc_mem_spooling,calc_sto_spooling
8627,m5.24xlarge,384.0,96.0,3.1,0.0,0.0,EBS,25,True,4.608,,m5,24,24.0,24.0,1.0,8627,25.0,0.0,3.125,2.5,50,3.125,48.0,192.0,0.0,192.0,0.0
8635,c5.24xlarge,192.0,96.0,3.0,0.0,0.0,EBS,25,True,4.08,,c5,24,24.0,24.0,1.0,8635,25.0,0.0,3.125,2.5,50,3.125,48.0,96.0,0.0,96.0,0.0
8646,r5n.24xlarge,768.0,96.0,2.5,0.0,0.0,EBS,100,True,7.152,,r5n,24,24.0,24.0,1.0,8646,100.0,0.0,12.5,10.0,50,12.5,48.0,384.0,0.0,384.0,0.0
8656,r5d.24xlarge,768.0,96.0,3.1,3600.0,4.0,NVMe,25,True,6.912,,r5d,24,24.0,24.0,1.0,8656,25.0,4.0,3.125,2.5,50,8.0,48.0,384.0,1800.0,384.0,1800.0
8673,i3en.24xlarge,768.0,96.0,3.1,60000.0,8.0,NVMe,100,True,10.848,,i3en,24,24.0,24.0,1.0,8673,100.0,8.0,12.5,10.0,50,16.0,48.0,384.0,30000.0,384.0,30000.0
8674,c5n.18xlarge,192.0,72.0,3.0,0.0,0.0,EBS,100,True,3.888,,c5n,18,18.0,18.0,1.0,8674,100.0,0.0,12.5,10.0,50,12.5,36.0,96.0,0.0,96.0,0.0
8683,m5d.24xlarge,384.0,96.0,3.1,3600.0,4.0,NVMe,25,True,5.424,,m5d,24,24.0,24.0,1.0,8683,25.0,4.0,3.125,2.5,50,8.0,48.0,192.0,1800.0,192.0,1800.0
8698,c5d.24xlarge,192.0,96.0,3.0,3600.0,4.0,NVMe,25,True,4.608,,c5d,24,24.0,24.0,1.0,8698,25.0,4.0,3.125,2.5,50,8.0,48.0,96.0,1800.0,96.0,1800.0
8712,r5.24xlarge,768.0,96.0,3.1,0.0,0.0,EBS,25,True,6.048,,r5,24,24.0,24.0,1.0,8712,25.0,0.0,3.125,2.5,50,3.125,48.0,384.0,0.0,384.0,0.0
8717,x1e.32xlarge,3904.0,128.0,2.3,3840.0,2.0,SSD,25,True,26.688,,x1e,32,32.0,32.0,1.0,8717,25.0,2.0,3.125,2.5,50,1.0,64.0,1952.0,1920.0,1952.0,1920.0
