In [2]:
# import relevant libraries 
import pandas as pd
import numpy as np
from pathlib import Path
from scipy import stats
from scipy.interpolate import interp1d



In [3]:
# use original ec2 file
path = "../raw/AWS EC2 Carbon Footprint Dataset - EC2 Instances Dataset.csv"
df = pd.read_csv(path, decimal=',')

# create and save new csv with relevant data 
d = {'instance': df["Instance type"], 'scope3hourly': df["Instance Hourly Manufacturing Emissions (gCO₂eq)"], '0%': df["Instance @ Idle"],
     '10%': df["Instance @ 10%"], '50': df["Instance @ 50%"], '100%': df["Instance @ 100%"]}
slimdf = pd.DataFrame(data=d)
print(slimdf)
dfpath = Path(f"../processed/teads/ec2_instances.csv")
slimdf.to_csv(dfpath, index=False)


          instance  scope3hourly    0%   10%    50  100%
0        a1.medium           1.8   1.2   1.9   3.2   4.2
1         a1.large           3.7   2.4   3.8   6.4   8.5
2        a1.xlarge           7.4   4.8   7.6  12.7  17.0
3       a1.2xlarge          14.8   9.5  15.2  25.4  34.0
4       a1.4xlarge          29.6  19.0  30.3  50.9  67.9
..             ...           ...   ...   ...   ...   ...
616    db.t2.small           0.9   2.0   3.3   5.3   7.0
617   db.t2.medium           1.8   4.0   6.6  10.7  14.1
618    db.t2.large           1.8   4.8   7.8  12.3  16.5
619   db.t2.xlarge           3.6   9.6  15.7  24.6  33.0
620  db.t2.2xlarge           7.1  19.2  31.4  49.1  66.0

[621 rows x 6 columns]


In [5]:
'''save a power curve to a file for each instance type including error metrics'''

x = [0, 10, 50, 100]
y=slimdf.values[0][1:]
print(y[-1])
#max_rating=slimdf.values[-1]
#print(y)
#slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
# check that data is present and can be indexed
dfs=[]
for index, row in slimdf.iterrows():
    y = [list(row[2:].values)]
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)

    dfs.append(pd.DataFrame(data={'instance_type':[row[0]], 'scope3_hourly':[row[1]],'slope':[slope],'intercept':[intercept],'error':[std_err],'r_value':[r_value],'p_value':[p_value],'max_power':[row[-1]]},index=[row[0]]))
all=pd.concat(dfs)

# result implies that the power curves are not flat i.e. there is something to be gained from a granular utilisation
print("proportion of values with P <0.02: ",len(all[all['p_value']<0.05])/len(all))

# the power curves are fit well by linregress, and that non-linear methods is likely unecessary
print("proportion of values with R >0.95: ",len(all[all['r_value']>0.95])/len(all))

# most power curves have a low average deviation from the plotted data
# however there are a few with pretty high error and these should be investigated 
print("proportion of curves with error <2: ",len(all[all['error']<2])/len(all))
print("proportion of curves with error <1: ",len(all[all['error']<1])/len(all))
print("proportion of curves with error <0.1: ",len(all[all['error']<0.1])/len(all))
dfpath = Path(f"../processed/teads/instancelines.csv")
all.to_csv(dfpath, index=False)


4.2
proportion of values with P <0.02:  1.0
proportion of values with R >0.95:  1.0
proportion of curves with error <2:  0.9790660225442834
proportion of curves with error <1:  0.9597423510466989
proportion of curves with error <0.1:  0.5104669887278583


In [66]:
f = interp1d(x, y)

f2 = interp1d(x, y, kind='cubic')
print(f(x))



[[19.2 31.4 49.1 66. ]]
