In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format='retina'

mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['font.size'] = 12

plt.style.use('tableau-colorblind10')

In [2]:
df_reading = pd.read_csv("logs/power_reading_glue.csv")

In [3]:
df_reading['last_changed'] = pd.to_datetime(df_reading['last_changed'], errors='coerce')
df_reading.dropna(subset=['last_changed'], inplace=True)
df_reading.set_index('last_changed', inplace=True)

In [4]:
df_time_interval = pd.read_csv("logs/inferenceTextClass.csv")

In [5]:
df_time_interval['start_seconds']= [datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S.%f") - timedelta(hours=1, minutes=0)
                                  for start_time in df_time_interval.start_time]
df_time_interval['end_seconds'] = [datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S.%f") - timedelta(hours=1, minutes=0)
                                for end_time in df_time_interval.end_time]
df_time_interval['total_time'] = df_time_interval['end_seconds'] - df_time_interval['start_seconds']
df_time_interval['total_time'] = df_time_interval['total_time'].apply(lambda x: x.total_seconds())
df_time_interval.head()

Unnamed: 0,model,start_time,end_time,start_seconds,end_seconds,total_time
0,MRPC,2020-06-29 23:41:36.427207,2020-06-29 23:42:31.367912,2020-06-29 22:41:36.427207,2020-06-29 22:42:31.367912,54.940705
1,STS-B,2020-06-29 23:43:50.022453,2020-06-29 23:44:45.388476,2020-06-29 22:43:50.022453,2020-06-29 22:44:45.388476,55.366023
2,CoLA,2020-06-29 23:45:01.687218,2020-06-29 23:45:48.487155,2020-06-29 22:45:01.687218,2020-06-29 22:45:48.487155,46.799937


In [6]:
df_time_interval['s_date'] = [x.date().strftime("%Y-%m-%d") for x in df_time_interval["start_seconds"]]
df_time_interval['e_date'] = [x.date().strftime("%Y-%m-%d") for x in df_time_interval["end_seconds"]]
df_time_interval['s_time'] = [x.time().strftime("%H:%M:%S") for x in df_time_interval["start_seconds"]]
df_time_interval['e_time'] = [x.time().strftime("%H:%M:%S") for x in df_time_interval["end_seconds"]]
df_time_interval.head()

Unnamed: 0,model,start_time,end_time,start_seconds,end_seconds,total_time,s_date,e_date,s_time,e_time
0,MRPC,2020-06-29 23:41:36.427207,2020-06-29 23:42:31.367912,2020-06-29 22:41:36.427207,2020-06-29 22:42:31.367912,54.940705,2020-06-29,2020-06-29,22:41:36,22:42:31
1,STS-B,2020-06-29 23:43:50.022453,2020-06-29 23:44:45.388476,2020-06-29 22:43:50.022453,2020-06-29 22:44:45.388476,55.366023,2020-06-29,2020-06-29,22:43:50,22:44:45
2,CoLA,2020-06-29 23:45:01.687218,2020-06-29 23:45:48.487155,2020-06-29 22:45:01.687218,2020-06-29 22:45:48.487155,46.799937,2020-06-29,2020-06-29,22:45:01,22:45:48


In [7]:
def power_mean(s_date,e_date,s_time,e_time):
    subset_df = df_reading.loc[s_date:e_date].between_time(s_time, e_time)[['power_consumption']]
    return subset_df['power_consumption'].mean()

In [8]:
for index, row in df_time_interval.iterrows(): 
    df_time_interval.at[index,'mean_power'] = power_mean(row['s_date'],row['e_date'],
                                            row['s_time'],row['e_time'])

In [9]:
df_time_interval['power'] = df_time_interval['mean_power'] * df_time_interval['total_time']
df_time_interval.head()

Unnamed: 0,model,start_time,end_time,start_seconds,end_seconds,total_time,s_date,e_date,s_time,e_time,mean_power,power
0,MRPC,2020-06-29 23:41:36.427207,2020-06-29 23:42:31.367912,2020-06-29 22:41:36.427207,2020-06-29 22:42:31.367912,54.940705,2020-06-29,2020-06-29,22:41:36,22:42:31,135.240667,7430.217571
1,STS-B,2020-06-29 23:43:50.022453,2020-06-29 23:44:45.388476,2020-06-29 22:43:50.022453,2020-06-29 22:44:45.388476,55.366023,2020-06-29,2020-06-29,22:43:50,22:44:45,134.521,7447.89278
2,CoLA,2020-06-29 23:45:01.687218,2020-06-29 23:45:48.487155,2020-06-29 22:45:01.687218,2020-06-29 22:45:48.487155,46.799937,2020-06-29,2020-06-29,22:45:01,22:45:48,131.045187,6132.906519


In [11]:
df_time_interval['power (kWh)'] = df_time_interval['mean_power'] * df_time_interval['total_time'] / (1000 * 3600)
df_time_interval["datacenter_power"] = df_time_interval['power (kWh)'] * 1.58

df_co2 = pd.read_csv("results/co2data.csv")
uk_co2 = df_co2.loc[df_co2.country == 'United Kingdom'].impact.mean()

df_time_interval['CO2e'] = uk_co2 * df_time_interval['datacenter_power']
df_time_interval['kg CO2e/kWh'] = df_time_interval['CO2e'] /1000

In [12]:
features = ['model','power','power (kWh)','datacenter_power','kg CO2e/kWh']
df_time_interval[features]

Unnamed: 0,model,power,power (kWh),datacenter_power,kg CO2e/kWh
0,MRPC,7430.217571,0.002064,0.003261,0.002032
1,STS-B,7447.89278,0.002069,0.003269,0.002036
2,CoLA,6132.906519,0.001704,0.002692,0.001677


In [2]:
def annotate_pm_data(filename):
    df_reading = pd.read_csv(filename)
    df_reading['last_changed'] = pd.to_datetime(df_reading['last_changed'], errors='coerce')
    df_reading.dropna(subset=['last_changed'], inplace=True)
    df_reading.set_index('last_changed', inplace=True)
    return df_reading

In [32]:
def mean_power(df, s_date,e_date,s_time,e_time):
    subset_df = df.loc[s_date:e_date].between_time(s_time, e_time)[['power_consumption']]
    return subset_df['power_consumption'].mean()

In [33]:
def annotate_time_data(df_time_interval, df_reading):
    df_time_interval['start_seconds']= [datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S.%f") - timedelta(hours=1, minutes=0)
                                  for start_time in df_time_interval.start_time]
    df_time_interval['end_seconds'] = [datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S.%f") - timedelta(hours=1, minutes=0)
                                    for end_time in df_time_interval.end_time]
    df_time_interval['total_time'] = df_time_interval['end_seconds'] - df_time_interval['start_seconds']
    df_time_interval['total_time'] = df_time_interval['total_time'].apply(lambda x: x.total_seconds())
    #df_time_interval.head()
    df_time_interval['s_date'] = [x.date().strftime("%Y-%m-%d") for x in df_time_interval["start_seconds"]]
    df_time_interval['e_date'] = [x.date().strftime("%Y-%m-%d") for x in df_time_interval["end_seconds"]]
    df_time_interval['s_time'] = [x.time().strftime("%H:%M:%S") for x in df_time_interval["start_seconds"]]
    df_time_interval['e_time'] = [x.time().strftime("%H:%M:%S") for x in df_time_interval["end_seconds"]]
    #df_time_interval.head()
    for index, row in df_time_interval.iterrows(): 
        df_time_interval.at[index,'mean_power'] = mean_power(df_reading,row['s_date'],row['e_date'],
                                            row['s_time'],row['e_time'])
    df_time_interval['power'] = df_time_interval['mean_power'] * df_time_interval['total_time']
    #df_time_interval.head()
    
    df_time_interval['power (kWh)'] = df_time_interval['mean_power'] * df_time_interval['total_time'] / (1000 * 3600)
    df_time_interval["datacenter_power"] = df_time_interval['power (kWh)'] * 1.58

    df_co2 = pd.read_csv("results/co2data.csv")
    uk_co2 = df_co2.loc[df_co2.country == 'United Kingdom'].impact.mean()

    df_time_interval['CO2e'] = uk_co2 * df_time_interval['datacenter_power']
    df_time_interval['kg CO2e/kWh'] = df_time_interval['CO2e'] /1000
    return df_time_interval

In [52]:
df_squad = annotate_pm_data("results/power_reading_pm_squad.csv")

In [53]:
temp = pd.read_csv("logs/inferenceTextClass.csv")

In [54]:
df = temp.loc[temp['model'] == 'bert_squad']

In [55]:
df_new = annotate_time_data(df,df_squad)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inste

In [56]:
df_new

Unnamed: 0,model,start_time,end_time,start_seconds,end_seconds,total_time,s_date,e_date,s_time,e_time,mean_power,power,power (kWh),datacenter_power,CO2e,kg CO2e/kWh
3,bert_squad,2020-07-08 15:59:19.952964,2020-07-08 15:59:37.163396,2020-07-08 14:59:19.952964,2020-07-08 14:59:37.163396,17.210432,2020-07-08,2020-07-08,14:59:19,14:59:37,,,,,,


In [58]:
df_squad.loc['2020-06-01':'2020-07-07'].between_time('20:20:19', '20:59:37')[['power_consumption']]

Unnamed: 0_level_0,power_consumption
last_changed,Unnamed: 1_level_1
2020-06-01 20:20:20.814459+00:00,0.000
2020-06-01 20:20:23.818895+00:00,0.000
2020-06-01 20:20:26.818268+00:00,0.000
2020-06-01 20:20:29.816911+00:00,0.000
2020-06-01 20:20:32.812419+00:00,0.000
...,...
2020-06-28 20:59:24.705729+00:00,83.234
2020-06-28 20:59:27.707251+00:00,83.163
2020-06-28 20:59:30.703599+00:00,84.165
2020-06-28 20:59:33.705963+00:00,83.032
