In [31]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
from tabulate import tabulate

In [32]:
# Constants
regions = ['Great Britain', 'Germany', 'California', 'Texas', 'South Africa', 'Tokyo', 'New South Wales']
short_regions = ['gb', 'de', 'ca', 'tx', 'zaf', 'tyo', 'nsw']
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
short_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] 
tasks=['bowtie2_build', 'fastp', 'fastqc', 'trimgalore']
machines=['gpg13', 'gpg14', 'gpg15', 'gpg22', 'hu26']
governors=['performance', 'powersave']

In [33]:
# Constants
ENERGY_CONSUMPTION = "Energy Consumption (exc. PUE)"
ENERGY_CONSUMPTION_PUE = "Energy Consumption (inc. PUE)"
MEMORY_CONSUMPTION = "Memory Energy Consumption (exc. PUE)"
MEMORY_CONSUMPTION_PUE = "Memory Energy Consumption (inc. PUE)"
CARBON_EMISSIONS = "Operational Carbon Emissions"
EMBODIED_CARBON_EMISSIONS = "Embodied Carbon Emissions"
TOTAL_CARBON_EMISSIONS = "Total Carbon Emissions"
RES_MEM_ENERGY_CONSUMPTION = "Reserved Memory Energy Consumption"
RES_MEM_CARBON_EMISSIONS = "Reserved Memory Carbon Emissions"

In [34]:
# Parse Key Details from Summary Files
def get_data(lines):
    data = {}

    for line in lines:
        parts = line.strip().split(':')
        parts[0] = parts[0][2:].strip()
        data[parts[0]] = parts[1].replace('kWh', '').replace('gCO2e', '').strip()

    return data


def parse_summary(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()

    info = get_data(lines[2:5])
    data = get_data(lines[7:14])

    return (info, data)


def get_average(first, second, third):
    data = {}

    for key in first.keys():
        data[key] = (float(first[key]) + float(second[key]) + float(third[key])) / 3

    return data


def print_summary(data):
    return {
        ENERGY_CONSUMPTION: data[ENERGY_CONSUMPTION],
        MEMORY_CONSUMPTION: data[MEMORY_CONSUMPTION],
        CARBON_EMISSIONS: data[CARBON_EMISSIONS],
        EMBODIED_CARBON_EMISSIONS: data[EMBODIED_CARBON_EMISSIONS],
        TOTAL_CARBON_EMISSIONS: data[TOTAL_CARBON_EMISSIONS]
    }


def print_info(data_1, data_2, data_3):
    matches = data_1 == data_2 == data_3

    if not matches:
        print("[ERROR] Workflow Data does not match ...")
        return

def report_summary_for(filename_1, filename_2, filename_3):
    (_, data_1) = parse_summary(filename_1)
    (_, data_2) = parse_summary(filename_2)
    (_, data_3) = parse_summary(filename_3)
    avg_data = get_average(data_1, data_2, data_3)
    summary = print_summary(avg_data)

    return summary

**Workflow Task Resource Scaling -- Processor Governor**

In [35]:
# Process Data for each task on each machine, in each month of the year, for each region 
task_stats_avg = {}

for region in short_regions:
    task_stats_avg[region] = {}
    for month in short_months:
        task_stats_avg[region][month] = {}
        for task in tasks:
            task_stats_avg[region][month][task] = {}
            for machine in machines:
                task_stats_avg[region][month][task][machine] = {}
                for gov in governors:
                    temp_f = [f'../data/results/resource-scaling/tasks/out/{region}/{machine}-{gov}-{task}-{iteration}-{month}-{region}-2024-{month}-mid-{machine}_{gov}_linear-summary.txt' for iteration in range(1, 4)]
                    task_stats_avg[region][month][task][machine][gov] = report_summary_for(temp_f[0], temp_f[1], temp_f[2])


In [36]:
# Process Data for each task on each machine, in each month of the year, for each region 
task_stats_marg = {}

for region in short_regions:
    task_stats_marg[region] = {}
    for month in short_months:
        task_stats_marg[region][month] = {}
        for task in tasks:
            task_stats_marg[region][month][task] = {}
            for machine in machines:
                task_stats_marg[region][month][task][machine] = {}
                for gov in governors:
                    temp_f = [f'../data/results/resource-scaling/tasks/out/{region}/{machine}-{gov}-{task}-{iteration}-{month}-{region}-2024-{month}-mid-marg-{machine}_{gov}_linear-summary.txt' for iteration in range(1, 4)]
                    task_stats_marg[region][month][task][machine][gov] = report_summary_for(temp_f[0], temp_f[1], temp_f[2])


In [49]:
def print_stats_for_month(month, all_data_avg, all_data_marg, task_runtimes):
    headers = ['region', 'task', 'machine', 'governor', 'runtime (h)', 'energy consumption (kWh)', 'average emissions (gCO2e)', 'marginal emissions (gCO2e)']
    table_data = []

    for region in short_regions:
        for task in tasks:
            for machine in machines:
                for gov in governors:
                    curr = task_stats_avg[region][month][task][machine][gov]
                    curr_energy = round(curr[ENERGY_CONSUMPTION] + curr[MEMORY_CONSUMPTION], 3)
                    curr_emissions = round(curr[CARBON_EMISSIONS], 2)
                    curr_runtime = round(task_runtimes[task][machine][gov], 2)
                    curr_marg = task_stats_marg[region][month][task][machine][gov]
                    curr_marg_emissions = round(curr_marg[CARBON_EMISSIONS], 2)
                    table_data.append([region, task, machine, gov, curr_runtime, curr_energy, curr_emissions, curr_marg_emissions])

    print("Comparison of Tasks @ Varied Frequencies using Average CI Data")
    print(tabulate(table_data, headers, tablefmt='orgtbl'))

    return table_data


In [38]:
def get_runtime(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()

    parts = lines[1].split(',')
    return int(parts[19])

all_task_runtimes = {}

for task in tasks:
    all_task_runtimes[task] = {}
    for machine in machines:
        all_task_runtimes[task][machine] = {}
        for gov in governors:
            filebase = f'../data/traces/tasks/{task}/{machine}-{gov}-{task}-ITER.csv'
            runtime_1 = get_runtime(filebase.replace('ITER', '1'))
            runtime_2 = get_runtime(filebase.replace('ITER', '2'))
            runtime_3 = get_runtime(filebase.replace('ITER', '3'))
            avg_runtime = (runtime_1 + runtime_2 + runtime_3) / 3
            all_task_runtimes[task][machine][gov] = avg_runtime / 1000 / 3600

In [58]:
all_ra_data = {}

for month in short_months:
    print('\n' + month)
    month_data = print_stats_for_month(month, task_stats_avg, task_stats_marg, all_task_runtimes)

    for row in month_data:
        c_region = row[0]
        c_task = row[1]
        c_mach = row[2]
        c_gov = row[3]

        if c_mach in all_ra_data:
            if c_task in all_ra_data[c_mach]:
                if c_gov in all_ra_data[c_mach][c_task]:
                    all_ra_data[c_mach][c_task][c_gov]['avg emissions'].append(row[6])
                    all_ra_data[c_mach][c_task][c_gov]['marg emissions'].append(row[7])
                else:
                    all_ra_data[c_mach][c_task][c_gov] = {
                        'region': c_region,
                        'runtime': row[4],
                        'energy': row[5],
                        'avg emissions': [row[6]],
                        'marg emissions': [row[7]]
                    }
            else:
                all_ra_data[c_mach][c_task] = {}
                all_ra_data[c_mach][c_task][c_gov] = {
                    'region': c_region,
                    'runtime': row[4],
                    'energy': row[5],
                    'avg emissions': [row[6]],
                    'marg emissions': [row[7]]
                }
        else: 
            all_ra_data[c_mach] = {}
            all_ra_data[c_mach][c_task] = {}
            all_ra_data[c_mach][c_task][c_gov] = {
                'region': c_region,
                'runtime': row[4],
                'energy': row[5],
                'avg emissions': [row[6]],
                'marg emissions': [row[7]]
            }


jan
Comparison of Tasks @ Varied Frequencies using Average CI Data
| region   | task          | machine   | governor    |   runtime (h) |   energy consumption (kWh) |   average emissions (gCO2e) |   marginal emissions (gCO2e) |
|----------+---------------+-----------+-------------+---------------+----------------------------+-----------------------------+------------------------------|
| gb       | bowtie2_build | gpg13     | performance |          0.25 |                      0.016 |                        3.86 |                         4.29 |
| gb       | bowtie2_build | gpg13     | powersave   |          0.31 |                      0.017 |                        4.26 |                         5.22 |
| gb       | bowtie2_build | gpg14     | performance |          0.25 |                      0.016 |                        3.93 |                         4.38 |
| gb       | bowtie2_build | gpg14     | powersave   |          0.31 |                      0.018 |                        4.34

In [59]:
headers = ['region', 'task', 'machine', 'governor', 'runtime (h)', 'energy consumption (kWh)', 'average emissions (gCO2e)', 'marginal emissions (gCO2e)']
mean_data = []

for machine in machines:
    for task in tasks:
        for gov in governors:
            entry = all_ra_data[machine][task][gov]
            mean_avg_ems = sum(entry['avg emissions']) / len(entry['avg emissions'])
            mean_marg_ems = sum(entry['marg emissions']) / len(entry['marg emissions'])
            mean_data.append([entry['region'], task, machine, gov, entry['runtime'], entry['energy'], round(mean_avg_ems, 2), round(mean_marg_ems, 2)])

print(tabulate(mean_data, headers, tablefmt='orgtbl'))

with open('../data/results/resource-scaling/tasks/table_mean_data.csv', 'w') as f:
    f.write(','.join(headers) + '\n')

    for row in mean_data: 
        f.write(','.join([str(v) for v in row]) + '\n')


| region   | task          | machine   | governor    |   runtime (h) |   energy consumption (kWh) |   average emissions (gCO2e) |   marginal emissions (gCO2e) |
|----------+---------------+-----------+-------------+---------------+----------------------------+-----------------------------+------------------------------|
| gb       | bowtie2_build | gpg13     | performance |          0.25 |                      0.016 |                        5.06 |                         6.17 |
| gb       | bowtie2_build | gpg13     | powersave   |          0.31 |                      0.017 |                        5.58 |                         7.41 |
| gb       | fastp         | gpg13     | performance |          0.14 |                      0.009 |                        3.01 |                         2.32 |
| gb       | fastp         | gpg13     | powersave   |          0.17 |                      0.01  |                        3.16 |                         2.88 |
| gb       | fastqc        | gpg13

**Workflow Resource Scaling -- Processor Governor (8 nodes)**

In [None]:
workflows = ['chipseq', 'rnaseq']
wf_stats_avg = {}
wf_stats_marg = {} 

for region in short_regions:
    wf_stats_avg[region] = {}
    wf_stats_marg[region] = {}
    for wf in workflows:
        wf_stats_avg[region][wf] = {}
        wf_stats_marg[region][wf] = {}
        for gov in governors:
            temp_f = [f'../data/results/resource-scaling/workflows/out/{wf}-{gov}-{iteration}-{region}-2024-wf-scale-hu26_{gov}_linear-summary.txt' for iteration in range(1, 4)]
            wf_stats_avg[region][wf][gov] = report_summary_for(temp_f[0], temp_f[1], temp_f[2])
            
            temp_f2 = [f'../data/results/resource-scaling/workflows/out/{wf}-{gov}-{iteration}-{region}-2024-wf-scale-marg-hu26_{gov}_linear-summary.txt' for iteration in range(1, 4)]
            wf_stats_marg[region][wf][gov] = report_summary_for(temp_f2[0], temp_f2[1], temp_f2[2])
            

headers = ['workflow', 'governor', 'region', 'energy consumption (kWh)', 'average emissions (gCO2e)', 'marginal emissions (gCO2e)']
mean_data = []

for region in short_regions:
    for wf in workflows:
        for gov in governors:
            entry_avg = wf_stats_avg[region][wf][gov]
            entry_marg = wf_stats_marg[region][wf][gov]
            mean_data.append([region, wf, gov, entry_avg[ENERGY_CONSUMPTION], entry_avg[CARBON_EMISSIONS], entry_marg[CARBON_EMISSIONS]])

print(wf_stats_avg)

print(tabulate(mean_data, headers, tablefmt='orgtbl'))

with open('../data/results/resource-scaling/workflows/table_mean_data.csv', 'w') as f:
    f.write(','.join(headers) + '\n')

    for row in mean_data: 
        f.write(','.join([str(v) for v in row]) + '\n')

{'gb': {'chipseq': {'performance': {'Energy Consumption (exc. PUE)': 15.325366289047475, 'Memory Energy Consumption (exc. PUE)': 0.08621989247557209, 'Operational Carbon Emissions': 780.6798238805613, 'Embodied Carbon Emissions': 78.35105412325386, 'Total Carbon Emissions': 859.0308780038152}, 'powersave': {'Energy Consumption (exc. PUE)': 21.88665381505049, 'Memory Energy Consumption (exc. PUE)': 0.18664198596138362, 'Operational Carbon Emissions': 2313.2603686580296, 'Embodied Carbon Emissions': 231.36708755418616, 'Total Carbon Emissions': 2544.6274562122157}}, 'rnaseq': {'performance': {'Energy Consumption (exc. PUE)': 8.457312757894043, 'Memory Energy Consumption (exc. PUE)': 0.07387434759618454, 'Operational Carbon Emissions': 331.42443510779884, 'Embodied Carbon Emissions': 43.76241183740219, 'Total Carbon Emissions': 375.186846945201}, 'powersave': {'Energy Consumption (exc. PUE)': 11.197552860338371, 'Memory Energy Consumption (exc. PUE)': 0.13743071802470905, 'Operational Car