In [84]:
# Basic libraries
import shutil 
import os
import docker
import logging
import time
import concurrent.futures
from datetime import datetime
import csv
import re
import pprint
import yaml
import math
# Additional stuff for data handling and analysis
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# Specific libraries for machine learning
# Feature extraction and preprocessing
from sklearn.metrics.pairwise import rbf_kernel
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Clustering
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import AgglomerativeClustering
# Dimensionality reduction and embedding
from mvlearn.embed import KMCCA
# Regression based learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [6]:
# Pipeline configurations.
RESULTS_DIR = "/usr/local/bin/results"
SCOPED_RESULTS_DIR = "/usr/local/bin/scoped_results"
CONFIG_FILE = "/usr/local/bin/scoped_results/config.yml"
FIN_CONTAINERS = "/usr/local/bin/scoped_results/died_nextflow_containers.csv"
START_CONTAINERS = "/usr/local/bin/scoped_results/started_nextflow_containers.csv"
META_DATA = "slurm-job-exporter"
DATA_SOURCE = "all"
POWER_METERING = "ebpf-mon"
POWER_STATS= "/usr/local/bin/scoped_results/task_energy_data/ebpf-mon/container_power/containers"

In [66]:
# Read in the monitoring results data.
results = "/usr/local/bin/results"
fin_containers = "/usr/local/bin/results/died_nextflow_containers.csv"
start_containers = "/usr/local/bin/results/started_nextflow_containers.csv"

for root, dirs, files in os.walk(results):
    # print(i)
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)
            data = pd.read_csv(file_path, index_col=0)
            # print(f"Found CSV file: {file_path}")

In [3]:
# TODO: Include scope results constants here to not read in the data sources that wont be used.
# Create a dict to for the data source and its corresponding primary key
# Identifier to primary key mapping
# name = nxf-container-name (used by cadvisor, ebpf-mon)
# container_name = nxf-container-name (used by docker-activity)
# workDir = nxf-container-workdir (used by slurm-exporter)

def readInResultsConf(config_file):
    """
    Read in the results configuration file and return a dictionary.
    """
    monitoring_config = config_file
    with open(monitoring_config, 'r') as file:
        data = yaml.load(file, Loader=yaml.FullLoader)

    filtered_sources = []
    seen = set()
    for target in data['monitoring_targets'].values():
        ds = target.get('data_sources')
        if ds:
            if isinstance(ds, dict):
                ds = [ds]
            for entry in ds:
                filtered = {k: entry[k] for k in ('identifier', 'source') if k in entry}
                if (
                    'source' in filtered and
                    filtered['source'] == 'slurm-job-exporter'
                ):
                    continue
                if 'source' in filtered and 'identifier' in filtered:
                    key = (filtered['source'], filtered['identifier'])
                    if key not in seen:
                        filtered_sources.append(filtered)
                        seen.add(key)
    pprint.pprint(filtered_sources)
    return filtered_sources

filtered_sources = readInResultsConf("/usr/local/bin/results/config.yml")

[{'identifier': 'name', 'source': 'cAdvisor'},
 {'identifier': 'name', 'source': 'ebpf-mon'},
 {'identifier': 'container_name', 'source': 'docker-activity'}]


In [7]:
# Set the scope for the results data
def resultsScope(results_dir, meta_data, data_source, power_metering):
    """
   Creates a copy of the results directory and returns the cleaned file tree depending on the users scope definition.
   Meta data, data source and power metering are mandatory scope definitions.
    """
    scoped_results_dir = shutil.copytree(results_dir, "/usr/local/bin/scoped_results", dirs_exist_ok=True)
    if data_source == 'all':
        print("Data source is set to 'all', no filtering will be applied.")
        return scoped_results_dir
    for metric in os.listdir(scoped_results_dir):
        metric_path = os.path.join(scoped_results_dir, metric)
        if not os.path.isdir(metric_path):
           continue 
    # Walk from base dir and rm all dirs that do not match the scope and the power dirs. 
        for subdir in os.listdir(metric_path):
            subdir_path = os.path.join(metric_path, subdir)
            subdir_name = os.path.basename(subdir_path)
            # print("Sub directory name:", subdir_name)
            if os.path.isdir(subdir_path) and subdir_name not in [meta_data, data_source, power_metering]:
                shutil.rmtree(subdir_path, ignore_errors=True)
    print("Successfully scoped results directory:", scoped_results_dir) 
    return scoped_results_dir

scoped_results = resultsScope(RESULTS_DIR, META_DATA, DATA_SOURCE, POWER_METERING) 

Data source is set to 'all', no filtering will be applied.


In [8]:
def split_task_timeseries_by_datasource(results_dir, datasource_identifier_map, nextflow_pattern=r"nxf-[A-Za-z0-9]{23}"):
    """
    For each data source in datasource_identifier_map, traverse the results_dir,
    and for each metric, split the time series CSVs into per-task files using the correct identifier column.
    """
    for datasource, identifier in datasource_identifier_map.items():
        for root, dirs, files in os.walk(results_dir):
            if os.path.basename(root) == datasource:
                for metric in os.listdir(root):
                    metric_path = os.path.join(root, metric)
                    if os.path.isdir(metric_path):
                        containers_dir = os.path.join(metric_path, "containers")
                        os.makedirs(containers_dir, exist_ok=True)
                        for file in os.listdir(metric_path):
                            if file.endswith(".csv"):
                                file_path = os.path.join(metric_path, file)
                                df = pd.read_csv(file_path)
                                if identifier not in df.columns:
                                    print(f"Identifier '{identifier}' not found in {file_path}, skipping.")
                                    continue
                                for task_name in df[identifier].unique():
                                    if pd.isna(task_name):
                                        continue
                                    if re.match(nextflow_pattern, str(task_name)):
                                        task_df = df[df[identifier] == task_name]
                                        out_path = os.path.join(containers_dir, f"{task_name}.csv")
                                        task_df.to_csv(out_path, index=False)
                                        # print(f"Saved data for {task_name} to {out_path}")
    print("Finished splitting time series data by data source.")

datasource_identifier_map = {d['source']: d['identifier'] for d in filtered_sources}
split_task_timeseries_by_datasource(scoped_results, datasource_identifier_map)

Finished splitting time series data by data source.


In [9]:
def report_missing_tasks_all_sources(results_dir, datasource_identifier_map, fin_containers_df, container_workdirs, nextflow_pattern=r"nxf-[A-Za-z0-9]{23}"):
    """
    For each data source, report how many tasks are missing compared to the finished containers.
    """
    workdir_containers = set(container_workdirs.keys())
    for datasource, identifier in datasource_identifier_map.items():
        found_containers = set()
        for root, dirs, files in os.walk(results_dir):
            if os.path.basename(root) == datasource:
                for metric in os.listdir(root):
                    metric_path = os.path.join(root, metric)
                    if os.path.isdir(metric_path):
                        for file in os.listdir(metric_path):
                            if file.endswith(".csv"):
                                file_path = os.path.join(metric_path, file)
                                df = pd.read_csv(file_path)
                                if identifier not in df.columns:
                                    continue
                                found_containers.update(
                                    str(name) for name in df[identifier].unique()
                                    if pd.notna(name) and re.match(nextflow_pattern, str(name))
                                )
        missing_in_source = workdir_containers - found_containers
        missing_in_workdirs = found_containers - workdir_containers
        print(f"--- {datasource} ---")
        print("Containers in monitored list but NOT in", datasource + ":", missing_in_source)
        print("Count:", len(missing_in_source))
        print("Containers in", datasource, "but NOT in monitored list:", missing_in_workdirs)
        print("Count:", len(missing_in_workdirs))
        print()
        
datasource_identifier_map = {d['source']: d['identifier'] for d in filtered_sources}
fin_containers = "/usr/local/bin/results/died_nextflow_containers.csv"
fin_containers_df = pd.read_csv(fin_containers)
container_workdirs = {row['Name']: row['WorkDir'] for idx, row in fin_containers_df.iterrows()}
report_missing_tasks_all_sources(scoped_results, datasource_identifier_map, fin_containers_df, container_workdirs)

--- cAdvisor ---
Containers in monitored list but NOT in cAdvisor: {'nxf-Yai5kgrU7pqBw8NF0amKCHBl', 'nxf-bSbMDssVdOQA80DS0hoeLY0P', 'nxf-t95097tnO1NDiAhs9ZHmVV7A', 'nxf-JGwoD4w6hYZEFydBJqoxaCR7'}
Count: 4
Containers in cAdvisor but NOT in monitored list: set()
Count: 0

--- ebpf-mon ---
Containers in monitored list but NOT in ebpf-mon: {'nxf-Yai5kgrU7pqBw8NF0amKCHBl', 'nxf-NxaSUuCkf1ChnPNT80phUox0', 'nxf-t95097tnO1NDiAhs9ZHmVV7A', 'nxf-JGwoD4w6hYZEFydBJqoxaCR7', 'nxf-bSbMDssVdOQA80DS0hoeLY0P'}
Count: 5
Containers in ebpf-mon but NOT in monitored list: set()
Count: 0

--- docker-activity ---
Containers in monitored list but NOT in docker-activity: {'nxf-Yai5kgrU7pqBw8NF0amKCHBl', 'nxf-NxaSUuCkf1ChnPNT80phUox0', 'nxf-J20Bjy5AuGQ9uEymEJRMfJGx', 'nxf-t95097tnO1NDiAhs9ZHmVV7A', 'nxf-JGwoD4w6hYZEFydBJqoxaCR7', 'nxf-bSbMDssVdOQA80DS0hoeLY0P'}
Count: 6
Containers in docker-activity but NOT in monitored list: set()
Count: 0



In [10]:
def add_workdir_to_all_task_csvs(results_dir, container_workdirs):
    """
    For every data source and metric, update each per-task CSV in 'containers' subfolders
    with the correct WorkDir from container_workdirs.
    """
    for root, dirs, files in os.walk(results_dir):
        if os.path.basename(root) == "containers":
            for file in files:
                if file.endswith(".csv"):
                    file_path = os.path.join(root, file)
                    fin_container_df = pd.read_csv(file_path)
                    container_name = os.path.splitext(file)[0]
                    if container_name in container_workdirs:
                        workdir = container_workdirs[container_name]
                        fin_container_df['WorkDir'] = workdir
                        fin_container_df.to_csv(file_path, index=False)
                        # print(f"Updated {file_path} with work directory {workdir}")

add_workdir_to_all_task_csvs(scoped_results, container_workdirs)

In [12]:
# TODO: Maybe update results path with scoped results path.
def extract_slurm_job_metadata(slurm_metadata_path, slurm_job_col="job_name"):
    """
    Extracts slurm job metadata from time-series CSVs and writes each job's data to a separate file.
    """
    for file in os.listdir(slurm_metadata_path):
        if file.endswith("slurm_job_id.csv"):
            file_path = os.path.join(slurm_metadata_path, file)
            print(f"Reading file: {file_path}")
            df = pd.read_csv(file_path)
            for job_name in df[slurm_job_col].unique():
                if pd.isna(job_name):
                    continue
                print(f"Processing job: {job_name}")
                job_df = df[df[slurm_job_col] == job_name]
                out_path = os.path.join(slurm_metadata_path, f"{job_name}.csv")
                job_df.to_csv(out_path, index=False)
                print(f"Saved data for {job_name} to {out_path}")

extract_slurm_job_metadata("/usr/local/bin/results/task_metadata/slurm-job-exporter/slurm_job_id")

Reading file: /usr/local/bin/results/task_metadata/slurm-job-exporter/slurm_job_id/slurm_job_id.csv
Processing job: nf-NFCORE_SAREK_PREPARE_GENOME_UNZIP_ALLELES_(G1000_alleles_hg38.zip)
Saved data for nf-NFCORE_SAREK_PREPARE_GENOME_UNZIP_ALLELES_(G1000_alleles_hg38.zip) to /usr/local/bin/results/task_metadata/slurm-job-exporter/slurm_job_id/nf-NFCORE_SAREK_PREPARE_GENOME_UNZIP_ALLELES_(G1000_alleles_hg38.zip).csv
Processing job: nf-NFCORE_SAREK_PREPARE_INTERVALS_TABIX_BGZIPTABIX_INTERVAL_COMBINED_(S07604624_Padded_Agilent_SureSelectXT_allexons_V6_UTR)
Saved data for nf-NFCORE_SAREK_PREPARE_INTERVALS_TABIX_BGZIPTABIX_INTERVAL_COMBINED_(S07604624_Padded_Agilent_SureSelectXT_allexons_V6_UTR) to /usr/local/bin/results/task_metadata/slurm-job-exporter/slurm_job_id/nf-NFCORE_SAREK_PREPARE_INTERVALS_TABIX_BGZIPTABIX_INTERVAL_COMBINED_(S07604624_Padded_Agilent_SureSelectXT_allexons_V6_UTR).csv
Processing job: nf-NFCORE_SAREK_PREPARE_INTERVALS_CREATE_INTERVALS_BED_(S07604624_Padded_Agilent_Sure

In [13]:
# TODO: Maybe update results path with scoped results path.
def update_finished_containers_with_nfcore_task(slurm_metadata_path, fin_containers, workdir_col='WorkDir', slurm_workdir_col='work_dir', slurm_job_col='job_name'):
    """
    Update the finished containers file with the nf-core task name (Nextflow) by matching work directories
    with slurm job metadata.
    """

    updated = False
    for file in os.listdir(slurm_metadata_path):
        if file.endswith("slurm_job_id.csv"):
            file_path = os.path.join(slurm_metadata_path, file)
            print(f"Reading file: {file_path}")
            df = pd.read_csv(file_path)
            fin_df = pd.read_csv(fin_containers)
            if workdir_col in fin_df.columns and slurm_workdir_col in df.columns:
                for idx, row in df.iterrows():
                    work_dir = row[slurm_workdir_col]
                    slurm_job = row[slurm_job_col]
                    if pd.isna(work_dir) or pd.isna(slurm_job):
                        print(f"Skipping row {idx} due to missing WorkDir or slurm_job.")
                        continue
                    # Update fin_df where WorkDir matches
                    fin_df.loc[fin_df[workdir_col] == work_dir, 'Nextflow'] = slurm_job
                # Write back the updated fin_df
                fin_df.to_csv(fin_containers, index=False)
                print(f"Updated {fin_containers} with slurm job info.")
                updated = True
            else:
                print("WorkDir or job_name column missing in DataFrames.")
    if not updated:
        print("No updates were made to the finished containers file.")

slurm_metadata_path = os.path.join(scoped_results, "task_metadata", "slurm-job-exporter", "slurm_job_id")
update_finished_containers_with_nfcore_task(slurm_metadata_path, FIN_CONTAINERS)

Reading file: /usr/local/bin/scoped_results/task_metadata/slurm-job-exporter/slurm_job_id/slurm_job_id.csv
Updated /usr/local/bin/scoped_results/died_nextflow_containers.csv with slurm job info.


In [14]:
def add_nextflow_to_all_task_csvs(results_dir, fin_containers_file, workdir_col='WorkDir', nextflow_col='Nextflow'):
    """
    For every data source and metric, update each per-task CSV in 'containers' subfolders
    with the correct Nextflow task value from the finished containers file.
    """
    fin_df = pd.read_csv(fin_containers_file)
    # Ensure WorkDir is string and stripped in fin_df
    fin_df[workdir_col] = fin_df[workdir_col].astype(str).str.strip()
    for root, dirs, files in os.walk(results_dir):
        if os.path.basename(root) == "containers":
            for file in files:
                if file.endswith(".csv"):
                    file_path = os.path.join(root, file)
                    container_df = pd.read_csv(file_path)
                    if workdir_col in container_df.columns:
                        # Ensure WorkDir is string and stripped in container_df
                        container_df[workdir_col] = container_df[workdir_col].astype(str).str.strip()
                        workdir = container_df[workdir_col].iloc[0]
                        match = fin_df[fin_df[workdir_col] == workdir]
                        if not match.empty and nextflow_col in match.columns:
                            nextflow_value = match[nextflow_col].values[0]
                            container_df[nextflow_col] = nextflow_value
                            container_df.to_csv(file_path, index=False)
                            print(f"Updated {file_path} with Nextflow value {nextflow_value}")
                        else:
                            print(f"No matching Nextflow value found for WorkDir {workdir} in {file_path}") 

add_nextflow_to_all_task_csvs(scoped_results, FIN_CONTAINERS)

Updated /usr/local/bin/scoped_results/task_disk_data/cAdvisor/container_fs_reads_bytes_total/containers/nxf-DCzpSRywCGxNp3AqAQG8NWp1.csv with Nextflow value nf-NFCORE_SAREK_SAREK_FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON_BWAMEM1_MEM_(HCC1395T)
Updated /usr/local/bin/scoped_results/task_disk_data/cAdvisor/container_fs_reads_bytes_total/containers/nxf-sAPk9MYk9lmJfaSU7zQ0wj3x.csv with Nextflow value nf-NFCORE_SAREK_SAREK_FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON_BWAMEM1_MEM_(HCC1395N)
Updated /usr/local/bin/scoped_results/task_disk_data/cAdvisor/container_fs_reads_bytes_total/containers/nxf-1XjcuKDW0JDyexpUcyj4QJVi.csv with Nextflow value nf-NFCORE_SAREK_SAREK_FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON_BWAMEM1_MEM_(HCC1395N)
Updated /usr/local/bin/scoped_results/task_disk_data/cAdvisor/container_fs_reads_bytes_total/containers/nxf-pIZPf1BAcpQS87kvUO89Lq7J.csv with Nextflow value nf-NFCORE_SAREK_SAREK_FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON_BWAMEM1_MEM_(HCC1395N)
Updated /usr/local/bin/scope

In [15]:
# TODO: If necessary deal with the numerical format of the signatures.
def build_container_temporal_signatures_scoped_sources(results_dir, fin_containers_file):
    """
    Build feature vectors for the scoped data sources and metrics by scanning every containers directory
    under every metric for every data source. Returns a dictionary of container temporal signatures.
    As the power consumption data of the workflow tasks will be used as labels to train models, it will be excluded from the temporal signatures.
    Each container will have a 'temporal_signatures' dict with keys like 'source/metric' for every metric from the scoped data source(s).
    """
    df = pd.read_csv(fin_containers_file)
    container_temporal_signatures = {}
    for idx, row in df.iterrows():
        container_temporal_signatures[row['Name']] = {
            'temporal_signatures': {}
        }

    # Feature vectors
    for root, dirs, files in os.walk(results_dir):
        if "task_energy_data" in root.split(os.sep):
            continue
        if os.path.basename(root) == "containers":
            metric_name = os.path.basename(os.path.dirname(root))
            for file in files:
                if file.endswith(".csv"):
                    file_path = os.path.join(root, file)
                    ts_container_df = pd.read_csv(file_path)
                    ts_container_df['timestamp'] = pd.to_datetime(ts_container_df['timestamp'], unit='ns')
                    ts_container_df.set_index('timestamp', inplace=True)
                    value_cols = [col for col in ts_container_df.columns if col.startswith('Value')]
                    if not value_cols:
                        print(f"Skipping {file_path} as it does not contain 'value' column.")
                        continue
                    resource_series = ts_container_df[value_cols[0]]  

                    # Feature extraction
                    peak_value = resource_series.max()
                    lowest_value = resource_series.min()
                    mean_value = resource_series.mean()
                    median_value = resource_series.median()
                    variance = resource_series.var()
                    mean_val = resource_series.mean()
                    if mean_val == 0:
                        relative_variance = 0.0  
                    else:
                        relative_variance = (resource_series.var() - mean_val**2) / (mean_val**2)
                    std_dev = resource_series.std()
                    pattern_vector = resource_series.iloc[np.round(np.linspace(0, len(resource_series) - 1, 10)).astype(int)].to_numpy()

                    # The server spec can come from the host benchmark in nextflow
                    server_spec = {
                        'GHz x Cores': "",
                        'GFlops': "",
                        'RAM': "",
                        'IOPS': "",
                        'Max Network Throughput': "",
                    }

                    feature_vector = { 
                        'peak_value': peak_value, 'lowest_value': lowest_value, 'mean': mean_value, 
                        'variance': variance
                    }

                    # feature_vector = { 
                    #     'peak_value': peak_value, 'lowest_value': lowest_value, 'mean': mean_value, 'median': median_value, 
                    #     'variance': variance,'relative_variance': relative_variance, 'std_dev':std_dev, 
                    #     'pattern_vector': pattern_vector, 'server_spec': server_spec
                    # }
                    
                    container_name = os.path.splitext(file)[0]
                    if container_name in container_temporal_signatures:
                        if feature_vector is not None and feature_vector != {}:
                            # Validation step to account for missing feature values
                            expected_keys = ['peak_value', 'lowest_value', 'mean', 'variance']
                            missing_values = [key for key in expected_keys if key not in feature_vector or feature_vector[key] is None]
                            if missing_values:
                                print(f"Warning: Missing values in feature vector for {container_name} in {metric_name}: {missing_values}")
                            if 'pattern_vector' in feature_vector:
                                if not isinstance(feature_vector['pattern_vector'],np.ndarray):
                                    print(f"WARNING: {container_name} {metric_name} pattern_vector shape: {feature_vector['pattern_vector'].shape}")
                            container_temporal_signatures[container_name]['temporal_signatures'][metric_name] = feature_vector
    pprint.pprint(container_temporal_signatures)
    return container_temporal_signatures

container_temporal_signatures = build_container_temporal_signatures_scoped_sources(scoped_results, FIN_CONTAINERS)

{'nxf-0Q4uzTd1MwW3RtoMMle3ddKg': {'temporal_signatures': {'container_blkio_device_usage_total': {'lowest_value': 4096.0,
                                                                                                 'mean': 6144.0,
                                                                                                 'peak_value': 8192.0,
                                                                                                 'variance': 4218832.093567251},
                                                          'container_cpu_user_seconds_total': {'lowest_value': 0.020038,
                                                                                               'mean': 1185.1207543114035,
                                                                                               'peak_value': 2003.171991,
                                                                                               'variance': 472841.08923190285},
                        

In [16]:
def cleanFeatureVectors(container_temporal_signatures):
    """
    Clean the feature vectors by removing containers that have no temporal signatures.
    This function modifies the input dictionary in place.
    """
    cleaned_container_temporal_signatures = container_temporal_signatures.copy()
    none_counter = 0
    to_delete = []
    for name, info in cleaned_container_temporal_signatures.items():
        if not info['temporal_signatures']:
            none_counter += 1
            # print(f"Container {name} has no temporal signatures. Will be deleted.")
            to_delete.append(name)
    print(f"Total containers with no signature for any metric: {none_counter}")

    # Delete containers with no temporal signatures.
    for name in to_delete:
        del cleaned_container_temporal_signatures[name]
    # pprint.pprint(container_temporal_signatures)

    print(f"Remaining containers after cleaning: {len(cleaned_container_temporal_signatures)}")

    all_metrics = set()
    for info in cleaned_container_temporal_signatures.values():
        all_metrics.update(info['temporal_signatures'].keys())
    all_metrics = sorted(all_metrics)
    print(f"All metrics found: {all_metrics}")

    all_feature_names = set()
    for info in cleaned_container_temporal_signatures.values():
        for metric in info['temporal_signatures'].values():
            all_feature_names.update([k for k in metric.keys()])
    all_feature_names = sorted(all_feature_names)
    
    containers_with_all_metrics = []

    for container, info in cleaned_container_temporal_signatures.items():
        if set(info['temporal_signatures'].keys()) == set(all_metrics):
            containers_with_all_metrics.append(container)
    print(f"Keeping {len(containers_with_all_metrics)} containers with all metrics.")

    # pprint.pprint(containers_with_all_metrics)
    # pprint.pprint(all_metrics)
    # pprint.pprint(all_feature_names)
    pprint.pprint(cleaned_container_temporal_signatures)
    return cleaned_container_temporal_signatures, containers_with_all_metrics, all_metrics, all_feature_names

cleaned_container_temporal_signatures, containers_with_all_metrics, all_metrics, all_feature_names = cleanFeatureVectors(container_temporal_signatures)

Total containers with no signature for any metric: 4
Remaining containers after cleaning: 47
All metrics found: ['container_blkio_device_usage_total', 'container_cpu_user_seconds_total', 'container_fs_io_current', 'container_fs_reads_bytes_total', 'container_fs_writes_bytes_total', 'container_memory_usage_bytes', 'container_weighted_cycles', 'cpuPercent', 'memoryUsage']
Keeping 33 containers with all metrics.
{'nxf-0Q4uzTd1MwW3RtoMMle3ddKg': {'temporal_signatures': {'container_blkio_device_usage_total': {'lowest_value': 4096.0,
                                                                                                 'mean': 6144.0,
                                                                                                 'peak_value': 8192.0,
                                                                                                 'variance': 4218832.093567251},
                                                          'container_cpu_user_seconds_total': {'lowest_va

### Feature Extraction for Container Metrics

#### Given

- **Containers**  
  `C = {c1, c2, …, cn}`  
  *Example:* `nxf-0X0tQJagkeWOAir2jS124FfK`, `nxf-0mUZ0M8vpF30z1CEoXjCQQbH`, …

- **Metrics**  
  `M = {container_weighted_cycles}`

- **Feature Table**

| Container Name                | lowest_value        | mean               | peak_value         | variance           |
|-------------------------------|---------------------|--------------------|--------------------|--------------------|
| nxf-0X0tQJagkeWOAir2jS124FfK  | 2.55 | 1.93 | 1.99 | 1.02  |
| nxf-0mUZ0M8vpF30z1CEoXjCQQbH  | 1.03  | 1.03| 1.03  | 0                 |

- **Features per Metric**  
  `F = {lowest_value, mean, peak_value, variance}`

##### Feature Vector

For each container `c_i` and metric `m` in `M`, extract:

$$
\mathbf{x}_i =
\begin{bmatrix}
\text{lowest\_value}(c_i, m) \\
\text{mean}(c_i, m) \\
\text{peak\_value}(c_i, m) \\
\text{variance}(c_i, m)
\end{bmatrix}
$$

##### Matrix form

$$
X =
\begin{bmatrix}
x_{1,1} & x_{1,2} & x_{1,3} & x_{1,4} \\\\
x_{2,1} & x_{2,2} & x_{2,3} & x_{2,4} \\\\
\vdots  & \vdots  & \vdots  & \vdots  \\\\
x_{n,1} & x_{n,2} & x_{n,3} & x_{n,4}
\end{bmatrix}
$$

In [41]:
def buildFeatureMatriceInput(containers_with_all_metrics, all_metrics, all_feature_names):
    """
    Build the feature matrices for the containers with all metrics.
    Returns the feature matrix and the container names.
    """

    # Generate column names for all metric-feature pairs
    full_feature_names = []
    for metric in all_metrics:
        for name in all_feature_names:
            full_feature_names.append(f"{metric}_{name}")
            
    # Those containers who have all metrics (some are not caught by all exporters)
    feature_matrix_x = []
    container_names_x = []

    for container in containers_with_all_metrics:
        info = cleaned_container_temporal_signatures[container]
        # pprint.pprint(info['temporal_signatures'])
        row = []
        for metric in all_metrics:
            feats = info['temporal_signatures'].get(metric)
            # print(f"Processing container: {container}, metric: {metric}")
            if feats:
                for name in all_feature_names:
                    value = feats.get(name)
                    if isinstance(value, np.ndarray): # handles the pattern feature being a numpy array
                        row.extend(value.tolist())
                    else:
                        row.append(value)
        feature_matrix_x.append(row)
        container_names_x.append(container)

    # Convert into feature matrix K_x
    feature_matrix_x = np.array(feature_matrix_x)
    print(f"Feature matrix shape: {feature_matrix_x.shape}")
    df = pd.DataFrame(feature_matrix_x, columns=full_feature_names)
    print(df)
    return feature_matrix_x, full_feature_names, container_names_x

feature_matrix_x, full_feature_names, container_names_x = buildFeatureMatriceInput(containers_with_all_metrics, all_metrics, all_feature_names)
pprint.pprint(full_feature_names)

Feature matrix shape: (33, 36)
    container_blkio_device_usage_total_lowest_value  \
0                                               0.0   
1                                               0.0   
2                                            4096.0   
3                                               0.0   
4                                               0.0   
5                                               0.0   
6                                               0.0   
7                                               0.0   
8                                               0.0   
9                                               0.0   
10                                              0.0   
11                                              0.0   
12                                              0.0   
13                                              0.0   
14                                              0.0   
15                                              0.0   
16                                

In [19]:
# Add power values from one chosen data source to all nextflow files for each data source.
# First just add the power values to fin_containers.
def addPowerToFinContainers(fin_containers, containers_with_all_metrics, power_stats):
    """
    Add power values to the finished containers file.
    """
    fin_df = pd.read_csv(fin_containers)
    power_stat_files = set(f[:-4] for f in os.listdir(power_stats) if f.endswith('.csv'))
    # print(power_stat_files)

    for container in containers_with_all_metrics:
        # print(container)
        if container in power_stat_files:
            power_df = pd.read_csv(os.path.join(power_stats, f"{container}.csv"))
            # print(power_df.head())
            mean_power = power_df['Value (microjoules)'].mean() if 'Value (microjoules)' in power_df.columns else None
            fin_df.loc[fin_df['Name'] == container, 'MeanPower'] = mean_power
    fin_df.to_csv(fin_containers, index=False)
    return fin_df

fin_df = addPowerToFinContainers(FIN_CONTAINERS, containers_with_all_metrics, POWER_STATS)
# pprint.pprint(fin_df)

### Feature Extraction for Container Runtime and Power

#### Given

- **Containers**  
  `C = {c1, c2, …, cn}`  
  *Example:* `nxf-0X0tQJagkeWOAir2jS124FfK`, `nxf-0mUZ0M8vpF30z1CEoXjCQQbH`, …

- **Metrics**  
  `M = {runtime, power}`

- **Feature Table**

| Container Name                | runtime (s) | power (μJ)      |
|-------------------------------|-------------|-----------------|
| nxf-0X0tQJagkeWOAir2jS124FfK  | 123.4       | 1.23        |
| nxf-0mUZ0M8vpF30z1CEoXjCQQbH  | 98.7        | 2.34       |

- **Features per Container**  
  `F = {runtime, power}`

##### Feature Vector

For each container `c_i`, extract:

$$
\mathbf{y}_i =
\begin{bmatrix}
\text{runtime}(c_i) \\
\text{power}(c_i)
\end{bmatrix}
$$

##### Matrix form

$$
Y =
\begin{bmatrix}
y_{1,1} & y_{1,2} \\\\
y_{2,1} & y_{2,2} \\\\
\vdots  & \vdots  \\\\
y_{n,1} & y_{n,2}
\end{bmatrix}
$$

Where each row corresponds to a container, and the columns are:
- `runtime`: execution time in seconds
- `power`: mean power consumption in microjoules

In [20]:
# Build feature output matrix for KCCA model.
def buildFeatureMatriceOutput(fin_df):
    """
    Build the feature matrices for the finished containers.
    Returns the feature matrix and the container names only for containers with available power values.
    """
    container_runtime_power = {}

    fin_df['LifeTime_s'] = (
        fin_df['LifeTime']
        .str.extract(r'([0-9.]+)(ms|s)', expand=True)
        .assign(
            value=lambda x: x[0].astype(float),
            seconds=lambda x: np.where(x[1] == 'ms', x['value'] / 1000, x['value'])
        )['seconds']
    )

    for idx, row in fin_df.iterrows():
        container_runtime_power[row['Name']] = {
            'runtime': row['LifeTime_s'],
            'power': row['MeanPower']
        }
        
    feature_matrix_y = []
    container_names_y = []

    for container, info in container_runtime_power.items():
        if container not in cleaned_container_temporal_signatures:
            continue
        if pd.notna(info['runtime']) and pd.notna(info['power']):
            feature_matrix_y.append([info['runtime'], info['power']])
            container_names_y.append(container)
            
    # Transform feature matrix K_y into numpy array
    feature_matrix_y = np.array(feature_matrix_y)
    print(f"Feature matrix shape: {feature_matrix_y.shape}")
    df = pd.DataFrame(feature_matrix_y, columns=['runtime', 'power'])
    print(df)

    return feature_matrix_y, container_names_y

finished_containers_dfs_with_power = addPowerToFinContainers(FIN_CONTAINERS, containers_with_all_metrics,POWER_STATS) 
feature_matrix_y = buildFeatureMatriceOutput(finished_containers_dfs_with_power)

Feature matrix shape: (33, 2)
      runtime       power
0   59.548863    6.496155
1   53.175117  225.025966
2   46.426578  240.184366
3   46.762919   24.579077
4   45.793670  256.548187
5   43.455630  248.746481
6   44.249097  220.017799
7    0.958273   27.555464
8   41.359273  186.599306
9   37.302691  195.881560
10  34.573092  191.470158
11  34.374786  254.304466
12  34.707160  253.612773
13  48.849550  279.803949
14  34.654697  279.753171
15  50.874116  369.359143
16  49.923590  403.952780
17  50.525501  348.337542
18  49.453427  344.794584
19  50.567061  390.085584
20  50.306627  319.980598
21  51.197768  232.641956
22  50.086856  346.933412
23  50.740052  326.995385
24  50.295974  328.254649
25  52.834660  304.828611
26   3.022350   42.339426
27  56.095345    8.417772
28  52.538233    0.333439
29  55.659238   46.906973
30  25.138424   65.651936
31   0.646006   27.709335
32  25.803028   13.531401


In [21]:
# Debugging output to check if the container names in X and Y match and order is the same.
container_names_x = feature_matrix_x[1]
container_names_y = feature_matrix_y[1]
print("X names:", container_names_x[:5])
print("Y names:", container_names_y[:5])
print("Length X:", len(container_names_x))
print("Length Y:", len(container_names_y))
print("All X in Y:", all(name in container_names_y for name in container_names_x))
print("All Y in X:", all(name in container_names_x for name in container_names_y))
print("Order identical:", container_names_x == container_names_y)

X names: ['nxf-mEtwQSB2fF1zwFwMdrk4AJ0W', 'nxf-9oUdO4sl7BdpAnHASfQhLlrX', 'nxf-sAPk9MYk9lmJfaSU7zQ0wj3x', 'nxf-UOzR0F8D9pclTsQSQbiMePeV', 'nxf-u9U8E8iqlCigZKyInNfL93hA']
Y names: ['nxf-mEtwQSB2fF1zwFwMdrk4AJ0W', 'nxf-9oUdO4sl7BdpAnHASfQhLlrX', 'nxf-sAPk9MYk9lmJfaSU7zQ0wj3x', 'nxf-UOzR0F8D9pclTsQSQbiMePeV', 'nxf-u9U8E8iqlCigZKyInNfL93hA']
Length X: 33
Length Y: 33
All X in Y: True
All Y in X: True
Order identical: True


#### Z-Score Transformation and Standard Scaling performed on Feature and Label Matrices

**Standard scaling** (also known as z-score normalization) is a technique used to standardize the features of a dataset so that they have the properties of a standard normal distribution with a mean of 0 and a standard deviation of 1.

##### Formula

The standard score (z-score) for a value \( x \) is calculated as:

$$
z = \frac{x - \mu}{\sigma}
$$

where:
- \( x \) is the original value,
- \( \mu \) is the mean of the training samples,
- \( \sigma \) is the standard deviation of the training samples.

##### How StandardScaler Works

- **Centering**: Subtracts the mean value of each feature so that the feature is centered around zero.
- **Scaling**: Divides each centered feature by its standard deviation so that the resulting distribution has unit variance.

This transformation is performed **independently for each feature**.
- Many machine learning algorithms assume that all features are centered around zero and have the same scale.
- Features with larger scales can dominate the objective function and negatively impact model performance.
- Standard scaling ensures that each feature contributes equally to the model.
- StandardScaler is sensitive to outliers: extreme values can affect the mean and standard deviation, leading to less robust scaling.
- For sparse data, one can disable mean centering to preserve sparsity.

In [22]:
def scaleFeatureMatrices(feature_matrix_x, feature_matrix_y):
    """
    Scale the feature matrices using StandardScaler.
    Returns the scaled feature matrices.
    """
    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    scaled_x = scaler_x.fit_transform(feature_matrix_x)
    scaled_y = scaler_y.fit_transform(feature_matrix_y)

    print(f"Scaled feature matrix X shape: {scaled_x.shape}")
    print(f"Scaled feature matrix Y shape: {scaled_y.shape}")
    
    return scaled_x, scaled_y, scaler_x, scaler_y

scaled_feature_matrix_x, scaled_feature_matrix_y, scaler_x, scaler_y = scaleFeatureMatrices(feature_matrix_x[0], feature_matrix_y[0])

Scaled feature matrix X shape: (33, 36)
Scaled feature matrix Y shape: (33, 2)


#### Train-Test Split Procedure

To evaluate machine learning models, it is common practice to split the available data into **training** and **testing** subsets. This ensures that model evaluation is performed on data not seen during training, providing an unbiased estimate of model performance.

The `train_test_split` function from scikit-learn is a utility that splits arrays or matrices into random train and test subsets. It shuffles and splits the data in a single call.

**Parameters:**
- **arrays**: Input data to split (e.g., numpy arrays, pandas DataFrames, lists). All arrays must have the same length.
- **test_size**: Proportion (float between 0.0 and 1.0) or absolute number (int) of samples to include in the test split. Default is 0.25 if not specified.
- **train_size**: Proportion or absolute number of samples to include in the train split. If None, set to the complement of `test_size`.
- **random_state**: Controls the shuffling applied to the data before splitting. Setting an integer ensures reproducibility.
- **shuffle**: Whether to shuffle the data before splitting (default: True).
- **stratify**: If not None, data is split in a stratified fashion using this as class labels.

**Returns:**
- The function returns lists or arrays containing the train-test split of the inputs, preserving the input type (e.g., numpy array, pandas DataFrame).

In [34]:
# Debug
# Check the values of the test data set against the kcca predictions before z-score normalization.
def splitFeatureMatrices(feature_matrix_x, feature_matrix_y, container_names_x, container_names_y):
    """
    Split the feature matrices into training and testing sets.
    """
    X_train, X_test, y_train, y_test, train_container_names_x, test_container_names_x, train_container_names_y, test_container_names_y = train_test_split(
        feature_matrix_x, feature_matrix_y, container_names_x, container_names_y, test_size=0.2, random_state=42
    )
    print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")
    return X_train, X_test, y_train, y_test, train_container_names_x, test_container_names_x, train_container_names_y, test_container_names_y

X_train, X_test, y_train, y_test, train_container_names_x, test_container_names_x, train_container_names_y, test_container_names_y = splitFeatureMatrices(feature_matrix_x[0], feature_matrix_y[0], feature_matrix_x[1], feature_matrix_y[1])

x_train_df = pd.DataFrame(X_train, columns=full_feature_names)
y_train_df = pd.DataFrame(y_train, columns=['runtime', 'power'])
# print("X_train DataFrame:", x_train_df)
x_test_df = pd.DataFrame(X_test, columns=full_feature_names)
# print("X_test DataFrame:", x_test_df)

y_train_df = pd.DataFrame(y_train, columns=['runtime', 'power'])
print("y_train DataFrame:", y_train_df)
y_test_df = pd.DataFrame(y_test, columns=['runtime', 'power'])
print("y_test DataFrame:", y_test_df)

ValueError: Found input variables with inconsistent numbers of samples: [33, 33, 36, 33]

In [61]:
def splitFeatureMatrices(feature_matrix_x, feature_matrix_y, container_names_x, container_names_y):
    """
    Split the feature matrices into training and testing sets.
    """
    X_train, X_test, y_train, y_test, train_container_names_x, test_container_names_x, train_container_names_y, test_container_names_y = train_test_split(
        feature_matrix_x, feature_matrix_y, container_names_x, container_names_y, test_size=0.2, random_state=42
    )
    print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")
    return X_train, X_test, y_train, y_test, train_container_names_x, test_container_names_x, train_container_names_y, test_container_names_y

X_train, X_test, y_train, y_test, train_container_names_x, test_container_names_x, train_container_names_y, test_container_names_y = splitFeatureMatrices(scaled_feature_matrix_x, scaled_feature_matrix_y, container_names_x, container_names_y)

x_train_df = pd.DataFrame(X_train, columns=full_feature_names)
x_test_df = pd.DataFrame(X_test, columns=full_feature_names)
y_train_df = pd.DataFrame(y_train, columns=['runtime', 'power'])
y_test_df = pd.DataFrame(y_test, columns=['runtime', 'power'])

Training set shape: (26, 36), Test set shape: (7, 36)


In [62]:
# Debugging
# Check variance of original feature matrices before kernelization
print("feature_matrix_x std:", np.std(scaled_feature_matrix_x))
print("feature_matrix_y std:", np.std(scaled_feature_matrix_y))

# After splitting
print("X_train (features) std:", np.std(X_train))
print("Y_train (features) std:", np.std(y_train))
print("X_test (features) std:", np.std(X_test))
print("Y_test (features) std:", np.std(y_test))

feature_matrix_x std: 0.9428090415820634
feature_matrix_y std: 1.0
X_train (features) std: 0.9049055257090838
Y_train (features) std: 0.9041293723900872
X_test (features) std: 1.055340888644453
Y_test (features) std: 1.2727008917726146


### Building the Kernel Matrix for Workflow Tasks

We build an $N \times N$ matrix $K_x$ where the $(i, j)$-th entry is the kernel evaluation $k_x(x_i, x_j)$,  
with $x_i$ and $x_j$ being the temporal signatures for tasks $i$ and $j$.

- **Each row and column** corresponds to a workflow task.
- **Each entry** $K_x[i, j] = k(x_i, x_j)$ measures the similarity between tasks $i$ and $j$ using a kernel function.
- We use the **Gaussian (RBF) kernel**, which measures similarity based on the Euclidean distance in feature space, scaled by a parameter $\sigma$.
- This kernel gives higher values when two tasks have similar temporal patterns.

In [None]:
# KCCA Model kernalizes the normalized input matrices itself.
# def computeKernelMatrices(X_train, X_test, y_train, y_test):
#     """
#     Compute the RBF kernel matrices for train and test splits.
#     Returns: K_x_train, K_x_test, K_y_train, K_y_test
#     """
#     K_x_train = rbf_kernel(X_train, X_train)
#     K_y_train = rbf_kernel(y_train, y_train)
#     K_x_test = rbf_kernel(X_test, X_train)
#     K_y_test = rbf_kernel(y_test, y_train)
#     print(f"K_x_train shape: {K_x_train.shape}, K_x_test shape: {K_x_test.shape}")
#     print(f"K_y_train shape: {K_y_train.shape}, K_y_test shape: {K_y_test.shape}")
#     return K_x_train, K_x_test, K_y_train, K_y_test

# K_x_train, K_x_test, K_y_train, K_y_test = computeKernelMatrices(X_train, X_test, y_train, y_test)

### Kernel Canonical Correlation Analysis (KCCA) Overview

The **KCCA algorithm** takes the kernel matrices \( K_x \) and \( K_y \) and solves a generalized eigenvector problem. This procedure finds subspaces in the linear space spanned by the eigenfunctions of the kernel functions such that projections onto these subspaces are **maximally correlated** [7]. Traditional Canonical Correlation Analysis (CCA) aims to find useful projections of features in each view of data by computing a weighted sum. However, due to its linearity, CCA may not extract meaningful descriptors of complex data.

Kernel MCCA (KMCCA) addresses this limitation by first projecting the data into a higher-dimensional feature space **before** performing CCA in that new space.

- We refer to these projections as the **resource usage projection** and the **metric projection**, respectively.
- If the linear space associated with the Gaussian (RBF) kernel can be interpreted as clusters in the original feature space, then KCCA finds **correlated pairs of clusters** in the resource usage vector space and the performance/power vector space.

**Workflow:**
1. **Compute kernel matrices** \( K_x \) and \( K_y \) for the resource and metric features.
2. **Fit KCCA** using the training data kernel matrices.
3. **Project data** into the maximally correlated subspaces for further analysis or prediction.

In [63]:
kmcca = KMCCA(kernel='rbf', n_components=2)
kmcca.fit([X_train, y_train])

0,1,2
,n_components,2
,kernel,'rbf'
,kernel_params,{}
,regs,
,signal_ranks,
,sval_thresh,0.001
,diag_mode,'A'
,center,True
,filter_params,False
,n_jobs,


In [64]:
# Project training and test data
X_train_proj, Y_train_proj = kmcca.transform([X_train, y_train])
X_test_proj, Y_test_proj = kmcca.transform([X_test, y_test])

In [65]:
# Inspect the projections
print("X_train_proj shape:", X_train_proj.shape)
print("Y_train_proj shape:", Y_train_proj.shape)
print("X_test_proj shape:", X_test_proj.shape)
print("Y_test_proj shape:", Y_test_proj.shape)

print("X_train_proj mean/std:", np.mean(X_train_proj), np.std(X_train_proj))
print("Y_train_proj mean/std:", np.mean(Y_train_proj), np.std(Y_train_proj))
print("First 3 rows of X_train_proj:\n", X_train_proj[:3])
print("First 3 rows of Y_train_proj:\n", Y_train_proj[:3])

X_train_proj shape: (26, 2)
Y_train_proj shape: (26, 2)
X_test_proj shape: (7, 2)
Y_test_proj shape: (7, 2)
X_train_proj mean/std: -3.752340318805217e-16 0.1386750490563072
Y_train_proj mean/std: -1.6653345369377348e-16 0.13867504905630715
First 3 rows of X_train_proj:
 [[-0.04995371  0.07769868]
 [-0.05053754 -0.21141113]
 [-0.16414332 -0.03278422]]
First 3 rows of Y_train_proj:
 [[-0.04995371  0.07769868]
 [-0.05053754 -0.21141113]
 [-0.16414332 -0.03278422]]


In [66]:
# Debug
# Evaluate the correlation between the projections for test data
corr, _ = pearsonr(X_test_proj.ravel(), Y_test_proj.ravel())
print(f"Pearson correlation coefficient between projections: {corr:.3f}")

Pearson correlation coefficient between projections: 0.117


### Predicting Power Consumption and Execution Time Using KCCA and Nearest Neighbors

The **consumed power** and **execution time** for a one-to-one mapping of clusters to servers can be estimated using a KCCA model trained offline. The process is as follows:

1. **Projection into Resource Subspace:**  
   The input vector, which includes the temporal signature of the resource usage profile (and optionally server capacity), is projected into the resource subspace learned by KCCA.

2. **Finding Nearest Neighbors:**  
   For each test sample's resource projection (`X_test_proj`), find its *k* nearest neighbors among the training projections (`X_train_proj`).  
   - This is typically done using Euclidean distance in the projected subspace.
   - In our implementation, we use `k = 3`.

3. **Inferring Metric Projections:**  
   For each test sample, collect the metric projections (`Y_train_proj`) of its *k* nearest neighbors.

4. **Weighted Sum for Prediction:**  
   Compute a weighted sum of these metric projections, where the weight for each neighbor is the inverse of its distance to the test sample (closer neighbors have more influence).

5. **Mapping Back to Original Metric Space:**  
   The weighted sum gives an estimated metric projection for the test sample.  
   - If your metrics were scaled, use the scaler's `inverse_transform` to convert the projection back to the original units (e.g., actual power and time).

6. **Selecting the Optimal Point (Optional):**  
   The optimal point of this iteration, with the minimum total power consumption, can be recorded for further analysis or scheduling.

---
- **KCCA** finds maximally correlated subspaces between resource usage and metrics, capturing nonlinear relationships.
- By using nearest neighbors in the resource subspace, you leverage the learned relationship to predict metrics for new, unseen resource profiles.
- The weighted sum ensures that predictions are more influenced by similar (closer) training

### Estimating Power and Execution Time via KCCA

The consumed power and execution time for the **one-to-one mapping** of clusters to servers can be estimated using a **KCCA model** trained offline.

Specifically:

- The **input vector** — consisting of the **temporal signature** of the resource usage profile and the **server capacity** — is projected into the **resource subspace**.
- The corresponding coordinates in the **metric subspace** are inferred using **k-nearest neighbors** (**\( k = 3 \)** in our implementation).
- The **metric projection** is then mapped back to the original **metrics**:
  - **Consumed Power**
  - **Execution Time**

A **weighted sum** of the metric projections from the \( k \) nearest neighbors is computed. The **weight** is defined as the **inverse of the distance** between projections in the subspace.

The **optimal point** — the configuration with the **minimum total power consumption** — is recorded for deployment.

Then, temporal signature of the new cluster is updated from the consolidated workloads. Such consolidation iterations stop when the clusters cannot be merged anymore since merging will incur significant interference, and/or the degradation in application performance will be intolerable.

In [67]:
# Starting point for unseen data used with KCCA model.
def predictKCCAUnseen(X_train_proj, Y_train_proj, X_test_proj, scaler_y, k=3):
    """
    Based on the projections of the trained Input features
    this func uses the unsupervised nearest neighbours algorithm
    to find the nearest points from the unseen points to the training data
    points. When the k-nearest neighbours are found a weighted average
    denotes the prediction for the unseen data.
    This function assumes that the unseen data is already scaled and prepared.
    
    Args:
        X_train_proj: Projected training resource features (n_train, n_components)
        Y_train_proj: Projected training metric features (n_train, n_components)
        X_test_proj: Projected test resource features (n_test, n_components)
        scaler_y: Fitted StandardScaler for the metric space (for inverse_transform)
        k: Number of nearest neighbors to use (default: 3)
    Returns:
        Y_pred: Predicted metrics (runtime, power) in original units for test data (n_test, 2)
    """

    nn = NearestNeighbors(n_neighbors=k, metric='euclidean')
    # Fit the model on the training projections
    nn.fit(X_train_proj)

    distances, indices = nn.kneighbors(X_test_proj)
    
    Y_pred_proj = []
    for i, (dists, idxs) in enumerate(zip(distances, indices)):
        print(f"\nTest sample {i}:")
        print(f"  Neighbor indices: {idxs}")
        print(f"  Neighbor distances: {dists}")
        actual_neighbor_values = scaler_y.inverse_transform(Y_train_proj[idxs])
        print(f"  Neighbor actual values (runtime, power):\n{actual_neighbor_values}")
        weights = 1 / (dists + 1e-8)  # Avoid division by zero
        weights /= weights.sum()
        y_pred = np.average(Y_train_proj[idxs], axis=0, weights=weights)
        Y_pred_proj.append(y_pred)
    Y_pred_proj = np.array(Y_pred_proj)
    Y_pred = scaler_y.inverse_transform(Y_pred_proj)
    print(Y_pred.shape)
    return Y_pred

# Call func on test data projections
Y_pred = predictKCCAUnseen(X_train_proj, Y_train_proj, X_test_proj, scaler_y, k=3)
df = pd.DataFrame(Y_pred, columns=['runtime', 'power'])
print("Predicted metrics for test data:")
print(df.head())


Test sample 0:
  Neighbor indices: [21 12 25]
  Neighbor distances: [0.25757548 0.47169147 0.62262336]
  Neighbor actual values (runtime, power):
[[ 49.67947404 208.87578334]
 [ 46.26360259 208.95199741]
 [ 44.0433985  204.27520267]]

Test sample 1:
  Neighbor indices: [13 10  3]
  Neighbor distances: [0.02786881 0.02987887 0.0380997 ]
  Neighbor actual values (runtime, power):
[[ 41.0166517  201.80385463]
 [ 41.32763086 204.50326628]
 [ 41.34085065 202.2226978 ]]

Test sample 2:
  Neighbor indices: [10  3 19]
  Neighbor distances: [0.03963433 0.04345446 0.05208114]
  Neighbor actual values (runtime, power):
[[ 41.32763086 204.50326628]
 [ 41.34085065 202.2226978 ]
 [ 41.34202291 200.24453794]]

Test sample 3:
  Neighbor indices: [ 0 20  8]
  Neighbor distances: [0.01027219 0.01667735 0.02377701]
  Neighbor actual values (runtime, power):
[[ 41.11928124 216.54380408]
 [ 41.46505633 218.25723027]
 [ 41.48122712 219.38680373]]

Test sample 4:
  Neighbor indices: [ 5 19 13]
  Neighbor di

#### Prediction Procedure Using k-Nearest Neighbors in Projected Space

| Step | Purpose |
|------|---------|
| `nn.kneighbors(X_test_proj)` | Find \(k\) nearest neighbors for each test sample in the projected resource space |
| Loop over test samples | For each test sample, perform the following steps: |
| `weights = 1 / (dists + 1e-8)` | Compute inverse-distance weights |
| `weights /= weights.sum()` | Normalize weights so they sum to 1 |
| `np.average(Y_train_proj[idxs], axis=0, weights=weights)` | Compute the weighted average of neighbors' metric projections |
| `Y_pred_proj.append(y_pred)` | Collect the prediction for this test sample |
| `np.array(Y_pred_proj)` | Stack all predictions into a single matrix |
| `scaler_y.inverse_transform(Y_pred_proj)` | Convert predictions back to original metric units |
| `return Y_pred` | Output the final predictions |

Actual test values I think the predictions are off and somewhat similar to each other due to lack of enough training data points.

Training set shape: (11, 4), Test set shape: (3, 4)
y_test DataFrame:     runtime       power
0  6.161708    0.297813
1  1.344221    0.008668
2  3.291482  683.055173

### Clustering for Workflow Task Consolidation

Our consolidation problem can be viewed as a **clustering problem**. Traditionally, clustering algorithms group similar objects together based on a defined similarity or distance metric. However, in our context, the objective is different:

- **Goal:** Group workflow tasks that are **dissimilar** in their resource requirements.
- **Rationale:** By consolidating tasks with dissimilar resource usage, we can minimize resource contention and interference, leading to more efficient utilization of system resources.

#### Custom Distance Measure

To achieve this, we need to define a **distance measure** that captures the **interference** between the resource requirements of workflow tasks. Instead of grouping tasks with similar profiles, our distance metric should:

- Assign **larger distances** to pairs of tasks with similar resource usage (to discourage grouping them together).
- Assign **smaller distances** to pairs of tasks with complementary or non-overlapping resource usage (to encourage their consolidation).

#### Approach

1. **Feature Extraction:**  
   Extract temporal signatures or resource usage profiles for each workflow task.

2. **Distance Metric Design:**  
   Design a distance function that reflects the potential for interference. For example, tasks with overlapping peaks in CPU, memory, or I/O usage should have a higher distance.

3. **Clustering Algorithm:**  
   Apply a clustering algorithm (e.g., k-means, hierarchical clustering, or custom algorithms) using the designed distance metric to group tasks.

4. **Consolidation:**  
   Assign tasks from different clusters to the same server or resource pool, ensuring that grouped tasks are as dissimilar as possible in their resource demands.

In [68]:
def compute_nextflow_task_peak_series(results_dir):
    """
    For every data source and metric, update each per-task CSV in 'containers' subfolders
    with the correct Nextflow task value from the finished containers file.
    """
    
    for root, dirs, files in os.walk(results_dir):
        if os.path.basename(root) == "containers":
            for file in files:
                if file.endswith(".csv"):
                    file_path = os.path.join(root, file)
                    task_df = pd.read_csv(file_path)
                    task_df['timestamp'] = pd.to_datetime(task_df['timestamp'], unit='ns')
                    task_df.set_index('timestamp', inplace=True)
                    value_cols = [col for col in task_df.columns if col.startswith('Value')]
                    if not value_cols:
                        # print(f"Skipping {file_path} as it does not contain 'value' column.")
                        continue
                    resource_series = task_df[value_cols[0]]
                    # print(f"Processing {file_path} with resource series: {resource_series.name}")
                    # Compute the peak series
                    peak_series = resource_series.resample('1s').max()
                    peak_df = peak_series.reset_index()
                    # print(peak_series.head())
                    peak_df.columns = ['timestamp','peak_value']
                    out_file = os.path.join(root, f"PEAK_Series_{file}")
                    peak_df.to_csv(out_file, index=False)
                    print(f"Updated {file_path} with peak series for distance calculation in {out_file}")
    return scoped_results

compute_nextflow_task_peak_series(scoped_results)

Updated /usr/local/bin/scoped_results/task_disk_data/cAdvisor/container_fs_reads_bytes_total/containers/nxf-DCzpSRywCGxNp3AqAQG8NWp1.csv with peak series for distance calculation in /usr/local/bin/scoped_results/task_disk_data/cAdvisor/container_fs_reads_bytes_total/containers/PEAK_Series_nxf-DCzpSRywCGxNp3AqAQG8NWp1.csv
Updated /usr/local/bin/scoped_results/task_disk_data/cAdvisor/container_fs_reads_bytes_total/containers/nxf-sAPk9MYk9lmJfaSU7zQ0wj3x.csv with peak series for distance calculation in /usr/local/bin/scoped_results/task_disk_data/cAdvisor/container_fs_reads_bytes_total/containers/PEAK_Series_nxf-sAPk9MYk9lmJfaSU7zQ0wj3x.csv
Updated /usr/local/bin/scoped_results/task_disk_data/cAdvisor/container_fs_reads_bytes_total/containers/nxf-1XjcuKDW0JDyexpUcyj4QJVi.csv with peak series for distance calculation in /usr/local/bin/scoped_results/task_disk_data/cAdvisor/container_fs_reads_bytes_total/containers/PEAK_Series_nxf-1XjcuKDW0JDyexpUcyj4QJVi.csv
Updated /usr/local/bin/scoped_r

'/usr/local/bin/scoped_results'

In [106]:
# Correlation can be NaN if two compared time series have no overlapping timestamps or if one of them has constant values.
# TODO: Include all affinity scores for all recoreded metrics somehow.
# Currently simplified for only one metric.
# For some containers one time disk reads or for short lived container the memory consumption is constant
# which hinders the correlation calculation.
# delete this task with constant peak series from the distance matrix
def computeTaskSignatureDistances(scoped_results, cleaned_container_temporal_signatures):
    """
    Compute the distances between task signatures in the feature space.
    Returns a distance matrix based on the custom distance function.
    
    Args:
        scoped_results: Result dictionary holding the peak time series for each task's metric.
    Returns:
        distance_matrix: Numpy array of distances between task signatures.
    """
    
    # TODO: Read in affinity scores from a file
    # affinity_scores = {'CpuCpu' : 0.9911813042346189, 'CpuMem': 0.9917353659941142,'FileIOCpu': 0.9919034036414974, 'FileIOFileIO': 0.8044261086909198, 'MemFileIO': 0.9676917390088071, 'MemMem': 0.9976325434871719}
    # pprint.pprint(affinity_scores) 
    # pprint.pprint(cleaned_container_temporal_signatures)
    
    # Compute pairwise distances between all tasks

    # Use the keys of cleaned_container_temporal_signatures as task identifiers
    nextflow_jobs = list(cleaned_container_temporal_signatures.keys())
    
    filtered_jobs = []
    for job in nextflow_jobs:
        peak_df = getPeakTimeSeriesForTask(job, scoped_results)
        if peak_df is not None and not peak_df['peak_value'].nunique() == 1:
            filtered_jobs.append(job)

    distance_matrix = np.full((len(filtered_jobs), len(filtered_jobs)), np.nan)

    for i in range (len(filtered_jobs)):
        for j in range (i + 1, len(filtered_jobs)):
            job_i = nextflow_jobs[i]
            job_j = nextflow_jobs[j]

            # Apply custom distance calculation
            
            # Get affinity score for the recorded metrics in the order of the config file
            # aff_score = affinity_scores.get('CpuCpu')
            
            # Get the peak time series 
            peak_df_i = getPeakTimeSeriesForTask(job_i, scoped_results)
            peak_df_j = getPeakTimeSeriesForTask(job_j, scoped_results)

            # Normalize to seconds from starting point
            if peak_df_i is not None and peak_df_j is not None:
                norm_i = normalizePeakTimeSeries(peak_df_i)
                norm_j = normalizePeakTimeSeries(peak_df_j)
                rel_time_i, peak_i = interpolatePeakTimeSeries(norm_i)
                rel_time_j, peak_j = interpolatePeakTimeSeries(norm_j)
            else:
                continue

            df_i = pd.DataFrame({'relative_time': rel_time_i, 'peak_value': peak_i})
            df_j = pd.DataFrame({'relative_time': rel_time_j, 'peak_value': peak_j})
            
            if df_i['peak_value'].nunique() == 1 or df_j['peak_value'].nunique() == 1:
                # print(f"Skipping {job_i} and {job_j} due to constant peak values.")
                continue
            
            # Align on timestamp
            merged = pd.merge(df_i, df_j, on="relative_time", suffixes=('_i', '_j'))
            if len(merged) < 2:
                # print(f"Not enough overlapping data for {job_i} and {job_j}. Skipping.")
                continue

            # Merge two peak time series only on overlapping timestamps for correlation calculation.
            peak_correlation = pearsonr(merged['peak_value_i'], merged['peak_value_j'])[0]
            print(peak_correlation)
            
            # Compute the distance based on the correlation coefficient and affinity score
            # distance = aff_score * (peak_correlation ** 2)
            distance = (peak_correlation ** 2)  # Simplified distance calculation
            print(distance)
            

            if not math.isnan(distance):
                distance_matrix[i, j] = distance
                distance_matrix[j, i] = distance
            else:
                print(f"Distance for {job_i} and {job_j} is NaN, skipping assignment.")

    return distance_matrix

# Helper to get the according peak time series for the current nextflow task.
def getPeakTimeSeriesForTask(task_name, scoped_results):
    """
    Get the peak time series for a given task name.
    """
    for root ,dirs, files in os.walk(scoped_results):
        if os.path.basename(root) == "containers":
            peak_file = os.path.join(root, f"PEAK_Series_{task_name}.csv")
            if os.path.exists(peak_file):
                return pd.read_csv(peak_file)
            else:
                # print(f"Peak time series file not found for task: {task_name}")
                return None
            
def normalizePeakTimeSeries(df):
    """
    Normalize the peak time series by scaling the 'peak_value' column.
    """

    df = df.copy()
    df['relative_time'] = (pd.to_datetime(df['timestamp']) - pd.to_datetime(df['timestamp']).iloc[0]).dt.total_seconds()

    return df

def interpolatePeakTimeSeries(df, n_points=100):
    df = df.copy()
    # Ensure rel_time is sorted
    df = df.sort_values('relative_time')
    # Interpolate peak_value to n_points
    interp_times = np.linspace(df['relative_time'].min(), df['relative_time'].max(), n_points)
    interp_values = np.interp(interp_times, df['relative_time'], df['peak_value'])
    return interp_times, interp_values

distance_matrix = computeTaskSignatureDistances(scoped_results, cleaned_container_temporal_signatures)
np.set_printoptions(suppress=False, nanstr='nan')
print(distance_matrix.shape)

  peak_correlation = pearsonr(merged['peak_value_i'], merged['peak_value_j'])[0]
  peak_correlation = pearsonr(merged['peak_value_i'], merged['peak_value_j'])[0]


nan
nan
Distance for nxf-9oUdO4sl7BdpAnHASfQhLlrX and nxf-UOzR0F8D9pclTsQSQbiMePeV is NaN, skipping assignment.
0.9432422182837985
0.889705882352941
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.9432422182837985
0.889705882352941
1.0
1.0
nan
nan
Distance for nxf-UOzR0F8D9pclTsQSQbiMePeV and nxf-1XjcuKDW0JDyexpUcyj4QJVi is NaN, skipping assignment.
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.9999999999999996
0.9999999999999991
1.0
1.0
1.0
1.0
1.0
1.0
0.8685850995582639
0.7544400751746392
1.0
1.0
0.9999999999999997
0.9999999999999993
1.0
1.0
0.9999999999999998
0.9999999999999996
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.9999999999999996
0.9999999999999991
0.9999999999999998
0.9999999999999996
0.9999999999999996
0.9999999999999991
0.4921409508495718
0.24220271550312067
0.4921409508495718
0.24220271550312067
1.0
1.0
1.0
1.0
0.9999999999999998
0.9999999999999996
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.9999999999999998
0.9999999999999996
1.0
1.0
1.0
1.0
1.0
1.0
0.9999999999999998
0.99999

In [70]:
# Run agglomerative clustering algorithm on the distance matrix
# TODO: Calculate the distance threshold based on distribution of the distances in the data.
def runAgglomerativeClustering(distance_matrix):
    """
    Run agglomerative clustering on the distance matrix.
    Returns the cluster labels for each task.
    
    Args:
        distance_matrix: Numpy array of distances between task signatures.
    Returns:
        cluster_labels: Numpy array of cluster labels for each task.
    """
    clustering = AgglomerativeClustering(n_clusters = None, metric='precomputed', linkage='average', compute_full_tree=True, compute_distances=False, distance_threshold=5.0).fit(distance_matrix)
    cluster_labels = clustering.labels_
    print(f"Number of clusters found: {len(set(cluster_labels))}")
    print(f"Cluster labels: {cluster_labels}")
    return cluster_labels

cluster_labels = runAgglomerativeClustering(distance_matrix)

Number of clusters found: 1
Cluster labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]


In [None]:
# Update the  complete temporal signature feature vectors of the tasks that are considered for colocation.
def updateFeatureVectorOfColocatableTasks(cleaned_container_temporal_signatures, cluster_labels):
    """
    Update the feature vectors of the colocatable tasks by adding the cluster label.
    Returns the updated cleaned_container_temporal_signatures.
    
    Args:
        cleaned_container_temporal_signatures: Dictionary of cleaned container temporal signatures.
        cluster_labels: Numpy array of cluster labels for each task.
    Returns:
        updated_cleaned_container_temporal_signatures: Updated dictionary with cluster labels.
    """

In [None]:
# Update only the time series pattern of the feature vectors of the tasks that are considered for colocation.
def updateFeatureVectorOfColocatableTasks(cleaned_container_temporal_signatures, cluster_labels):
    """
    Update the feature vectors of the colocatable tasks by adding the cluster label.
    Returns the updated cleaned_container_temporal_signatures.
    
    Args:
        cleaned_container_temporal_signatures: Dictionary of cleaned container temporal signatures.
        cluster_labels: Numpy array of cluster labels for each task.
    Returns:
        updated_cleaned_container_temporal_signatures: Updated dictionary with cluster labels.
    """

### Random Forest Regressor Modeling
#### Some parts of the data processing are repeated here for better understandability.

In [29]:
# Build feature output matrix on runtime labels for Random Forest Regression model.
def buildFeatureMatriceOutput(fin_df):
    """
    Build the feature matrices for the finished containers.
    Returns the feature matrix and the container names for the task's runtimes.
    """
    container_runtime_power = {}

    fin_df['LifeTime_s'] = (
        fin_df['LifeTime']
        .str.extract(r'([0-9.]+)(ms|s)', expand=True)
        .assign(
            value=lambda x: x[0].astype(float),
            seconds=lambda x: np.where(x[1] == 'ms', x['value'] / 1000, x['value'])
        )['seconds']
    )

    for idx, row in fin_df.iterrows():
        container_runtime_power[row['Name']] = {
            'runtime': row['LifeTime_s'],
            # 'power': row['MeanPower']
        }
        
    feature_matrix_y = []
    container_names_y = []

    for container, info in container_runtime_power.items():
        if container not in cleaned_container_temporal_signatures:
            continue
        if pd.notna(info['runtime']):
            feature_matrix_y.append([info['runtime']])
            container_names_y.append(container)
            
    # Transform feature matrix K_y into numpy array
    feature_matrix_y = np.array(feature_matrix_y)
    print(f"Feature matrix shape: {feature_matrix_y.shape}")
    df = pd.DataFrame(feature_matrix_y, columns=['runtime'])
    print(df)

    return feature_matrix_y, container_names_y

finished_containers_dfs_with_power = addPowerToFinContainers(FIN_CONTAINERS, containers_with_all_metrics,POWER_STATS) 
reg_runtime_feature_matrix_y = buildFeatureMatriceOutput(finished_containers_dfs_with_power)

Feature matrix shape: (14, 1)
      runtime
0    3.291482
1    3.624712
2    4.641757
3    2.615504
4    2.757030
5   16.219254
6    1.906432
7    4.927431
8    4.828914
9    6.161708
10   1.252003
11   1.344221
12   1.494906
13  19.257875


In [46]:
# Build feature output matrix on power consumption labels for Random Forest Regression model.
def buildFeatureMatriceOutput(fin_df):
    """
    Build the feature matrices for the finished containers.
    Returns the feature matrix and the container names for the task's power consumption.
    """
    container_runtime_power = {}

    # fin_df['LifeTime_s'] = (
    #     fin_df['LifeTime']
    #     .str.extract(r'([0-9.]+)(ms|s)', expand=True)
    #     .assign(
    #         value=lambda x: x[0].astype(float),
    #         seconds=lambda x: np.where(x[1] == 'ms', x['value'] / 1000, x['value'])
    #     )['seconds']
    # )

    for idx, row in fin_df.iterrows():
        container_runtime_power[row['Name']] = {
            # 'runtime': row['LifeTime_s'],
            'power': row['MeanPower']
        }
        
    feature_matrix_y = []
    container_names_y = []

    for container, info in container_runtime_power.items():
        if container not in cleaned_container_temporal_signatures:
            continue
        if pd.notna(info['power']):
            feature_matrix_y.append(info['power'])
            container_names_y.append(container)
            
    # Transform feature matrix K_y into numpy array
    feature_matrix_y = np.array(feature_matrix_y)
    print(f"Feature matrix shape: {feature_matrix_y.shape}")
    df = pd.DataFrame(feature_matrix_y, columns=['power'])
    print(df)

    return feature_matrix_y, container_names_y

finished_containers_dfs_with_power = addPowerToFinContainers(FIN_CONTAINERS, containers_with_all_metrics,POWER_STATS) 
reg_power_feature_matrix_y = buildFeatureMatriceOutput(finished_containers_dfs_with_power)
pprint.pprint(reg_power_feature_matrix_y)

Feature matrix shape: (14,)
          power
0    683.055173
1     12.204298
2    107.747945
3    588.790631
4   1566.518561
5     80.176704
6      1.850064
7    189.087464
8     21.191534
9      0.297813
10     0.776517
11     0.008668
12     0.008729
13     1.580281
(array([6.83055173e+02, 1.22042981e+01, 1.07747945e+02, 5.88790631e+02,
       1.56651856e+03, 8.01767037e+01, 1.85006400e+00, 1.89087464e+02,
       2.11915336e+01, 2.97812820e-01, 7.76517000e-01, 8.66800000e-03,
       8.72900000e-03, 1.58028076e+00]),
 ['nxf-cPB62cVKMj0A2W3ZgiyXeXAy',
  'nxf-8HEIDLPLcFSgNV2onUEea8wK',
  'nxf-0X0tQJagkeWOAir2jS124FfK',
  'nxf-TrD9qyudd3YDIKNfgNkKpu9H',
  'nxf-qDilxwaxmY8uJ5TscM5fDPNc',
  'nxf-SX1AWI1RbvjBo0PJOwC1FAFw',
  'nxf-1AUOV7AhBGVUbCmee5WApTRX',
  'nxf-0mUZ0M8vpF30z1CEoXjCQQbH',
  'nxf-0pUrbbt0IplTwbj4uE7h1Lv0',
  'nxf-UY2XomSHbY5BM00lkqJ3KiSI',
  'nxf-bQCEmlIiekPOOtkHpYmKBSn7',
  'nxf-6NsMcpYNvIhqIRPUkkVmSPjV',
  'nxf-i3k55HVSqlStQlJ9rLveDORE',
  'nxf-l4UOQ6vq023FfdVkhpq6uhFB'])


In [50]:
# Scale the feature matrices for regression models with runtime output labels.
def scaleFeatureMatrices(feature_matrix_x, reg_runtime_feature_matrix_y):
    """
    Scale the feature matrices using StandardScaler.
    Returns the scaled feature matrices.
    """

    # Reshape to 2D array
    reg_runtime_y = np.array(reg_runtime_feature_matrix_y)
    print(reg_runtime_y)
    if reg_runtime_y.ndim == 1:
        reg_runtime_y = reg_runtime_y.reshape(-1,1)

    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    scaled_x = scaler_x.fit_transform(feature_matrix_x)
    scaled_y = scaler_y.fit_transform(reg_runtime_y)

    print(f"Scaled feature matrix X shape: {scaled_x.shape}")
    print(f"Scaled feature matrix Y shape: {scaled_y.shape}")
    
    return scaled_x, scaled_y, scaler_x, scaler_y

scaled_feature_matrix_x, scaled_reg_runtime_feature_matrix_y, scaler_x, reg_runtime_scaler_y = scaleFeatureMatrices(feature_matrix_x[0], reg_runtime_feature_matrix_y[0])

[[ 3.29148233]
 [ 3.62471178]
 [ 4.64175652]
 [ 2.61550419]
 [ 2.75703028]
 [16.21925428]
 [ 1.9064317 ]
 [ 4.92743082]
 [ 4.82891415]
 [ 6.16170765]
 [ 1.25200322]
 [ 1.34422121]
 [ 1.49490571]
 [19.25787532]]
Scaled feature matrix X shape: (14, 4)
Scaled feature matrix Y shape: (14, 1)


In [None]:
# Scale the feature matrices for regression models with power output labels.
def scaleFeatureMatrices(feature_matrix_x, reg_power_feature_matrix_y):
    """
    Scale the feature matrices using StandardScaler.
    Returns the scaled feature matrices.
    """

    # Reshape to 2D array
    reg_power_y = np.array(reg_power_feature_matrix_y)
    print(reg_power_y)
    if reg_power_y.ndim == 1:
        reg_power_y = reg_power_y.reshape(-1,1)

    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    scaled_x = scaler_x.fit_transform(feature_matrix_x)
    scaled_y = scaler_y.fit_transform(reg_power_y)

    print(f"Scaled feature matrix X shape: {scaled_x.shape}")
    print(f"Scaled feature matrix Y shape: {scaled_y.shape}")
    
    return scaled_x, scaled_y, scaler_x, scaler_y

scaled_feature_matrix_x, scaled_reg_power_feature_matrix_y, scaler_x, reg_power_scaler_y = scaleFeatureMatrices(feature_matrix_x[0], reg_power_feature_matrix_y[0])
# pprint.pprint(reg_power_feature_matrix_y[0])

[6.83055173e+02 1.22042981e+01 1.07747945e+02 5.88790631e+02
 1.56651856e+03 8.01767037e+01 1.85006400e+00 1.89087464e+02
 2.11915336e+01 2.97812820e-01 7.76517000e-01 8.66800000e-03
 8.72900000e-03 1.58028076e+00]
Scaled feature matrix X shape: (14, 4)
Scaled feature matrix Y shape: (14, 1)
array([6.83055173e+02, 1.22042981e+01, 1.07747945e+02, 5.88790631e+02,
       1.56651856e+03, 8.01767037e+01, 1.85006400e+00, 1.89087464e+02,
       2.11915336e+01, 2.97812820e-01, 7.76517000e-01, 8.66800000e-03,
       8.72900000e-03, 1.58028076e+00])


In [53]:
def splitFeatureMatrices(scaled_feature_matrix_x, scaled_feature_matrix_y, container_names_x, container_names_y):
    """
    Split the feature matrices into training and testing sets.
    """
    X_train, X_test, y_train, y_test, train_container_names_x, test_container_names_x, train_container_names_y, test_container_names_y = train_test_split(
        scaled_feature_matrix_x, scaled_feature_matrix_y, container_names_x, container_names_y, test_size=0.2, random_state=42
    )
    print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")
    return X_train, X_test, y_train, y_test, train_container_names_x, test_container_names_x, train_container_names_y, test_container_names_y

X_train, X_test, y_train_power, y_test_power, train_container_names_x, test_container_names_x, train_container_names_y, test_container_names_y = splitFeatureMatrices(scaled_feature_matrix_x, scaled_reg_power_feature_matrix_y, feature_matrix_x[1], feature_matrix_y[1])
# pprint.pprint(scaled_reg_power_feature_matrix_y.shape)

# x_train_df = pd.DataFrame(X_train, columns=all_feature_names)
# y_train_df = pd.DataFrame(y_train, columns=['power'])
# print("X_train DataFrame:", x_train_df)
# x_test_df = pd.DataFrame(X_test, columns=all_feature_names)
# print("X_test DataFrame:", x_test_df)

# y_train_df = pd.DataFrame(y_train, columns=['power'])
# print("y_train DataFrame:", y_train_df)
# y_test_df = pd.DataFrame(y_test, columns=['power'])
# print("y_test DataFrame:", y_test_df)

Training set shape: (11, 4), Test set shape: (3, 4)


In [56]:
def splitFeatureMatrices(scaled_feature_matrix_x, scaled_feature_matrix_y, container_names_x, container_names_y):
    """
    Split the feature matrices into training and testing sets.
    """
    X_train, X_test, y_train, y_test, train_container_names_x, test_container_names_x, train_container_names_y, test_container_names_y = train_test_split(
        scaled_feature_matrix_x, scaled_feature_matrix_y, container_names_x, container_names_y, test_size=0.2, random_state=42
    )
    print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")
    return X_train, X_test, y_train, y_test, train_container_names_x, test_container_names_x, train_container_names_y, test_container_names_y

X_train, X_test, y_train_runtime, y_test_runtime, train_container_names_x, test_container_names_x, train_container_names_y, test_container_names_y = splitFeatureMatrices(scaled_feature_matrix_x, scaled_reg_runtime_feature_matrix_y, feature_matrix_x[1], feature_matrix_y[1])
# pprint.pprint(scaled_reg_power_feature_matrix_y.shape)

# x_train_df = pd.DataFrame(X_train, columns=all_feature_names)
# y_train_df = pd.DataFrame(y_train, columns=['power'])
# print("X_train DataFrame:", x_train_df)
# x_test_df = pd.DataFrame(X_test, columns=all_feature_names)
# print("X_test DataFrame:", x_test_df)

# y_train_df = pd.DataFrame(y_train, columns=['power'])
# print("y_train DataFrame:", y_train_df)
# y_test_df = pd.DataFrame(y_test, columns=['power'])
# print("y_test DataFrame:", y_test_df)

Training set shape: (11, 4), Test set shape: (3, 4)


In [60]:
# Random Forest Regressor to predict the power of colocatable tasks
def trainPowerWithRandomForest(X, y):
    """
    Train a Random Forest regressor to predict power consumption based on the feature matrix.
    """
    
    X, y = make_regression(n_features=4, n_informative=2,
                        random_state=0, shuffle=False)

    regr = RandomForestRegressor(max_depth=2, random_state=0)

    regr.fit(X, y)

    return regr


def predictPowerWithRandomForest(regressor, test_Data):
    """
    Predict the power consumption using the trained Random Forest regressor.
    
    Args:
        regressor: Trained Random Forest regressor.
        test_data: Test data for prediction.
        
    Returns:
        Predicted power consumption values.
    """

    return regressor.predict(test_Data)

# Fit the model.
trainedPredictor = trainPowerWithRandomForest(X_train, y_train_power)
# Predict power consumption for the test data.
predicted_power = predictPowerWithRandomForest(trainedPredictor, X_test)



In [58]:
# Random Forest Regressor to predict the runtime of colocatable tasks
def trainRuntimeWithRandomForest(X, y):
    X, y = make_regression(n_features=4, n_informative=2,
                        random_state=0, shuffle=False)

    regr = RandomForestRegressor(max_depth=2, random_state=0)

    regr.fit(X, y)
    return regr
    
def predictRuntimeWithRandomForest(regressor, test_Data):
    """
    Predict the runtime using the trained Random Forest regressor.
    
    Args:
        regressor: Trained Random Forest regressor.
        test_data: Test data for prediction.
        
    Returns:
        Predicted runtime values.
    """

    return regressor.predict(test_Data)
    

# Fit the model.
trainedPredictor = trainRuntimeWithRandomForest(X_train, y_train_runtime)

# Predict the runtime for the test data.
predicted_runtime = predictRuntimeWithRandomForest(trainedPredictor, X_test)