In [86]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import docker
import logging
import time
import concurrent.futures
from datetime import datetime
import csv
import re
import pprint
from sklearn.metrics.pairwise import rbf_kernel
from mvlearn.embed import KMCCA
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.neighbors import NearestNeighbors

In [23]:
# Read in the monitoring results data
results = "/usr/local/bin/results"
fin_containers = "/usr/local/bin/results/died_nextflow_containers.csv"
start_containers = "/usr/local/bin/results/started_nextflow_containers.csv"

for root, dirs, files in os.walk(results):
    # print(i)
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)
            data = pd.read_csv(file_path, index_col=0)
            print(f"Found CSV file: {file_path}")
        

Found CSV file: /usr/local/bin/results/started_nextflow_containers.csv
Found CSV file: /usr/local/bin/results/died_nextflow_containers.csv
Found CSV file: /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/container_network_transmit_bytes_total.csv
Found CSV file: /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/containers/nxf-bEdYAgPSeAi8wXfUSgSNRVZn.csv
Found CSV file: /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/containers/nxf-lIjuTGpZakXR7QSqkIOKbZoX.csv
Found CSV file: /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/containers/nxf-WmrVWwD6tZ0ReLKDwuqF0kTo.csv
Found CSV file: /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/containers/nxf-eGWq7LRmqmNv39ec2O2uOhMY.csv
Found CSV file: /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/containers/n

In [24]:
# Sanity checks

# Check 1: Compare started vs. finished containers
fin_df = pd.read_csv(fin_containers, index_col=0)
start_df = pd.read_csv(start_containers, index_col=0)
missing_containers_containeragent = []
# print(fin_df.columns)
for container in fin_df['ContainerID']:
    if container not in start_df['ContainerID'].values:
        missing_containers_containeragent.append(container)
    # print(f"Container {container} is present in finished containers.")
if missing_containers_containeragent:
    print("The following containers are missing from the started containers list:")
    for container in missing_containers_containeragent:
        print(container)
print("Amount of missing containers found by agent:",len(missing_containers_containeragent))

# Check 2: Compare finished containers with task cAdvisor containers and print the diff
    


Amount of missing containers found by agent: 0


In [25]:
# Write container working directories and nf-core task names into a dict for entity matching 
df = pd.read_csv(fin_containers)
container_workdirs = {}
# print(df.head())
for idx, row in df.iterrows():
    container_workdirs[row['Name']] = row['WorkDir']
    
    
for name, workdir in container_workdirs.items():
    print(f"Container {name} has work directory {workdir}") 

Container nxf-M1kNzYnYqdtmxNbd7Tqsx28F has work directory /storage/nf-core/exec/work/02/7ab2644b72f1c738b286bc0ae26180
Container nxf-8MN6NSbAfmEFQ0dgsPNZDPg2 has work directory /storage/nf-core/exec/work/a3/00115258d0fed84cebcb9411618ca4
Container nxf-XGKlZVbBA0pmGOBRcNy93glF has work directory /storage/nf-core/exec/work/c4/8edb1e8437cae7b565e44b4a2c09b5
Container nxf-QM1fN1IYuPAR3my3yJwK0VaE has work directory /storage/nf-core/exec/work/48/8c6ae251b33634a1436ac719190c44
Container nxf-sYRw50ZGcahwGZ1cClf2E9i7 has work directory /storage/nf-core/exec/work/7f/69f904e70ee7c939c33f0ac1d6c1ee
Container nxf-bEdYAgPSeAi8wXfUSgSNRVZn has work directory /storage/nf-core/exec/work/58/b52bef7fc95a4a579c7383476e6aae
Container nxf-6fUrFQAkCWQN7iuY4Wf06dN2 has work directory /storage/nf-core/exec/work/f5/780b19b8a2516325365a535e7b202a
Container nxf-ziOzW254Iky8DQ4sot6k5DnJ has work directory /storage/nf-core/exec/work/88/a943cfb7ea7924e017f0c56d5477bd
Container nxf-Q0djkdLfPuewHV5KqH29iNNp has work 

In [26]:
# Extract each cAdvisor task in its own file and dataframe
nextflow_pattern = r"nxf-[A-Za-z0-9]{23}"
results = "/usr/local/bin/results"
missing_cAdvisor_containers = []

for root, dirs, files in os.walk(results):
    if os.path.basename(root) == "cAdvisor":
        cAdvisor_path = root
        for metric in os.listdir(cAdvisor_path):
            metric_path = os.path.join(cAdvisor_path, metric)
            if os.path.isdir(metric_path):
                containers_dir = os.path.join(metric_path, "containers")
                os.makedirs(containers_dir, exist_ok=True)
                for file in os.listdir(metric_path):
                    if file.endswith(".csv"):
                        file_path = os.path.join(metric_path, file)
                        # print(f"Processing file: {file_path}")
                        df = pd.read_csv(file_path)
                        col = 'name'
                        for container_name in df[col].unique():
                            if pd.isna(container_name):
                                continue
                            if re.match(nextflow_pattern, str(container_name)):
                                # print(f"Processing container: {container_name}")
                                container_df = df[df[col] == container_name]
                                out_path = os.path.join(containers_dir, f"{container_name}.csv")
                                container_df.to_csv(out_path, index=False)
                                # print(f"Saved data for {container_name} to {out_path}")

cAdvisor_containers = set()
for root, dirs, files in os.walk(results):
    if os.path.basename(root) == "cAdvisor":
        for metric in os.listdir(root):
            metric_path = os.path.join(root, metric)
            if os.path.isdir(metric_path):
                for file in os.listdir(metric_path):
                    if file.endswith(".csv"):
                        df = pd.read_csv(os.path.join(metric_path, file))
                        col = 'name'
                        cAdvisor_containers.update(
                            str(name) for name in df[col].unique() if pd.notna(name) and re.match(nextflow_pattern, str(name)))
                        
workdir_containers = set(container_workdirs.keys())
missing_in_workdirs = cAdvisor_containers - workdir_containers
missing_in_cadvisor = workdir_containers - cAdvisor_containers
print("Containers in monitored list but NOT in cAdvisor:", missing_in_cadvisor)
print(len(missing_in_cadvisor))
                                

Containers in monitored list but NOT in cAdvisor: {'nxf-M1kNzYnYqdtmxNbd7Tqsx28F', 'nxf-e7z8SKrpJyoxLQm1KwPQU4vS', 'nxf-e9G2aTzZXExfs0EhgOcOXbgi', 'nxf-QM1fN1IYuPAR3my3yJwK0VaE', 'nxf-zSvN3uKWx73IHCFcXTcjkJIx', 'nxf-ziOzW254Iky8DQ4sot6k5DnJ'}
6


In [27]:
# Add the containers working directory to every cAdvisor task time series file in all metrics
for root, dirs, files in os.walk(results):
    if os.path.basename(root) == "containers":
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                fin_container_df = pd.read_csv(file_path)
                container_name = os.path.splitext(file)[0]
                if container_name in container_workdirs:
                    workdir = container_workdirs[container_name]
                    fin_container_df['WorkDir'] = workdir
                    fin_container_df.to_csv(file_path, index=False)
                    print(f"Updated {file_path} with work directory {workdir}")

Updated /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/containers/nxf-bEdYAgPSeAi8wXfUSgSNRVZn.csv with work directory /storage/nf-core/exec/work/58/b52bef7fc95a4a579c7383476e6aae
Updated /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/containers/nxf-lIjuTGpZakXR7QSqkIOKbZoX.csv with work directory /storage/nf-core/exec/work/a7/404a25265432bb9f8f9bb99f462123
Updated /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/containers/nxf-WmrVWwD6tZ0ReLKDwuqF0kTo.csv with work directory /storage/nf-core/exec/work/fc/b39bf10d7ddb8068491832d60da327
Updated /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/containers/nxf-eGWq7LRmqmNv39ec2O2uOhMY.csv with work directory /storage/nf-core/exec/work/58/65457826cccaa35805374aa488368a
Updated /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/containers/nxf-

In [28]:
# Extract slurm job metadata out of time-series data and write them into separate files
slurm_metadata_path = "/usr/local/bin/results/task_metadata/slurm-job-exporter/slurm_job_id"

for file in os.listdir(slurm_metadata_path):
    if file.endswith("slurm_job_id.csv"):
        file_path = os.path.join(slurm_metadata_path, file)
        print(f"Reading file: {file_path}")
        df = pd.read_csv(file_path)
        # try:
        #     rm_columns = ['num_cpus', 'work_dir','job_name','value', 'instance', 'partition', 'priority', 'run_time',
        #         'slurm_job_pid', 'std_err', 'std_in', 'submit_time', 'threads_per_core', 'user']
        #     df.drop(columns=rm_columns, inplace=True, errors='ignore')
        # except Exception as e:
        #     print(f"Error dropping columns: {e}")
        
        slurm_job_col = 'job_name'
        # print(df.head())
        for job_name in df[slurm_job_col].unique():
            print(slurm_job_col)
            if pd.isna(job_name):
                continue
            print(f"Processing job: {job_name}")
            job_df = df[df[slurm_job_col] == job_name]
            out_path = os.path.join(slurm_metadata_path, f"{job_name}.csv")
            job_df.to_csv(out_path, index=False)
            print(f"Saved data for {job_name} to {out_path}")

Reading file: /usr/local/bin/results/task_metadata/slurm-job-exporter/slurm_job_id/slurm_job_id.csv
job_name
Processing job: nf-NFCORE_SAREK_PREPARE_GENOME_BWAMEM1_INDEX_(genome.fasta)
Saved data for nf-NFCORE_SAREK_PREPARE_GENOME_BWAMEM1_INDEX_(genome.fasta) to /usr/local/bin/results/task_metadata/slurm-job-exporter/slurm_job_id/nf-NFCORE_SAREK_PREPARE_GENOME_BWAMEM1_INDEX_(genome.fasta).csv
job_name
Processing job: nf-NFCORE_SAREK_PREPARE_INTERVALS_GATK4_INTERVALLISTTOBED_(genome)
Saved data for nf-NFCORE_SAREK_PREPARE_INTERVALS_GATK4_INTERVALLISTTOBED_(genome) to /usr/local/bin/results/task_metadata/slurm-job-exporter/slurm_job_id/nf-NFCORE_SAREK_PREPARE_INTERVALS_GATK4_INTERVALLISTTOBED_(genome).csv
job_name
Processing job: nf-NFCORE_SAREK_PREPARE_INTERVALS_CREATE_INTERVALS_BED_(genome.interval_list)
Saved data for nf-NFCORE_SAREK_PREPARE_INTERVALS_CREATE_INTERVALS_BED_(genome.interval_list) to /usr/local/bin/results/task_metadata/slurm-job-exporter/slurm_job_id/nf-NFCORE_SAREK_PRE

In [29]:
# Add the nf-core task name to the finished containers file
for file in os.listdir(slurm_metadata_path):
    if file.endswith("slurm_job_id.csv"):
        file_path = os.path.join(slurm_metadata_path, file)
        print(f"Reading file: {file_path}")
        df = pd.read_csv(file_path)
        fin_df = pd.read_csv(fin_containers)
        if 'WorkDir' in fin_df.columns and 'work_dir' in df.columns:
            for idx, row in df.iterrows():
                work_dir = row['work_dir']  
                slurm_job = row['job_name'] 
                if pd.isna(work_dir) or pd.isna(slurm_job):
                    print(f"Skipping row {idx} due to missing WorkDir or slurm_job.")
                    continue
                # Update fin_df where WorkDir matches
                fin_df.loc[fin_df['WorkDir'] == work_dir, 'Nextflow'] = slurm_job

            # Write back the updated fin_df
            fin_df.to_csv(fin_containers, index=False)
            print(f"Updated {fin_containers} with slurm job info.")
        else:
            print("WorkDir or num_tasks column missing in DataFrames.")

Reading file: /usr/local/bin/results/task_metadata/slurm-job-exporter/slurm_job_id/slurm_job_id.csv
Updated /usr/local/bin/results/died_nextflow_containers.csv with slurm job info.


In [30]:
# Update: process all containers directories under all metrics, not just the first cAdvisor found
for root, dirs, files in os.walk(results):
    if os.path.basename(root) == "containers":
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                container_df = pd.read_csv(file_path)
                if 'WorkDir' in container_df.columns:
                    workdir = container_df['WorkDir'].iloc[0]
                    match = fin_df[fin_df['WorkDir'] == workdir]
                    if not match.empty and 'Nextflow' in match.columns:
                        nextflow_value = match['Nextflow'].values[0]
                        container_df['Nextflow'] = nextflow_value
                        container_df.to_csv(file_path, index=False)
                        print(f"Updated {file_path} with Nextflow value {nextflow_value}")

Updated /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/containers/nxf-bEdYAgPSeAi8wXfUSgSNRVZn.csv with Nextflow value nf-NFCORE_SAREK_SAREK_FASTQC_(test-test_L2)
Updated /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/containers/nxf-lIjuTGpZakXR7QSqkIOKbZoX.csv with Nextflow value nf-NFCORE_SAREK_SAREK_VCF_QC_BCFTOOLS_VCFTOOLS_VCFTOOLS_TSTV_COUNT_(test)
Updated /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/containers/nxf-WmrVWwD6tZ0ReLKDwuqF0kTo.csv with Nextflow value nf-NFCORE_SAREK_SAREK_BAM_APPLYBQSR_GATK4_APPLYBQSR_(test)
Updated /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/containers/nxf-eGWq7LRmqmNv39ec2O2uOhMY.csv with Nextflow value nf-NFCORE_SAREK_SAREK_BAM_BASERECALIBRATOR_GATK4_BASERECALIBRATOR_(test)
Updated /usr/local/bin/results/task_network_data/cAdvisor/container_network_transmit_bytes_total/container

In [10]:
# I dont think i need reindexing or interpolation here...
# Time-series data treatments
# Convert to convenient time-series format 
# for root, dirs, files in os.walk(results):
#     if os.path.basename(root) == "containers":
#         for file in files:
#             if file.endswith(".csv"):
#                 file_path = os.path.join(root, file)
#                 # Set the timestamp column as the index
#                 ts_container_df = pd.read_csv(file_path, index_col=0)
#                 print(f"Processing time-series data for {file_path}")
#                 # Convert the timestamp column to datetime format
#                 ts_container_df['timestamp'] = pd.to_datetime(ts_container_df['timestamp'], unit='ns')
#                 # Define a uniform time grid
#                 ts_container_df = ts_container_df.set_index("timestamp")
#                 # Define time grid based on existing data range
#                 ts_start = complete_ts_df.index.min().floor("500ms")
#                 ts_end = complete_ts_df.index.max().ceil("500ms")
#                 ts_grid = pd.date_range(start=ts_start, end=ts_end, freq="500ms")

#                 # Reindex to aligned uniform grid (NaNs may appear where no original data exists)
#                 complete_ts_df = complete_ts_df.reindex(ts_grid)

#                 # Save result
#                 complete_ts_df.index.name = "timestamp"
#                 file_name = os.path.splitext(file)[0] + "_ts.csv"
#                 out_path = os.path.join(metric_path, file_name)
#                 complete_ts_df.to_csv(out_path)

#                 print(f"Saved reindexed data for {file} to {out_path}") 
            

In [31]:
df = pd.read_csv(fin_containers)
container_temporal_signatures = {}
for idx, row in df.iterrows():
    container_temporal_signatures[row['Name']] = {
        'temporal_signatures': {}
    }

# Feature vectors
for root, dirs, files in os.walk(results):
    if os.path.basename(root) == "containers":
        metric_name = os.path.basename(os.path.dirname(root))
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                ts_container_df = pd.read_csv(file_path)
                ts_container_df['timestamp'] = pd.to_datetime(ts_container_df['timestamp'], unit='ns')
                ts_container_df.set_index('timestamp', inplace=True)
                value_cols = [col for col in ts_container_df.columns if col.startswith('Value')]
                if not value_cols:
                    print(f"Skipping {file_path} as it does not contain 'value' column.")
                    continue
                resource_series = ts_container_df[value_cols[0]]  

                # Feature extraction
                peak_value = resource_series.max()
                lowest_value = resource_series.min()
                mean_value = resource_series.mean()
                median_value = resource_series.median()
                variance = resource_series.var()
                mean_val = resource_series.mean()
                if mean_val == 0:
                    relative_variance = 0.0  
                else:
                    relative_variance = (resource_series.var() - mean_val**2) / (mean_val**2)
                std_dev = resource_series.std()
                pattern_vector = resource_series.iloc[np.round(np.linspace(0, len(resource_series) - 1, 10)).astype(int)].to_numpy()

                # The server spec can come from the host benchmark in nextflow
                server_spec = {
                    'GHz x Cores': "",
                    'GFlops': "",
                    'RAM': "",
                    'IOPS': "",
                    'Max Network Throughput': "",
                }

                feature_vector = { 
                    'peak_value': peak_value, 'lowest_value': lowest_value, 'mean': mean_value, 
                    'variance': variance
                }

                # feature_vector = { 
                #     'peak_value': peak_value, 'lowest_value': lowest_value, 'mean': mean_value, 'median': median_value, 
                #     'variance': variance,'relative_variance': relative_variance, 'std_dev':std_dev, 
                #     'pattern_vector': pattern_vector, 'server_spec': server_spec
                # }
                
                container_name = os.path.splitext(file)[0]
                if container_name in container_temporal_signatures:
                    if feature_vector is not None and feature_vector != {}:
                        # if container_temporal_signatures[container_name]['temporal_signatures'] != {}:
                        container_temporal_signatures[container_name]['temporal_signatures'][metric_name] = feature_vector

none_counter = 0
to_delete = []
for name, info in container_temporal_signatures.items():
    if not info['temporal_signatures']:
        none_counter += 1
        print(f"Container {name} has no temporal signatures.")
        # del container_temporal_signatures[name]
        to_delete.append(name)
print(f"Total containers with no signature for any metric: {none_counter}")

for name in to_delete:
    del container_temporal_signatures[name]
    print(f"Deleted container {name} with no temporal signatures.")

pprint.pprint(container_temporal_signatures)
# Add server spec to feature vector

# Maybe resample the missing ones or leave them or fix monitoring...

Container nxf-M1kNzYnYqdtmxNbd7Tqsx28F has no temporal signatures.
Container nxf-QM1fN1IYuPAR3my3yJwK0VaE has no temporal signatures.
Container nxf-ziOzW254Iky8DQ4sot6k5DnJ has no temporal signatures.
Container nxf-e9G2aTzZXExfs0EhgOcOXbgi has no temporal signatures.
Container nxf-e7z8SKrpJyoxLQm1KwPQU4vS has no temporal signatures.
Container nxf-zSvN3uKWx73IHCFcXTcjkJIx has no temporal signatures.
Total containers with no signature for any metric: 6
Deleted container nxf-M1kNzYnYqdtmxNbd7Tqsx28F with no temporal signatures.
Deleted container nxf-QM1fN1IYuPAR3my3yJwK0VaE with no temporal signatures.
Deleted container nxf-ziOzW254Iky8DQ4sot6k5DnJ with no temporal signatures.
Deleted container nxf-e9G2aTzZXExfs0EhgOcOXbgi with no temporal signatures.
Deleted container nxf-e7z8SKrpJyoxLQm1KwPQU4vS with no temporal signatures.
Deleted container nxf-zSvN3uKWx73IHCFcXTcjkJIx with no temporal signatures.
{'nxf-2hPDvZ6LVCczKaGw8lACL1Wc': {'temporal_signatures': {'container_cpu_user_seconds_tot

In [40]:
# Build a an N x N matrix where the (i, j)th entry is the kernel evaluation k_x(x_i, x_j)
# with x_i and x_j being temporal signatures for tasks i and j using the Gaussian kernel
# Each row and column corresponds to a workflow task.
# Each entry K_x[i, j] = k(x_i, x_j) measures similarity between tasks i and j using a kernel.
# The Gaussian kernel, which measures similarity based on Euclidean distance in feature space, scaled by a parameter sigma.
# This kernel gives higher values when two tasks have similar temporal patterns.
all_metrics = set()
for info in container_temporal_signatures.values():
    all_metrics.update(info['temporal_signatures'].keys())
all_metrics = sorted(all_metrics)
print(f"All metrics found: {all_metrics}")

all_feature_names = set()
for info in container_temporal_signatures.values():
    for metric in info['temporal_signatures'].values():
        all_feature_names.update([k for k in metric.keys()])
all_feature_names = sorted(all_feature_names)

feature_matrix_x = []
container_names_x = []

for container, info in container_temporal_signatures.items():
    row = []
    for metric in all_metrics:
        feats = info['temporal_signatures'].get(metric)
        # print(f"Processing container: {container}, metric: {metric}")
        if feats:
            row.extend([feats.get(name) for name in all_feature_names])
        # else: 
        #     print(f"No features found for container {container} in metric {metric}. Filling with zeros.")
            # print(row)
    feature_matrix_x.append(row)
    container_names_x.append(container)

# Convert into feature matrix K_x
feature_matrix_x = np.array(feature_matrix_x)
print(f"Feature matrix shape: {feature_matrix_x.shape}")

All metrics found: ['container_cpu_user_seconds_total', 'container_memory_usage_bytes', 'container_network_transmit_bytes_total']
Feature matrix shape: (16, 12)


In [90]:
# we also form an N×N matrix Ky, where the(i,j)th element is the kernel evaluation ky (yi , yj ) 
# with respect to application performance and power consumption.
# power consumptions are null for now
df = pd.read_csv(fin_containers)
container_runtime_power = {}

df['LifeTime_s'] = (
    df['LifeTime']
    .str.extract(r'([0-9.]+)(ms|s)', expand=True)
    .assign(
        value=lambda x: x[0].astype(float),
        seconds=lambda x: np.where(x[1] == 'ms', x['value'] / 1000, x['value'])
    )['seconds']
)

for idx, row in df.iterrows():
    container_runtime_power[row['Name']] = {
        'runtime': row['LifeTime_s'],
        'power': None,
    }
pprint.pprint(container_runtime_power)

# Convert into feature matrix K_y
feature_matrix_y = []
container_names_y = []

for container, info in container_runtime_power.items():
    if container not in container_temporal_signatures:
        continue
    feature_matrix_y.append([info['runtime']])
    container_names_y.append(container)
    
feature_matrix_y = np.array(feature_matrix_y)
print(f"Feature matrix shape: {feature_matrix_y.shape}")

{'nxf-2hPDvZ6LVCczKaGw8lACL1Wc': {'power': None, 'runtime': 6.510277},
 'nxf-400sWIGEqpeJvCmEkkHqd3WL': {'power': None, 'runtime': 1.569657394},
 'nxf-6fUrFQAkCWQN7iuY4Wf06dN2': {'power': None, 'runtime': 2.515007942},
 'nxf-8MN6NSbAfmEFQ0dgsPNZDPg2': {'power': None, 'runtime': 3.451122649},
 'nxf-M1kNzYnYqdtmxNbd7Tqsx28F': {'power': None, 'runtime': 0.5869842270000001},
 'nxf-NkrAO6X3dbCedULPe2eMFWC5': {'power': None, 'runtime': 1.676209708},
 'nxf-Q0djkdLfPuewHV5KqH29iNNp': {'power': None, 'runtime': 16.42187386},
 'nxf-QM1fN1IYuPAR3my3yJwK0VaE': {'power': None, 'runtime': 0.355722555},
 'nxf-SkRx2tZV0XvNMWjBqYMxpnOk': {'power': None, 'runtime': 0.6204614199999999},
 'nxf-UnvZj8GD2RTcYMOCE3A3uV5L': {'power': None, 'runtime': 20.274242267},
 'nxf-WmrVWwD6tZ0ReLKDwuqF0kTo': {'power': None, 'runtime': 4.854509332},
 'nxf-XGKlZVbBA0pmGOBRcNy93glF': {'power': None, 'runtime': 1.558177832},
 'nxf-bEdYAgPSeAi8wXfUSgSNRVZn': {'power': None, 'runtime': 4.980742008},
 'nxf-e7z8SKrpJyoxLQm1KwPQ

In [69]:
# Scale the feature matrics
scaler_x = StandardScaler()
scaler_y = StandardScaler()
feature_matrix_x_scaled = scaler_x.fit_transform(feature_matrix_x)
feature_matrix_y_scaled = scaler_y.fit_transform(feature_matrix_y)

In [70]:
# Split the kernel matrices into training and test set
X_train, X_test, Y_train, Y_test = train_test_split(
    feature_matrix_x_scaled, feature_matrix_y_scaled, test_size=0.3, random_state=42
)

In [71]:
# Compute the rbf kernel matrix with X=Y
K_x_train = rbf_kernel(X_train)
K_y_train = rbf_kernel(Y_train)

K_x_test = rbf_kernel(X_test, X_train)
K_y_test = rbf_kernel(Y_test, Y_train)


In [89]:
# Debugging
# Check variance of original feature matrices before kernelization
print("feature_matrix_x std:", np.std(feature_matrix_x_scaled))
print("feature_matrix_y std:", np.std(feature_matrix_y_scaled))

# After splitting
print("X_train (features) std:", np.std(X_train))
print("Y_train (features) std:", np.std(Y_train))
print("X_test (features) std:", np.std(X_test))
print("Y_test (features) std:", np.std(Y_test))

feature_matrix_x std: 0.9574271077563381
feature_matrix_y std: 1.0
X_train (features) std: 0.9255264757755896
Y_train (features) std: 0.9688549632949179
X_test (features) std: 1.02405846046536
Y_test (features) std: 1.065191746779106


In [75]:
# The KCCA algorithm takes the matrices K_x and K_y , and solves the generalized eigen-vector problem
# This procedure finds subspaces in the linear space spanned by the eigenfunctions of the kernel 
# functions such that projections onto these subspaces are maximally correlated [7]. 
# We refer to such pro- jections as resource usage projection and metric projection, respectively.
# If the linear space associated with the Gaussian kernel can be understood as clusters in the 
# original feature space, then KCCA finds correlated pairs of clus- ters in the resource usage 
# vector space and the performance/power vector space.
# Fit with training data
kmcca = KMCCA(kernel='precomputed')
kmcca.fit([K_x_train, K_y_train])

0,1,2
,n_components,1
,kernel,'precomputed'
,kernel_params,{}
,regs,
,signal_ranks,
,sval_thresh,0.001
,diag_mode,'A'
,center,True
,filter_params,False
,n_jobs,


In [76]:
# Project training and test data
X_train_proj, Y_train_proj = kmcca.transform([K_x_train, K_y_train])
X_test_proj, Y_test_proj = kmcca.transform([K_x_test, K_y_test])

In [85]:
# Evaluate the correlation between the projections for test data
corr, _ = pearsonr(X_test_proj.ravel(), Y_test_proj.ravel())
print(f"Pearson correlation coefficient between projections: {corr:.3f}")

Pearson correlation coefficient between projections: 0.998


In [None]:
# plt.scatter(X_test_proj, Y_test_proj, alpha=0.7)
# plt.xlabel("Resource usage projection (test)")
# plt.ylabel("Metric projection (test)")
# plt.title("KCCA projections on test set")
# plt.grid(True)
# plt.show()

In [91]:
# The consumed power and execution time for the one-to-one mapping of clusters to the servers
# can be estimated using the KCCA model trained offline. To be specific, the input vector, 
# which includes the temporal signa- ture of the resource usage profile along with the server 
# capacity, is projected into the resource subspace. We then infer the coordinates in the metric 
# subspace using the k nearest neighbors, where k = 3 in our implementation. Finally, we map the 
# metric projection back to the metrics, which are the consumed power and the execution time. 
# A weighted sum of the metric projections from the nearest k neighbors has been used, with the weight 
# to be the reverse distance between coordinates of two projections in the subspace. The optimal point 
# of this iteration with the minimum total power consumption is recorded

# Use either nearest neighbors or nearest neighbors regression to predict the power consumption and execution time
# After fitting KCCA, you have projections of your training data (X_train_proj, Y_train_proj) and test data (X_test_proj).
# X_train_proj represents the resource usage profiles in the learned subspace.
# Y_train_proj represents the corresponding metrics (e.g., runtime, power) in the metric subspace.
# Find Nearest Neighbors in Resource Subspace

# For each test sample's resource projection (X_test_proj), find its k nearest neighbors among the training projections (X_train_proj).
# This is typically done using Euclidean distance.
# Infer Metric Projection Using Neighbors

# For each test sample, collect the metric projections (Y_train_proj) of its k nearest neighbors.
# Compute a weighted sum of these metric projections, where the weight for each neighbor is the inverse of its distance to the test sample (closer neighbors have more influence).
# Map Back to Original Metric Space

# The weighted sum gives you an estimated metric projection for the test sample.
# If your metrics were scaled, use the sclaer's inverse_transform to convert the projection back to the original units
# Select Optimal Point (Optional)

# KCCA finds maximally correlated subspaces between resource usage and metrics.
# By using nearest neighbors in the resource subspace, you leverage the learned relationship to predict metrics for new, unseen resource profiles.
# The weighted sum ensures that predictions are more influenced by similar (closer) training samples.