In [97]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import docker
import logging
import time
import concurrent.futures
from datetime import datetime
import csv
import re
import pprint

In [None]:
# Read in the monitoring results data
results = "/usr/local/bin/results"
fin_containers = "/usr/local/bin/results/died_nextflow_containers.csv"
start_containers = "/usr/local/bin/results/started_nextflow_containers.csv"

for root, dirs, files in os.walk(results):
    # print(i)
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)
            data = pd.read_csv(file_path, index_col=0)
            print(f"Found CSV file: {file_path}")
        

In [None]:
# Sanity checks

# Check 1: Compare started vs. finished containers
fin_df = pd.read_csv(fin_containers, index_col=0)
start_df = pd.read_csv(start_containers, index_col=0)
missing_containers_containeragent = []
# print(fin_df.columns)
for container in fin_df['ContainerID']:
    if container not in start_df['ContainerID'].values:
        missing_containers_containeragent.append(container)
    # print(f"Container {container} is present in finished containers.")
if missing_containers_containeragent:
    print("The following containers are missing from the started containers list:")
    for container in missing_containers_containeragent:
        print(container)
print("Amount of missing containers found by agent:",len(missing_containers_containeragent))

# Check 2: Compare finished containers with task cAdvisor containers and print the diff
    


Amount of missing containers found by agent: 0


In [None]:
# Write container working directories and nf-core task names into a dict for entity matching 
df = pd.read_csv(fin_containers)
container_workdirs = {}
# print(df.head())
for idx, row in df.iterrows():
    container_workdirs[row['Name']] = row['WorkDir'],
    
    
for name, workdir in container_workdirs.items():
    print(f"Container {name} has work directory {workdir}") 

In [46]:
# Extract each cAdvisor task in its own file and dataframe
nextflow_pattern = r"nxf-[A-Za-z0-9]{23}"
results = "/usr/local/bin/results"
missing_cAdvisor_containers = []

for root, dirs, files in os.walk(results):
    if os.path.basename(root) == "cAdvisor":
        cAdvisor_path = root
        for metric in os.listdir(cAdvisor_path):
            metric_path = os.path.join(cAdvisor_path, metric)
            if os.path.isdir(metric_path):
                containers_dir = os.path.join(metric_path, "containers")
                os.makedirs(containers_dir, exist_ok=True)
                for file in os.listdir(metric_path):
                    if file.endswith(".csv"):
                        file_path = os.path.join(metric_path, file)
                        # print(f"Processing file: {file_path}")
                        df = pd.read_csv(file_path)
                        col = 'instance'
                        for container_name in df[col].unique():
                            if pd.isna(container_name):
                                continue
                            if re.match(nextflow_pattern, str(container_name)):
                                # print(f"Processing container: {container_name}")
                                container_df = df[df[col] == container_name]
                                out_path = os.path.join(containers_dir, f"{container_name}.csv")
                                container_df.to_csv(out_path, index=False)
                                # print(f"Saved data for {container_name} to {out_path}")

cAdvisor_containers = set()
for root, dirs, files in os.walk(results):
    if os.path.basename(root) == "cAdvisor":
        for metric in os.listdir(root):
            metric_path = os.path.join(root, metric)
            if os.path.isdir(metric_path):
                for file in os.listdir(metric_path):
                    if file.endswith(".csv"):
                        df = pd.read_csv(os.path.join(metric_path, file))
                        col = 'instance'
                        cAdvisor_containers.update(
                            str(name) for name in df[col].unique() if pd.notna(name) and re.match(nextflow_pattern, str(name)))
                        
workdir_containers = set(container_workdirs.keys())
missing_in_workdirs = cAdvisor_containers - workdir_containers
missing_in_cadvisor = workdir_containers - cAdvisor_containers
print("Containers in workdirs but NOT in cAdvisor:", missing_in_cadvisor)
print(len(missing_in_cadvisor))
                                

Containers in workdirs but NOT in cAdvisor: {'nxf-2FQONitXmvrAu2CTCWJklCig', 'nxf-oe20oA7rUrEOLaxCVHOIFH3y', 'nxf-LduiJ8a5AAaJRvyCZdx0ONWc', 'nxf-sIRLu96fyj05OQ1s7nEG7K8e', 'nxf-jGeaDE2H634XKY4r0NFzynRa'}
5


In [None]:
# Add the containers working directory to every cAdvisor task time series file in all metrics
for root, dirs, files in os.walk(results):
    if os.path.basename(root) == "containers":
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                fin_container_df = pd.read_csv(file_path)
                container_name = os.path.splitext(file)[0]
                if container_name in container_workdirs:
                    workdir = container_workdirs[container_name]
                    fin_container_df['WorkDir'] = workdir
                    fin_container_df.to_csv(file_path, index=False)
                    print(f"Updated {file_path} with work directory {workdir}")

In [None]:
# Extract slurm job metadata out of time-series data and write them into separate files
slurm_metadata_path = "/usr/local/bin/results/task_metadata/slurm-job-exporter/slurm_job_id"

for file in os.listdir(slurm_metadata_path):
    if file.endswith("slurm_job_id.csv"):
        file_path = os.path.join(slurm_metadata_path, file)
        print(f"Reading file: {file_path}")
        df = pd.read_csv(file_path)
        try:
            rm_columns = ['num_cpus', 'work_dir','job_name','value', 'instance', 'partition', 'priority', 'run_time',
                'slurm_job_pid', 'std_err', 'std_in', 'submit_time', 'threads_per_core', 'user']
            df.drop(columns=rm_columns, inplace=True, errors='ignore')
            slurm_job_col = 'ave_vm_size'
        except Exception as e:
            print(f"Error dropping columns: {e}")
        
        # print(df.head())
        for job_name in df[slurm_job_col].unique():
            print(slurm_job_col)
            if pd.isna(job_name):
                continue
            print(f"Processing job: {job_name}")
            job_df = df[df[slurm_job_col] == job_name]
            out_path = os.path.join(slurm_metadata_path, f"{job_name}.csv")
            job_df.to_csv(out_path, index=False)
            print(f"Saved data for {job_name} to {out_path}")

In [None]:
# Add the nf-core task name to the finished containers file
for file in os.listdir(slurm_metadata_path):
    if file.endswith("slurm_job_id.csv"):
        file_path = os.path.join(slurm_metadata_path, file)
        print(f"Reading file: {file_path}")
        df = pd.read_csv(file_path)
        rm_columns = ['num_cpus', 'work_dir','job_name','value', 'instance', 'partition', 'priority', 'run_time',
            'slurm_job_pid', 'std_err', 'std_in', 'submit_time', 'threads_per_core', 'user']
        df.drop(columns=rm_columns, inplace=True, errors='ignore')

        fin_df = pd.read_csv(fin_containers)
        if 'WorkDir' in fin_df.columns and 'num_tasks' in df.columns:
            for idx, row in df.iterrows():
                work_dir = row['cpu_ids']  
                slurm_job = row['ave_vm_size'] 
                if pd.isna(work_dir) or pd.isna(slurm_job):
                    print(f"Skipping row {idx} due to missing WorkDir or slurm_job.")
                    continue
                # Update fin_df where WorkDir matches
                fin_df.loc[fin_df['WorkDir'] == work_dir, 'Nextflow'] = slurm_job

            # Write back the updated fin_df
            fin_df.to_csv(fin_containers, index=False)
            print(f"Updated {fin_containers} with slurm job info.")
        else:
            print("WorkDir or num_tasks column missing in DataFrames.")

In [None]:
# Update: process all containers directories under all metrics, not just the first cAdvisor found
for root, dirs, files in os.walk(results):
    if os.path.basename(root) == "containers":
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                container_df = pd.read_csv(file_path)
                if 'WorkDir' in container_df.columns:
                    workdir = container_df['WorkDir'].iloc[0]
                    match = fin_df[fin_df['WorkDir'] == workdir]
                    if not match.empty and 'Nextflow' in match.columns:
                        nextflow_value = match['Nextflow'].values[0]
                        container_df['Nextflow'] = nextflow_value
                        container_df.to_csv(file_path, index=False)
                        print(f"Updated {file_path} with Nextflow value {nextflow_value}")

In [65]:
# I dont think i need reindexing or interpolation here...
# Time-series data treatments
# Convert to convenient time-series format 
# for root, dirs, files in os.walk(results):
#     if os.path.basename(root) == "containers":
#         for file in files:
#             if file.endswith(".csv"):
#                 file_path = os.path.join(root, file)
#                 # Set the timestamp column as the index
#                 ts_container_df = pd.read_csv(file_path, index_col=0)
#                 print(f"Processing time-series data for {file_path}")
#                 # Convert the timestamp column to datetime format
#                 ts_container_df['timestamp'] = pd.to_datetime(ts_container_df['timestamp'], unit='ns')
#                 # Define a uniform time grid
#                 ts_container_df = ts_container_df.set_index("timestamp")
#                 # Define time grid based on existing data range
#                 ts_start = complete_ts_df.index.min().floor("500ms")
#                 ts_end = complete_ts_df.index.max().ceil("500ms")
#                 ts_grid = pd.date_range(start=ts_start, end=ts_end, freq="500ms")

#                 # Reindex to aligned uniform grid (NaNs may appear where no original data exists)
#                 complete_ts_df = complete_ts_df.reindex(ts_grid)

#                 # Save result
#                 complete_ts_df.index.name = "timestamp"
#                 file_name = os.path.splitext(file)[0] + "_ts.csv"
#                 out_path = os.path.join(metric_path, file_name)
#                 complete_ts_df.to_csv(out_path)

#                 print(f"Saved reindexed data for {file} to {out_path}") 
            

In [None]:
df = pd.read_csv(fin_containers)
container_temporal_signatures = {}
for idx, row in df.iterrows():
    container_temporal_signatures[row['Name']] = {
        'temporal_signatures': {}
    }

# Feature vectors
for root, dirs, files in os.walk(results):
    if os.path.basename(root) == "containers":
        metric_name = os.path.basename(os.path.dirname(root))
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                ts_container_df = pd.read_csv(file_path)
                ts_container_df['timestamp'] = pd.to_datetime(ts_container_df['timestamp'], unit='ns')
                ts_container_df.set_index('timestamp', inplace=True)
                if 'value' not in ts_container_df.columns:
                    print(f"Skipping {file_path} as it does not contain 'value' column.")
                    continue
                resource_series = ts_container_df['value']

                # Feature extraction
                peak_value = resource_series.max()
                lowest_value = resource_series.min()
                mean_value = resource_series.mean()
                median_value = resource_series.median()
                variance = resource_series.var()
                mean_val = resource_series.mean()
                if mean_val == 0:
                    relative_variance = 0.0  
                else:
                    relative_variance = (resource_series.var() - mean_val**2) / (mean_val**2)
                std_dev = resource_series.std()
                pattern_vector = resource_series.iloc[np.round(np.linspace(0, len(resource_series) - 1, 10)).astype(int)].to_numpy()

                feature_vector = { 
                    'peak_value': peak_value, 'lowest_value': lowest_value, 'mean': mean_value, 'median': median_value, 
                    'variance': variance,'relative_variance': relative_variance, 'std_dev':std_dev, 
                    'pattern_vector': pattern_vector
                }
                
                container_name = os.path.splitext(file)[0]
                if container_name in container_temporal_signatures:
                    container_temporal_signatures[container_name]['temporal_signatures'][metric_name] = feature_vector

none_counter = 0
for name, info in container_temporal_signatures.items():
    if not info['temporal_signatures']:
        none_counter += 1
        print(f"Container {name} has no temporal signatures.")
print(f"Total containers with no signature for any metric: {none_counter}")
pprint.pprint(container_temporal_signatures)

# Add server spec to feature vector

# Convert the dict into df

# Maybe resample the missing ones or leave them or fix monitoring...

Container nxf-jGeaDE2H634XKY4r0NFzynRa has no temporal signatures.
Container nxf-oe20oA7rUrEOLaxCVHOIFH3y has no temporal signatures.
Container nxf-sIRLu96fyj05OQ1s7nEG7K8e has no temporal signatures.
Container nxf-LduiJ8a5AAaJRvyCZdx0ONWc has no temporal signatures.
Container nxf-2FQONitXmvrAu2CTCWJklCig has no temporal signatures.
Total containers with no signature for any metric: 5
{'nxf-00juGI1VV2A3pN4qiDxMXeiw': {'temporal_signatures': {'container_cpu_system_seconds_total': {'lowest_value': np.float64(0.06563),
                                                                                                 'mean': np.float64(0.06563),
                                                                                                 'median': np.float64(0.06563),
                                                                                                 'pattern_vector': array([0.06563, 0.06563, 0.06563, 0.06563, 0.06563, 0.06563, 0.06563,
       0.06563, 0.06563, 0.06563]),
   

In [None]:
# Workflow Task Clustering for resource labeling

In [None]:
# Kernel Canonical Correlation Analysis