In [5]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import docker
import logging
import time
import concurrent.futures
from datetime import datetime
import csv
import re

In [None]:
# Read in the monitoring results data
results = "/usr/local/bin/results"
fin_containers = "/usr/local/bin/results/died_nextflow_containers.csv"
start_containers = "/usr/local/bin/results/started_nextflow_containers.csv"

for root, dirs, files in os.walk(results):
    # print(i)
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)
            data = pd.read_csv(file_path, index_col=0)
            print(f"Found CSV file: {file_path}")
        

In [None]:
# Sanity checks

# Check 1: Compare started vs. finished containers
fin_df = pd.read_csv(fin_containers, index_col=0)
start_df = pd.read_csv(start_containers, index_col=0)
missing_containers = []
# print(fin_df.columns)
for container in fin_df['ContainerID']:
    if container not in start_df['ContainerID'].values:
        missing_containers.append(container)
    # print(f"Container {container} is present in finished containers.")
if missing_containers:
    print("The following containers are missing from the started containers list:")
    for container in missing_containers:
        print(container)
print("Amount of missing containers:",len(missing_containers))

# Check 2: Compare finished containers with task cpu data

Amount of missing containers: 0


In [79]:
# Write container workDirs into dict
df = pd.read_csv(fin_containers)
container_workdirs = {}
# print(df.head())
for idx, row in df.iterrows():
    container_workdirs[row['Name']] = row['WorkDir']
    
for name, workdir in container_workdirs.items():
    print(f"Container {name} has work directory {workdir}") 

Container nxf-bfJFtRFRw0GIbtJCiIuzsBYd has work directory /storage/nf-core/exec/work/74/d4bd0fff2b47205292a504c29ebe8c
Container nxf-8L2S9NgKP0YAIrhQ1YAfxlhT has work directory /storage/nf-core/exec/work/b7/9463339793f7be0528f2908d5cf85c
Container nxf-7YmtU0huW8HeHSt09MlA9RgN has work directory /storage/nf-core/exec/work/03/f64cb99f85b92bb2cc40c4188ae895
Container nxf-Iy0bbDZ1tASrFdK36P54Y7Ez has work directory /storage/nf-core/exec/work/8c/cb319d859a0b685c21bebbf768a47d
Container nxf-I6XrOifaCt8ZJMdhvkzJ0jaw has work directory /storage/nf-core/exec/work/7a/154783a3be78e8480d35b35325b601
Container nxf-TRybVxfkjefOE9l8ZAg0A2V0 has work directory /storage/nf-core/exec/work/d6/a640d2e4e94a90e56fab916b0d971b
Container nxf-9KChRJSoMLn41QtCOOygsbM4 has work directory /storage/nf-core/exec/work/d1/6e2e5eadbad624f0780ec57b51416b
Container nxf-17PZ1CXKNtrXYRnSVuSXIP90 has work directory /storage/nf-core/exec/work/8e/85b15fdfa75fc29216385eca848833
Container nxf-FK09qpVhYpNuW4qgv9xCpC0F has work 

In [None]:
# # Extract each cAdvisor task in its own file and dataframe
# nextflow_pattern = r"nxf-[A-Za-z0-9]{23}"
# results = "/usr/local/bin/results"

# for root, dirs, files in os.walk(results):
#     if os.path.basename(root) == "cAdvisor":
#         cAdvisor_path = root
#         for metric in os.listdir(cAdvisor_path):
#             metric_path = os.path.join(cAdvisor_path, metric)
#             if os.path.isdir(metric_path):
#                 containers_dir = os.path.join(metric_path, "containers")
#                 os.makedirs(containers_dir, exist_ok=True)
#                 for file in os.listdir(metric_path):
#                     if file.endswith(".csv"):
#                         file_path = os.path.join(metric_path, file)
#                         print(f"Processing file: {file_path}")
#                         df = pd.read_csv(file_path, index_col=0)
#                         col = 'instance'
#                         for container_name in df[col].unique():
#                             if pd.isna(container_name):
#                                 continue
#                             if re.match(nextflow_pattern, str(container_name)):
#                                 container_df = df[df[col] == container_name]
#                                 out_path = os.path.join(containers_dir, f"{container_name}.csv")
#                                 container_df.to_csv(out_path, index=False)
#                                 # print(f"Saved data for {container_name} to {out_path}")

# Add the containers workDir to each time-series entry
for root, dirs, files in os.walk(cAdvisor_path):
    if os.path.basename(root) == "containers":
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                fin_container_df = pd.read_csv(file_path)
                container_name = os.path.splitext(file)[0]
                if container_name in container_workdirs:
                    workdir = container_workdirs[container_name]
                    fin_container_df['WorkDir'] = workdir
                    fin_container_df.to_csv(file_path, index=False)
                    print(f"Updated {file_path} with work directory {workdir}")


In [None]:
# Extract slurm jobs out of time-series data
slurm_metadata_path = "/usr/local/bin/results/task_metadata/slurm-job-exporter/slurm_job_id"

for file in os.listdir(slurm_metadata_path):
    if file.endswith("slurm_job_id.csv"):
        file_path = os.path.join(slurm_metadata_path, file)
        print(f"Reading file: {file_path}")
        df = pd.read_csv(file_path)
        rm_columns = ['num_cpus', 'work_dir','job_name','value', 'instance', 'partition', 'priority', 'run_time',
            'slurm_job_pid', 'std_err', 'std_in', 'submit_time', 'threads_per_core', 'user']
        df.drop(columns=rm_columns, inplace=True, errors='ignore')
        slurm_job_col = 'job_state'
        
        # print(df.head())
        for job_name in df[slurm_job_col].unique():
            if pd.isna(job_name):
                continue
            job_df = df[df[slurm_job_col] == job_name]
            out_path = os.path.join(slurm_metadata_path, f"{job_name}.csv")
            job_df.to_csv(out_path, index=False)
            print(f"Saved data for {job_name} to {out_path}")

for file in os.listdir(slurm_metadata_path):
    if file.endswith("slurm_job_id.csv"):
        file_path = os.path.join(slurm_metadata_path, file)
        print(f"Reading file: {file_path}")
        df = pd.read_csv(file_path)
        rm_columns = ['num_cpus', 'work_dir','job_name','value', 'instance', 'partition', 'priority', 'run_time',
            'slurm_job_pid', 'std_err', 'std_in', 'submit_time', 'threads_per_core', 'user']
        df.drop(columns=rm_columns, inplace=True, errors='ignore')

        fin_df = pd.read_csv(fin_containers)
        if 'WorkDir' in fin_df.columns and 'num_tasks' in df.columns:
            for idx, row in df.iterrows():
                work_dir = row['num_tasks']  
                slurm_job = row['job_state'] 
                if pd.isna(work_dir) or pd.isna(slurm_job):
                    print(f"Skipping row {idx} due to missing WorkDir or slurm_job.")
                    continue
                # Update fin_df where WorkDir matches
                fin_df.loc[fin_df['WorkDir'] == work_dir, 'slurm_job'] = slurm_job

            # Write back the updated fin_df
            fin_df.to_csv(fin_containers, index=False)
            print(f"Updated {fin_containers} with slurm job info.")
        else:
            print("WorkDir or num_tasks column missing in DataFrames.")

In [None]:
# Time-series data treatments