In [56]:
import os
import re
import pandas as pd
from pathlib import Path
from datetime import timedelta
import sys

In [57]:
sys.path.append('/groups/icecube/cyan/factory/DOMification')
from Enum.Flavour import Flavour
from Enum.EnergyRange import EnergyRange

In [52]:
def parse_logfile(logfilename: str) -> tuple[dict, float, float, float, float]:
    def parse_size(size_str):
        value, unit = size_str.split()
        value = float(value)
        unit = unit.strip().upper()
        if unit == "GB":
            return value * 1024 * 1024
        elif unit == "MB":
            return value * 1024
        return value

    def parse_elapsed(line):
        match = re.search(r'Elapsed time:\s*(\d+)h\s*(\d+)m\s*(\d+)s', line)
        if match:
            h, m, s = map(int, match.groups())
            return h * 3600 + m * 60 + s
        return None

    parsed_table = {}
    total_elapsed_time = None
    source_file_size = None
    memory_total = None
    memory_available = None

    with open(logfilename, 'r') as f:
        lines = f.readlines()

    inside_table = False
    plus_line_count = 0

    for line in lines:
        line = line.strip()

        # Source part file size
        if "Source part file size:" in line:
            match = re.search(r"Source part file size:\s*([\d.]+)\s*(GB|MB)", line)
            if match:
                value, unit = match.groups()
                source_file_size = float(value)
                if unit == "GB":
                    source_file_size *= 1024  # Convert to MB

        # Memory: 251.00 GB total, 245.71 GB available
        if "Memory:" in line:
            match = re.search(r'Memory:\s*([\d.]+)\s*GB total,\s*([\d.]+)\s*GB available', line)
            if match:
                memory_total, memory_available = map(lambda x: float(x) * 1024, match.groups())

        # Table detection based on '+----+----+' line
        if line.startswith('+') and line.endswith('+'):
            plus_line_count += 1
            if plus_line_count == 2:
                inside_table = True
            elif plus_line_count == 3:
                inside_table = False
            continue

        if inside_table:
            match = re.match(r"\|\s*(.*?)\s*\|\s*(.*?)\s*\|", line)
            if match:
                key, val = match.groups()
                try:
                    parsed_table[key.strip()] = parse_size(val.strip())
                except Exception:
                    continue

        # Total elapsed time (excluding shard lines)
        if "Elapsed time:" in line and "shard" not in line:
            total_elapsed_time = parse_elapsed(line)

    return parsed_table, total_elapsed_time, source_file_size, memory_total, memory_available


In [36]:
log_22010 = "/groups/icecube/cyan/factory/DOMification/PMTfication/log/22010/[20250420_140528]PMTfy_22010_4_51484765.log"
log_22011 = "/groups/icecube/cyan/factory/DOMification/PMTfication/log/22011/[20250419_195851]PMTfy_22011_6_51481717.log"
log_22012 = "/groups/icecube/cyan/factory/DOMification/PMTfication/log/22012/[20250419_092910]PMTfy_22012_25_51478675.log"
log_22013 = "/groups/icecube/cyan/factory/DOMification/PMTfication/log/22013/[20250421_064014]PMTfy_22013_1_51484768.log"
log_22014 = "/groups/icecube/cyan/factory/DOMification/PMTfication/log/22014/[20250419_185724]PMTfy_22014_4_51481698.log"
log_22015 = "/groups/icecube/cyan/factory/DOMification/PMTfication/log/22015/[20250419_113242]PMTfy_22015_10_51478693.log"
log_22016 = "/groups/icecube/cyan/factory/DOMification/PMTfication/log/22016/[20250421_153945]PMTfy_22016_1_51484769.log"
log_22017 = "/groups/icecube/cyan/factory/DOMification/PMTfication/log/22017/[20250419_114455]PMTfy_22017_2_51478697.log"
log_22018 = "/groups/icecube/cyan/factory/DOMification/PMTfication/log/22018/[20250419_134348]PMTfy_22018_18_51478717.log"
log_files = [
    log_22010,
    log_22011,
    log_22012,
    log_22013,
    log_22014,
    log_22015,
    log_22016,
    log_22017,
    log_22018
]


In [53]:
import pandas as pd
from pathlib import Path
from datetime import timedelta

def build_dataframe(log_files):
    records = []

    for path in log_files:
        parsed_table, total_elapsed_time, source_file_size, memory_total, memory_available = parse_logfile(path)

        total_pmtfied_size = parsed_table.get("Total PMTfied Size", None)
        avg_pmtfied_size = parsed_table.get("Avg PMTfied Shard Size", None)
        truth_file_size = parsed_table.get("Truth File Size", None)

        subdir_no = Path(path).parts[-2]

        record = {
            # "logfile": path,
            "subdir_no": int(subdir_no),
            "source_file_size_GB": source_file_size / 1024 if source_file_size else None,
            "total_pmtfied_size_GB": total_pmtfied_size / (1024 * 1024) if total_pmtfied_size else None,
            "avg_pmtfied_size_MB": avg_pmtfied_size / 1024 if avg_pmtfied_size else None,
            "truth_file_size_MB": truth_file_size / 1024 if truth_file_size else None,
            "elapsed_time": str(timedelta(seconds=int(total_elapsed_time))) if total_elapsed_time else None,
            "elapsed_time_sec": total_elapsed_time,
            "memory_total_GB": memory_total / 1024 if memory_total else None,
            "memory_available_GB": memory_available / 1024 if memory_available else None,
        }


        records.append(record)

    return pd.DataFrame(records)


In [54]:
df = build_dataframe(log_files)

In [55]:
df

Unnamed: 0,subdir_no,source_file_size_GB,total_pmtfied_size_GB,avg_pmtfied_size_MB,truth_file_size_MB,elapsed_time,elapsed_time_sec,memory_total_GB,memory_available_GB
0,22010,29.299297,3.579619,166.61,1254.04,16:21:14,58874,251.0,247.82
1,22011,24.863604,1.273662,62.11,134.13,5:06:04,18364,61.95,58.92
2,22012,38.62708,0.349443,23.86,10.39,2:08:45,7725,61.95,58.4
3,22013,26.909941,2.446357,192.7,717.41,8:59:01,32341,251.0,245.47
4,22014,24.601543,0.746182,63.67,79.14,3:03:34,11014,61.95,60.31
5,22015,38.640029,0.293721,21.48,9.53,1:04:57,3897,251.0,246.06
6,22016,27.834746,2.896699,164.79,1022.43,11:55:28,42928,251.0,245.71
7,22017,24.722363,0.914258,55.07,114.24,2:01:43,7303,251.0,245.84
8,22018,38.643193,0.349531,19.88,12.28,2:18:50,8330,61.95,58.19


In [68]:
def generate_latex_tabular(df):
    # Annotate flavour and energy range columns separately
    flavour_list = []
    energy_range_list = []

    for subdir in df["subdir_no"].astype(str):
        flavour = EnergyRange.get_flavour(subdir)
        energy_range = EnergyRange.get_energy_range(subdir)
        # Flavour LaTeX
        if flavour == Flavour.E:
            flavour_str = r"$\nu_{e}$"
        elif flavour == Flavour.MU:
            flavour_str = r"$\nu_{\mu}$"
        elif flavour == Flavour.TAU:
            flavour_str = r"$\nu_{\tau}$"
        else:
            flavour_str = "Unknown"
        flavour_list.append(flavour_str)
        energy_range_list.append(f"${energy_range.string}$" if energy_range else "Unknown")

    # Insert new columns
    df["Flavour"] = flavour_list
    df["Energy Range"] = energy_range_list

    # Sort by Flavour then EnergyRange (optional)
    df = df.sort_values(by=["Flavour", "Energy Range"]).reset_index(drop=True)

    # Define column order
    df = df[[
        "Flavour", "Energy Range",
        "source_file_size_GB",
        "truth_file_size_MB",
        "total_pmtfied_size_GB",
        "avg_pmtfied_size_MB",
        "elapsed_time",
        "memory_available_GB"
    ]]

    # Rename columns for LaTeX
    df.columns = [
        "Flavour", "Energy Range",
        "source database file [GB]",
        "truth file [MB]",
        "shard files total [GB]",
        "average shard files [MB]",
        "processing duration [hr:min:sec]",
        "machine memory [GB]"
    ]

    # Start LaTeX tabular
    latex = [
        r"\begin{tabular}{llrrrrlr}",
        r"    \toprule",
        r"    Flavour & Energy Range & source database file [GB] & truth file [MB] & shard files total [GB] & average shard files [MB] & processing duration [hr:min:sec] & machine memory [GB] \\",
        r"    \midrule"
    ]

    # Write rows with flavour suppression for repeated blocks
    last_flavour = None
    for _, row in df.iterrows():
        flavour = row["Flavour"] if row["Flavour"] != last_flavour else ""
        last_flavour = row["Flavour"]
        line = f"    {flavour} & {row['Energy Range']} & {row['source database file [GB]']:.2f} & {row['truth file [MB]']:.2f} & {row['shard files total [GB]']:.2f} & {row['average shard files [MB]']:.2f} & {row['processing duration [hr:min:sec]']} & {row['machine memory [GB]']:.2f} \\\\"
        latex.append(line)

    latex.append(r"    \bottomrule")
    latex.append(r"\end{tabular}")

    return "\n".join(latex)


In [69]:
latex_code = generate_latex_tabular(df)
print(latex_code)


\begin{tabular}{llrrrrlr}
    \toprule
    Flavour & Energy Range & source database file [GB] & truth file [MB] & shard files total [GB] & average shard files [MB] & processing duration [hr:min:sec] & machine memory [GB] \\
    \midrule
    $\nu_{\mu}$ & $100GeV-10TeV$ & 29.30 & 1254.04 & 3.58 & 166.61 & 16:21:14 & 247.82 \\
     & $10TeV-1PeV$ & 24.86 & 134.13 & 1.27 & 62.11 & 5:06:04 & 58.92 \\
     & $1PeV-100PeV$ & 38.63 & 10.39 & 0.35 & 23.86 & 2:08:45 & 58.40 \\
    $\nu_{\tau}$ & $100GeV-10TeV$ & 27.83 & 1022.43 & 2.90 & 164.79 & 11:55:28 & 245.71 \\
     & $10TeV-1PeV$ & 24.72 & 114.24 & 0.91 & 55.07 & 2:01:43 & 245.84 \\
     & $1PeV-100PeV$ & 38.64 & 12.28 & 0.35 & 19.88 & 2:18:50 & 58.19 \\
    $\nu_{e}$ & $100GeV-10TeV$ & 26.91 & 717.41 & 2.45 & 192.70 & 8:59:01 & 245.47 \\
     & $10TeV-1PeV$ & 24.60 & 79.14 & 0.75 & 63.67 & 3:03:34 & 60.31 \\
     & $1PeV-100PeV$ & 38.64 & 9.53 & 0.29 & 21.48 & 1:04:57 & 246.06 \\
    \bottomrule
\end{tabular}
