In [None]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mplhep as hep
from pathlib import Path

from __future__ import annotations
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np


# Set up plotting with CMS style
plt.style.use(hep.style.CMS)
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams["font.size"] = 12

In [None]:
# Configuration
CARDS_BASE_DIR = "/home/users/lumori/bbtautau/src/bbtautau/cards/26Jan6-ggf"
CHANNELS = ["combined", "hh", "he", "hm"]


# Find all card directories with outputs
def find_output_directories():
    """Find all directories containing analysis outputs"""
    output_dirs = []

    for root, dirs, files in os.walk(CARDS_BASE_DIR):
        if "outs" in dirs:
            outs_path = Path(root) / "outs"
            if any(outs_path.glob("*.txt")):
                output_dirs.append(root)

    return sorted(output_dirs)


output_dirs = find_output_directories()
print(f"Found {len(output_dirs)} directories with outputs:")
for d in output_dirs:
    print(f"  - {Path(d).relative_to(CARDS_BASE_DIR)}")

In [None]:
def parse_asymptotic_limits(file_path):
    """Parse asymptotic limits from log file"""
    results = {}

    if not Path(file_path).exists():
        return results

    try:
        with open(file_path, "r") as f:
            content = f.read()

        # Look for expected limits pattern
        patterns = {
            "expected_2.5": r"Expected\s+2\.5%:\s*r\s*<\s*([0-9.]+)",
            "expected_16.0": r"Expected\s+16\.0%:\s*r\s*<\s*([0-9.]+)",
            "expected_50.0": r"Expected\s+50\.0%:\s*r\s*<\s*([0-9.]+)",
            "expected_84.0": r"Expected\s+84\.0%:\s*r\s*<\s*([0-9.]+)",
            "expected_97.5": r"Expected\s+97\.5%:\s*r\s*<\s*([0-9.]+)",
            "observed": r"Observed\s+Limit:\s*r\s*<\s*([0-9.]+)",
        }

        for key, pattern in patterns.items():
            match = re.search(pattern, content, re.IGNORECASE)
            if match:
                results[key] = float(match.group(1))

        # Check for convergence issues
        convergence_issues = []
        if "Minimization did NOT converge" in content:
            convergence_issues.append("did_not_converge")

        results["convergence_issues"] = convergence_issues
        results["status"] = "success" if not convergence_issues else "issues"

    except Exception as e:
        print(f"Error parsing {file_path}: {e}")
        results["status"] = "parse_error"

    return results


def parse_background_fit(file_path):
    """Parse background fit results"""
    results = {}

    if not Path(file_path).exists():
        return results

    try:
        with open(file_path, "r") as f:
            content = f.read()

        # Check fit status
        if "Minimization success!" in content:
            results["fit_converged"] = True
        elif "Minimization did NOT converge" in content:
            results["fit_converged"] = False
        else:
            results["fit_converged"] = None

        # Extract fit statistics
        nll_match = re.search(r"best fit NLL\s*=\s*([0-9.-]+)", content)
        if nll_match:
            results["best_fit_nll"] = float(nll_match.group(1))

        # Count function calls
        fcn_calls = len(re.findall(r"FCN", content))
        results["function_calls"] = fcn_calls

        # Check for specific issues
        issues = []
        if "Hesse matrix not pos-def" in content:
            issues.append("hesse_not_posdef")
        if "MIGRAD FAILS" in content:
            issues.append("migrad_failed")
        if "Covariance matrix" in content and "not available" in content:
            issues.append("no_covariance")

        results["fit_issues"] = issues

    except Exception as e:
        print(f"Error parsing background fit {file_path}: {e}")

    return results


# Test parsing on first directory
if output_dirs:
    test_dir = Path(output_dirs[0]) / "outs"
    print(f"Testing parsing on: {test_dir}")

    # Look for limit files
    limit_files = list(test_dir.glob("*AsymptoticLimits.txt"))
    if limit_files:
        test_results = parse_asymptotic_limits(limit_files[0])
        print(f"Sample limit parsing: {test_results}")

    # Look for background fit files
    bfit_files = list(test_dir.glob("*MultiDimFit.txt"))
    if bfit_files:
        test_bfit = parse_background_fit(bfit_files[0])
        print(f"Sample background fit parsing: {test_bfit}")

In [None]:
# Signal region labels: ggf, vbf, or allsigs (ggf+vbf combined)
SIG_REGION_LABELS = ["", "ggf", "vbf", "allsigs"]


def collect_all_results():
    """Collect results from all output directories"""
    all_results = []

    for card_dir in output_dirs:
        outs_dir = Path(card_dir) / "outs"
        card_name = Path(card_dir).relative_to(CARDS_BASE_DIR)

        print(f"Processing: {card_name}")

        # Check for different channel and signal region results
        for channel in CHANNELS:
            for siglabel in SIG_REGION_LABELS:
                # Build label: siglabel + channel
                channellabel = "" if channel == "combined" else channel
                label = f"{siglabel}{channellabel}"

                result_entry = {
                    "card_directory": str(card_name),
                    "channel": channel,
                    "sig_region": siglabel,
                }

                # Look for limits file
                limit_file = outs_dir / f"{label}AsymptoticLimits.txt"
                bfit_file = outs_dir / f"{label}MultiDimFit.txt"

                # Parse limits
                limit_results = parse_asymptotic_limits(limit_file)
                result_entry.update(limit_results)

                # Parse background fit
                bfit_results = parse_background_fit(bfit_file)
                result_entry.update(bfit_results)

                # Only add if we found some results
                if limit_results or bfit_results:
                    all_results.append(result_entry)

    return pd.DataFrame(all_results)


# Collect all results
results_df = collect_all_results()
print(f"\nCollected results from {len(results_df)} analyses")
print(f"Columns: {list(results_df.columns)}")
print(f"\nFirst few entries:")
display(results_df.head())

In [None]:
# Summary statistics
print("=== ANALYSIS SUMMARY ===")
print(f"Total analyses: {len(results_df)}")
print(f"Card directories: {results_df['card_directory'].nunique()}")
print(f"Signal regions: {sorted([sr for sr in results_df['sig_region'].unique() if sr])}")
print(f"Channels analyzed: {sorted(results_df['channel'].unique())}")

# Status summary
if "status" in results_df.columns:
    print("\n=== LIMIT CALCULATION STATUS ===")
    status_counts = results_df["status"].value_counts()
    for status, count in status_counts.items():
        print(f"{status}: {count}")

# Convergence summary
if "fit_converged" in results_df.columns:
    print("\n=== BACKGROUND FIT CONVERGENCE ===")
    conv_counts = results_df["fit_converged"].value_counts()
    for conv, count in conv_counts.items():
        print(f"{conv}: {count}")

# Expected limits summary by signal region and channel
if "expected_50.0" in results_df.columns:
    # Clean up names
    channel_name_map = {"combined": "Combined", "hh": "œÑ_h œÑ_h", "he": "œÑ_h e", "hm": "œÑ_h Œº"}
    sig_region_name_map = {"ggf": "ggF", "vbf": "VBF", "all": "ggF+VBF", "allsigs": "ggF+VBF"}

    # Group by signal region and channel
    limits_summary = results_df.groupby(["sig_region", "channel"])["expected_50.0"].agg(
        ["count", "mean", "std", "min", "max"]
    )
    limits_summary.columns = ["Count", "Mean", "Std Dev", "Min", "Max"]

    # Rename index levels
    limits_summary.index = limits_summary.index.set_levels(
        [sig_region_name_map.get(sr, sr) for sr in limits_summary.index.levels[0]], level=0
    )
    limits_summary.index = limits_summary.index.set_levels(
        [channel_name_map.get(ch, ch) for ch in limits_summary.index.levels[1]], level=1
    )
    limits_summary.index.names = ["Signal Region", "Channel"]

    print("\n=== EXPECTED LIMITS (50%) BY SIGNAL REGION AND CHANNEL ===")
    display(limits_summary.round(2))

In [None]:
# Create separate transposed tables for each signal region and channel
print("=== DETAILED RESULTS BY SIGNAL REGION AND CHANNEL ===")

# Get available columns
limit_cols = ["expected_2.5", "expected_16.0", "expected_50.0", "expected_84.0", "expected_97.5"]
available_limit_cols = [col for col in limit_cols if col in results_df.columns]

status_cols = ["status", "fit_converged"]
available_status_cols = [col for col in status_cols if col in results_df.columns]

# Create table for each signal region and channel
signal_regions = sorted([sr for sr in results_df["sig_region"].unique() if sr])  # exclude empty
channels = sorted(results_df["channel"].unique())

# Define nice column names
column_name_map = {
    "expected_2.5": "Expected -2œÉ",
    "expected_16.0": "Expected -1œÉ",
    "expected_50.0": "Expected Median",
    "expected_84.0": "Expected +1œÉ",
    "expected_97.5": "Expected +2œÉ",
    "status": "Status",
    "fit_converged": "Fit Converged",
}

# Define nice channel names
channel_name_map = {"combined": "Combined", "hh": "œÑ_h œÑ_h", "he": "œÑ_h œÑ_e", "hm": "œÑ_h œÑ_Œº"}

# Define nice signal region names
sig_region_name_map = {"ggf": "ggF", "vbf": "VBF", "all": "ggF+VBF", "allsigs": "ggF+VBF"}

for signal_region in signal_regions:
    sig_region_display = sig_region_name_map.get(signal_region, signal_region.upper())
    print(f"\n{'='*60}")
    print(f"Signal Region: {sig_region_display}")
    print(f"{'='*60}")

    for channel in channels:
        # Filter by both signal region and channel
        filtered_data = results_df[
            (results_df["sig_region"] == signal_region) & (results_df["channel"] == channel)
        ].copy()

        if len(filtered_data) > 0:
            # Select columns (excluding 'channel' and 'sig_region' since they're the same for all rows)
            summary_cols = ["card_directory"] + available_limit_cols + available_status_cols
            summary_table = filtered_data[summary_cols].copy()

            # Format the table - round to 2 decimal places
            for col in available_limit_cols:
                if col in summary_table.columns:
                    summary_table[col] = summary_table[col].round(2)

            # Clean up card_directory names (remove underscores, make prettier)
            summary_table["card_directory"] = (
                summary_table["card_directory"].str.replace("_", " ").str.title()
            )

            # Transpose the table: set card_directory as index, then transpose
            transposed_table = summary_table.set_index("card_directory").T

            # Rename the index (row names) to be more readable
            transposed_table.index = [
                column_name_map.get(idx, idx.replace("_", " ").title())
                for idx in transposed_table.index
            ]

            channel_display_name = channel_name_map.get(channel, channel.upper())
            print(f"\n--- {channel_display_name} Channel ---")
            display(transposed_table)

In [None]:
plt.style.use(hep.style.CMS)
hep.style.use("CMS")

from boostedhh import hh_vars

years = ["2022", "2022EE", "2023", "2023BPix"]


if "expected_50.0" in results_df.columns and len(results_df) > 0:
    required_cols = [
        "expected_2.5",
        "expected_16.0",
        "expected_50.0",
        "expected_84.0",
        "expected_97.5",
    ]
    plot_df = results_df.copy()

    # Keep only rows with full bands available
    plot_df = plot_df.dropna(subset=[col for col in required_cols if col in plot_df.columns])

    # Get unique bmin values
    bmins = sorted(plot_df["card_directory"].unique())

    # Nice names for display
    channel_name_map = {
        "combined": "Combined",
        "hh": "$œÑ_hœÑ_h$",
        "he": "$œÑ_hœÑ_e$",
        "hm": "$œÑ_hœÑ_Œº$",
    }
    sig_region_name_map = {"ggf": "ggF", "vbf": "VBF", "all": "ggF+VBF", "allsigs": "ggF+VBF"}

    if len(plot_df) > 0 and len(bmins) > 0:
        for bmin in bmins:
            # Filter data for this bmin
            bmin_df = plot_df[plot_df["card_directory"] == bmin].copy()

            if len(bmin_df) == 0:
                continue

            # Create region labels: "sig_region / channel"
            bmin_df["region_label"] = bmin_df.apply(
                lambda row: f"{sig_region_name_map.get(row['sig_region'], row['sig_region'])} / {channel_name_map.get(row['channel'], row['channel'])}",
                axis=1,
            )

            # Sort by sig_region then channel with custom ordering (combined, hh, hm, he)
            channel_order = ["combined", "hh", "hm", "he"]
            bmin_df["channel"] = pd.Categorical(
                bmin_df["channel"], categories=channel_order, ordered=True
            )
            bmin_df = bmin_df.sort_values(["sig_region", "channel"])

            regions = bmin_df["region_label"].tolist()
            n_regions = len(regions)

            if n_regions == 0:
                continue

            # Create figure
            fig_height = max(4, 0.6 * n_regions + 2)
            fig, ax = plt.subplots(figsize=(10, fig_height), dpi=400)

            y_pos = np.arange(n_regions)
            bar_height = 1.0  # Adjacent bars (no white space)

            # Extract values
            med = bmin_df["expected_50.0"].values
            low1 = bmin_df["expected_16.0"].values
            high1 = bmin_df["expected_84.0"].values
            low2 = bmin_df["expected_2.5"].values
            high2 = bmin_df["expected_97.5"].values

            # Brazil plot colors
            yellow = "#FFCC00"  # 95% band (2œÉ)
            green = "#00CC00"  # 68% band (1œÉ)

            # Plot 95% (2œÉ) band - yellow
            ax.barh(
                y_pos,
                high2 - low2,
                left=low2,
                height=bar_height,
                color=yellow,
                edgecolor="none",
                label="Expected ¬±2œÉ",
                zorder=1,
            )

            # Plot 68% (1œÉ) band - green
            ax.barh(
                y_pos,
                high1 - low1,
                left=low1,
                height=bar_height,
                color=green,
                edgecolor="none",
                label="Expected ¬±1œÉ",
                zorder=2,
            )

            # Plot median expected - black dashed line
            for i, (y, m) in enumerate(zip(y_pos, med)):
                ax.plot(
                    [m, m],
                    [y - bar_height / 2, y + bar_height / 2],
                    color="black",
                    linestyle="--",
                    linewidth=2,
                    zorder=3,
                    label="Median expected" if i == 0 else None,
                )

            # Reference line at r=1
            ax.axvline(x=1, color="red", linestyle="-", linewidth=1.5, alpha=0.7, label="SM (r=1)")

            # Formatting - keep y labels but hide tick marks
            ax.set_yticks(y_pos)
            ax.set_yticklabels(regions, fontsize=14)

            # Add expected values below each label in smaller font
            for i, (y, m) in enumerate(zip(y_pos, med)):
                ax.annotate(
                    f"Exp. {m:.1f}",
                    xy=(0, y + 0.07),
                    xytext=(-10, -10),
                    textcoords="offset points",
                    ha="right",
                    va="top",
                    fontsize=11,
                    xycoords=("axes fraction", "data"),
                )
            ax.set_xlabel("95% CL limit on $\sigma(pp\\rightarrow HH) / \sigma_{SM}$", fontsize=16)
            ax.set_ylabel("")
            ax.tick_params(axis="y", length=0)  # Hide y tick marks, keep labels
            ax.tick_params(axis="x", which="major", labelsize=14)

            ax.invert_yaxis()
            ax.set_xscale("log")
            ax.grid(True, axis="x", ls=":", alpha=0.5, which="both")

            # Legend
            ax.legend(loc="upper right", fontsize=12)

            # Title
            # ax.set_title(f"Expected Limits - {bmin}", fontsize=14, fontweight="bold")
            print(bmin)

            # CMS label
            hep.cms.label(
                ax=ax,
                label="Work in Progress",
                data=True,
                year="2022-23",
                com="13.6",
                fontsize=14,
                lumi=f"{np.sum([hh_vars.LUMI[year] for year in years]) / 1000:.1f}",
            )

            plt.tight_layout()
            plt.show()
    else:
        print("No data to plot")
else:
    print("Insufficient data for Brazil plots")

In [None]:
# Identify problematic analyses
print("=== ISSUES AND WARNINGS REPORT ===")

# Convergence issues
if "fit_converged" in results_df.columns:
    non_converged = results_df[results_df["fit_converged"] == False]
    if len(non_converged) > 0:
        print(f"\n‚ö†Ô∏è  NON-CONVERGED BACKGROUND FITS ({len(non_converged)})")
        for _, row in non_converged.iterrows():
            sig_region = row.get("sig_region", "")
            print(f"  - {row['card_directory']} / {sig_region} / {row['channel']}")
    else:
        print("\n‚úÖ All background fits converged")

# Limit calculation issues
if "status" in results_df.columns:
    failed_limits = results_df[results_df["status"] != "success"]
    if len(failed_limits) > 0:
        print(f"\n‚ö†Ô∏è  LIMIT CALCULATION ISSUES ({len(failed_limits)})")
        for _, row in failed_limits.iterrows():
            sig_region = row.get("sig_region", "")
            issues = ", ".join(row.get("convergence_issues", []))
            print(
                f"  - {row['card_directory']} / {sig_region} / {row['channel']}: {row['status']} ({issues})"
            )
    else:
        print("\n‚úÖ All limit calculations successful")

# Outlier limits (unusually high or low)
if "expected_50.0" in results_df.columns:
    valid_limits = results_df.dropna(subset=["expected_50.0"])
    if len(valid_limits) > 1:
        Q1 = valid_limits["expected_50.0"].quantile(0.25)
        Q3 = valid_limits["expected_50.0"].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers = valid_limits[
            (valid_limits["expected_50.0"] < lower_bound)
            | (valid_limits["expected_50.0"] > upper_bound)
        ]

        if len(outliers) > 0:
            print(f"\nüìä OUTLIER LIMITS ({len(outliers)})")
            print(f"   Normal range: {lower_bound:.3f} - {upper_bound:.3f}")
            for _, row in outliers.iterrows():
                sig_region = row.get("sig_region", "")
                print(
                    f"  - {row['card_directory']} / {sig_region} / {row['channel']}: {row['expected_50.0']:.3f}"
                )
        else:
            print("\n‚úÖ No outlier limits detected")

# Function call warnings
if "function_calls" in results_df.columns:
    high_calls = results_df[results_df["function_calls"] > 10000]  # Arbitrary threshold
    if len(high_calls) > 0:
        print(f"\n‚è±Ô∏è  HIGH FUNCTION CALL COUNT ({len(high_calls)})")
        for _, row in high_calls.iterrows():
            sig_region = row.get("sig_region", "")
            print(
                f"  - {row['card_directory']} / {sig_region} / {row['channel']}: {row['function_calls']} calls"
            )

print("\n=== END REPORT ===")

In [None]:
# Save results to CSV for further analysis
output_file = Path(CARDS_BASE_DIR) / "analysis_results_summary.csv"
results_df.to_csv(output_file, index=False)
print(f"Results saved to: {output_file}")

# Create a summary report
report_file = Path(CARDS_BASE_DIR) / "analysis_report.txt"
with open(report_file, "w") as f:
    f.write("HiggsCombine Analysis Report\n")
    f.write("=" * 30 + "\n\n")

    f.write(f"Generated: {pd.Timestamp.now()}\n\n")

    f.write("SUMMARY:\n")
    f.write(f"- Total analyses: {len(results_df)}\n")
    f.write(f"- Card directories: {results_df['card_directory'].nunique()}\n")
    f.write(f"- Channels: {', '.join(sorted(results_df['channel'].unique()))}\n\n")

    if "expected_50.0" in results_df.columns:
        combined_limits = results_df[results_df["channel"] == "combined"]["expected_50.0"].dropna()
        if len(combined_limits) > 0:
            f.write("COMBINED CHANNEL EXPECTED LIMITS:\n")
            f.write(f"- Best limit: {combined_limits.min():.3f}\n")
            f.write(f"- Median limit: {combined_limits.median():.3f}\n")
            f.write(f"- Worst limit: {combined_limits.max():.3f}\n\n")

    if "fit_converged" in results_df.columns:
        conv_rate = results_df["fit_converged"].sum() / len(results_df) * 100
        f.write(f"CONVERGENCE RATE: {conv_rate:.1f}%\n\n")

    f.write("DETAILED RESULTS: See analysis_results_summary.csv\n")

print(f"Summary report saved to: {report_file}")
print("\nüìÅ Output files created:")
print(f"  - {output_file.name}")
print(f"  - {report_file.name}")