# A. Summarize Highlighted Read Counts into a CSV File
# B. Plot Stacked Bar Graph top5_gray_rest_white_box

In [6]:
import os
import re
import shutil
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# =========================
# Config & paths
# =========================
SRC_DIRS = [
    "fastq_7_8_9_10_11_12/6_align_histogram",
    "fastq_1_2_3_4_5_6/4_align_histogram",
]
MERGED_DIR = Path("step_allign_histogram")   # NOTE: keep existing folder name
SUMMARY_DIR = MERGED_DIR / "summary"
MERGED_DIR.mkdir(parents=True, exist_ok=True)
SUMMARY_DIR.mkdir(parents=True, exist_ok=True)

# Highlight mapping (if sample name contains the suffix, that RNAME is highlighted in red)
HIGHLIGHT_MAPPING = {
    "_01step": "seq_0001_1",
    "_02step": "seq_0002_10",
    "_03step": "seq_0005_101",
    "_04step": "seq_0010_1010",
    "_05step": "seq_0021_10101",
    "_06step": "seq_0042_101010",
    "_07step": "seq_0085_1010101",
    "_08step": "seq_0170_10101010",
    "_09step": "seq_0341_101010101",
    "_10step": "seq_0682_1010101010",
    "_11step": "seq_1365_10101010101",
    "_12step": "seq_2730_101010101010",
}

TOP_N = 5
BASE_RGB = (137, 137, 138)  # gray base for gradient bars


# =========================
# Utilities
# =========================
def copy_histograms(sources, dest: Path):
    """Copy histogram_*.csv from sources into dest, avoiding name collisions."""
    copied = []
    for src in sources:
        srcp = Path(src)
        if not srcp.exists():
            print(f"⚠️  Missing source dir: {src}")
            continue
        for fn in sorted(srcp.glob("histogram_*.csv")):
            dest_name = fn.name
            out_path = dest / dest_name
            if out_path.exists():
                base, ext = os.path.splitext(dest_name)
                dest_name = f"{base}__from_{srcp.name}{ext}"
                out_path = dest / dest_name
            shutil.copy2(fn, out_path)
            copied.append(out_path)
    if copied:
        print(f"✅ Copied {len(copied)} files into {dest}")
    else:
        print("⚠️  No histogram_*.csv copied (none found in sources).")
    return copied


def extract_step_number(name: str):
    """Extract step number from a filename like '_07step' for sorting."""
    m = re.search(r"_(\d+)step", name)
    return int(m.group(1)) if m else float("inf")


def blend_color(base_rgb, t: float):
    """Return gray→white gradient color as an RGB tuple in [0,1]."""
    white = np.array([255, 255, 255], dtype=float)
    base = np.array(base_rgb, dtype=float)
    blended = (1 - t) * base + t * white
    return tuple(blended / 255.0)


# =========================
# Step A: Ensure merged_dir has CSVs
# =========================
if not any(MERGED_DIR.glob("histogram_*.csv")):
    copy_histograms(SRC_DIRS, MERGED_DIR)
else:
    print(f"ℹ️  Using existing CSVs in {MERGED_DIR}")


# =========================
# Step B: Build highlight summary CSV (formerly '2번')
# =========================
rows = []
for fp in sorted(MERGED_DIR.glob("histogram_*.csv")):
    file_name = fp.name.replace("histogram_", "")
    try:
        df = pd.read_csv(fp)
    except Exception as e:
        print(f"❌ Read fail: {fp.name} -> {e}")
        continue

    if not {"RNAME", "Count"}.issubset(df.columns):
        print(f"⚠️ Skip (missing RNAME/Count): {fp.name}")
        continue

    # Determine the highlight target RNAME by step suffix
    highlight_rname = ""
    for suffix, rname in HIGHLIGHT_MAPPING.items():
        if suffix in file_name:
            highlight_rname = rname
            break

    df["Count"] = pd.to_numeric(df["Count"], errors="coerce").fillna(0).astype(int)
    total = int(df["Count"].sum())

    hl_cnt = int(df.loc[df["RNAME"] == highlight_rname, "Count"].sum()) if highlight_rname else 0
    hl_pct = round((hl_cnt / total) * 100, 2) if total > 0 else 0.0

    counts_sorted = df["Count"].sort_values(ascending=False).to_numpy()
    if counts_sorted.size >= 2:
        second = int(counts_sorted[1])
    elif counts_sorted.size == 1:
        second = int(counts_sorted[0])
    else:
        second = 0
    hl_vs_second = round((hl_cnt / second), 3) if second > 0 else 0.0

    rows.append([
        file_name,                  # File
        hl_cnt,                     # Highlight_Count
        total,                      # Total_Count
        hl_pct,                     # Highlight_Percentage
        highlight_rname,            # Highlight_RNAMEs
        hl_vs_second,               # Highlight_vs_SecondTop_Ratio
        extract_step_number(file_name)  # Step_Number (for sorting)
    ])

cols = [
    "File",
    "Highlight_Count",
    "Total_Count",
    "Highlight_Percentage",
    "Highlight_RNAMEs",
    "Highlight_vs_SecondTop_Ratio",
    "Step_Number"
]
highlight_df = pd.DataFrame(rows, columns=cols)
if not highlight_df.empty:
    highlight_df = highlight_df.sort_values("Step_Number").drop(columns="Step_Number")
out_csv = SUMMARY_DIR / "highlight_result.csv"
highlight_df.to_csv(out_csv, index=False)
print(f"📌 Highlight summary saved to: {out_csv}")


# =========================
# Step C: Load, normalize, and plot stacked bars (formerly '1번')
# =========================
sample_rname_dfs = {}
for file_path in sorted(MERGED_DIR.glob("histogram_*.csv")):
    sample_name = file_path.name.replace("histogram_", "").replace(".csv", "")
    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        print(f"⚠️  Read fail: {file_path.name} -> {e}")
        continue

    if not {"RNAME", "Count"}.issubset(df.columns):
        print(f"⚠️  Skip (missing RNAME/Count): {file_path.name}")
        continue

    df["Count"] = pd.to_numeric(df["Count"], errors="coerce").fillna(0).astype(int)
    if df["Count"].sum() == 0:
        print(f"⚠️  Skip (sum(Count)=0): {file_path.name}")
        continue

    df["Normalized_Count"] = df["Count"] / df["Count"].sum()
    df = df.sort_values("Count", ascending=False).reset_index(drop=True)
    df["Sample"] = sample_name
    sample_rname_dfs[sample_name] = df

if not sample_rname_dfs:
    raise SystemExit("❌ No usable histogram_*.csv found to plot.")

# Sort samples by step number in sample name
sorted_samples = sorted(sample_rname_dfs.items(), key=lambda kv: extract_step_number(kv[0]))

# Plot
fig, ax = plt.subplots(figsize=(24, 12))

for sample_name, df in sorted_samples:
    matched_key = next((k for k in HIGHLIGHT_MAPPING if k in sample_name), None)
    highlight_rname = HIGHLIGHT_MAPPING.get(matched_key)

    bottom = 0.0
    rest_sum = 0.0

    for rank, row in df.iterrows():
        rname = row["RNAME"]
        height = float(row["Normalized_Count"])

        if highlight_rname and rname == highlight_rname:
            ax.bar(sample_name, height, bottom=bottom, color="red", edgecolor="black", linewidth=0.2)
            bottom += height
        elif rank < TOP_N:
            t = rank / (TOP_N - 1) if TOP_N > 1 else 0.0
            color = blend_color(BASE_RGB, t)
            ax.bar(sample_name, height, bottom=bottom, color=color, edgecolor="black", linewidth=0.2)
            bottom += height
        else:
            rest_sum += height

    if rest_sum > 0:
        ax.bar(sample_name, rest_sum, bottom=bottom, color="white", edgecolor="black", linewidth=0.2)

# Style
ax.axhline(y=0.5, color="gray", linestyle="--", linewidth=1)
ax.set_ylabel("Normalized Count", fontsize=20)
ax.set_xlabel("Sample", fontsize=20)
ax.set_title("Stacked Bar (Red = Highlight, Gray→White = Top 5, Rest = One White Box)", fontsize=18)
ax.tick_params(axis="x", labelsize=18)
ax.tick_params(axis="y", labelsize=18)
plt.xticks(rotation=45, ha="right")
plt.tight_layout()

# Save plot
png_path = SUMMARY_DIR / "stacked_bar_top5_gray_rest_white_box.png"
svg_path = SUMMARY_DIR / "stacked_bar_top5_gray_rest_white_box.svg"
plt.savefig(png_path, dpi=200)
plt.savefig(svg_path)
plt.close()

print(f"✅ Saved:\n - PNG: {png_path}\n - SVG: {svg_path}")

✅ Copied 10 files into step_allign_histogram
📌 Highlight summary saved to: step_allign_histogram/summary/highlight_result.csv
✅ Saved:
 - PNG: step_allign_histogram/summary/stacked_bar_top5_gray_rest_white_box.png
 - SVG: step_allign_histogram/summary/stacked_bar_top5_gray_rest_white_box.svg
