# CD55 Differential Methylation Analysis — Clean Version

**Created:** 2026-02-14  
**Based on:** `20251021_DIFmC_analyze.ipynb`

Analyzes differential methylation (ΔmC) at 137 CpG sites in the CD55 gene region,  
comparing CRISPRoff-silenced vs. unedited T cells across timepoints (Days 6, 28, 35).  
Integrates LASSO and SHAP ML coefficients and exports annotated coordinate files.

**Region:** chr1:206,583,354–206,589,854 (T2T v2.0) = chr1:207,318,058–207,324,558 (hg38)  
**TSS (hg38):** 207,321,678 | **TSS (T2T):** 206,586,974

---
## Sections
1. Setup & Constants
2. Load Methylation Data
3. Compute Delta mC
4. Load ML Coefficients
5. Visualize Methylation Fractions & Delta mC
6. Visualize ML Coefficients
7. Generate CpG Coordinates
8. Merge All Data & Save Annotated CSV
9. Visualize Spatial Distribution of Delta mC
10. Export BED Files for UCSC Genome Browser

## Section 1 — Setup & Constants

In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from pathlib import Path
from datetime import date

# Utility imports
sys.path.append("/home/michalula/code/epiCausality/epiCode/utils/")
from funcs_analize_forward_reverse_extracted_mC_reads import load_padded_reads
from funcs_extract_mC_profiles_from_BAMs import get_reference_sequence

# ── Genomic constants ──────────────────────────────────────────────────────────
REGION_CHR   = "chr1"
REGION_START = 206_583_354   # T2T v2.0, 1-based inclusive
REGION_END   = 206_589_854   # T2T v2.0, 1-based inclusive
OFFSET_HG38  = 734_704       # hg38_pos = T2T_pos + OFFSET_HG38
TSS_T2T      = 206_586_974   # CD55 TSS (T2T v2.0)
TSS_HG38     = 207_321_678   # CD55 TSS (hg38) — verified correct value
PROMOTER_WIN = 1_000         # ±1,000 bp around TSS defines promoter

# ── File paths ─────────────────────────────────────────────────────────────────
REF_GENOME = Path("/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa")
BASE       = Path("/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells")
ML_DIR     = BASE / "day_6/model_data"
OUT_DIR    = Path("/home/michalula/code/epiCausality/epiCode/analyze_ont_data/compare_conditions")
TODAY      = date.today().strftime("%Y-%m-%d")

print(f"Output directory: {OUT_DIR}")
print(f"Reference genome: {REF_GENOME}  (exists={REF_GENOME.exists()})")
print(f"Date: {TODAY}")

## Section 2 — Load Methylation Data

In [None]:
def load_condition(npy_path):
    """Load a padded-reads .npy file, validate shape, return (array, fracs).

    Parameters
    ----------
    npy_path : Path or str
        Absolute path to the .npy file.

    Returns
    -------
    arr   : np.ndarray, shape (n_reads, n_cpgs)
    fracs : np.ndarray, shape (n_cpgs,)  — mean methylation fraction per CpG
    """
    arr = np.load(npy_path)
    assert arr.ndim == 2, f"Expected 2D array, got shape {arr.shape}"
    fracs = np.nansum(arr, axis=0) / arr.shape[0]
    name = Path(npy_path).name
    print(f"  {name}")
    print(f"    shape={arr.shape}  reads={arr.shape[0]}  CpGs={arr.shape[1]}")
    return arr, fracs


# ── Day 6 ──────────────────────────────────────────────────────────────────────
print("=== Day 6 ===")
D6_UNE = BASE / "day_6/unedited/analyze_single_reads/dimelo_v2_output"
D6_CR  = BASE / "day_6/croff/analyze_single_reads/dimelo_v2_output"

CGs_D6_unedited_mc07,   CGs_D6_unedited_mc07_fracs   = load_condition(
    D6_UNE / "CG_137_padded_reads_day6_unedited_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10_mCthresh0.7_t2t_v2_0_chr1:206583354-206589854_2025-09-29_units_combined_numFWD490_numRVS644.npy")

CGs_D6_unedited_mc0995, CGs_D6_unedited_mc0995_fracs = load_condition(
    D6_UNE / "CG_137_padded_reads_day6_unedited_Tcells_mC0.995_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10_mCthresh0.995_t2t_v2_0_chr1:206583354-206589854_2025-09-29_units_combined_numFWD489_numRVS638.npy")

CGs_D6_CRoff_mc07,      CGs_D6_CRoff_mc07_fracs      = load_condition(
    D6_CR  / "CG_137_padded_reads_day6_CRoff_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10_mCthresh0.7_t2t_v2_0_chr1:206583354-206589854_2025-09-29_units_combined_numFWD802_numRVS1480.npy")

CGs_D6_CRoff_mc0995,    CGs_D6_CRoff_mc0995_fracs    = load_condition(
    D6_CR  / "CG_137_padded_reads_day6_CRoff_Tcells_mC0.995_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10_mCthresh0.995_t2t_v2_0_chr1:206583354-206589854_2025-09-29_units_combined_numFWD793_numRVS1449.npy")


# ── Day 28 (low coverage — warning issued in Section 3) ────────────────────────
print("\n=== Day 28 ===")
D28_UNE = BASE / "day_28/unedited/analyze_single_reads/dimelo_v2_output"
D28_CR  = BASE / "day_28/croff/analyze_single_reads/dimelo_v2_output"

CGs_D28_unedited_mc0995, CGs_D28_unedited_mc0995_fracs = load_condition(
    D28_UNE / "CG_137_padded_reads_day28_unedited_Tcells_mC0.995_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modeBaseQ10_mCthresh0.995_t2t_v2_0_chr1:206583354-206589854_2025-09-29_units_combined_numFWD6_numRVS17.npy")

CGs_D28_unedited_mc07,   CGs_D28_unedited_mc07_fracs   = load_condition(
    D28_UNE / "CG_137_padded_reads_day28_unedited_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modeBaseQ10_mCthresh0.7_t2t_v2_0_chr1:206583354-206589854_2025-09-29_units_combined_numFWD6_numRVS18.npy")

CGs_D28_CRoff_mc07,      CGs_D28_CRoff_mc07_fracs      = load_condition(
    D28_CR  / "CG_137_padded_reads_day28_CRoff_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10_mCthresh0.7_t2t_v2_0_chr1:206583354-206589854_2025-09-29_units_combined_numFWD6_numRVS6.npy")

CGs_D28_CRoff_mc0995,    CGs_D28_CRoff_mc0995_fracs    = load_condition(
    D28_CR  / "CG_137_padded_reads_day28_CRoff_Tcells_mC0.995_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10_mCthresh0.995_t2t_v2_0_chr1:206583354-206589854_2025-09-29_units_combined_numFWD6_numRVS6.npy")


# ── Day 35 ─────────────────────────────────────────────────────────────────────
print("\n=== Day 35 ===")
# CRoff replica_1 (current, 2025-11-09) — same read counts as original 2025-09-29 files
D35_CR   = BASE / "day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output"
D35_UNE1 = BASE / "day_35/unedited/part1_37h_sequenced/dimelo_v2_output"

CGs_D35_CRoff_mc07,   CGs_D35_CRoff_mc07_fracs   = load_condition(
    D35_CR / "CG_137_padded_reads_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_mCthresh0.7_T2Tv2_0_chr1:206583354-206589854_2025-11-09_units_combined_numFWD104_numRVS222.npy")

CGs_D35_CRoff_mc0995, CGs_D35_CRoff_mc0995_fracs = load_condition(
    D35_CR / "CG_137_padded_reads_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_mCthresh0.995_T2Tv2_0_chr1:206583354-206589854_2025-11-09_units_combined_numFWD104_numRVS222.npy")

# Note: Day 35 unedited .npy files carry 'CRoff' in their filename — this is a
# naming artefact from the original pipeline; the data corresponds to unedited cells.
CGs_D35_unedited_mc07_lib1,   CGs_D35_unedited_mc07_fracs_lib1   = load_condition(
    D35_UNE1 / "CG_137_padded_reads_day35_CRoff_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modeBaseQ10_mCthresh0.7_t2t_v2_0_chr1:206583354-206589854_2025-10-05_units_combined_numFWD86_numRVS115.npy")

CGs_D35_unedited_mc0995_lib1, CGs_D35_unedited_mc0995_fracs_lib1 = load_condition(
    D35_UNE1 / "CG_137_padded_reads_day35_CRoff_Tcells_mC0.995_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modeBaseQ10_mCthresh0.995_t2t_v2_0_chr1:206583354-206589854_2025-10-05_units_combined_numFWD86_numRVS114.npy")

## Section 3 — Compute Delta mC

In [None]:
# ── Primary analysis: Day 6, mC threshold 0.995 ────────────────────────────────
dif_D6_mc0995 = CGs_D6_CRoff_mc0995_fracs - CGs_D6_unedited_mc0995_fracs

# ── Day 6, mC threshold 0.7 ────────────────────────────────────────────────────
dif_D6_mc07 = CGs_D6_CRoff_mc07_fracs - CGs_D6_unedited_mc07_fracs

# ── Day 35, mC threshold 0.995 ─────────────────────────────────────────────────
dif_D35_mc0995 = CGs_D35_CRoff_mc0995_fracs - CGs_D35_unedited_mc0995_fracs_lib1

# ── Day 28 coverage warning ────────────────────────────────────────────────────
MIN_COVERAGE = 30
d28_une_n = CGs_D28_unedited_mc0995.shape[0]
d28_cr_n  = CGs_D28_CRoff_mc0995.shape[0]
if d28_une_n < MIN_COVERAGE or d28_cr_n < MIN_COVERAGE:
    print(f"WARNING: Day 28 coverage is very low "
          f"(unedited={d28_une_n} reads, CRoff={d28_cr_n} reads).")
    print("  Day 28 delta-mC values are statistically unreliable and "
          "should NOT be used for quantitative conclusions.")

# ── Normalizations of Day 6 ΔmC (for overlays with ML coefficients) ───────────
dif_D6_mc0995_norm01 = (
    (dif_D6_mc0995 - dif_D6_mc0995.min())
    / (dif_D6_mc0995.max() - dif_D6_mc0995.min())
)
dif_D6_mc0995_max1 = dif_D6_mc0995 / np.abs(dif_D6_mc0995).max()

print(f"Day 6 ΔmC (mc0.995): min={dif_D6_mc0995.min():.3f}  max={dif_D6_mc0995.max():.3f}")
print(f"Day 6 ΔmC (mc0.7):   min={dif_D6_mc07.min():.3f}  max={dif_D6_mc07.max():.3f}")
print(f"Day 35 ΔmC (mc0.995):min={dif_D35_mc0995.min():.3f}  max={dif_D35_mc0995.max():.3f}")

## Section 4 — Load ML Coefficients

In [None]:
lasso_abs_coefs   = np.load(
    ML_DIR / "lasso_abs_coefs_silenced_unedited_Tcells_day6_mCthresh0.995_2025-10-21.npy")
shap_sumabs_coefs = np.load(
    ML_DIR / "shap_sumabs_coefs_silenced_unedited_Tcells_day6_mCthresh0.995_2025-10-21.npy")

# Validate shapes match the CpG count
expected_shape = CGs_D6_CRoff_mc0995_fracs.shape
assert lasso_abs_coefs.shape == expected_shape, \
    f"LASSO shape {lasso_abs_coefs.shape} != fracs shape {expected_shape}"
assert shap_sumabs_coefs.shape == expected_shape, \
    f"SHAP shape {shap_sumabs_coefs.shape} != fracs shape {expected_shape}"

lasso_abs_coefs_norm = (
    (lasso_abs_coefs - lasso_abs_coefs.min())
    / (lasso_abs_coefs.max() - lasso_abs_coefs.min())
)

print(f"LASSO coefs : shape={lasso_abs_coefs.shape}  "
      f"range=[{lasso_abs_coefs.min():.4f}, {lasso_abs_coefs.max():.4f}]")
print(f"SHAP coefs  : shape={shap_sumabs_coefs.shape}  "
      f"range=[{shap_sumabs_coefs.min():.4f}, {shap_sumabs_coefs.max():.4f}]")

## Section 5 — Visualize Methylation Fractions & Delta mC

In [None]:
def plot_fracs(fracs_dict, title="Methylation Fractions",
               figsize=(16, 5), alpha=0.6, colors=None):
    """Bar plot comparing methylation fractions across conditions.

    Parameters
    ----------
    fracs_dict : dict[str -> np.ndarray]
        {condition_label: fracs_array}  — plotted in insertion order.
    title : str
    figsize : tuple
    alpha : float
    colors : list of str, optional
    """
    default_colors = ["deepskyblue", "navy", "tomato", "darkred",
                      "mediumseagreen", "darkgreen"]
    colors = colors or default_colors
    n_cpgs = next(iter(fracs_dict.values())).shape[0]
    x = np.arange(n_cpgs)

    fig, ax = plt.subplots(figsize=figsize)
    for i, (label, fracs) in enumerate(fracs_dict.items()):
        ax.bar(x, fracs, snap=False, alpha=alpha,
               label=label, color=colors[i % len(colors)])
    ax.set_xlabel("CpG Index")
    ax.set_ylabel("Methylation Fraction")
    ax.set_title(title)
    ax.legend()
    plt.tight_layout()
    plt.show()


# Day 6: CRoff vs Unedited, mc0.995
plot_fracs(
    {"CRISPRoff (mc>0.995)": CGs_D6_CRoff_mc0995_fracs,
     "Unedited  (mc>0.995)": CGs_D6_unedited_mc0995_fracs},
    title="Day 6 — CD55 CpG Methylation Fractions (mC threshold 0.995)",
    colors=["deepskyblue", "navy"])

# Day 6: CRoff vs Unedited, mc0.7
plot_fracs(
    {"CRISPRoff (mc>0.7)": CGs_D6_CRoff_mc07_fracs,
     "Unedited  (mc>0.7)": CGs_D6_unedited_mc07_fracs},
    title="Day 6 — CD55 CpG Methylation Fractions (mC threshold 0.7)",
    colors=["skyblue", "royalblue"])

# Day 35: CRoff vs Unedited, mc0.995
plot_fracs(
    {"CRISPRoff Day 35 (mc>0.995)": CGs_D35_CRoff_mc0995_fracs,
     "Unedited  Day 35 (mc>0.995)": CGs_D35_unedited_mc0995_fracs_lib1},
    title="Day 35 — CD55 CpG Methylation Fractions (mC threshold 0.995)",
    colors=["tomato", "darkred"])

# Delta mC: Day 6 (mc0.995) — primary analysis
fig, ax = plt.subplots(figsize=(16, 5))
ax.bar(np.arange(len(dif_D6_mc0995)), dif_D6_mc0995,
       snap=False, alpha=0.7, color="turquoise",
       label="ΔmC = CRISPRoff − Unedited (mc>0.995, Day 6)")
ax.axhline(0, color="black", linewidth=0.8, linestyle="--")
ax.set_xlabel("CpG Index")
ax.set_ylabel("ΔmC")
ax.set_title("Day 6 — Differential Methylation (CRISPRoff − Unedited), mC threshold 0.995")
ax.legend()
plt.tight_layout()
plt.show()

# Delta mC: Day 6 (mc0.7)
fig, ax = plt.subplots(figsize=(16, 5))
ax.bar(np.arange(len(dif_D6_mc07)), dif_D6_mc07,
       snap=False, alpha=0.7, color="steelblue",
       label="ΔmC = CRISPRoff − Unedited (mc>0.7, Day 6)")
ax.axhline(0, color="black", linewidth=0.8, linestyle="--")
ax.set_xlabel("CpG Index")
ax.set_ylabel("ΔmC")
ax.set_title("Day 6 — Differential Methylation (CRISPRoff − Unedited), mC threshold 0.7")
ax.legend()
plt.tight_layout()
plt.show()

In [None]:

# Day 6: CRoff vs Unedited, mc0.7
plot_fracs(
    {"CRISPRoff (mc>0.7)": CGs_D6_CRoff_mc07_fracs,
     "Unedited  (mc>0.7)": CGs_D6_unedited_mc07_fracs},
    title="Day 6 — CD55 CpG Methylation Fractions (mC threshold 0.7)",
    colors=["skyblue", "royalblue"])


# Delta mC: Day 6 (mc0.7)
fig, ax = plt.subplots(figsize=(16, 5))
ax.bar(np.arange(len(dif_D6_mc07)), dif_D6_mc07,
       snap=False, alpha=0.7, color="steelblue",
       label="ΔmC = CRISPRoff − Unedited (mc>0.7, Day 6)")
ax.axhline(0, color="black", linewidth=0.8, linestyle="--")
ax.set_xlabel("CpG Index")
ax.set_ylabel("ΔmC")
ax.set_title("Day 6 — Differential Methylation (CRISPRoff − Unedited), mC threshold 0.7")
ax.legend()
plt.tight_layout()
plt.show()


## Section 6 — Visualize ML Coefficients

In [None]:
x = np.arange(len(lasso_abs_coefs))

# LASSO and SHAP side-by-side
fig, axes = plt.subplots(2, 1, figsize=(16, 8), sharex=True)

axes[0].bar(x, lasso_abs_coefs, snap=False, alpha=0.8,
            color="magenta", label="LASSO |coef|")
axes[0].set_ylabel("Absolute Coefficient")
axes[0].set_title("LASSO Absolute Coefficients — Day 6, mC threshold 0.995")
axes[0].legend()

axes[1].bar(x, shap_sumabs_coefs / 20, snap=False, alpha=0.7,
            color="darkorchid", label="SHAP Σ|values| / 20")
axes[1].bar(x, lasso_abs_coefs_norm, snap=False, alpha=0.5,
            color="magenta", label="LASSO |coef| normalized 0→1")
axes[1].set_xlabel("CpG Index")
axes[1].set_ylabel("Scaled Importance")
axes[1].set_title("SHAP vs LASSO (scaled for overlay comparison)")
axes[1].legend()

plt.tight_layout()
plt.show()

# ΔmC overlaid with normalized LASSO
fig, ax = plt.subplots(figsize=(16, 5))
ax.bar(x, dif_D6_mc0995, snap=False, alpha=0.6,
       color="mediumblue", label="ΔmC (Day 6, mc>0.995)")
ax.bar(x, lasso_abs_coefs_norm, snap=False, alpha=0.5,
       color="magenta", label="LASSO |coef| (normalized 0→1)")
ax.axhline(0, color="black", linewidth=0.8, linestyle="--")
ax.set_xlabel("CpG Index")
ax.set_ylabel("Value")
ax.set_title("ΔmC vs Normalized LASSO Coefficients — Day 6 CD55 CpGs")
ax.legend()
plt.tight_layout()
plt.show()

## Section 7 — Generate CpG Coordinates

Programmatically fetches the T2T v2.0 reference sequence and locates all CG dinucleotides.  
Uses **TSS (hg38) = 207,321,678** throughout — the verified correct annotation.

In [None]:
# Fetch reference sequence
# get_reference_sequence uses pysam.FastaFile.fetch(chr, start-1, end-1)
ref_seq_list = get_reference_sequence(REF_GENOME, REGION_CHR, REGION_START, REGION_END)
ref_str = ''.join(ref_seq_list).upper()
print(f"Fetched sequence: {len(ref_str)} bp  (region span: {REGION_END - REGION_START} bp)")


def find_CpG_sites(seq, region_start, offset_hg38):
    """Find all CG dinucleotide positions in a genomic sequence.

    Parameters
    ----------
    seq : str
        Genomic sequence (uppercase). seq[0] corresponds to region_start.
    region_start : int
        1-based T2T coordinate of seq[0].
    offset_hg38 : int
        hg38_pos = T2T_pos + offset_hg38.

    Returns
    -------
    list of dict with keys: start_T2T, end_T2T, start_hg38, end_hg38
    """
    sites = []
    for i in range(len(seq) - 1):
        if seq[i] == 'C' and seq[i + 1] == 'G':
            st_t2t = region_start + i
            en_t2t = st_t2t + 1
            sites.append({
                "start_T2T":  st_t2t,
                "end_T2T":    en_t2t,
                "start_hg38": st_t2t + offset_hg38,
                "end_hg38":   en_t2t + offset_hg38,
            })
    return sites


def classify_region(pos_t2t, tss_t2t=TSS_T2T, window=PROMOTER_WIN):
    """Classify a T2T genomic position relative to the CD55 TSS.

    Returns
    -------
    'Promoter', 'Upstream Flank', or 'Gene Body/Downstream'
    """
    if tss_t2t - window <= pos_t2t <= tss_t2t + window:
        return "Promoter"
    elif pos_t2t < tss_t2t - window:
        return "Upstream Flank"
    else:
        return "Gene Body/Downstream"


cpg_sites = find_CpG_sites(ref_str, REGION_START, OFFSET_HG38)
print(f"Found {len(cpg_sites)} CpG sites")

df_cpgs = pd.DataFrame(cpg_sites)
df_cpgs["name"]            = [f"CpG_{i+1}" for i in range(len(df_cpgs))]
df_cpgs["chrom"]           = REGION_CHR
df_cpgs["strand"]          = "+"
df_cpgs["rel_to_TSS_T2T"]  = df_cpgs["start_T2T"]  - TSS_T2T
df_cpgs["rel_to_TSS_hg38"] = df_cpgs["start_hg38"] - TSS_HG38
df_cpgs["Region"]          = df_cpgs["start_T2T"].apply(classify_region)

df_cpgs = df_cpgs[["name", "chrom", "start_T2T", "end_T2T",
                    "start_hg38", "end_hg38",
                    "rel_to_TSS_T2T", "rel_to_TSS_hg38",
                    "strand", "Region"]]

print(f"\nRegion breakdown:\n{df_cpgs['Region'].value_counts()}")
df_cpgs.head()

## Section 8 — Merge All Data & Save Annotated CSV

In [None]:
n_cpgs = len(df_cpgs)

# Validate all arrays match the number of CpGs found
assert len(dif_D6_mc0995)     == n_cpgs, \
    f"ΔmC D6 mc0.995 length {len(dif_D6_mc0995)} != {n_cpgs} CpGs found in reference"
assert len(dif_D6_mc07)       == n_cpgs, \
    f"ΔmC D6 mc0.7 length {len(dif_D6_mc07)} != {n_cpgs} CpGs found in reference"
assert len(lasso_abs_coefs)   == n_cpgs, \
    f"LASSO coefs length {len(lasso_abs_coefs)} != {n_cpgs} CpGs found in reference"
assert len(shap_sumabs_coefs) == n_cpgs, \
    f"SHAP coefs length {len(shap_sumabs_coefs)} != {n_cpgs} CpGs found in reference"

df_cpgs["delta_mC_D6_mc0995"]  = dif_D6_mc0995
df_cpgs["delta_mC_D6_mc07"]    = dif_D6_mc07
df_cpgs["delta_mC_D35_mc0995"] = dif_D35_mc0995
df_cpgs["lasso_abs_coef"]      = lasso_abs_coefs
df_cpgs["lasso_abs_coef_norm"] = lasso_abs_coefs_norm
df_cpgs["shap_sumabs_coef"]    = shap_sumabs_coefs

out_csv = OUT_DIR / f"CD55_CpGs_Annotated_T2T_hg38_with_model_coefs_Tcells_day6_mCthresh0.995_{TODAY}.csv"
df_cpgs.to_csv(out_csv, index=False)
print(f"Saved: {out_csv}")
print(f"Shape: {df_cpgs.shape}")
df_cpgs.head()

## Section 9 — Visualize Spatial Distribution of Delta mC

In [None]:
region_colors = {
    "Promoter":            "tab:red",
    "Upstream Flank":      "tab:blue",
    "Gene Body/Downstream": "tab:green",
}

fig, ax = plt.subplots(figsize=(16, 6))

for region, grp in df_cpgs.groupby("Region"):
    ax.bar(grp["start_T2T"], grp["delta_mC_D6_mc0995"],
           width=20, snap=False, alpha=0.8,
           color=region_colors[region], label=region)

# Reference lines
ax.axvline(TSS_T2T, color="black", linewidth=1.5, linestyle="--", label="TSS")
ax.axvline(TSS_T2T - PROMOTER_WIN, color="grey", linewidth=1.0,
           linestyle=":", label="Promoter boundary (±1 kb)")
ax.axvline(TSS_T2T + PROMOTER_WIN, color="grey", linewidth=1.0, linestyle=":")
ax.axhline(0, color="black", linewidth=0.8)

ax.xaxis.set_major_formatter(
    ticker.FuncFormatter(lambda val, _: f"{int(val):,}"))
ax.set_xlabel("Genomic Position (T2T v2.0, chr1)")
ax.set_ylabel("ΔmC  (CRISPRoff − Unedited)")
ax.set_title(
    "CD55 CpG Differential Methylation vs. Genomic Position\n"
    "Day 6, mC threshold 0.995  |  T2T v2.0 coordinates")
ax.legend(loc="upper left")
plt.tight_layout()

out_png = OUT_DIR / "delta_mC_vs_genomic_position_T2T.png"
out_pdf = OUT_DIR / "delta_mC_vs_genomic_position_T2T.pdf"
plt.savefig(out_png, dpi=300)
plt.savefig(out_pdf)
print(f"Saved: {out_png}")
print(f"Saved: {out_pdf}")
plt.show()

## Section 10 — Export BED Files for UCSC Genome Browser

In [None]:
# BED format: 0-based start, 1-based end, integer score 0–1000
df_bed = df_cpgs[["chrom", "start_hg38", "end_hg38",
                   "name", "delta_mC_D6_mc0995", "strand", "Region"]].copy()
df_bed = df_bed.rename(columns={"start_hg38": "chromStart",
                                  "end_hg38":   "chromEnd",
                                  "delta_mC_D6_mc0995": "delta_mC"})
df_bed["chromStart"] -= 1   # convert to 0-based BED
df_bed["score"] = (
    df_bed["delta_mC"] * 1000
).round().clip(lower=0, upper=1000).astype(int)

# Full BED — all CpGs
bed_full = OUT_DIR / "cd55_dmc_cpgs.hg38.bed"
df_bed[["chrom", "chromStart", "chromEnd", "name", "score", "strand"]].to_csv(
    bed_full, sep="\t", header=False, index=False)
print(f"Saved: {bed_full}  ({len(df_bed)} CpGs)")

# Top DMC BED — top 20% by absolute ΔmC
threshold = np.percentile(np.abs(df_bed["delta_mC"]), 80)
df_top = df_bed[np.abs(df_bed["delta_mC"]) >= threshold].copy()
bed_top = OUT_DIR / "top_dmc_cd55.hg38.bed"
df_top[["chrom", "chromStart", "chromEnd", "name", "score", "strand"]].to_csv(
    bed_top, sep="\t", header=False, index=False)
print(f"Saved: {bed_top}  ({len(df_top)} top CpGs, |ΔmC| ≥ {threshold:.3f})")

print(f"\nTop DMC by region:")
print(df_top.groupby("Region")["delta_mC"].describe().round(3))