# UCSC generate track .bed files

for K562 ZFPoff changes

In [None]:
'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/compare/modkit_dmr/mc_07/filtered/20251124_dmr_K562_ZFPoffHigh_vs_Unedit_filtered_mC07.csv'

In [None]:
import pandas as pd

In [None]:
dmr_dir_path = '/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/compare/modkit_dmr/mc_07/filtered/'
dmr_K562_ZFPoffHigh_vs_Unedit_filtered_mC07 = dmr_dir_path + '20251124_dmr_K562_ZFPoffHigh_vs_Unedit_filtered_mC07.csv'
dmr_K562_ZFPoffHigh_vs_Unedit_filtered_mC07


In [None]:
# pd load csv
dmr_K562_ZFPoffHigh_vs_Unedit_filtered_mC07 = pd.read_csv(dmr_K562_ZFPoffHigh_vs_Unedit_filtered_mC07)
dmr_K562_ZFPoffHigh_vs_Unedit_filtered_mC07


In [None]:
#!/usr/bin/env python3

import pandas as pd
import numpy as np
from pathlib import Path


# === CONFIG ===

experiment_codition="K562_ZFPoffHigh_vs_Unedit_filtered_mC07"


# Put here the exact name of your CSV file:
INPUT_CSV = '/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/compare/modkit_dmr/mc_07/filtered/20251124_dmr_K562_ZFPoffHigh_vs_Unedit_filtered_mC07.csv'
# "20251121 Dmr Filtered Day6 CRoff Vs Unedit.csv"

# Output folder (can be "." for current directory)
OUT_DIR = Path("/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/compare/modkit_dmr/mc_07/filtered/" + \
               "ucsc_generate_tracks/ucsc_beds")
OUT_DIR.mkdir(exist_ok=True)


def load_dmr_table(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)

    required = [
        "chrom",
        "start_hg38",
        "end_hg38",
        "cg_unit_pair_id",
        "effect_size",
        "score",
    ]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns in CSV: {missing}")

    return df


def make_cg_id_bed(df: pd.DataFrame, 
                   experiment_codition: str, 
                   out_path: Path) -> None:
    """
    BED4: chrom, start_hg38, end_hg38, cg_unit_pair_id
    """
    bed = pd.DataFrame(
        {
            "chrom": df["chrom"],
            "start": df["start_hg38"].astype(int),
            "end": df["end_hg38"].astype(int),
            "name": df["cg_unit_pair_id"].astype(str),
        }
    )
    bed.to_csv(out_path, sep="\t", header=False, index=False)
    print(f"Written basic BED to: {out_path}")


def make_track_effectsize_color_bed(df: pd.DataFrame, 
                                    experiment_codition: str, 
                                    out_path: Path) -> None:
    """
    BED9 with itemRgb, colors reflect effect_size:

    - sign:   red = positive, blue = negative
    - brightness: ~ |effect_size| (robustly scaled by 95th percentile)
    """

    es = df["effect_size"].astype(float)

    # robust scaling by |effect_size|
    abs_es = np.abs(es.replace([np.inf, -np.inf], np.nan))
    if abs_es.notna().any():
        p95 = np.nanpercentile(abs_es.dropna(), 95)
    else:
        p95 = 1.0
    if p95 == 0 or np.isnan(p95):
        p95 = 1.0

    norm = np.clip(abs_es / p95, 0, 1)

    # intensity range (bright)
    min_intensity = 150
    intensity = (min_intensity + norm * (255 - min_intensity)).round().astype(int)

    r = np.zeros(len(df), dtype=int)
    g = np.zeros(len(df), dtype=int)
    b = np.zeros(len(df), dtype=int)

    pos_mask = es > 0
    neg_mask = es < 0
    zero_mask = ~(pos_mask | neg_mask)

    # positive -> shades of red
    r[pos_mask] = intensity[pos_mask]
    g[pos_mask] = 0
    b[pos_mask] = 0

    # negative -> shades of blue
    r[neg_mask] = 0
    g[neg_mask] = 0
    b[neg_mask] = intensity[neg_mask]

    # zeros/NaNs -> neutral gray
    r[zero_mask] = 180
    g[zero_mask] = 180
    b[zero_mask] = 180

    itemRgb = [f"{ri},{gi},{bi}" for ri, gi, bi in zip(r, g, b)]

    start = df["start_hg38"].astype(int)
    end = df["end_hg38"].astype(int)

    bed_score = (norm * 1000).round().astype(int)

    bed9 = pd.DataFrame(
        {
            "chrom": df["chrom"],
            "start": start,
            "end": end,
            "name": df["cg_unit_pair_id"].astype(str),
            "score": bed_score,
            "strand": ["."] * len(df),
            "thickStart": start,
            "thickEnd": end,
            "itemRgb": itemRgb,
        }
    )

    track_line = (
        'track name="DMRs_effect_size_bright_'+experiment_codition+'" '
        'description="DMRs colored by effect_size (red=positive, blue=negative; brightness~|effect_size|) '+experiment_codition+'" '
        'visibility=2 itemRgb="On"\n'
    )

    with open(out_path, "w") as f:
        f.write(track_line)
    bed9.to_csv(out_path, sep="\t", header=False, index=False, mode="a")
    print(f"Written effect_size-colored BED to: {out_path}")


def make_track_LMLR_score_color_bed(df: pd.DataFrame,
                                    experiment_codition: str, 
                                    out_path: Path) -> None:
    """
    BED9 with itemRgb, colors reflect 'score' (all shades of red):

    - score <= 0 -> low intensity
    - max(score) -> brightest red
    """

    score_raw = df["score"].astype(float)
    score_clipped = score_raw.clip(lower=0)

    max_score = score_clipped.max()
    if max_score == 0 or np.isnan(max_score):
        max_score = 1.0

    norm = score_clipped / max_score

    # 50..255 so low scores still visible
    min_intensity = 50
    intensity = (min_intensity + norm * (255 - min_intensity)).round().astype(int)

    r = intensity
    g = np.zeros(len(df), dtype=int)
    b = np.zeros(len(df), dtype=int)

    itemRgb = [f"{ri},{gi},{bi}" for ri, gi, bi in zip(r, g, b)]

    start = df["start_hg38"].astype(int)
    end = df["end_hg38"].astype(int)

    bed_score = (norm * 1000).round().astype(int)

    bed9 = pd.DataFrame(
        {
            "chrom": df["chrom"],
            "start": start,
            "end": end,
            "name": df["cg_unit_pair_id"].astype(str),
            "score": bed_score,
            "strand": ["."] * len(df),
            "thickStart": start,
            "thickEnd": end,
            "itemRgb": itemRgb,
        }
    )

    track_line = (
        'track name="DMRs_LMLR_score_red_gradient_'+experiment_codition+'" '
        'description="DMRs colored by LMLR score (red gradient, max score = brightest red) '+experiment_codition+'" '
        'visibility=2 itemRgb="On"\n'
    )

    with open(out_path, "w") as f:
        f.write(track_line)
    bed9.to_csv(out_path, sep="\t", header=False, index=False, mode="a")
    print(f"Written score-colored BED to: {out_path}")


def make_track_LMLR_norm_score_color_bed(df: pd.DataFrame,
                                    experiment_codition: str, 
                                    out_path: Path) -> None:
    """
    BED9 with itemRgb, colors reflect 'score' (all shades of red):

    - score <= 0 -> low intensity
    - max(score) -> brightest red
    """

    score_raw = df["score_norm"].astype(float)
    score_clipped = score_raw.clip(lower=0)

    max_score = score_clipped.max()
    if max_score == 0 or np.isnan(max_score):
        max_score = 1.0

    norm = score_clipped / max_score

    # 50..255 so low scores still visible
    min_intensity = 50
    intensity = (min_intensity + norm * (255 - min_intensity)).round().astype(int)

    r = intensity
    g = np.zeros(len(df), dtype=int)
    b = np.zeros(len(df), dtype=int)

    itemRgb = [f"{ri},{gi},{bi}" for ri, gi, bi in zip(r, g, b)]

    start = df["start_hg38"].astype(int)
    end = df["end_hg38"].astype(int)

    bed_score = (norm * 1000).round().astype(int)

    bed9 = pd.DataFrame(
        {
            "chrom": df["chrom"],
            "start": start,
            "end": end,
            "name": df["cg_unit_pair_id"].astype(str),
            "score": bed_score,
            "strand": ["."] * len(df),
            "thickStart": start,
            "thickEnd": end,
            "itemRgb": itemRgb,
        }
    )

    track_line = (
        'track name="DMR_LMLR_norm_score_red_gradient_'+experiment_codition+'" '
        'description="DMR colored by LMLR scores normalized (red gradient, max score = brightest red) '+experiment_codition+'" '
        'visibility=2 itemRgb="On"\n'
    )

    with open(out_path, "w") as f:
        f.write(track_line)
    bed9.to_csv(out_path, sep="\t", header=False, index=False, mode="a")
    print(f"Written score-colored BED to: {out_path}")


def make_LMLR_score_only_bed(df: pd.DataFrame,
                        experiment_codition: str,
                        out_path: Path) -> None:
    """
    BED5 (no color): chrom, start_hg38, end_hg38, cg_unit_pair_id, score
    """

    bed = pd.DataFrame(
        {
            "chrom": df["chrom"],
            "start": df["start_hg38"].astype(int),
            "end": df["end_hg38"].astype(int),
            "name": df["cg_unit_pair_id"].astype(str),
            "score": df["score"].clip(lower=0).astype(int),
        }
    )
    bed.to_csv(out_path, sep="\t", header=False, index=False)
    print(f"Written score-only BED to: {out_path}")


# def main():
df = load_dmr_table(INPUT_CSV)

make_cg_id_bed(df, experiment_codition, OUT_DIR /  str("dmr_basic_"+experiment_codition+".bed"))
make_track_effectsize_color_bed(df,experiment_codition, OUT_DIR /  str("ucsc_dmr_effect_size_bright_"+experiment_codition+".bed"))
make_LMLR_score_only_bed(df, experiment_codition, OUT_DIR /  str("dmrs_LMLR_score_only_"+experiment_codition+".bed"))
make_track_LMLR_score_color_bed(df, experiment_codition, OUT_DIR /  str("ucsc_dmr_LMLR_score_red_gradient_"+experiment_codition+".bed"))
make_track_LMLR_norm_score_color_bed(df, experiment_codition, OUT_DIR /  str("ucsc_dmr_LMLR_norm_score_red_gradient_"+experiment_codition+".bed"))

# if __name__ == "__main__":
#     main()


#todo: add date today to file names and move exact file paths inside functions

In [None]:
df

In [None]:
experiment_codition

In [None]:
 OUT_DIR / str("dmrs_basic_"+experiment_codition+".bed")

In [None]:
OUT_DIR

In [None]:
# === CONFIG ===

experiment_codition="T_CRoff_vs_Unedit_D6_filtered_mC07"


# Put here the exact name of your CSV file:
# "20251121 Dmr Filtered Day6 CRoff Vs Unedit.csv"
dmr_dir_path = '/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/filtered/new_dmr_output/'
dmr_T_CRoff_vs_Unedit_D6_filtered_mC07_path = dmr_dir_path + '20251121_dmr_filtered_day6_CRoff_vs_Unedit.csv'
INPUT_CSV = dmr_T_CRoff_vs_Unedit_D6_filtered_mC07_path 


dmr_T_CRoff_vs_Unedit_D6_filtered_mC07 = pd.read_csv(dmr_T_CRoff_vs_Unedit_D6_filtered_mC07_path, index_col=0)
dmr_T_CRoff_vs_Unedit_D6_filtered_mC07

# Output folder (can be "." for current directory)
OUT_DIR = Path(dmr_dir_path + \
               "ucsc_generate_tracks")
OUT_DIR.mkdir(exist_ok=True)



df = load_dmr_table(INPUT_CSV)

make_cg_id_bed(df, experiment_codition, OUT_DIR /  str("dmr_basic_"+experiment_codition+".bed"))
make_track_effectsize_color_bed(df,experiment_codition, OUT_DIR /  str("ucsc_dmr_effect_size_bright_"+experiment_codition+".bed"))
make_LMLR_score_only_bed(df, experiment_codition, OUT_DIR /  str("dmrs_LMLR_score_only_"+experiment_codition+".bed"))
make_track_LMLR_score_color_bed(df, experiment_codition, OUT_DIR /  str("ucsc_dmr_LMLR_score_red_gradient_"+experiment_codition+".bed"))
make_track_LMLR_norm_score_color_bed(df, experiment_codition, OUT_DIR /  str("ucsc_dmr_LMLR_norm_score_red_gradient_"+experiment_codition+".bed"))


# day 35 T cell CRoff

In [None]:
# === CONFIG ===

experiment_codition="T_CRoff_vs_Unedit_D35_filtered_mC07"


# Put here the exact name of your CSV file:
# "20251121 Dmr Filtered Day6 CRoff Vs Unedit.csv"
dmr_dir_path = '/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/filtered/new_dmr_output/'
dmr_T_CRoff_vs_Unedit_D6_filtered_mC07_path = dmr_dir_path + '20251121_dmr_filtered_day6_CRoff_vs_Unedit.csv'

dmr_dir_path = '/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/'
dmr_T_CRoff_vs_Unedit_D35_filtered_mC07_path = dmr_dir_path + '20251128_dmr_day35_CRoff_vs_Unedit.csv'

INPUT_CSV = dmr_T_CRoff_vs_Unedit_D35_filtered_mC07_path 


dmr_T_CRoff_vs_Unedit_D6_filtered_mC07 = pd.read_csv(dmr_T_CRoff_vs_Unedit_D6_filtered_mC07_path, index_col=0)
dmr_T_CRoff_vs_Unedit_D6_filtered_mC07

# Output folder (can be "." for current directory)
OUT_DIR = Path(dmr_dir_path + \
               "ucsc_generate_tracks")
OUT_DIR.mkdir(exist_ok=True)



df = load_dmr_table(INPUT_CSV)

make_cg_id_bed(df, experiment_codition, OUT_DIR /  str("dmr_basic_"+experiment_codition+".bed"))
make_track_effectsize_color_bed(df,experiment_codition, OUT_DIR /  str("ucsc_dmr_effect_size_bright_"+experiment_codition+".bed"))
make_LMLR_score_only_bed(df, experiment_codition, OUT_DIR /  str("dmrs_LMLR_score_only_"+experiment_codition+".bed"))
make_track_LMLR_score_color_bed(df, experiment_codition, OUT_DIR /  str("ucsc_dmr_LMLR_score_red_gradient_"+experiment_codition+".bed"))
make_track_LMLR_norm_score_color_bed(df, experiment_codition, OUT_DIR /  str("ucsc_dmr_LMLR_norm_score_red_gradient_"+experiment_codition+".bed"))
