## Changing data into csv form

In [None]:
# 别跑这一段，没有任何必要！！！！

import os, json, csv, gzip
from pathlib import Path

BASE = Path("data/m6Anet")
OUTDIR = Path("/mnt/sdd/fast/task2")
OUTDIR.mkdir(parents=True, exist_ok=True)

def iter_json_lines(path: Path):
    """Yield JSON objects from NDJSON file; supports .json and .json.gz."""
    if path.suffix == ".gz":
        opener = lambda: gzip.open(path, "rt", encoding="utf-8", errors="replace")
    else:
        opener = lambda: open(path, "r", encoding="utf-8")
    with opener() as f:
        for line in f:
            s = line.strip()
            if s:
                yield json.loads(s)

def flatten_record(rec):
    """
    Input record:
      { transcript_id : { position : { context7 : [ [9 features], ... ] } } }
    Yields rows: (transcript_id, position, context7, read_idx, 9 features)
    """
    (tid, tdata), = rec.items()
    (pos_key, contexts), = tdata.items()
    pos = int(pos_key) if isinstance(pos_key, str) else pos_key
    (ctx7, reads), = contexts.items()
    for i, fv in enumerate(reads):
        yield (
            tid, pos, ctx7, i,
            fv[0], fv[1], fv[2],
            fv[3], fv[4], fv[5],
            fv[6], fv[7], fv[8],
        )

HEADER = [
    "sample",
    "transcript_id","position","context7","read_idx",
    "dwell_m1","sd_m1","mean_m1",
    "dwell_0","sd_0","mean_0",
    "dwell_p1","sd_p1","mean_p1",
]

def process_sample_dir(sample_dir: Path):
    in_path = sample_dir / "data.json"
    sample_name = sample_dir.name
    out_path = OUTDIR / f"{sample_name}.csv"
    print(f"[write] {out_path}  <=  {in_path}")

    with open(out_path, "w", newline="", encoding="utf-8") as fout:
        w = csv.writer(fout)
        w.writerow(HEADER)
        for rec in iter_json_lines(in_path):
            for row in flatten_record(rec):
                w.writerow((sample_name, *row))

def main():
    if not BASE.exists():
        raise SystemExit(f"{BASE} not found")
    for p in sorted(BASE.iterdir()):
        if p.is_dir():
            process_sample_dir(p)

if __name__ == "__main__":
    main()


[write] /mnt/sdd/fast/task2/SGNex_A549_directRNA_replicate5_run1.csv  <=  data/m6Anet/SGNex_A549_directRNA_replicate5_run1/data.json
[write] /mnt/sdd/fast/task2/SGNex_A549_directRNA_replicate6_run1.csv  <=  data/m6Anet/SGNex_A549_directRNA_replicate6_run1/data.json
[write] /mnt/sdd/fast/task2/SGNex_Hct116_directRNA_replicate3_run1.csv  <=  data/m6Anet/SGNex_Hct116_directRNA_replicate3_run1/data.json
[write] /mnt/sdd/fast/task2/SGNex_Hct116_directRNA_replicate3_run4.csv  <=  data/m6Anet/SGNex_Hct116_directRNA_replicate3_run4/data.json
[write] /mnt/sdd/fast/task2/SGNex_Hct116_directRNA_replicate4_run3.csv  <=  data/m6Anet/SGNex_Hct116_directRNA_replicate4_run3/data.json
[write] /mnt/sdd/fast/task2/SGNex_HepG2_directRNA_replicate5_run2.csv  <=  data/m6Anet/SGNex_HepG2_directRNA_replicate5_run2/data.json
[write] /mnt/sdd/fast/task2/SGNex_HepG2_directRNA_replicate6_run1.csv  <=  data/m6Anet/SGNex_HepG2_directRNA_replicate6_run1/data.json
[write] /mnt/sdd/fast/task2/SGNex_K562_directRNA_repl

## Preprocessing the data exactly

In [None]:
# 这一段也别跑，也没有任何必要！！！！

import os
from pathlib import Path
import polars as pl
from tqdm.auto import tqdm

os.environ["POLARS_MAX_THREADS"] = str(os.cpu_count())

IN_DIR  = Path("data/task2")
OUT_DIR = Path("data/processed_data")
OUT_DIR.mkdir(parents=True, exist_ok=True)

FEATURE_COLS = [
    "dwell_m1","sd_m1","mean_m1",
    "dwell_0","sd_0","mean_0",
    "dwell_p1","sd_p1","mean_p1",
]

def ctx_onehot_expr():
    exprs = []
    for i in range(7):
        ch = pl.col("context7").str.slice(i, 1).str.to_uppercase()
        for b in ("A","C","G","T"):
            exprs.append((ch == b).cast(pl.Int8).alias(f"ctx_{i*4 + 'ACGT'.index(b)}"))
    return exprs

def process_one_csv(in_path: Path, out_dir: Path = OUT_DIR, to_parquet: bool = False, drop_all_null_sites: bool = True):
    sample = in_path.stem

    scan = (
        pl.scan_csv(
            in_path,
            has_header=True,
            infer_schema_length=0,
            schema_overrides={
                "sample": pl.Utf8,
                "transcript_id": pl.Utf8,
                "position": pl.Int64,
                "context7": pl.Utf8,
            },
            null_values=[""],
        )
        .select(["sample","transcript_id","position","context7","read_idx", *FEATURE_COLS])
        .with_columns([pl.col(FEATURE_COLS).cast(pl.Float64, strict=False)])
    )

    keys = ["sample","transcript_id","position","context7"]

    aggs = []
    for c in FEATURE_COLS:
        aggs += [
            pl.col(c).mean().alias(f"mean_{c}"),
            pl.col(c).std(ddof=0).alias(f"std_{c}"),
            pl.col(c).min().alias(f"min_{c}"),
            pl.col(c).max().alias(f"max_{c}"),
            pl.col(c).median().alias(f"median_{c}"),
        ]
    aggs += [pl.len().alias("read_count")]

    site = (
        scan.group_by(keys).agg(aggs)
            .with_columns(ctx_onehot_expr())
            .collect()
    )

    out_path = (out_dir / f"{sample}_site_features.parquet") if to_parquet else (out_dir / f"{sample}_site_features.csv")
    (site.write_parquet(out_path) if to_parquet else site.write_csv(out_path))
    return sample, site.height, out_path

files = sorted(IN_DIR.glob("*.csv"))
print(f"Found {len(files)} files in {IN_DIR}")
results = []
for p in tqdm(files, desc="Processing files"):
    results.append(process_one_csv(p, OUT_DIR, to_parquet=False))
results


## Using model to predict

In [3]:
import json
from pathlib import Path
import pandas as pd
import xgboost as xgb

MODEL_PATH = Path("models/xgb_model.json")
META_PATH  = Path("models/metadata.json")
IN_DIR     = Path("data/processed_data")
OUT_DIR    = Path("results")
OUT_DIR.mkdir(parents=True, exist_ok=True)

model = xgb.XGBClassifier()
model.load_model(MODEL_PATH)

with META_PATH.open() as f:
    meta = json.load(f)
FEATURES  = meta["feature_names"]
THRESHOLD = meta.get("threshold", 0.5)

def get_pos_series(df: pd.DataFrame) -> pd.Series:
    if "transcript_position" in df.columns:
        return df["transcript_position"]
    if "position" in df.columns:
        return df["position"].rename("transcript_position")
    raise KeyError("Need a 'transcript_position' or 'position' column in the input.")

for csv_path in sorted(IN_DIR.glob("*.csv")):
    print(f"[predict] {csv_path.name}")
    df = pd.read_csv(csv_path)

    X = df[FEATURES]
    proba = model.predict_proba(X)[:, 1]

    out = pd.DataFrame({
        "transcript_id": df["transcript_id"],
        "transcript_position": get_pos_series(df),
        "score": proba,
    })

    out_path = OUT_DIR / f"{csv_path.stem}_scores.csv"
    out.to_csv(out_path, index=False)
    print(f"  -> wrote {out_path}")


[predict] SGNex_A549_directRNA_replicate5_run1_site_features.csv
  -> wrote results/SGNex_A549_directRNA_replicate5_run1_site_features_scores.csv
[predict] SGNex_A549_directRNA_replicate6_run1_site_features.csv
  -> wrote results/SGNex_A549_directRNA_replicate6_run1_site_features_scores.csv
[predict] SGNex_Hct116_directRNA_replicate3_run1_site_features.csv
  -> wrote results/SGNex_Hct116_directRNA_replicate3_run1_site_features_scores.csv
[predict] SGNex_Hct116_directRNA_replicate3_run4_site_features.csv
  -> wrote results/SGNex_Hct116_directRNA_replicate3_run4_site_features_scores.csv
[predict] SGNex_Hct116_directRNA_replicate4_run3_site_features.csv
  -> wrote results/SGNex_Hct116_directRNA_replicate4_run3_site_features_scores.csv
[predict] SGNex_HepG2_directRNA_replicate5_run2_site_features.csv
  -> wrote results/SGNex_HepG2_directRNA_replicate5_run2_site_features_scores.csv
[predict] SGNex_HepG2_directRNA_replicate6_run1_site_features.csv
  -> wrote results/SGNex_HepG2_directRNA_rep