In [9]:
import os, json, gzip
import numpy as np
import xgboost as xgb
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.model_selection import GroupShuffleSplit


def parse_json_line(obj):
    (tid, tdata), = obj.items()
    (pos_key, contexts), = tdata.items()
    pos = int(pos_key) if isinstance(pos_key, str) else pos_key
    (ctx7, reads), = contexts.items()
    arr = np.asarray(reads, dtype=float)
    return tid, pos, ctx7, arr

BASE_IDX = {"A":0, "C":1, "G":2, "T":3}
def onehot28(seq7: str) -> np.ndarray:
    out = np.zeros((7,4), dtype=np.int8)
    s = (seq7 or "").upper()
    for i in range(min(7, len(s))):
        j = BASE_IDX.get(s[i], -1)
        if j >= 0:
            out[i, j] = 1
    return out.ravel()

def aggregate_9(arr: np.ndarray) -> np.ndarray:
    if arr.size == 0:
        return np.zeros(45, dtype=np.float32)
    mean = arr.mean(axis=0)
    std  = arr.std(axis=0, ddof=0)
    mn   = arr.min(axis=0)
    mx   = arr.max(axis=0)
    med  = np.median(arr, axis=0)
    return np.concatenate([mean, std, mn, mx, med]).astype(np.float32, copy=False)

NUM_COLS = [
    "dwell_m1","sd_m1","mean_m1",
    "dwell_0","sd_0","mean_0",
    "dwell_p1","sd_p1","mean_p1",
]
FEATURE_NAMES = (
    [f"mean_{c}"   for c in NUM_COLS] +
    [f"std_{c}"    for c in NUM_COLS] +
    [f"min_{c}"    for c in NUM_COLS] +
    [f"max_{c}"    for c in NUM_COLS] +
    [f"median_{c}" for c in NUM_COLS] +
    [f"ctx_{i}"    for i in range(28)]
)


def build_dataset_from_json_objects(json_objects, label_dict, transcript_to_gene):
    """
    json_objects: iterable of parsed per-line dicts
    label_dict: {(transcript_id, position): 0/1}
    Returns X (N,73), y (N,), plus ids for later mapping.
    """
    X_rows, y_rows, ids = [], [], []
    for obj in json_objects:
        tid, pos, ctx7, arr = parse_json_line(obj)
        feats45 = aggregate_9(arr)
        ctx28   = onehot28(ctx7)
        X_rows.append(np.concatenate([feats45, ctx28]))
        y_rows.append(label_dict.get((tid, int(pos)), None))
        gene = transcript_to_gene.get((tid, None))
        ids.append((gene, tid, int(pos)))
    X = np.asarray(X_rows, dtype=np.float32)
    y = np.asarray(y_rows)
    return X, y, ids

def train_and_save_model(X, y, ids, model_dir="models"):
    mask = ~pd.isna(y)
    X_tr = X[mask]; y_tr = y[mask].astype(int)
    
    groups = np.array([
        str(gene) if gene is not None else tid
        for gene, tid, pos in ids
    ])[mask]

    gss = GroupShuffleSplit(test_size=0.3, random_state=42)
    train_idx, temp_idx = next(gss.split(X_tr, y_tr, groups=groups))

    gss2 = GroupShuffleSplit(test_size=0.5, random_state=42)
    val_idx, test_idx = next(gss2.split(X_tr[temp_idx], y_tr[temp_idx], groups=groups[temp_idx]))

    X_train, y_train = X_tr[train_idx], y_tr[train_idx]
    X_val,   y_val   = X_tr[temp_idx][val_idx],  y_tr[temp_idx][val_idx]
    X_test,  y_test  = X_tr[temp_idx][test_idx], y_tr[temp_idx][test_idx]

    train_genes = set(groups[train_idx])
    val_genes   = set(groups[temp_idx][val_idx])
    test_genes  = set(groups[temp_idx][test_idx])
    assert len(train_genes & val_genes) == 0, "Gene overlap between train and val!"
    assert len(train_genes & test_genes) == 0, "Gene overlap between train and test!"
    assert len(val_genes & test_genes) == 0, "Gene overlap between val and test!"
    print(f"Split complete: {len(train_genes)} train genes, {len(val_genes)} val genes, {len(test_genes)} test genes")

    clf = xgb.XGBClassifier(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="aucpr",
        scale_pos_weight=20,
        random_state=42,
        n_jobs=os.cpu_count(), 
    )
    clf.fit(X_train, y_train)

    val_p = clf.predict_proba(X_val)[:,1]
    tst_p = clf.predict_proba(X_test)[:,1]
    print("VAL  AUPRC:", average_precision_score(y_val, val_p), "AUROC:", roc_auc_score(y_val, val_p))
    print("TEST AUPRC:", average_precision_score(y_test, tst_p), "AUROC:", roc_auc_score(y_test, tst_p))

    Path(model_dir).mkdir(parents=True, exist_ok=True)
    clf.save_model(Path(model_dir) / "xgb_model.json")
    with open(Path(model_dir) / "metadata.json", "w") as f:
        json.dump({
            "feature_names": FEATURE_NAMES,
            "threshold": 0.5
        }, f, indent=2)
    return clf

def iter_json_lines(path: str):
    """Stream NDJSON from .json or .json.gz"""
    p = Path(path)
    if p.suffix == ".gz":
        with gzip.open(p, "rt", encoding="utf-8", errors="replace") as f:
            for line in f:
                s = line.strip()
                if s:
                    yield json.loads(s)
    else:
        with open(p, "r", encoding="utf-8") as f:
            for line in f:
                s = line.strip()
                if s:
                    yield json.loads(s)

In [10]:
df_labels = pd.read_csv("data_task1/data.info.labelled.csv")
label_dict = { (r.transcript_id, int(r.transcript_position)): int(r.label) for r in df_labels.itertuples(index=False)}
transcript_to_gene = dict(zip(df_labels['transcript_id'], df_labels['gene_id']))

train_json_iter = iter_json_lines("data_task1/dataset0.json")
X, y, ids = build_dataset_from_json_objects(train_json_iter, label_dict, transcript_to_gene)

model = train_and_save_model(X, y, ids, model_dir="models")

Split complete: 3733 train genes, 800 val genes, 800 test genes
VAL  AUPRC: 0.5373957333304621 AUROC: 0.9218604092017936
TEST AUPRC: 0.45298139823266814 AUROC: 0.9269070576824326


In [11]:
X.shape

(121838, 73)

#### Sanity Check
- generate predictions for entire dataset
- check prediction quality 

In [17]:
def predict_dataset(model, data_path, label_path=None, output_csv="predictions/predictions_dataset0.csv"):
    json_iter = iter_json_lines(data_path)
    X, y, ids = build_dataset_from_json_objects(json_iter, label_dict,transcript_to_gene)

    preds = model.predict_proba(X)[:, 1]

    pred_df = pd.DataFrame({
        "transcript_id": [tid for gene, tid, pos in ids],
        "transcript_position": [pos for gene, tid, pos in ids],
        "score": preds
    })
    pred_df.to_csv(output_csv, index=False)
    print(f"Predictions saved to {output_csv}")

    if label_path:
        mask = ~pd.isna(y)
        y_true = y[mask].astype(int)
        y_pred = preds[mask]
        auprc = average_precision_score(y_true, y_pred)
        auroc = roc_auc_score(y_true, y_pred)
        print(f"[Sanity Check on labeled data]")
        print(f"AUPRC: {auprc:.4f}, AUROC: {auroc:.4f}")
        return pred_df, auprc, auroc

    return pred_df


In [18]:
data_path = "data_task1/dataset0.json"
label_path = "data_task1/data.info.labelled.csv"
predict_dataset(model, data_path, label_path)

Predictions saved to predictions/predictions_dataset0.csv
[Sanity Check on labeled data]
AUPRC: 0.8163, AUROC: 0.9764


(          transcript_id  transcript_position     score
 0       ENST00000000233                  244  0.005141
 1       ENST00000000233                  261  0.051818
 2       ENST00000000233                  316  0.006430
 3       ENST00000000233                  332  0.080532
 4       ENST00000000233                  368  0.008524
 ...                 ...                  ...       ...
 121833  ENST00000641834                 1348  0.864682
 121834  ENST00000641834                 1429  0.069511
 121835  ENST00000641834                 1531  0.919931
 121836  ENST00000641834                 1537  0.072776
 121837  ENST00000641834                 1693  0.031175
 
 [121838 rows x 3 columns],
 np.float64(0.8162775099629662),
 np.float64(0.9763526120453562))

#### Fine Tuning