# Myllia Model: ctrl-SVD + GenePT(m3) + external_sig + TFIDF(text)
Output: Weighted-output PCA -> KernelRidge(RBF)  
Hyperparam search: LOOCV proxy using official-metric math (approx)  
Writes a submission CSV at the end  

In [13]:
import os, glob, json, pickle, zipfile, urllib.request
import numpy as np
import pandas as pd
import scipy.sparse as sp

import anndata as ad
import scanpy as sc

from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.kernel_ridge import KernelRidge
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
SEED = 6
np.random.seed(SEED)

MEANS_PATH = "data/training_data_means.csv"
VALMAP_PATH = "data/pert_ids_val.csv"
H5AD_PATH   = "Data/training_cells.h5ad"

GENEPT_DIR = "external_genept"
GENEPT_SUBDIR = os.path.join(GENEPT_DIR, "GenePT_emebdding_v2")
SIG_NPZ = os.path.join("external_sig", "external_sig_k562_essential.npz")

GENEPT_ZENODO_URL = "https://zenodo.org/records/10833191/files/GenePT_emebdding_v2.zip?download=1"
GENEPT_ZIP = os.path.join(GENEPT_DIR, "GenePT.zip")

In [5]:
# Embedding dims
CTRL_SVD_DIM     = 128
GENEPT_PCA_DIM   = 128
SIG_DIM_EXPECTED = None  # inferred from npz
TFIDF_SVD_DIM    = 128

# Control trimming
DEPTH_TRIM_Q = 0.99

# Search grids
OUT_K_GRID    = [32, 48, 64, 80]
ALPHA_GRID    = [1e-4, 1e-3, 1e-2, 1e-1]
GAMMA_GRID    = [0.01, 0.02, 0.05, 0.1, 0.2]

# Post grids (best was scale=0.85, thr=0.0 from model 2)
SCALE_GRID = [0.75, 0.80, 0.85, 0.90, 0.95, 1.00]
THR_GRID   = [0.0, 0.01, 0.02, 0.05, 0.08]

BLEND_GRID = [1.0]

Mertric Math

In [4]:
def _smoothstep(t):
    return t * t * (3.0 - 2.0 * t)

def _gate(x, left=0.0, right=0.2):
    t = (x - left) / (right - left)
    t = np.clip(t, 0.0, 1.0)
    return _smoothstep(t)

def weighted_cosine(pred_flat, true_flat, left=0.0, right=0.2, eps=1e-12):
    pred = np.asarray(pred_flat, np.float64).ravel()
    true = np.asarray(true_flat, np.float64).ravel()
    x = np.maximum(np.abs(pred), np.abs(true))
    w = _gate(x, left, right)
    w2 = w * w
    num = np.sum(w2 * pred * true)
    den = np.sqrt(np.sum(w2 * pred * pred)) * np.sqrt(np.sum(w2 * true * true))
    return 0.0 if den < eps else float(num / den)

def official_score_arrays(y_true, y_pred, w, baseline_wmae, eps=1e-12, max_log2=5.0, cos_left=0.0, cos_right=0.2):
    y_true = np.asarray(y_true, np.float64)
    y_pred = np.asarray(y_pred, np.float64)
    w = np.asarray(w, np.float64)
    baseline_wmae = np.asarray(baseline_wmae, np.float64)

    pred_wmae = np.mean(np.abs(y_true - y_pred) * w, axis=1)
    pred_wmae = np.maximum(pred_wmae, eps)
    baseline = np.maximum(baseline_wmae, eps)

    terms = np.log2(baseline / pred_wmae)
    terms = np.minimum(terms, max_log2)
    sum_wmae = float(np.sum(terms))

    wcos = weighted_cosine(y_pred.ravel(), y_true.ravel(), left=cos_left, right=cos_right, eps=eps)
    return float(sum_wmae * max(0.0, wcos))

def proxy_weights_from_deltas(D):
    # Approx gene weights from typical movement across perturbations
    w = np.mean(np.abs(D), axis=0).astype(np.float64)
    w = w / (np.mean(w) + 1e-12)  # mean=1 => sum=n_genes
    return w

def proxy_score_oof(y_true, y_pred, base_pred_per_row, w_gene):
    y_true = np.asarray(y_true, np.float64)
    y_pred = np.asarray(y_pred, np.float64)
    base_pred_per_row = np.asarray(base_pred_per_row, np.float64)

    n, d = y_true.shape
    w = np.tile(w_gene[None, :], (n, 1))
    baseline_wmae = np.mean(np.abs(y_true - base_pred_per_row) * w, axis=1)
    return official_score_arrays(y_true, y_pred, w, baseline_wmae)

def soft_threshold(x, t):
    if t <= 0:
        return x
    return np.sign(x) * np.maximum(np.abs(x) - t, 0.0)


In [6]:
def load_genept_for_union(pkl_path, union_genes, pca_dim=128, seed=6):
    print(f"[GenePT] Loading: {pkl_path}")
    with open(pkl_path, "rb") as f:
        d = pickle.load(f)

    dim = None
    for g in union_genes:
        v = d.get(g.upper(), None)
        if v is not None:
            vv = np.asarray(v, dtype=np.float32).ravel()
            dim = vv.shape[0]
            break
    if dim is None:
        raise ValueError("GenePT dict does not contain any of your union genes.")

    E = np.zeros((len(union_genes), dim), dtype=np.float32)
    found_vecs = []
    missing = 0

    for i, g in enumerate(union_genes):
        v = d.get(g.upper(), None)
        if v is None:
            missing += 1
            continue
        vv = np.asarray(v, dtype=np.float32).ravel()
        if vv.shape[0] != dim:
            missing += 1
            continue
        E[i] = vv
        found_vecs.append(vv)

    fallback = np.mean(np.stack(found_vecs, axis=0), axis=0).astype(np.float32)

    mask = np.all(E == 0.0, axis=1)
    E[mask] = fallback

    print(f"[GenePT] dim={dim} missing={missing}/{len(union_genes)}")

    pca = PCA(n_components=min(pca_dim, dim), random_state=seed)
    E_small = pca.fit_transform(E).astype(np.float32)
    gene2 = {g.upper(): E_small[i] for i, g in enumerate(union_genes)}
    fb = E_small.mean(axis=0).astype(np.float32)
    return E_small, gene2, fb

In [7]:
def build_or_load_tfidf_embeddings(union_genes_upper):
    """
    Builds TF-IDF char ngram embeddings from the provided JSON summaries.
    Caches to external_genept/tfidf_combined_svd128.npz
    """
    out_path = os.path.join(GENEPT_DIR, f"tfidf_combined_svd{TFIDF_SVD_DIM}.npz")
    if os.path.exists(out_path):
        z = np.load(out_path, allow_pickle=True)
        genes = [str(x).upper() for x in z["genes"]]
        emb = z["emb"].astype(np.float32)
        mp = {g: emb[i] for i, g in enumerate(genes)}
        fb = emb.mean(axis=0).astype(np.float32)
        print("[TFIDF] Loaded cached:", out_path, "shape:", emb.shape)
        return mp, fb, emb.shape[1]

    n_path = os.path.join(GENEPT_SUBDIR, "NCBI_summary_of_genes.json")
    u_path = os.path.join(GENEPT_SUBDIR, "NCBI_UniProt_summary_of_genes.json")
    if not os.path.exists(n_path) or not os.path.exists(u_path):
        raise FileNotFoundError("Missing NCBI / UniProt summary JSONs under GenePT_emebdding_v2")

    print("[TFIDF] Building embeddings from JSON summaries")

    with open(n_path, "r", encoding="utf-8") as f:
        ncbi = json.load(f)
    with open(u_path, "r", encoding="utf-8") as f:
        unip = json.load(f)

    texts = []
    miss = 0
    for g in union_genes_upper:
        tn = ncbi.get(g, "") or ""
        tu = unip.get(g, "") or ""
        t = (str(tn) + " " + str(tu)).strip()
        if not t:
            miss += 1
        texts.append(t)

    print("[TFIDF] empty summaries:", miss, "/", len(union_genes_upper))

    vectorizer = TfidfVectorizer(
        analyzer="char_wb",
        ngram_range=(3, 5),
        min_df=2,
        max_features=200_000,
    )
    X = vectorizer.fit_transform(texts)
    k = min(TFIDF_SVD_DIM, X.shape[0] - 1, X.shape[1] - 1)

    svd = TruncatedSVD(n_components=k, random_state=SEED)
    E = svd.fit_transform(X).astype(np.float32)

    if E.shape[1] < TFIDF_SVD_DIM:
        pad = np.zeros((E.shape[0], TFIDF_SVD_DIM - E.shape[1]), dtype=np.float32)
        E = np.hstack([E, pad])

    np.savez_compressed(out_path, genes=np.array(union_genes_upper, dtype=object), emb=E)
    print("[TFIDF] wrote:", out_path, "shape:", E.shape, "explained:", float(svd.explained_variance_ratio_.sum()))

    mp = {g: E[i] for i, g in enumerate(union_genes_upper)}
    fb = E.mean(axis=0).astype(np.float32)
    return mp, fb, E.shape[1]

In [9]:
df_means = pd.read_csv(MEANS_PATH)
gene_cols = [c for c in df_means.columns if c != "pert_symbol"]

base_mask = df_means["pert_symbol"].astype(str) == "non-targeting"

x_base = df_means.loc[base_mask, gene_cols].iloc[0].to_numpy(np.float32)

df_train = df_means.loc[~base_mask].reset_index(drop=True)
train_genes = df_train["pert_symbol"].astype(str).tolist()

D = df_train[gene_cols].to_numpy(np.float32) - x_base[None, :]
D_mean = D.mean(axis=0).astype(np.float32)

df_valmap = pd.read_csv(VALMAP_PATH)
val_map = dict(zip(df_valmap["pert_id"].astype(str), df_valmap["pert"].astype(str)))
val_genes = list(val_map.values())

union_genes = sorted(set(gene_cols) | set(train_genes) | set(val_genes))
union_upper = [g.upper() for g in union_genes]

print("Train perts:", len(train_genes))
print("Output genes:", len(gene_cols))
print("Union genes:", len(union_genes))

Train perts: 80
Output genes: 5127
Union genes: 5143


In [10]:
w_gene = proxy_weights_from_deltas(D)
sqrtw = np.sqrt(w_gene).astype(np.float32)
sqrtw = sqrtw / (np.mean(sqrtw) + 1e-12)

In [11]:
adata = ad.read_h5ad(H5AD_PATH)

ctrl = adata[adata.obs["sgrna_symbol"].astype(str) == "non-targeting"].copy()
q = float(np.quantile(ctrl.obs["nCount_RNA"].values, DEPTH_TRIM_Q))
ctrl = ctrl[ctrl.obs["nCount_RNA"].values <= q].copy()

ctrl = ctrl[:, union_genes].copy()

sc.pp.normalize_total(ctrl, target_sum=1e4, inplace=True)

In [14]:
X = ctrl.X
if not sp.issparse(X):
    X = sp.csr_matrix(X)
X = X.tocsr(copy=True)
X.data = np.log2(X.data + 1.0).astype(np.float32)

In [15]:
X = StandardScaler(with_mean=False).fit_transform(X)
X = normalize(X, norm="l2", axis=1)

svd_ctrl = TruncatedSVD(n_components=CTRL_SVD_DIM, random_state=SEED)
svd_ctrl.fit(X)

ctrl_gene_emb = svd_ctrl.components_.T.astype(np.float32)  # (n_union, CTRL_SVD_DIM)
gene2ctrl = {g.upper(): ctrl_gene_emb[i] for i, g in enumerate(union_genes)}
ctrl_fallback = ctrl_gene_emb.mean(axis=0).astype(np.float32)

print("ctrl gene emb:", ctrl_gene_emb.shape)

ctrl gene emb: (5143, 128)


In [16]:
def find_genept_pickles():
    # Prefer the model_3 gene+protein pickle
    cand = glob.glob(os.path.join(GENEPT_SUBDIR, "*.pickle"))
    cand += glob.glob(os.path.join(GENEPT_SUBDIR, "*.pkl"))
    if not cand:
        raise FileNotFoundError(f"No GenePT pickle found under {GENEPT_SUBDIR}")

    ada = None
    m3 = None
    for p in cand:
        name = os.path.basename(p).lower()
        if "gene_embedding_ada_text" in name:
            ada = p
        if "gene_protein_embedding_model_3" in name:
            m3 = p

    # Fallback if names differ
    if m3 is None:
        m3 = max(cand, key=os.path.getsize)
    return os.path.normpath(m3)

In [17]:
find_genept_pickles()

'external_genept\\GenePT_emebdding_v2\\GenePT_gene_protein_embedding_model_3_text.pickle'

In [18]:
genept_m3_pkl = 'external_genept\\GenePT_emebdding_v2\\GenePT_gene_protein_embedding_model_3_text.pickle'
E_m3, gene2m3, fb_m3 = load_genept_for_union(genept_m3_pkl, union_genes, pca_dim=GENEPT_PCA_DIM, seed=SEED)

print("[GenePT] m3 PCA emb:", E_m3.shape)

[GenePT] Loading: external_genept\GenePT_emebdding_v2\GenePT_gene_protein_embedding_model_3_text.pickle
[GenePT] dim=3072 missing=165/5143
[GenePT] m3 PCA emb: (5143, 128)


In [20]:
sig = np.load(SIG_NPZ, allow_pickle=True)
sig_genes = [str(x).upper() for x in sig["genes"]]
sig_emb = sig["emb"].astype(np.float32)

sig_map = {g: sig_emb[i] for i, g in enumerate(sig_genes)}
sig_fb = sig_emb.mean(axis=0).astype(np.float32)

print("[SIG] emb:", sig_emb.shape, "coverage on train:", sum(g.upper() in sig_map for g in train_genes), "/", len(train_genes))

[SIG] emb: (196239, 128) coverage on train: 1 / 80


TF-IDF text embedding from NCBI + UniProt summaries

In [21]:
tf_map, tf_fb, tf_dim = build_or_load_tfidf_embeddings(union_upper)
print("[TFIDF] dim:", tf_dim)

[TFIDF] Loaded cached: external_genept\tfidf_combined_svd128.npz shape: (5143, 128)
[TFIDF] dim: 128


In [22]:
def feat_for_gene(g):
    g = str(g).upper()
    a = gene2ctrl.get(g, ctrl_fallback)   # ctrl SVD
    b = gene2m3.get(g, fb_m3)             # GenePT m3 PCA
    c = sig_map.get(g, sig_fb)            # external perturb signature PCA
    d = tf_map.get(g, tf_fb)              # TFIDF-SVD text
    return np.concatenate([a, b, c, d], axis=0).astype(np.float32)

X_feat = np.vstack([feat_for_gene(g) for g in train_genes]).astype(np.float32)

In [23]:
fscaler = StandardScaler(with_mean=True, with_std=True)
X_feat = fscaler.fit_transform(X_feat).astype(np.float32)

print("[X] train feature matrix:", X_feat.shape)

[X] train feature matrix: (80, 512)


In [24]:
Dw = (D * sqrtw[None, :]).astype(np.float32)

In [25]:
def loocv_raw_preds(K_out, alpha, gamma):
    n = Dw.shape[0]
    oof_raw_w = np.zeros_like(Dw, dtype=np.float32)
    oof_base_w = np.zeros_like(Dw, dtype=np.float32)

    for i in range(n):
        tr = np.ones(n, dtype=bool)
        tr[i] = False

        Dw_tr = Dw[tr]
        base_tr = Dw_tr.mean(axis=0).astype(np.float32)
        oof_base_w[i] = base_tr

        pca = PCA(n_components=min(K_out, Dw_tr.shape[0] - 1), random_state=SEED)
        C_tr = pca.fit_transform(Dw_tr)

        reg = KernelRidge(alpha=alpha, kernel="rbf", gamma=gamma)
        reg.fit(X_feat[tr], C_tr)

        c_hat = reg.predict(X_feat[i:i+1])
        dw_hat = pca.inverse_transform(c_hat)[0].astype(np.float32)
        oof_raw_w[i] = dw_hat

    # Convert weighted space back to original output space
    raw = (oof_raw_w / (sqrtw[None, :] + 1e-12)).astype(np.float32)
    base = (oof_base_w / (sqrtw[None, :] + 1e-12)).astype(np.float32)
    return raw, base

best = None

In [26]:
for K_out in OUT_K_GRID:
    for alpha in ALPHA_GRID:
        for gamma in GAMMA_GRID:
            raw, base = loocv_raw_preds(K_out, alpha, gamma)

            for scale in SCALE_GRID:
                for thr in THR_GRID:
                    for blend in BLEND_GRID:
                        pred = (scale * raw).astype(np.float32)
                        pred = soft_threshold(pred, thr).astype(np.float32)
                        pred = ((1.0 - blend) * base + blend * pred).astype(np.float32)

                        s = proxy_score_oof(D, pred, base, w_gene)
                        item = {
                            "K_out": K_out, "alpha": float(alpha), "gamma": float(gamma),
                            "scale": float(scale), "thr": float(thr), "blend": float(blend),
                            "proxy": float(s)
                        }
                        if best is None or item["proxy"] > best["proxy"]:
                            best = item
                            print("Best", best)

print("\nFinal best:", best)

Best {'K_out': 32, 'alpha': 0.0001, 'gamma': 0.01, 'scale': 0.75, 'thr': 0.0, 'blend': 1.0, 'proxy': 0.5981081592545646}
Best {'K_out': 48, 'alpha': 0.0001, 'gamma': 0.01, 'scale': 0.75, 'thr': 0.0, 'blend': 1.0, 'proxy': 0.6004260299347904}
Best {'K_out': 64, 'alpha': 0.0001, 'gamma': 0.01, 'scale': 0.75, 'thr': 0.0, 'blend': 1.0, 'proxy': 0.6014028329746118}

Final best: {'K_out': 64, 'alpha': 0.0001, 'gamma': 0.01, 'scale': 0.75, 'thr': 0.0, 'blend': 1.0, 'proxy': 0.6014028329746118}


In [27]:
K_out = int(best["K_out"])
alpha = float(best["alpha"])
gamma = float(best["gamma"])
scale = float(best["scale"])
thr   = float(best["thr"])
blend = float(best["blend"])

pca = PCA(n_components=min(K_out, Dw.shape[0] - 1), random_state=SEED)
C = pca.fit_transform(Dw).astype(np.float32)

reg = KernelRidge(alpha=alpha, kernel="rbf", gamma=gamma)
reg.fit(X_feat, C)

def predict_delta(gene_symbol):
    x = feat_for_gene(gene_symbol)[None, :].astype(np.float32)
    x = fscaler.transform(x).astype(np.float32)

    c_hat = reg.predict(x)
    dw_hat = pca.inverse_transform(c_hat)[0].astype(np.float32)
    d_hat = (dw_hat / (sqrtw + 1e-12)).astype(np.float32)

    d_hat = (scale * d_hat).astype(np.float32)
    d_hat = soft_threshold(d_hat, thr).astype(np.float32)
    d_hat = ((1.0 - blend) * D_mean + blend * d_hat).astype(np.float32)
    return d_hat

In [28]:
rows = []
for k in range(1, 121):
    pert_id = f"pert_{k}"
    g = val_map.get(pert_id, None)

    if g is None:
        delta = D_mean
    else:
        delta = predict_delta(g)

    row = {"pert_id": pert_id}
    row.update({gene_cols[j]: float(delta[j]) for j in range(len(gene_cols))})
    rows.append(row)

sub = pd.DataFrame(rows, columns=["pert_id"] + gene_cols)

out_path = "model_full_external_ctrl_genept_sig_tfidf.csv"
sub.to_csv(out_path, index=False)
print("Created:", out_path)

Created: model_full_external_ctrl_genept_sig_tfidf.csv


Absolute dogshit results