In [2]:
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc

from scipy import sparse
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.kernel_ridge import KernelRidge

Metric used in competition

In [4]:
def _smoothstep(t):
    return t * t * (3.0 - 2.0 * t)

def _gate(x, left=0.0, right=0.2):
    t = (x - left) / (right - left)
    t = np.clip(t, 0.0, 1.0)
    return _smoothstep(t)

def weighted_cosine(pred_flat, true_flat, left=0.0, right=0.2, eps=1e-12):
    pred = np.asarray(pred_flat, dtype=np.float64).ravel()
    true = np.asarray(true_flat, dtype=np.float64).ravel()
    x = np.maximum(np.abs(pred), np.abs(true))
    w = _gate(x, left, right)
    w2 = w * w
    num = np.sum(w2 * pred * true)
    den = np.sqrt(np.sum(w2 * pred * pred)) * np.sqrt(np.sum(w2 * true * true))
    return 0.0 if den < eps else float(num / den)

def official_score_arrays(y_true, y_pred, w, baseline_wmae, eps=1e-12, max_log2=5.0, cos_left=0.0, cos_right=0.2):
    y_true = np.asarray(y_true, dtype=np.float64)
    y_pred = np.asarray(y_pred, dtype=np.float64)
    w = np.asarray(w, dtype=np.float64)
    baseline_wmae = np.asarray(baseline_wmae, dtype=np.float64)

    pred_wmae = np.mean(np.abs(y_true - y_pred) * w, axis=1)
    pred_wmae = np.maximum(pred_wmae, eps)
    baseline = np.maximum(baseline_wmae, eps)

    terms = np.log2(baseline / pred_wmae)
    terms = np.minimum(terms, max_log2)
    sum_wmae = float(np.sum(terms))

    wcos = weighted_cosine(y_pred.ravel(), y_true.ravel(), left=cos_left, right=cos_right, eps=eps)
    return round(float(sum_wmae * max(0.0, wcos)), 5)


def proxy_weights_from_deltas(D):
    w_gene = np.mean(np.abs(D), axis=0).astype(np.float64)
    w_gene = w_gene / (np.mean(w_gene) + 1e-12)  # mean=1 => sum=n_genes
    return w_gene

def proxy_score_oof(y_true, y_pred, base_pred_per_row, w_gene, max_log2=5.0, cos_left=0.0, cos_right=0.2):
    y_true = np.asarray(y_true, dtype=np.float64)
    y_pred = np.asarray(y_pred, dtype=np.float64)
    base_pred_per_row = np.asarray(base_pred_per_row, dtype=np.float64)
    w_gene = np.asarray(w_gene, dtype=np.float64)

    n, d = y_true.shape
    w = np.tile(w_gene[None, :], (n, 1))
    baseline_wmae = np.mean(np.abs(y_true - base_pred_per_row) * w, axis=1)

    return official_score_arrays(
        y_true=y_true,
        y_pred=y_pred,
        w=w,
        baseline_wmae=baseline_wmae,
        max_log2=max_log2,
        cos_left=cos_left,
        cos_right=cos_right,
    )

In [None]:
SEED = 6
np.random.seed(SEED)

MEANS_PATH = "data/training_data_means.csv"
VALMAP_PATH = "data/pert_ids_val.csv"
H5AD_PATH = "Data/training_cells.h5ad"

EMBED_DIM = 128
OUT_K = 48
ALPHA = 0.1
GAMMA = 0.2

DEPTH_TRIM_Q = 0.99

# post params to tune (small grid)
BLENDS = [0.6, 0.8, 1.0]
SCALES = [0.85, 1.0, 1.15]
THRESHS = [0.0, 0.02, 0.05]

In [6]:
def soft_threshold(x, t):
    if t <= 0:
        return x
    return np.sign(x) * np.maximum(np.abs(x) - t, 0.0)

In [7]:
df_means = pd.read_csv(MEANS_PATH)
gene_cols = [c for c in df_means.columns if c != "pert_symbol"]

base_row = df_means["pert_symbol"].astype(str) == "non-targeting"
x_base = df_means.loc[base_row, gene_cols].iloc[0].to_numpy(np.float32)

df_train = df_means.loc[~base_row].reset_index(drop=True)
train_genes = df_train["pert_symbol"].astype(str).tolist()

D = df_train[gene_cols].to_numpy(np.float32) - x_base[None, :]
D_mean = D.mean(axis=0).astype(np.float32)

In [8]:
df_valmap = pd.read_csv(VALMAP_PATH)
val_map = dict(zip(df_valmap["pert_id"].astype(str), df_valmap["pert"].astype(str)))
val_genes = list(val_map.values())

In [9]:
adata = ad.read_h5ad(H5AD_PATH)
ctrl = adata[adata.obs["sgrna_symbol"].astype(str) == "non-targeting"].copy()

q = float(np.quantile(ctrl.obs["nCount_RNA"].values, DEPTH_TRIM_Q))
ctrl = ctrl[ctrl.obs["nCount_RNA"].values <= q].copy()

union_genes = sorted(set(gene_cols) | set(train_genes) | set(val_genes))
ctrl = ctrl[:, union_genes].copy()

Baseline row does not match norm+log1p mean as found out from analyzation.ipynb, try both log1p and log2 and pick the closer baseline mean_abs

In [11]:
def baseline_match_score(log_kind):
    tmp = adata[adata.obs["sgrna_symbol"].astype(str) == "non-targeting"][:, gene_cols].copy()
    sc.pp.normalize_total(tmp, target_sum=1e4, inplace=True)
    X = tmp.X.tocsr(copy=True)
    if log_kind == "log1p":
        X.data = np.log1p(X.data).astype(np.float32)
    else:
        X.data = np.log2(X.data + 1.0).astype(np.float32)
    base_ctrl = np.asarray(X.mean(axis=0)).ravel().astype(np.float32)
    return float(np.mean(np.abs(x_base - base_ctrl)))

log_kind = "log1p" if baseline_match_score("log1p") <= baseline_match_score("log2") else "log2"
print("Chosen log:", log_kind)

Chosen log: log2


In [12]:
sc.pp.normalize_total(ctrl, target_sum=1e4, inplace=True)
X = ctrl.X.tocsr(copy=True)
X.data = np.log2(X.data + 1.0).astype(np.float32)

In [13]:
X = StandardScaler(with_mean=False).fit_transform(X)
X = normalize(X, norm="l2", axis=1)

svd = TruncatedSVD(n_components=EMBED_DIM, random_state=SEED)
svd.fit(X)

gene_emb = svd.components_.T.astype(np.float32)  # (len(union_genes), EMBED_DIM)
gene2emb = {g: gene_emb[i] for i, g in enumerate(union_genes)}
emb_fallback = gene_emb.mean(axis=0).astype(np.float32)

In [14]:
w_gene = proxy_weights_from_deltas(D).astype(np.float64)

n = D.shape[0]
oof_raw = np.zeros_like(D, dtype=np.float32)
oof_base = np.zeros_like(D, dtype=np.float32)

for i in range(n):
    tr = np.ones(n, dtype=bool)
    tr[i] = False

    D_tr = D[tr]
    genes_tr = [train_genes[j] for j in np.where(tr)[0]]

    base_tr = D_tr.mean(axis=0).astype(np.float32)
    oof_base[i] = base_tr

    pca = PCA(n_components=min(OUT_K, D_tr.shape[0] - 1), random_state=SEED)
    C_tr = pca.fit_transform(D_tr)

    X_tr = np.vstack([gene2emb.get(g, emb_fallback) for g in genes_tr]).astype(np.float32)
    x_i = gene2emb.get(train_genes[i], emb_fallback).astype(np.float32)[None, :]

    reg = KernelRidge(alpha=ALPHA, kernel="rbf", gamma=GAMMA)
    reg.fit(X_tr, C_tr)

    c_hat = reg.predict(x_i)
    d_hat = pca.inverse_transform(c_hat)[0].astype(np.float32)
    oof_raw[i] = d_hat

In [15]:
best = None
for blend in BLENDS:
    for scale in SCALES:
        for thr in THRESHS:
            pred = (scale * oof_raw).astype(np.float32)
            pred = soft_threshold(pred, thr).astype(np.float32)
            pred = ((1.0 - blend) * oof_base + blend * pred).astype(np.float32)

            s = proxy_score_oof(y_true=D, y_pred=pred, base_pred_per_row=oof_base, w_gene=w_gene)
            if best is None or s > best["score"]:
                best = {"blend": blend, "scale": scale, "thr": thr, "score": s}

print("Best post (proxy):", best)

Best post (proxy): {'blend': 1.0, 'scale': 0.85, 'thr': 0.0, 'score': 0.37489}


In [16]:
pca = PCA(n_components=min(OUT_K, D.shape[0] - 1), random_state=SEED)
C = pca.fit_transform(D)

X_feat = np.vstack([gene2emb.get(g, emb_fallback) for g in train_genes]).astype(np.float32)
reg = KernelRidge(alpha=ALPHA, kernel="rbf", gamma=GAMMA)
reg.fit(X_feat, C)

BLEND, SCALE, THR = best["blend"], best["scale"], best["thr"]

In [17]:
def predict_delta(gene_symbol: str) -> np.ndarray:
    e = gene2emb.get(gene_symbol, emb_fallback).astype(np.float32)[None, :]
    c_hat = reg.predict(e)
    d_hat = pca.inverse_transform(c_hat)[0].astype(np.float32)

    d_hat = (SCALE * d_hat).astype(np.float32)
    d_hat = soft_threshold(d_hat, THR).astype(np.float32)
    d_hat = ((1.0 - BLEND) * D_mean + BLEND * d_hat).astype(np.float32)
    return d_hat

In [18]:
rows = []
for k in range(1, 121):
    pert_id = f"pert_{k}"
    gene = val_map.get(pert_id, None)

    if gene is None:
        delta = D_mean
    else:
        delta = predict_delta(gene)

    row = {"pert_id": pert_id}
    row.update({g: float(delta[j]) for j, g in enumerate(gene_cols)})
    rows.append(row)

sub = pd.DataFrame(rows, columns=["pert_id"] + gene_cols)
sub.to_csv("model2_krr_metric_tuned.csv", index=False)
print("Created model2_krr_metric_tuned.csv")

Created model2_krr_metric_tuned.csv
