In [2]:
import anndata as ad
import numpy as np
from scipy import sparse

adata = ad.read_h5ad("Data/training_cells.h5ad")  # adjust path if needed
print("adata:", adata)
print("X type:", type(adata.X), "sparse:", sparse.issparse(adata.X))

print("\nobs columns:", adata.obs.columns.tolist())
print("var shape:", adata.var.shape)
print("var_names example:", adata.var_names[:5].tolist())

# Basic group counts
sgrna_col = "sgrna_symbol" if "sgrna_symbol" in adata.obs.columns else None
if sgrna_col:
    vc = adata.obs[sgrna_col].astype(str).value_counts()
    print("\n# unique sgrna_symbol:", vc.shape[0])
    print("top 10 sgrna_symbol:\n", vc.head(10))
    print("non-targeting count:", int(vc.get("non-targeting", 0)))
else:
    print("\nNo sgrna_symbol column found. Tell me what the perturbation column is called.")


adata: AnnData object with n_obs × n_vars = 17882 × 19226
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'sgrna_id', 'sgrna_symbol', 'channel'
    var: 'features'
X type: <class 'scipy.sparse._csr.csr_matrix'> sparse: True

obs columns: ['nCount_RNA', 'nFeature_RNA', 'percent.mt', 'sgrna_id', 'sgrna_symbol', 'channel']
var shape: (19226, 1)
var_names example: ['A1BG', 'A1CF', 'A2M', 'A2ML1', 'A3GALT2']

# unique sgrna_symbol: 81
top 10 sgrna_symbol:
 sgrna_symbol
non-targeting    1026
MAPK1             447
KDM5C             428
FLNA              415
HDAC4             410
HSPA4             382
RUNX1             374
PAGR1             355
PAXIP1            350
INSIG1            348
Name: count, dtype: int64
non-targeting count: 1026


In [None]:
import numpy as np
from scipy import sparse
from sklearn.decomposition import TruncatedSVD
import pandas as pd
SEED = 6
EMBED_DIM = 128

# Reload needed objects
df_means = pd.read_csv("data/training_data_means.csv")
gene_cols = [c for c in df_means.columns if c != "pert_symbol"]

adata = ad.read_h5ad("Data/training_cells.h5ad")
ctrl_mask = (adata.obs["sgrna_symbol"].astype(str) == "non-targeting")
adata_ctrl = adata[ctrl_mask].copy()
print("Control cells:", adata_ctrl.n_obs)

# Align genes with gene_cols order
varnames = adata_ctrl.var_names.astype(str).tolist()
gene_to_idx = {g:i for i,g in enumerate(varnames)}
missing = [g for g in gene_cols if g not in gene_to_idx]
print("Missing genes from h5ad vs means:", len(missing))
print("First missing (if any):", missing[:10])

ordered_idx = [gene_to_idx[g] for g in gene_cols if g in gene_to_idx]
adata_ctrl = adata_ctrl[:, ordered_idx].copy()

X = adata_ctrl.X
if sparse.issparse(X):
    X = X.tocsr(copy=True)
    X.data = np.log2(X.data + 1.0).astype(np.float32)
else:
    X = np.log2(X.astype(np.float32) + 1.0)

# SVD
svd = TruncatedSVD(n_components=EMBED_DIM, random_state=SEED)
Z = svd.fit_transform(X)

evr = svd.explained_variance_ratio_
print("\nSVD EVR:")
print("  cumulative@16 :", float(np.cumsum(evr)[15]))
print("  cumulative@32 :", float(np.cumsum(evr)[31]))
print("  cumulative@64 :", float(np.cumsum(evr)[63]))
print("  cumulative@128:", float(np.cumsum(evr)[127]))

# QC correlations if present
for col in ["nCount_RNA", "percent.mt"]:
    if col in adata_ctrl.obs.columns:
        r = np.corrcoef(Z[:,0], adata_ctrl.obs[col].to_numpy())[0,1]
        print(f"corr(SVD1, {col}):", float(r))

# Find outliers in SVD space by radius
r = np.sqrt(Z[:,0]**2 + Z[:,1]**2)
idx = np.argsort(r)[-10:][::-1]
print("\nTop 10 SVD(1,2) radius outliers:")
for j in idx:
    row = adata_ctrl.obs.iloc[j]
    extras = []
    if "nCount_RNA" in row: extras.append(f"nCount_RNA={row['nCount_RNA']}")
    if "percent.mt" in row: extras.append(f"percent.mt={row['percent.mt']}")
    print(f"  cell_index={j}  r={float(r[j]):.4f}  " + "  ".join(extras))


Control cells: 1026
Missing genes from h5ad vs means: 0
First missing (if any): []

SVD EVR:
  cumulative@16 : 0.26657429337501526
  cumulative@32 : 0.3023926913738251
  cumulative@64 : 0.35523587465286255
  cumulative@128: 0.4424440562725067
corr(SVD1, nCount_RNA): 0.9738853720362605
corr(SVD1, percent.mt): -0.10616440498751527

Top 10 SVD(1,2) radius outliers:
  cell_index=475  r=155.4046  nCount_RNA=51610.0  percent.mt=4.194923464444876
  cell_index=715  r=149.4625  nCount_RNA=50255.0  percent.mt=4.208536464033429
  cell_index=721  r=147.8066  nCount_RNA=45949.0  percent.mt=3.717164682582864
  cell_index=357  r=146.1689  nCount_RNA=45325.0  percent.mt=5.844456701599559
  cell_index=782  r=145.5938  nCount_RNA=51079.0  percent.mt=3.0521349282483996
  cell_index=402  r=142.7757  nCount_RNA=41929.0  percent.mt=7.569939659901262
  cell_index=814  r=142.0721  nCount_RNA=44351.0  percent.mt=5.251290838988974
  cell_index=942  r=142.0457  nCount_RNA=39325.0  percent.mt=5.317228226319135
  

In [4]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

df_means = pd.read_csv("data/training_data_means.csv")
gene_cols = [c for c in df_means.columns if c != "pert_symbol"]
baseline_mask = (df_means["pert_symbol"].astype(str) == "non-targeting")

x_base = df_means.loc[baseline_mask, gene_cols].mean(axis=0).to_numpy(np.float32)
df_train = df_means.loc[~baseline_mask].reset_index(drop=True)
D = df_train[gene_cols].to_numpy(np.float32) - x_base[None, :]

# How "sparse" are deltas numerically?
absD = np.abs(D)
print("abs(delta) quantiles:", np.quantile(absD, [0.5, 0.75, 0.9, 0.95, 0.99]).tolist())

# PCA explained variance on outputs
pca = PCA(n_components=min(80, D.shape[0]), random_state=0)
pca.fit(D)
cum = np.cumsum(pca.explained_variance_ratio_)
for k in [8, 16, 32, 48, 64]:
    if k <= len(cum):
        print(f"Output PCA cumulative@{k}: {float(cum[k-1])}")


abs(delta) quantiles: [0.015714123845100403, 0.039262594655156136, 0.07378450930118562, 0.10317319482564924, 0.1939082145690912]
Output PCA cumulative@8: 0.5560860633850098
Output PCA cumulative@16: 0.6922448873519897
Output PCA cumulative@32: 0.831583559513092
Output PCA cumulative@48: 0.9131585359573364
Output PCA cumulative@64: 0.966091513633728


In [5]:
import pandas as pd

df_means = pd.read_csv("data/training_data_means.csv")
gene_columns = [c for c in df_means.columns if c != "pert_symbol"]
baseline_mask = df_means["pert_symbol"].astype(str) == "non-targeting"

df_train = df_means.loc[~baseline_mask].reset_index(drop=True)
train_genes = df_train["pert_symbol"].astype(str).tolist()

missing_train = sorted({g for g in train_genes if g not in gene_columns})
print("Train genes:", len(train_genes))
print("Train genes missing from gene_columns:", len(missing_train))
print("Missing list:", missing_train)


Train genes: 80
Train genes missing from gene_columns: 8
Missing list: ['BRD4', 'CHD4', 'DNAJA3', 'INO80', 'KAT8', 'KDM4A', 'PMEL', 'SETD1A']


In [6]:
import pandas as pd

df_means = pd.read_csv("data/training_data_means.csv")
gene_columns = [c for c in df_means.columns if c != "pert_symbol"]
baseline_mask = df_means["pert_symbol"].astype(str) == "non-targeting"

df_train = df_means.loc[~baseline_mask].reset_index(drop=True)
train_genes = df_train["pert_symbol"].astype(str).tolist()

missing_train = sorted({g for g in train_genes if g not in gene_columns})
print("Train genes:", len(train_genes))
print("Train genes missing from gene_columns:", len(missing_train))
print("Missing list:", missing_train)

Train genes: 80
Train genes missing from gene_columns: 8
Missing list: ['BRD4', 'CHD4', 'DNAJA3', 'INO80', 'KAT8', 'KDM4A', 'PMEL', 'SETD1A']


In [7]:
import numpy as np
import anndata as ad
import scanpy as sc
from scipy import sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

SEED = 6
EMBED_DIM = 64  # try 32/64 later

df_means = pd.read_csv("data/training_data_means.csv")
gene_columns = [c for c in df_means.columns if c != "pert_symbol"]

adata = ad.read_h5ad("Data/training_cells.h5ad")
ctrl_mask = adata.obs["sgrna_symbol"].astype(str) == "non-targeting"
adata_ctrl = adata[ctrl_mask].copy()

# Keep only the 5127 target genes and order them
varnames = adata_ctrl.var_names.astype(str).tolist()
gene_to_idx = {g:i for i,g in enumerate(varnames)}
ordered_idx = [gene_to_idx[g] for g in gene_columns]
adata_ctrl = adata_ctrl[:, ordered_idx].copy()

# Optional: drop ultra-high depth outliers
q = np.quantile(adata_ctrl.obs["nCount_RNA"].values, 0.99)
keep_cells = adata_ctrl.obs["nCount_RNA"].values <= q
adata_ctrl = adata_ctrl[keep_cells].copy()
print("Control cells after 99% depth trim:", adata_ctrl.n_obs)

# Normalize and log
sc.pp.normalize_total(adata_ctrl, target_sum=1e4, inplace=True)
X = adata_ctrl.X.tocsr(copy=True)
X.data = np.log1p(X.data).astype(np.float32)

# Scale per-gene variance without densifying
scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(X)

svd = TruncatedSVD(n_components=EMBED_DIM, random_state=SEED)
svd.fit(X)

Z = svd.transform(X)
obs = adata_ctrl.obs
print("corr(SVD1, nCount_RNA):", float(np.corrcoef(Z[:,0], obs["nCount_RNA"].values)[0,1]))
print("corr(SVD1, percent.mt):", float(np.corrcoef(Z[:,0], obs["percent.mt"].values)[0,1]))
print("cumEVR@32:", float(np.cumsum(svd.explained_variance_ratio_)[min(31, EMBED_DIM-1)]))

gene_emb = svd.components_.T.astype(np.float32)
gene2emb = {g: gene_emb[i] for i, g in enumerate(gene_columns)}
emb_fallback = gene_emb.mean(axis=0)


Control cells after 99% depth trim: 1015
corr(SVD1, nCount_RNA): 0.26806640619368727
corr(SVD1, percent.mt): -0.2649107023641811
cumEVR@32: 0.1406271904706955


In [8]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor

def cosine(a, b, eps=1e-9):
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    if na < eps or nb < eps:
        return 0.0
    return float(np.dot(a, b) / (na * nb))

def loocv_eval(D, genes, gene2emb, emb_fallback, K_out=32, alpha=10.0, blend=1.0, knn_k=None):
    n, d = D.shape
    maes, maes_base, coss, coss_base = [], [], [], []

    for i in range(n):
        tr = np.ones(n, dtype=bool)
        tr[i] = False

        D_tr = D[tr]
        D_va = D[i]
        genes_tr = [genes[j] for j in np.where(tr)[0]]
        g_va = genes[i]

        delta_base = D_tr.mean(axis=0)

        out_pca = PCA(n_components=min(K_out, D_tr.shape[0]-1), random_state=0)
        C_tr = out_pca.fit_transform(D_tr)

        X_tr = np.vstack([gene2emb.get(g, emb_fallback) for g in genes_tr]).astype(np.float32)
        x_va = gene2emb.get(g_va, emb_fallback).astype(np.float32)[None, :]

        if knn_k is None:
            reg = Ridge(alpha=alpha, random_state=0)
            reg.fit(X_tr, C_tr)
            c_hat = reg.predict(x_va)
        else:
            knn = KNeighborsRegressor(n_neighbors=knn_k, metric="cosine", algorithm="brute", weights="distance")
            knn.fit(X_tr, C_tr)
            c_hat = knn.predict(x_va)

        d_hat = out_pca.inverse_transform(c_hat)[0].astype(np.float32)
        d_hat = (1.0 - blend) * delta_base + blend * d_hat

        # baseline predictor for ratio: mean-delta
        d0 = delta_base

        mae = float(np.mean(np.abs(D_va - d_hat)))
        mae0 = float(np.mean(np.abs(D_va - d0)))
        cosv = max(0.0, cosine(D_va, d_hat))
        cos0 = max(0.0, cosine(D_va, d0))

        maes.append(mae)
        maes_base.append(mae0)
        coss.append(cosv)
        coss_base.append(cos0)

    mae_mean = float(np.mean(maes))
    mae0_mean = float(np.mean(maes_base))
    cos_mean = float(np.mean(coss))
    cos0_mean = float(np.mean(coss_base))
    ratio = mae_mean / (mae0_mean + 1e-9)

    return {
        "mae": mae_mean,
        "mae_base": mae0_mean,
        "mae_ratio": ratio,
        "cos": cos_mean,
        "cos_base": cos0_mean,
        "proxy_score": ratio * cos_mean,
    }

# Load deltas
df_means = pd.read_csv("data/training_data_means.csv")
gene_columns = [c for c in df_means.columns if c != "pert_symbol"]
baseline_mask = df_means["pert_symbol"].astype(str) == "non-targeting"
x_base = df_means.loc[baseline_mask, gene_columns].iloc[0].to_numpy(np.float32)

df_train = df_means.loc[~baseline_mask].reset_index(drop=True)
genes = df_train["pert_symbol"].astype(str).tolist()
D = df_train[gene_columns].to_numpy(np.float32) - x_base[None, :]

# Try a few configs
tests = [
    ("ridge", dict(K_out=32, alpha=10.0, blend=1.0, knn_k=None)),
    ("ridge_blend", dict(K_out=32, alpha=10.0, blend=0.7, knn_k=None)),
    ("knn5", dict(K_out=32, alpha=0.0, blend=1.0, knn_k=5)),
    ("knn10", dict(K_out=32, alpha=0.0, blend=1.0, knn_k=10)),
    ("knn10_blend", dict(K_out=32, alpha=0.0, blend=0.7, knn_k=10)),
]

for name, cfg in tests:
    out = loocv_eval(D, genes, gene2emb, emb_fallback, **cfg)
    print(name, out)


ridge {'mae': 0.028515085787512363, 'mae_base': 0.028515887749381363, 'mae_ratio': 0.9999718415976421, 'cos': 0.32098835660144687, 'cos_base': 0.32091827150434254, 'proxy_score': 0.32097931808214947}
ridge_blend {'mae': 0.028515319898724557, 'mae_base': 0.028515887749381363, 'mae_ratio': 0.999980051449148, 'cos': 0.3209677147679031, 'cos_base': 0.32091827150434254, 'proxy_score': 0.3209613119271232}
knn5 {'mae': 0.031157795782200993, 'mae_base': 0.028515887749381363, 'mae_ratio': 1.0926468417673618, 'cos': 0.21711297146539438, 'cos_base': 0.32091827150434254, 'proxy_score': 0.23722780257839052}
knn10 {'mae': 0.029851229628548026, 'mae_base': 0.028515887749381363, 'mae_ratio': 1.0468279593493517, 'cos': 0.25586538447532803, 'cos_base': 0.32091827150434254, 'proxy_score': 0.2678470382984449}
knn10_blend {'mae': 0.029170638346113265, 'mae_base': 0.028515887749381363, 'mae_ratio': 1.0229608693765895, 'cos': 0.2837439299561083, 'cos_base': 0.32091827150434254, 'proxy_score': 0.2902589372682

In [9]:
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
from scipy import sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

SEED = 6
EMBED_DIM = 128  # try 64/128/256

df_means = pd.read_csv("data/training_data_means.csv")
baseline_mask = df_means["pert_symbol"].astype(str) == "non-targeting"
df_train = df_means.loc[~baseline_mask].reset_index(drop=True)
train_genes = df_train["pert_symbol"].astype(str).tolist()

adata = ad.read_h5ad("Data/training_cells.h5ad")
ctrl = adata[adata.obs["sgrna_symbol"].astype(str) == "non-targeting"].copy()

# Trim crazy depth outliers (you already did 99%)
q = np.quantile(ctrl.obs["nCount_RNA"].values, 0.99)
ctrl = ctrl[ctrl.obs["nCount_RNA"].values <= q].copy()
print("Control cells:", ctrl.n_obs, "Genes:", ctrl.n_vars)

# Normalize + log1p
sc.pp.normalize_total(ctrl, target_sum=1e4, inplace=True)
X = ctrl.X.tocsr(copy=True)
X.data = np.log1p(X.data).astype(np.float32)

# Per-gene scaling (sparse-friendly)
scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(X)

svd = TruncatedSVD(n_components=EMBED_DIM, random_state=SEED)
svd.fit(X)

Z = svd.transform(X)
print("corr(SVD1, nCount_RNA):", float(np.corrcoef(Z[:,0], ctrl.obs["nCount_RNA"].values)[0,1]))

# Gene embeddings for ALL var_names
gene_emb = svd.components_.T.astype(np.float32)          # (n_genes, EMBED_DIM)
var_names = ctrl.var_names.astype(str).tolist()
gene2emb_all = {g: gene_emb[i] for i, g in enumerate(var_names)}
emb_fallback = gene_emb.mean(axis=0)

# Coverage check for training perturbation genes
missing = sorted({g for g in train_genes if g not in gene2emb_all})
print("Missing training pert genes from h5ad var_names:", len(missing))
print("Missing list:", missing)

# Also check the 8 you found
check8 = ['BRD4','CHD4','DNAJA3','INO80','KAT8','KDM4A','PMEL','SETD1A']
print("8-gene embedding availability:", {g: (g in gene2emb_all) for g in check8})


Control cells: 1015 Genes: 19226
corr(SVD1, nCount_RNA): 0.490323181762857
Missing training pert genes from h5ad var_names: 0
Missing list: []
8-gene embedding availability: {'BRD4': True, 'CHD4': True, 'DNAJA3': True, 'INO80': True, 'KAT8': True, 'KDM4A': True, 'PMEL': True, 'SETD1A': True}


In [11]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge

def cosine(a, b, eps=1e-9):
    na = np.linalg.norm(a); nb = np.linalg.norm(b)
    if na < eps or nb < eps: return 0.0
    return float(np.dot(a, b) / (na * nb))

def loocv_proxy(D, genes, gene2emb, emb_fallback, K_out, model_kind, params, blend):
    n, d = D.shape
    maes, maes0, coss, coss0 = [], [], [], []

    for i in range(n):
        tr = np.ones(n, dtype=bool); tr[i] = False
        D_tr, D_va = D[tr], D[i]
        genes_tr = [genes[j] for j in np.where(tr)[0]]
        g_va = genes[i]

        base = D_tr.mean(axis=0)

        pca = PCA(n_components=min(K_out, D_tr.shape[0]-1), random_state=0)
        C_tr = pca.fit_transform(D_tr)

        X_tr = np.vstack([gene2emb.get(g, emb_fallback) for g in genes_tr]).astype(np.float32)
        x_va = gene2emb.get(g_va, emb_fallback).astype(np.float32)[None, :]

        if model_kind == "ridge":
            reg = Ridge(alpha=params["alpha"], random_state=0)
        elif model_kind == "krr_rbf":
            reg = KernelRidge(alpha=params["alpha"], kernel="rbf", gamma=params["gamma"])
        elif model_kind == "krr_poly":
            reg = KernelRidge(alpha=params["alpha"], kernel="polynomial", degree=params["degree"], gamma=params.get("gamma", 1.0), coef0=params.get("coef0", 1.0))
        else:
            raise ValueError(model_kind)

        reg.fit(X_tr, C_tr)
        c_hat = reg.predict(x_va)
        d_hat = pca.inverse_transform(c_hat)[0].astype(np.float32)

        d_hat = (1.0 - blend) * base + blend * d_hat

        mae = float(np.mean(np.abs(D_va - d_hat)))
        mae0 = float(np.mean(np.abs(D_va - base)))
        cosv = max(0.0, cosine(D_va, d_hat))
        cos0 = max(0.0, cosine(D_va, base))

        maes.append(mae); maes0.append(mae0)
        coss.append(cosv); coss0.append(cos0)

    mae_m = float(np.mean(maes)); mae0_m = float(np.mean(maes0))
    cos_m = float(np.mean(coss)); cos0_m = float(np.mean(coss0))
    ratio = mae_m / (mae0_m + 1e-9)
    return {"K_out": K_out, "model": model_kind, "params": params, "blend": blend,
            "mae_ratio": ratio, "cos": cos_m, "cos_base": cos0_m, "proxy": ratio * cos_m}

df_means = pd.read_csv("data/training_data_means.csv")
gene_cols = [c for c in df_means.columns if c != "pert_symbol"]
baseline_mask = df_means["pert_symbol"].astype(str) == "non-targeting"
x_base = df_means.loc[baseline_mask, gene_cols].iloc[0].to_numpy(np.float32)

df_train = df_means.loc[~baseline_mask].reset_index(drop=True)
genes = df_train["pert_symbol"].astype(str).tolist()
D = df_train[gene_cols].to_numpy(np.float32) - x_base[None, :]

gene2emb = gene2emb_all

grid = []
for K_out in [16, 24, 32, 48]:
    for blend in [0.6, 0.8, 1.0]:
        for alpha in [0.1, 1, 10, 100]:
            grid.append(("ridge", {"alpha": alpha}))
        for alpha in [0.01, 0.1, 1]:
            for gamma in [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]:
                grid.append(("krr_rbf", {"alpha": alpha, "gamma": gamma}))
        for alpha in [0.01, 0.1, 1]:
            for degree in [2, 3]:
                grid.append(("krr_poly", {"alpha": alpha, "degree": degree, "gamma": 1.0, "coef0": 1.0}))

results = []
for model_kind, params in grid:
    out = loocv_proxy(D, genes, gene2emb, emb_fallback, K_out, model_kind, params, blend)
    results.append(out)

# Print top 15 by proxy
results_sorted = sorted(results, key=lambda x: x["proxy"], reverse=True)
for r in results_sorted[:15]:
    print(r)


{'K_out': 48, 'model': 'krr_rbf', 'params': {'alpha': 0.1, 'gamma': 0.2}, 'blend': 1.0, 'mae_ratio': 0.9997262433387063, 'cos': 0.32172345370054245, 'cos_base': 0.32091827150434254, 'proxy': 0.3216353797619975}
{'K_out': 48, 'model': 'krr_rbf', 'params': {'alpha': 0.1, 'gamma': 0.2}, 'blend': 1.0, 'mae_ratio': 0.9997262433387063, 'cos': 0.32172345370054245, 'cos_base': 0.32091827150434254, 'proxy': 0.3216353797619975}
{'K_out': 48, 'model': 'krr_rbf', 'params': {'alpha': 0.1, 'gamma': 0.2}, 'blend': 1.0, 'mae_ratio': 0.9997262433387063, 'cos': 0.32172345370054245, 'cos_base': 0.32091827150434254, 'proxy': 0.3216353797619975}
{'K_out': 48, 'model': 'krr_rbf', 'params': {'alpha': 0.1, 'gamma': 0.2}, 'blend': 1.0, 'mae_ratio': 0.9997262433387063, 'cos': 0.32172345370054245, 'cos_base': 0.32091827150434254, 'proxy': 0.3216353797619975}
{'K_out': 48, 'model': 'krr_rbf', 'params': {'alpha': 0.1, 'gamma': 0.2}, 'blend': 1.0, 'mae_ratio': 0.9997262433387063, 'cos': 0.32172345370054245, 'cos_ba

In [12]:
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
from scipy import sparse

df_means = pd.read_csv("data/training_data_means.csv")
gene_cols = [c for c in df_means.columns if c != "pert_symbol"]
base_df = df_means.loc[df_means["pert_symbol"].astype(str)=="non-targeting", gene_cols].iloc[0].to_numpy(np.float32)

adata = ad.read_h5ad("Data/training_cells.h5ad")
ctrl = adata[adata.obs["sgrna_symbol"].astype(str)=="non-targeting"][:, gene_cols].copy()
X_raw = ctrl.X.tocsr()

def stats(name, vec):
    diff = base_df - vec
    print("\n", name)
    print("  mean(abs(diff)):", float(np.mean(np.abs(diff))))
    print("  max(abs(diff)) :", float(np.max(np.abs(diff))))
    print("  L2(diff)       :", float(np.linalg.norm(diff)))
    print("  corr(base_df, vec):", float(np.corrcoef(base_df, vec)[0,1]))

def mean_of_log(X, log_kind):
    X = X.tocsr(copy=True)
    if log_kind == "log1p":
        X.data = np.log1p(X.data).astype(np.float32)
    elif log_kind == "log2":
        X.data = np.log2(X.data + 1.0).astype(np.float32)
    else:
        raise ValueError
    return np.asarray(X.mean(axis=0)).ravel().astype(np.float32)

def log_of_mean(X, log_kind):
    m = np.asarray(X.mean(axis=0)).ravel().astype(np.float32)
    if log_kind == "log1p":
        return np.log1p(m).astype(np.float32)
    elif log_kind == "log2":
        return np.log2(m + 1.0).astype(np.float32)
    else:
        raise ValueError

# Variant 0: raw means (no norm, no log)
stats("raw mean", np.asarray(X_raw.mean(axis=0)).ravel().astype(np.float32))

# Normalize total first
ctrl2 = ctrl.copy()
sc.pp.normalize_total(ctrl2, target_sum=1e4, inplace=True)
Xn = ctrl2.X.tocsr()

# Mean of log
stats("norm1e4 + mean(log1p)", mean_of_log(Xn, "log1p"))
stats("norm1e4 + mean(log2)",  mean_of_log(Xn, "log2"))

# Log of mean
stats("norm1e4 + log1p(mean)", log_of_mean(Xn, "log1p"))
stats("norm1e4 + log2(mean)",  log_of_mean(Xn, "log2"))



 raw mean
  mean(abs(diff)): 2.5708224773406982
  max(abs(diff)) : 287.5828552246094
  L2(diff)       : 883.6790771484375
  corr(base_df, vec): 0.7733315105723955

 norm1e4 + mean(log1p)
  mean(abs(diff)): 0.11021880060434341
  max(abs(diff)) : 1.863661766052246
  L2(diff)       : 17.3148193359375
  corr(base_df, vec): 0.9990306464348331

 norm1e4 + mean(log2)
  mean(abs(diff)): 0.10423404723405838
  max(abs(diff)) : 0.4308958053588867
  L2(diff)       : 11.10219669342041
  corr(base_df, vec): 0.9990306489846221

 norm1e4 + log1p(mean)
  mean(abs(diff)): 0.07655708491802216
  max(abs(diff)) : 1.8306965827941895
  L2(diff)       : 15.459739685058594
  corr(base_df, vec): 0.9952569724709338

 norm1e4 + log2(mean)
  mean(abs(diff)): 0.1808294951915741
  max(abs(diff)) : 1.1035455465316772
  L2(diff)       : 17.61606788635254
  corr(base_df, vec): 0.9952569724133998
