In [39]:
import numpy as np
import anndata as ad
import scanpy as sc
from scipy.sparse import csr_matrix, issparse
from tqdm import tqdm
from multiprocessing import Pool

from sklearn.linear_model import LogisticRegression

In [27]:
adata = sc.read("adata_normalized_custom.h5ad")
adata.X = adata.layers["binarized"]

In [28]:
if issparse(adata.X):
    if not adata.X.has_sorted_indices:
        adata.X.sort_indices()

In [41]:
def process_column(i):
    return np.asarray(np.squeeze(adata.X[:,i].toarray()))

with Pool() as p:
    y = list(tqdm(p.imap(process_column, range(adata.X.shape[1])), total=adata.X.shape[1]))
y = np.array(y)

100%|██████████| 92653/92653 [05:24<00:00, 285.44it/s]


In [43]:
X = np.asarray(np.sum(adata.X, axis = 1)).astype(float)
X_res = np.zeros(adata.X.shape)

def process_column_2(i):
    model = LogisticRegression(solver='liblinear', warm_start = True).fit(X = X,y = y[i])
    return y[i] - model.predict_proba(X)[:,1]

with Pool() as p:
    X_res = np.array(list(tqdm(p.imap(process_column_2, range(adata.X.shape[1])), total=adata.X.shape[1]))).T

100%|██████████| 92653/92653 [00:42<00:00, 2162.98it/s]


In [45]:
adata.layers["binary_residual"] = X_res

In [46]:
adata.write("adata_custom_v2.h5ad")