In [1]:
# Import requirements
import numpy as np
import scanpy as sc
import sys
import os

import cupy as cp

In [2]:
# Inputs
in_file = "/home/cjnolet/covid_omics_local/Counts_lung_cells.csv"
out_dir = "."

In [3]:
import cudf
import anndata

import time
start = time.time()

## Preprocess 1 
### (load counts -> filter -> normalize)

In [4]:
%%time
# Load data
adata = cudf.read_csv(in_file)

CPU times: user 3.92 s, sys: 1.95 s, total: 5.87 s
Wall time: 5.92 s


In [5]:
adata.shape

(32738, 39779)

In [6]:
%%time
cell_name = adata["cell_name"]
mito_genes = cell_name.str.startswith("MT-")
adata = adata.set_index("cell_name")

CPU times: user 347 ms, sys: 904 µs, total: 348 ms
Wall time: 348 ms


In [7]:
%%time
gpu_matrix = adata.as_gpu_matrix()

CPU times: user 2.82 s, sys: 132 ms, total: 2.95 s
Wall time: 2.95 s


In [8]:
%%time
import cupy as cp
gpu_array = cp.asarray(gpu_matrix, dtype=cp.float32)

CPU times: user 3.31 ms, sys: 3.62 ms, total: 6.93 ms
Wall time: 5.82 ms


In [9]:
%%time
sparse_gpu_array = cp.sparse.csr_matrix(gpu_array, dtype=cp.float32)

CPU times: user 517 ms, sys: 381 ms, total: 898 ms
Wall time: 894 ms


In [10]:
del adata
del gpu_matrix

In [11]:
%%time
# Filter cells w/ 200 <= n_genes <= 6000
col_sums = sparse_gpu_array.sum(axis=1)
query = ((200 <= col_sums) & (col_sums <= 6000)).ravel()
filtered_cells = sparse_gpu_array.get()[query.get()]

CPU times: user 96.4 ms, sys: 232 ms, total: 329 ms
Wall time: 330 ms


In [12]:
%%time
## Filter cells with >15% MT reads
n_counts = filtered_cells.sum(axis=1)
percent_mito = (filtered_cells[:,mito_genes.to_array()].sum(axis=1) / n_counts).ravel()
filtered_cells = filtered_cells[np.asarray(percent_mito < 0.15).ravel(),:]

CPU times: user 83.8 ms, sys: 56.9 ms, total: 141 ms
Wall time: 140 ms


In [13]:
%%time
## Remove zero columns
thr = np.asarray(filtered_cells.sum(axis=0) > 1).ravel()
filtered_cells = filtered_cells[:,thr]

CPU times: user 156 ms, sys: 58.2 ms, total: 214 ms
Wall time: 213 ms


In [14]:
%%time
## Normlaize to sum to 1e4 and log norm
target_sum = 1e4
sums = np.asarray(target_sum / filtered_cells.sum(axis=1)).ravel()
normalized = np.asarray(filtered_cells.todense()) * sums[:, None]
normalized = np.log1p(normalized)

CPU times: user 2.83 s, sys: 1.94 s, total: 4.77 s
Wall time: 4.77 s


In [15]:
normalized.shape

(13128, 39778)

In [16]:
time.time() - start

16.662619829177856

## Preprocess 2
### (regress -> scale -> PCA) 

In [None]:
# scanpy uses a Gaussian GLM w/ identity link, which is equivalent to Linear Regression w/  OLS.
