# Generate cellxgene counts from SAM file

In [1]:
import cudf 

import time
import numpy as np

start = time.time()

In [2]:
%%time
df = cudf.read_csv("../data/custom_count.tsv", header=None, sep="\t")

CPU times: user 1.18 s, sys: 681 ms, total: 1.86 s
Wall time: 2 s


In [3]:
%%time
# Some light cudf preprocessing
cell_molecule = df["0"].str.split("_")
df["cell"] = cell_molecule[1]
df["molecule"] = cell_molecule[2]
df = df.rename({"1": "gene"})
df = df.drop(["0"])

CPU times: user 194 ms, sys: 194 ms, total: 387 ms
Wall time: 483 ms


In [4]:
%%time
# Count unique molecules for each gene for each cell
df = df.groupby(["cell", "gene"]).molecule.nunique().reset_index()

CPU times: user 473 ms, sys: 363 ms, total: 836 ms
Wall time: 836 ms


In [5]:
df

Unnamed: 0,cell,gene,molecule
0,AAACCCAAGCGTATGG,AGL,1
1,AAACCCAAGCGTATGG,AGO4,1
2,AAACCCAAGCGTATGG,AGTRAP,3
3,AAACCCAAGCGTATGG,AHCYL1,2
4,AAACCCAAGCGTATGG,AK2,1
...,...,...,...
801252,TTTGTTGTCGCACGAC,WDR77,1
801253,TTTGTTGTCGCACGAC,WRAP73,1
801254,TTTGTTGTCGCACGAC,YBX1,5
801255,TTTGTTGTCGCACGAC,YTHDF2,1


In [6]:
stop = time.time() - start

In [7]:
print("Total runtime: " + str(stop))

Total runtime: 3.495598316192627


In [21]:
%%time
mito_genes = df["gene"].str.startswith("MIT-")

CPU times: user 940 µs, sys: 9.07 ms, total: 10 ms
Wall time: 54.6 ms


In [22]:
%%time
mito_genes[mito_genes==True]

CPU times: user 1.41 ms, sys: 1.36 ms, total: 2.78 ms
Wall time: 1.97 ms


Series([], dtype: bool)

In [23]:
%%time
df.cell_id = df.cell.astype('category').cat.codes

CPU times: user 11.9 ms, sys: 15.8 ms, total: 27.8 ms
Wall time: 28.8 ms


In [24]:
df.cell_id

0            0
1            0
2            0
3            0
4            0
          ... 
801252    5015
801253    5015
801254    5015
801255    5015
801256    5015
Length: 801257, dtype: int32

In [25]:
%%time
df.gene_id = df.gene.astype('category').cat.codes

CPU times: user 13.1 ms, sys: 12 ms, total: 25.1 ms
Wall time: 25.1 ms


In [26]:
df

Unnamed: 0,cell,gene,molecule
0,AAACCCAAGCGTATGG,AGL,1.0
1,AAACCCAAGCGTATGG,AGO4,1.0
2,AAACCCAAGCGTATGG,AGTRAP,3.0
3,AAACCCAAGCGTATGG,AHCYL1,2.0
4,AAACCCAAGCGTATGG,AK2,1.0
...,...,...,...
801252,TTTGTTGTCGCACGAC,WDR77,1.0
801253,TTTGTTGTCGCACGAC,WRAP73,1.0
801254,TTTGTTGTCGCACGAC,YBX1,5.0
801255,TTTGTTGTCGCACGAC,YTHDF2,1.0


In [27]:
n_cells = df.cell_id.max()+1
n_genes = df.gene_id.max()+1


In [28]:
n_cells

5016

In [29]:
n_genes

1967

In [30]:
%%time

# Convert to sparse cupy array

import cupy as cp

df.molecule = df.molecule.astype('float32')

rows = cp.array(df.cell_id.to_gpu_array())
cols = cp.array(df.gene_id.to_gpu_array())
vals = cp.array(df.molecule.to_gpu_array())

coo = cp.sparse.coo_matrix((vals, (rows, cols)), shape=(n_cells, n_genes))

CPU times: user 5.59 ms, sys: 5.51 ms, total: 11.1 ms
Wall time: 10.2 ms


In [31]:
coo.shape

(5016, 1967)

In [32]:
%%time
coo = coo.log1p()

CPU times: user 4.98 ms, sys: 2.05 ms, total: 7.03 ms
Wall time: 78.2 ms


In [33]:
coo.shape

(5016, 1967)