# Generate cellxgene counts from SAM file

In [1]:
import cudf 

import time
import numpy as np

start = time.time()

Import requested from: 'numba.numpy_support', please update to use 'numba.np.numpy_support' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba import cuda, numpy_support
Import requested from: 'numba.numpy_support', please update to use 'numba.np.numpy_support' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from cudf._lib.transform import bools_to_mask
Import requested from: 'numba.utils', please update to use 'numba.core.utils' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.utils import pysignature
Import of 'pysignature' requested from: 'numba.utils', please update to use 'numba.core.utils' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.utils import pysignature


In [2]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client

cluster = LocalCUDACluster()
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 41151 instead
  http_address["port"], self.http_server.port


In [3]:
%%time
import dask_cudf

import glob

csv_files = glob.glob("outs/*.txt")

def cudf_read_csv_parse(csv_file):
    
    df = cudf.read_csv(csv_file, header=None, sep="\t")
    cell_molecule = df["0"].str.split("_")
    df["cell"] = cell_molecule[1]
    df["molecule"] = cell_molecule[2]
    df = df.rename({"1": "gene"})
    df = df.drop(["0"])
    df = df.drop_duplicates()
    return df

def get_meta(x):
    return x.iloc[:0]

dfs = [client.submit(cudf_read_csv_parse, csv_file) for csv_file in csv_files]

CPU times: user 267 ms, sys: 17.3 ms, total: 285 ms
Wall time: 295 ms


In [4]:
%%time
meta = client.submit(get_meta, dfs[0]).result()
df = dask_cudf.from_delayed(dfs, meta=meta)

CPU times: user 1.75 s, sys: 739 ms, total: 2.49 s
Wall time: 35.2 s


In [5]:
df = df.repartition(npartitions=100)

In [6]:
df = df.reset_index()

In [7]:
df.npartitions

100

In [9]:
df.head()

Unnamed: 0,index,gene,cell,molecule
0,17628225,A1BG,AAACGCTTCTAGGCAT,TATAGCATCAGT
1,17628053,A1BG,AAAGGTACAACCACAT,CTGGTAATATCA
2,17628177,A1BG,AAGCATCTCGTGGGTC,ACTAGTCCTCTT
3,17628297,A1BG,ACCCTTGTCCGAGATT,CCCCTTGTCACT
4,17628034,A1BG,ACGGGTCTCAAGAGGC,TCTTACGTACGG


In [11]:
df = df.drop_duplicates()

In [12]:
%%time
df = df.groupby(["cell", "gene"]).molecule.count().reset_index()

CPU times: user 56.5 ms, sys: 5.05 ms, total: 61.6 ms
Wall time: 68.1 ms


In [13]:
%%time
df = df.compute()

CPU times: user 922 ms, sys: 1.15 s, total: 2.07 s
Wall time: 11.2 s


In [14]:
df.head()

Unnamed: 0,cell,gene,molecule
0,AAACCCAAGCGTATGG,A2M-AS1,1
1,AAACCCAAGCGTATGG,AAAS,1
2,AAACCCAAGCGTATGG,AAGAB,1
3,AAACCCAAGCGTATGG,AAMDC,1
4,AAACCCAAGCGTATGG,AAMP,1


In [15]:
stop = time.time() - start

In [16]:
print("Total runtime: " + str(stop))

Total runtime: 52.09818887710571


In [17]:
type(df)

cudf.core.dataframe.DataFrame

In [18]:
%%time
df.to_parquet("counts_output.parquet")

CPU times: user 82.6 ms, sys: 148 ms, total: 231 ms
Wall time: 231 ms
