# Single-cell RNA-seq analysis workflow using Scanpy on CPU

Copyright (c) 2020, NVIDIA CORPORATION. 

Licensed under the Apache License, Version 2.0 (the "License") 
you may not use this file except in compliance with the License. 
You may obtain a copy of the License at 

     http://www.apache.org/licenses/LICENSE-2.0 

Unless required by applicable law or agreed to in writing, software 
distributed under the License is distributed on an "AS IS" BASIS, 
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
See the License for the specific language governing permissions and 
limitations under the License.

## Import requirements

In [37]:
import time

import numpy as np
import scanpy as sc

from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings('ignore', 'Expected ')
warnings.simplefilter('ignore')

## Input data

In [38]:
# Add path to input file here.
input_file = "../data/krasnow_hlca_10x_UMIs.sparse.h5ad"

## Set parameters

In [39]:
# marker genes
RIBO_GENE_PREFIX = "RPS" # Prefix for ribosomal genes to regress out
markers = ["ACE2", "TMPRSS2", "EPCAM"] # Marker genes for visualization

# filtering cells
min_genes_per_cell = 200 # Filter out cells with fewer genes than this expressed 
max_genes_per_cell = 6000 # Filter out cells with more genes than this expressed 

# filtering genes
n_top_genes = 5000 # Number of highly variable genes to retain

# PCA
n_components = 50 # Number of principal components to compute

# t-SNE
tsne_n_pcs = 20 # Number of principal components to use for t-SNE

# k-means
k = 35 # Number of clusters for k-means

# KNN
n_neighbors = 15 # Number of nearest neighbors for KNN graph
knn_n_pcs = 50 # Number of principal components to use for finding nearest neighbors

# UMAP
umap_min_dist = 0.3 
umap_spread = 1.0

# Gene ranking
ranking_n_top_genes = 50 # Number of differential genes to compute for each cluster

# Number of parallel jobs for t-SNE and k-means
n_jobs=32

In [40]:
start = time.time()

## Load and Prepare Data

In [41]:
%%time
adata = sc.read(input_file)
adata = adata.T

CPU times: user 5.12 s, sys: 741 ms, total: 5.86 s
Wall time: 5.87 s


## Preprocessing

In [42]:
preprocess_start = time.time()

### Filter

We filter the count matrix to remove cells with an extreme number of genes expressed.

In [43]:
%%time
sc.pp.filter_cells(adata, min_genes=min_genes_per_cell)
sc.pp.filter_cells(adata, max_genes=max_genes_per_cell)

CPU times: user 2.04 s, sys: 1.68 s, total: 3.72 s
Wall time: 3.72 s


Some genes will now have zero expression in all cells. We filter out such genes.

In [44]:
%%time
sc.pp.filter_genes(adata, min_cells=1)

CPU times: user 1.84 s, sys: 773 ms, total: 2.62 s
Wall time: 2.62 s


### Normalize

In [45]:
%%time
sc.pp.normalize_total(adata, target_sum=1e4)

CPU times: user 432 ms, sys: 247 ms, total: 679 ms
Wall time: 674 ms


In [46]:
%%time
sc.pp.log1p(adata)

CPU times: user 2.05 s, sys: 0 ns, total: 2.05 s
Wall time: 2.05 s


### Select Most Variable Genes

In [47]:
%%time
# Filter matrix to only variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes, flavor="cell_ranger")

# Retain marker gene expression
for marker in markers:
    adata.obs[marker+"_raw"]= adata.X[:, adata.var.index==marker].todense()

adata = adata.copy()
n_genes = sum(adata.var.highly_variable)
adata = adata[:, adata.var.highly_variable]

CPU times: user 2.82 s, sys: 8.13 ms, total: 2.83 s
Wall time: 2.81 s


### Regress out confounding factors (number of counts, ribosomal gene expression)

In [48]:
ribo_genes = adata.var_names.str.startswith(RIBO_GENE_PREFIX)

In [49]:
%%time
n_counts = np.array(adata.X.sum(axis=1))
adata.obs['percent_ribo'] = np.array(np.sum(adata[:, ribo_genes].X, axis=1)) / n_counts
adata.obs['n_counts'] = n_counts

Trying to set attribute `.obs` of view, copying.


CPU times: user 1.58 s, sys: 110 ms, total: 1.69 s
Wall time: 1.69 s


In [None]:
%%time
sc.pp.regress_out(adata, ['n_counts', 'percent_ribo'])

### Scale

In [None]:
%%time
sc.pp.scale(adata, max_value=10)

In [None]:
preprocess_time = time.time()
print("Total Preprocess time : %s" % (preprocess_time-preprocess_start))

## Cluster & Visualize

### Reduce

In [None]:
%%time
# PCA
sc.tl.pca(adata, n_comps=n_components)

### TSNE + K-means

In [None]:
%%time
sc.tl.tsne(adata, n_pcs=tsne_n_pcs, n_jobs=n_jobs)

In [None]:
%%time
kmeans = KMeans(n_clusters=k, random_state=0, n_jobs=n_jobs).fit(adata.obsm['X_pca'])
adata.obs['kmeans'] = kmeans.labels_.astype(str)

In [None]:
%%time
sc.pl.tsne(adata, color=["kmeans"])

In [None]:
%%time
sc.pl.tsne(adata, size=4, color=["ACE2_raw"], color_map="Blues", vmax=1, vmin=-0.05)
sc.pl.tsne(adata, size=4, color=["TMPRSS2_raw"], color_map="Blues", vmax=1, vmin=-0.05)
sc.pl.tsne(adata, size=4, color=["EPCAM_raw"], color_map="Reds", vmax=1,  vmin=-0.05)

### UMAP + Graph clustering

In [None]:
%%time
# KNN graph
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=knn_n_pcs)

In [None]:
%%time
# UMAP
sc.tl.umap(adata, min_dist=umap_min_dist, spread=umap_spread)

In [None]:
%%time
# Louvain clustering
sc.tl.louvain(adata)

In [None]:
%%time
# Plot
sc.pl.umap(adata, color=["louvain"])

In [None]:
%%time
# Leiden clustering
sc.tl.leiden(adata)

In [None]:
%%time
# Plot
sc.pl.umap(adata, color=["leiden"])

In [None]:
%%time
sc.pl.umap(adata, size=4,color=["ACE2_raw"], color_map="Blues", vmax=1, vmin=-0.05)
sc.pl.umap(adata, size=4, color=["TMPRSS2_raw"], color_map="Blues", vmax=1, vmin=-0.05)
sc.pl.umap(adata, size=4, color=["EPCAM_raw"], color_map="Reds", vmax=1, vmin=-0.05)

## Differential expression analysis

In [None]:
%%time
sc.tl.rank_genes_groups(adata, groupby="louvain", n_genes=ranking_n_top_genes, groups='all', reference='rest')

In [None]:
%%time
sc.pl.rank_genes_groups(adata, n_genes=20)

## Create zoomed-in view

In [None]:
reanalysis_start = time.time()

In [None]:
%%time

adata = adata[adata.obs["EPCAM_raw"] > 0.0, :]
print(adata.X.shape)

In [None]:
%%time
sc.tl.pca(adata, n_comps=n_components)
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=knn_n_pcs)
sc.tl.umap(adata, min_dist=umap_min_dist, spread=umap_spread)
sc.tl.leiden(adata)

In [None]:
%%time
sc.pl.umap(adata, color=["leiden"])
sc.pl.umap(adata, color=["ACE2_raw"], color_map="Blues", vmax=1, vmin=-0.05)
sc.pl.umap(adata, color=["TMPRSS2_raw"], color_map="Blues", vmax=1, vmin=-0.05)

In [None]:
reanalysis_time = time.time()
print("Total reanalysis time : %s" % (reanalysis_time-reanalysis_start))

In [None]:
print("Full time: %s" % (time.time() - start))