# Scanpy: Quality control

#DATA_TITLE:

#DATA_ALL1:


In [1]:
# create a data directory.
!mkdir -p data

# check if file exists before downloading it.
!if [ ! -f data/pbmc_1k_v2_filtered_feature_bc_matrix.h5 ]; then curl -o data/pbmc_1k_v2_filtered_feature_bc_matrix.h5 -O http://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_v2/pbmc_1k_v2_filtered_feature_bc_matrix.h5; fi

!if [ ! -f data/pbmc_1k_v3_filtered_feature_bc_matrix.h5 ]; then curl -o data/pbmc_1k_v3_filtered_feature_bc_matrix.h5 -O http://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_v3/pbmc_1k_v3_filtered_feature_bc_matrix.h5; fi

!if [ ! -f data/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5 ]; then curl -o data/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5 -O http://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5; fi

#DATA_ALL2:

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc


sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()


In [None]:
sc.settings.set_figure_params(dpi=80)

#DATA_ALL3:

In [None]:
data_p3 = sc.read_10x_h5(
    './data/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5', gex_only = True)  
# OBS! gex_only to only read gene expression data.
data_p3.var_names_make_unique()

data_v2 = sc.read_10x_h5(
    './data/pbmc_1k_v2_filtered_feature_bc_matrix.h5')
data_v2.var_names_make_unique()

data_v3 = sc.read_10x_h5(
    './data/pbmc_1k_v3_filtered_feature_bc_matrix.h5')
data_v3.var_names_make_unique()


#OBJ_TITLE:

In [None]:
# first add in batch info in metadata
data_v2.obs['lib_prep'] = 'v2'
data_v3.obs['lib_prep'] = 'v3'
data_p3.obs['lib_prep'] = 'p3'


# merge into one object
adata = data_v2.concatenate(data_v3, data_p3)

# and delete individual datasets to save space
del(data_v2)
del(data_v3)
del(data_p3)


#OBJ_SCRANPY:

In [None]:
print(adata.obs['lib_prep'].value_counts())

adata

#QC_TITLE:

#QC_ALL1:

#QC_ALL1.1:

#QC_1_SCANPY:


In [None]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

# we now have many additional data types in the obs slot:
adata

#QC_2_SCANPY:

In [None]:
mito_genes = adata.var_names.str.startswith('MT-')
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1

print(sum(mito_genes))

#QC_ALL2: 

In [None]:
# Also calculate percent malat1
ribo_genes = adata.var_names.str.startswith(("RPS","RPL"))
print(sum(ribo_genes))

adata.obs['percent_ribo'] = np.sum(
    adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1



#QC_2_SCANPY:

In [None]:
adata

#QC_TITLE2:

#QC_ALL3:

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'n_counts', 'percent_mito','percent_ribo'],
             jitter=0.4, groupby = 'lib_prep')

#QC_ALL4:


In [None]:
sc.pl.scatter(adata, x='n_counts', y='percent_mito', color="lib_prep")
sc.pl.scatter(adata, x='n_counts', y='n_genes_by_counts', color="lib_prep")
sc.pl.scatter(adata, x='n_counts', y='percent_ribo', color="lib_prep")
sc.pl.scatter(adata, x='percent_mito', y='percent_ribo', color="lib_prep")

#FILTERING_TITLE:

#FILTERING_ALL0:


In [None]:
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

print(adata.n_obs, adata.n_vars)

#FILTERING_ALL3:

In [None]:
# filter for gene detection for v2
keep_v2 = (adata.obs['n_genes_by_counts'] < 2000) & (adata.obs['n_genes_by_counts'] > 500) & (adata.obs['lib_prep'] == 'v2')
print(sum(keep_v2))

# filter for gene detection for v3
keep_v3 = (adata.obs['n_genes_by_counts'] < 4100) & (adata.obs['n_genes_by_counts'] > 1000) & (adata.obs['lib_prep'] != 'v2')
print(sum(keep_v3))

# keep both sets of cells
keep = (keep_v2) | (keep_v3)
print(sum(keep))
adata = adata[keep, :]

print("Remaining cells %d"%adata.n_obs)

#FILTERING_ALL01:


In [None]:
sc.pl.highest_expr_genes(adata, n_top=20)

#FILTERING_ALL02:

#FILTERING_TITLE2:

#FILTERING_ALL1:

In [None]:
# filter for percent mito
adata = adata[adata.obs['percent_mito'] < 0.25, :]

# filter for percent ribo > 0.05
adata = adata[adata.obs['percent_ribo'] > 0.05, :]

print("Remaining cells %d"%adata.n_obs)

#FILTERING_ALL2:

#FILTERING_TITLE4:

#FILTERING_ALL5:

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'n_counts', 'percent_mito','percent_ribo'],
             jitter=0.4, groupby = 'lib_prep')

#FILTERING_TITLE5:

#FILTERING_ALL6:

In [None]:
malat1 = adata.var_names.str.startswith('MALAT1')
# we need to redefine the mito_genes since they were first 
# calculated on the full object before removing low expressed genes.
mito_genes = adata.var_names.str.startswith('MT-')

remove = np.add(mito_genes, malat1)
keep = np.invert(remove)

adata = adata[:,keep]

print(adata.n_obs, adata.n_vars)

#FILTERING_ALL7:

#FILTERING_ALL8:

In [None]:
save_file = 'data/scanpy_qc_filtered_3pbmc.h5ad'
adata.write_h5ad(save_file)

#CELLCYCLE_TITLE:

#CELLCYCLE_ALL1:

#CELLCYCLE_1_SCANPY:


In [None]:
!if [ ! -f data/regev_lab_cell_cycle_genes.txt ]; then curl -o data/regev_lab_cell_cycle_genes.txt https://raw.githubusercontent.com/theislab/scanpy_usage/master/180209_cell_cycle/data/regev_lab_cell_cycle_genes.txt; fi
    

In [None]:
cell_cycle_genes = [x.strip() for x in open('./data/regev_lab_cell_cycle_genes.txt')]
print(len(cell_cycle_genes))

# Split into 2 lists
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]

cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]
print(len(cell_cycle_genes))

#CELLCYCLE_2_SCANPY: 

In [None]:
adata.raw = adata

# normalize to depth 10 000
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)

# logaritmize
sc.pp.log1p(adata)

# scale
sc.pp.scale(adata)

#CELLCYCLE_3_SCANPY: 

In [None]:
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)

#CELLCYCLE_ALL2:

In [None]:
sc.pl.violin(adata, ['S_score', 'G2M_score'],
             jitter=0.4, groupby = 'lib_prep')

#CELLCYCLE_ALL3:



In [None]:
adata