In [1]:
%load_ext autoreload
%autoreload 2

import tqdm, sys, os, time
import pandas as pd
import scipy as sp

import scanpy as sc
import anndata as ad

# Load scRNAseq data

In [3]:
adata = ad.read_h5ad("cache/data-raw-scRNAseq_10x_v3_AIBS-matrix.h5ad",).transpose()
adata

AnnData object with n_obs × n_vars = 176584 × 31053

## Load samples

In [83]:
obs = pd.read_csv("data/raw/scRNAseq_10x_v3_AIBS/barcodes.tsv.gz", 
                  compression='gzip', skiprows=1,
                  names=['sample_name'], index_col=0)


metadata = pd.read_csv("data/raw/scRNAseq_10x_v3_AIBS/sample_metadata.csv", 
                       dtype={'Region': 'category',
                              'Lib_type': 'category',
                              'Gender': 'category',
                              'Donor': 'str',
                             },
                       index_col=0)
metadata.index.name = 'sample_name'


cluster_membership = pd.read_csv("data/raw/scRNAseq_10x_v3_AIBS/cluster.membership.csv", 
                                 names=['sample_name', 'cluster_id'], 
                                 dtype={'cluster_id': 'category'},
                                 skiprows=1, index_col=0)

cluster_annotation = pd.read_csv("data/raw/scRNAseq_10x_v3_AIBS/cluster.annotation.csv", 
                                 index_col='cluster_id', dtype={'cluster_id': 'category'})

cluster_membership = cluster_membership.join(cluster_annotation, on='cluster_id')


obs = obs.join(metadata).join(cluster_membership)
obs.describe(percentiles=[])

Unnamed: 0,aggr_num,umi.counts,gene.counts,Amp_Date,Amp_PCR_cyles,Lib_Date,Lib_PCR_cycles,Cell_Capture,Lib_Cells,Mean_Reads_perCell,Median_Genes_perCell,Median_UMI_perCell,Total_Cells,Live_Cells,mapped_reads,unmapped_reads,nonconf_mapped_reads,total.reads,doublet.score,size
count,176584.0,176584.0,176584.0,176584,176584.0,176584,176584.0,176584.0,176584.0,176584.0,176584.0,176584.0,176584.0,176584.0,176584.0,176584.0,176584.0,176584.0,176584.0,94170.0
mean,6.585851,30534.110389,4452.454039,2019-01-21 23:06:34.201060096,12.0,2019-01-28 16:45:42.608616704,10.403581,4666.47587,16700.022403,130999.594499,3774.248986,17492.789551,6816.154742,4586.634208,94822.65,1943.099834,9282.95742,106048.7,0.114185,6392.821451
min,1.0,500.0,69.0,2018-11-29 00:00:00,12.0,2018-12-11 00:00:00,9.0,2960.0,8535.0,85837.0,728.0,1044.0,4289.0,2466.0,534.0,0.0,13.0,547.0,0.0,11.0
50%,7.0,19362.0,4889.0,2018-12-07 00:00:00,12.0,2018-12-11 00:00:00,10.0,5094.0,16692.0,116944.0,3636.0,10689.0,7078.0,5094.0,58635.5,1156.0,6007.5,65917.0,0.066667,5854.0
max,12.0,336286.0,12287.0,2019-04-26 00:00:00,12.0,2019-04-30 00:00:00,11.0,5898.0,25378.0,189925.0,6946.0,45605.0,9598.0,5898.0,1078829.0,24399.0,110114.0,1193540.0,0.98,17334.0
std,3.385489,34412.951488,3318.534699,,0.0,,0.592404,929.288491,5570.952733,34875.649653,2488.091004,16251.133514,1616.140661,1081.268542,109239.7,2338.474244,10286.237424,121652.1,0.145514,6210.070362


In [84]:
adata.obs = obs

## Load variables

In [67]:
var = pd.read_table("data/raw/scRNAseq_10x_v3_AIBS/features.tsv.gz", 
                    compression='gzip', header=None, 
                    names=['gene_id', 'gene_name', 'type'], 
                    usecols=['gene_id', 'gene_name'])
adata.var = var

## AnnData 

In [85]:
adata

AnnData object with n_obs × n_vars = 176584 × 31053
    obs: 'aggr_num', 'umi.counts', 'gene.counts', 'library_id', 'tube_barcode', 'Seq_batch', 'Region', 'Lib_type', 'Gender', 'Donor', 'Amp_Name', 'Amp_Date', 'Amp_PCR_cyles', 'Lib_Name', 'Lib_Date', 'Replicate_Lib', 'Lib_PCR_cycles', 'Lib_PassFail', 'Cell_Capture', 'Lib_Cells', 'Mean_Reads_perCell', 'Median_Genes_perCell', 'Median_UMI_perCell', 'Saturation', 'Live_percent', 'Total_Cells', 'Live_Cells', 'method', 'exp_component_name', 'mapped_reads', 'unmapped_reads', 'nonconf_mapped_reads', 'total.reads', 'doublet.score', 'cluster_id', 'cluster_label', 'subclass_label', 'class_label', 'cluster_color', 'size'
    var: 'gene_id', 'gene_name'

In [87]:
adata.obs[['cluster_id', 'cluster_label', 'subclass_label', 'class_label', 'cluster_color', 'size']]

Unnamed: 0_level_0,cluster_id,cluster_label,subclass_label,class_label,cluster_color,size
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAACCCAAGCTTCATG-1L8TX_181211_01_G12,42,L5 IT Tcap_2,L5 IT,Glutamatergic,#52CA74,17334.0
AAACCCAAGGCGCTTC-1L8TX_181211_01_G12,,,,,,
AAACCCAAGGCTTAAA-1L8TX_181211_01_G12,,,,,,
AAACCCAAGTGAGGTC-1L8TX_181211_01_G12,41,L5 IT Tcap_1,L5 IT,Glutamatergic,#5DDB65,7462.0
AAACCCACACCAGCCA-1L8TX_181211_01_G12,42,L5 IT Tcap_2,L5 IT,Glutamatergic,#52CA74,17334.0
...,...,...,...,...,...,...
TTTGTTGTCATTGCGA-12L8TX_190430_01_G08,41,L5 IT Tcap_1,L5 IT,Glutamatergic,#5DDB65,7462.0
TTTGTTGTCCCAACTC-12L8TX_190430_01_G08,43,L5 IT S100b,L5 IT,Glutamatergic,#00CF1E,4272.0
TTTGTTGTCCGACAGC-12L8TX_190430_01_G08,,,,,,
TTTGTTGTCTATGCCC-12L8TX_190430_01_G08,42,L5 IT Tcap_2,L5 IT,Glutamatergic,#52CA74,17334.0


# Preprocessing

In [88]:
sc.tl.pca(adata, svd_solver='arpack')

In [89]:
adata

AnnData object with n_obs × n_vars = 176584 × 31053
    obs: 'aggr_num', 'umi.counts', 'gene.counts', 'library_id', 'tube_barcode', 'Seq_batch', 'Region', 'Lib_type', 'Gender', 'Donor', 'Amp_Name', 'Amp_Date', 'Amp_PCR_cyles', 'Lib_Name', 'Lib_Date', 'Replicate_Lib', 'Lib_PCR_cycles', 'Lib_PassFail', 'Cell_Capture', 'Lib_Cells', 'Mean_Reads_perCell', 'Median_Genes_perCell', 'Median_UMI_perCell', 'Saturation', 'Live_percent', 'Total_Cells', 'Live_Cells', 'method', 'exp_component_name', 'mapped_reads', 'unmapped_reads', 'nonconf_mapped_reads', 'total.reads', 'doublet.score', 'cluster_id', 'cluster_label', 'subclass_label', 'class_label', 'cluster_color', 'size'
    var: 'gene_id', 'gene_name'
    uns: 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

# Save

In [None]:
adata.write_loom("data/processed/scRNAseq_10x_v3_AIBS.loom", write_obsm_varm=True)