In [1]:
%load_ext autoreload
%autoreload 2

import tqdm, sys, os, time
import pandas as pd
import scipy as sp

import scanpy as sc
import anndata as ad

# Load scRNAseq data

In [2]:
adata = ad.read_h5ad("cache/data-raw-snRNAseq_10x_v3_Broad-matrix.h5ad").transpose()
adata

AnnData object with n_obs × n_vars = 215823 × 31053

## Load samples

In [3]:
obs = pd.read_csv("data/raw/snRNAseq_10x_v3_Broad/barcodes.csv.gz", 
                  compression='gzip', skiprows=1,
                  names=['sample_name'], index_col=0)


metadata = pd.read_csv("data/raw/snRNAseq_10x_v3_Broad/sample_metadata.csv", 
                       dtype={'dataset': 'str',
                              'QC': 'str',
                              'Gender': 'str',
                              'Allen.cluster_id': 'str',
                              'comb.QC': 'str',
                             },
                       # parse_dates=['Lib_Date', 'Amp_Date'],
                       index_col='sample_name', na_values=['nan'],)
metadata.drop(columns=['Unnamed: 0'], inplace=True)


cluster_membership = pd.read_csv("data/raw/snRNAseq_10x_v3_Broad/cluster.membership.csv", 
                                 names=['barcode', 'cluster_id'], 
                                 dtype={'cluster_id': 'str'},
                                 skiprows=1, index_col=0, na_values=['nan'],)

cluster_annotation = pd.read_csv("data/raw/snRNAseq_10x_v3_Broad/cluster.annotation.csv", 
                                 index_col='cluster_id', 
                                 dtype={'cluster_id': 'str'}, na_values=['nan'],)

cluster_membership = cluster_membership.join(cluster_annotation, on='cluster_id')


obs = obs.join(metadata).join(cluster_membership)
obs.describe(percentiles=[])

Unnamed: 0,nUMI,nGene,size,gene.counts,umi.counts,Broad.QC.doublet,Broad.QC.Mito,Broad.passQC,MALE,cl
count,215823.0,215823.0,174348.0,174348.0,174348.0,174348.0,174348.0,174348.0,174348.0,174348.0
mean,11933.397219,3693.444721,10881.678344,3995.986326,12818.433587,0.043944,0.02726,0.927649,0.445328,93.735609
std,9612.197936,1947.415193,8103.741188,1474.187164,7155.579677,0.111508,0.158758,0.189136,0.078705,36.085813
min,501.0,210.0,14.0,1409.4,2308.564103,0.0,0.0,0.0,0.0,1.0
50%,10228.0,3882.0,14450.0,4481.203891,13449.86817,0.01,0.0,0.99,0.448615,94.0
max,104062.0,10829.0,23971.0,7026.77933,34730.41899,1.0,1.0,1.0,1.0,161.0


In [4]:
adata.obs = obs

## Load variables

In [16]:
var = pd.read_table("data/raw/scRNAseq_10x_v3_AIBS/features.tsv.gz", 
                    compression='gzip', header=None, 
                    names=['gene_id', 'gene_name', 'type'], 
                    usecols=['gene_id', 'gene_name'])
adata.var = var.set_index("gene_name")

In [17]:
adata.var_names_make_unique()

## AnnData 

In [18]:
adata

AnnData object with n_obs × n_vars = 215823 × 31053
    obs: 'nUMI', 'nGene', 'dataset', 'QC', 'cluster', 'Allen.cluster_id', 'Allen.cluster_label', 'Allen.cluster_color', 'Allen.class_label', 'Allen.subclass_label', 'comb.QC', 'cluster_id', 'cluster_label', 'cluster_color', 'class_label', 'subclass_label', 'size', 'gene.counts', 'umi.counts', 'Broad.QC.doublet', 'Broad.QC.Mito', 'Broad.passQC', 'MALE', 'Comb.QC', 'cl'
    var: 'gene_id'

In [20]:
adata.obs[['cluster_id', 'cluster_label', 'subclass_label', 'class_label', 'cluster_color', 'size']]

Unnamed: 0_level_0,cluster_id,cluster_label,subclass_label,class_label,cluster_color,size
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
pBICCNsMMrMOpRMiF007d190314_AAAGAACGTTAATGAG,,,,,,
pBICCNsMMrMOpRMiF007d190314_AAAGAACTCTACGGTA,48,L2/3 IT_1,L2/3 IT,Glutamatergic,#6F836B,23971.0
pBICCNsMMrMOpRMiF007d190314_AAAGGGCAGAGGCTGT,,,,,,
pBICCNsMMrMOpRMiF007d190314_AAAGGGCAGGGCTAAC,91,Astro Aqp4_1,Astro,Non-Neuronal,#7C2D1F,16143.0
pBICCNsMMrMOpRMiF007d190314_AAAGGTAGTGGCTGAA,48,L2/3 IT_1,L2/3 IT,Glutamatergic,#6F836B,23971.0
...,...,...,...,...,...,...
pBICCNsMMrMOpRMiM006d190320_TTTGGTTGTATCGCTA,,,,,,
pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,90,Oligo Opalin_4,Oligo,Non-Neuronal,#474662,16566.0
pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,51,L5 IT S100b,L5 IT,Glutamatergic,#00CF1E,8684.0
pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,52,L5 IT Pld5,L5 IT,Glutamatergic,#3CBC92,3621.0


# Save

In [21]:
adata.write_loom("data/processed/snRNAseq_10x_v3_Broad.loom", write_obsm_varm=True)

  def twobit_to_dna(twobit: int, size: int) -> str:
  def dna_to_twobit(dna: str) -> int:
  def twobit_1hamming(twobit: int, size: int) -> List[int]:
