# Import libraries

In [1]:
%load_ext autoreload
%autoreload 2

import tqdm, sys, os, time, logging
logger = logging.getLogger("numba")
logger.setLevel(logging.ERROR)

import pandas as pd
import numpy as np
import scipy as sp
import scipy.sparse as sps

import scanpy as sc
import anndata as ad
import muon as mu

from sklearn.metrics import adjusted_rand_score as ari

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


## Load and preprocess individual datasets

### scRNA-seq A

In [3]:
ad_scrnaseq_A = mu.ad.read_loom("data/processed/scRNAseq_10x_v3_AIBS.loom", sparse=True)
ad_scrnaseq_A.obs.replace('nan', None, inplace=True)

ad_scrnaseq_A.obs.set_index("sample_name", inplace=True)
ad_scrnaseq_A.var.set_index("gene_name", inplace=True)

  def twobit_to_dna(twobit: int, size: int) -> str:
  def dna_to_twobit(dna: str) -> int:
  def twobit_1hamming(twobit: int, size: int) -> List[int]:


In [4]:
nonzero_gene_counts = pd.Series(np.array((ad_scrnaseq_A.X>0).sum(1)).flatten(),
                                index=ad_scrnaseq_A.obs.index)
ad_scrnaseq_A.obs['gene.counts'] = ad_scrnaseq_A.obs['gene.counts'].combine_first(nonzero_gene_counts)

ad_scrnaseq_A.obs['doublet.score'].fillna(0.0, inplace=True)

In [6]:
def filter_cell_A(s):
    if s['class_label'] == 'Low Quality':
        return False

    condition = True
    
    # Gene count
    if s['class_label'] == 'Non-Neuronal':
        condition = condition & (s['gene.counts'] >= 1000)
    else:
        condition = condition & (s['gene.counts'] >= 2000)

    # Doublet cells
    condition = condition & (s['doublet.score'] <= 0.3)

    return condition
    
ad_scrnaseq_A = ad_scrnaseq_A[ad_scrnaseq_A.obs.apply(filter_cell_A, axis=1), :]
ad_scrnaseq_A

View of AnnData object with n_obs × n_vars = 74132 × 31053
    obs: 'Amp_Date', 'Amp_Name', 'Amp_PCR_cyles', 'Cell_Capture', 'Donor', 'Gender', 'Lib_Cells', 'Lib_Date', 'Lib_Name', 'Lib_PCR_cycles', 'Lib_PassFail', 'Lib_type', 'Live_Cells', 'Live_percent', 'Mean_Reads_perCell', 'Median_Genes_perCell', 'Median_UMI_perCell', 'Region', 'Replicate_Lib', 'Saturation', 'Seq_batch', 'Total_Cells', 'aggr_num', 'class_label', 'cluster_color', 'cluster_id', 'cluster_label', 'doublet.score', 'exp_component_name', 'gene.counts', 'library_id', 'mapped_reads', 'method', 'nonconf_mapped_reads', 'size', 'subclass_label', 'total.reads', 'tube_barcode', 'umi.counts', 'unmapped_reads'
    var: 'gene_id'

### snRNA-seq B

In [9]:
ad_snrnaseq_B = mu.ad.read_loom("data/processed/snRNAseq_10x_v3_Broad.loom")
ad_snrnaseq_B.obs.replace('nan', None, inplace=True)

ad_snrnaseq_B.obs.set_index("sample_name", inplace=True)
ad_snrnaseq_B.var.set_index("gene_name", inplace=True)

In [10]:
ad_snrnaseq_B

AnnData object with n_obs × n_vars = 215823 × 31053
    obs: 'Allen.class_label', 'Allen.cluster_color', 'Allen.cluster_id', 'Allen.cluster_label', 'Allen.subclass_label', 'Broad.QC.Mito', 'Broad.QC.doublet', 'Broad.passQC', 'Comb.QC', 'MALE', 'QC', 'cl', 'class_label', 'cluster', 'cluster_color', 'cluster_id', 'cluster_label', 'comb.QC', 'dataset', 'gene.counts', 'nGene', 'nUMI', 'size', 'subclass_label', 'umi.counts'
    var: 'gene_id'

## Filtering low QC genes & cells

In [11]:
nonzero_gene_counts = pd.Series(np.array((ad_snrnaseq_B.X>0).sum(1)).flatten(),
                                index=ad_snrnaseq_B.obs.index)
ad_snrnaseq_B.obs['gene.counts'] = ad_snrnaseq_B.obs['gene.counts'].combine_first(nonzero_gene_counts)

ad_snrnaseq_B.obs['Broad.QC.doublet'].fillna(0.0, inplace=True)

In [12]:
def filter_cell_B(s):
    if s['class_label'] == 'Low Quality':
        return False
    
    condition = True

    # Gene count
    if s['class_label'] == 'Non-Neuronal':
        condition = condition & (s['gene.counts'] >= 500)
    else:
        condition = condition & (s['gene.counts'] >= 1000)

    # Doublet cells
    condition = condition & (s['Broad.QC.doublet'] <= 0.3)

    return condition

ad_snrnaseq_B = ad_snrnaseq_B[ad_snrnaseq_B.obs.apply(filter_cell_B, axis=1), :]
ad_snrnaseq_B

View of AnnData object with n_obs × n_vars = 184423 × 31053
    obs: 'Allen.class_label', 'Allen.cluster_color', 'Allen.cluster_id', 'Allen.cluster_label', 'Allen.subclass_label', 'Broad.QC.Mito', 'Broad.QC.doublet', 'Broad.passQC', 'Comb.QC', 'MALE', 'QC', 'cl', 'class_label', 'cluster', 'cluster_color', 'cluster_id', 'cluster_label', 'comb.QC', 'dataset', 'gene.counts', 'nGene', 'nUMI', 'size', 'subclass_label', 'umi.counts'
    var: 'gene_id'

In [10]:
assert (ad_snrnaseq_B.var.index == ad_scrnaseq_A.var.index).all()
ad_scrnaseq_A.var = ad_scrnaseq_A.var.reset_index().set_index('gene_id')
ad_snrnaseq_B.var = ad_scrnaseq_A.var

In [14]:
ad_scrnaseq_A.var.index.duplicated().any(), ad_snrnaseq_B.var.index.duplicated().any()

(False, False)

# Integrate cells from multiple dataset

In [15]:
mdata = mu.MuData({"scrna": ad_scrnaseq_A, 
                   "snrna": ad_snrnaseq_B}, 
                    axis=1)
mdata

In [20]:
mdata.X = sps.vstack([mdata['scrna'].X, mdata['snrna'].X])

In [28]:
mdata.update()
mdata

## Save

In [33]:
mdata['snrna'].obs['Comb.QC'].fillna("nan", inplace=True)

In [34]:
mdata.write_h5mu("data/processed/scRNAseq_snRNAseq_filteredQC.h5mu")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[key] = c
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[key] = c
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[key] = c
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:/