In [1]:
from typing import Dict, Iterable, Optional

import numpy as np
import torch
from torch.distributions import Normal, Poisson
from torch.distributions import kl_divergence as kld
from torch import tensor
import scanpy as sc
import anndata
import pandas as pd
import tools
import logging
# torch.autograd.set_det

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## load RNA data and ATAC data and ATAC annotation data
rna_data = anndata.read_h5ad("data/datasets/mouse_brain_rna.h5ad")
atac_data = anndata.read_h5ad("data/datasets/mouse_brain_atac.h5ad")
atac_annotation = pd.read_csv("data/datasets/e18_mouse_brain_fresh_5k_atac_peak_annotation.tsv", sep='\t')
atac_annotation.head(5)



Unnamed: 0,peak,gene,distance,peak_type
0,chr1_3094399_3095523,,,intergenic
1,chr1_3113503_3114077,,,intergenic
2,chr1_3119414_3121804,,,intergenic
3,chr1_3181343_3181401,,,intergenic
4,chr1_3198296_3198611,,,intergenic


In [3]:
## preprocessing RNA data
sc.pp.filter_genes(rna_data, min_cells=15)
rna_data.raw = rna_data
sc.pp.normalize_total(rna_data, target_sum=1e4)
sc.pp.log1p(rna_data)
sc.pp.highly_variable_genes(rna_data, min_disp = 0.2)
rna_data.var['exog'] = rna_data.var.highly_variable.copy()
rna_data.var['endog'] = rna_data.var.exog & (rna_data.var.dispersions_norm > 0.7)
rna_data.layers['counts'] = rna_data.raw.to_adata().X.copy()
rna_data.var['exog']

Xkr4               True
Gm1992            False
Gm19938           False
Rp1                True
Mrpl15            False
                  ...  
CAAA01118383.1    False
Vamp7             False
Tmlhe              True
4933409K07Rik     False
AC149090.1        False
Name: exog, Length: 14583, dtype: bool

In [4]:
def generate_peakid(row):
    chr = row['chr']
    start = row["start"]
    end = row["end"]
    peak = chr+'_' + start+ '_' + end
    return peak


def get_atac_exog(row, rna_dict):

    exog = False
    # endo = False
    genes = str(row["gene"])
    if genes == 'nan':
        return exog
    else:
        genes = genes.split(";")
        for g in genes:
            if g in rna_dict['exog'].keys():
                if rna_dict['exog'][g]:
                    exog = True
            # if g in rna_dict['endog'].keys():        
            #     if  rna_dict['endog'][g]:
            #         endo = True

    return exog


def get_atac_endog(row, rna_dict):

    endo = False
    genes = str(row["gene"])
    if genes == 'nan':
        return endo
    else:
        genes = genes.split(";")
        for g in genes:
    
            if g in rna_dict['endog'].keys():        
                if  rna_dict['endog'][g]:
                    endo = True
                

    return endo    



In [5]:
atac_data.var['peak'] =  atac_data.var.apply(generate_peakid, axis=1)
atac_data.var = atac_data.var.reset_index()
atac_data.var.head()


Unnamed: 0,peak_id,chr,start,end,n_cells,peak
0,1,chr1,3094399,3095523,235,chr1_3094399_3095523
1,2,chr1,3113503,3114077,103,chr1_3113503_3114077
2,3,chr1,3119414,3121804,449,chr1_3119414_3121804
3,4,chr1,3198296,3198611,46,chr1_3198296_3198611
4,5,chr1,3210204,3210605,52,chr1_3210204_3210605


In [6]:
new_var = pd.merge(atac_data.var, atac_annotation, left_on='peak', right_on='peak')
atac_data.var = new_var
atac_data.var.head(10)

AnnData expects .var.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


Unnamed: 0,peak_id,chr,start,end,n_cells,peak,gene,distance,peak_type
0,1,chr1,3094399,3095523,235,chr1_3094399_3095523,,,intergenic
1,2,chr1,3113503,3114077,103,chr1_3113503_3114077,,,intergenic
2,3,chr1,3119414,3121804,449,chr1_3119414_3121804,,,intergenic
3,4,chr1,3198296,3198611,46,chr1_3198296_3198611,,,intergenic
4,5,chr1,3210204,3210605,52,chr1_3210204_3210605,,,intergenic
5,6,chr1,3292586,3292976,45,chr1_3292586_3292976,Xkr4;Gm1992,0;-173611,distal;distal
6,7,chr1,3371598,3371961,53,chr1_3371598_3371961,Xkr4;Gm1992,0;-94626,distal;distal
7,8,chr1,3399683,3400422,132,chr1_3399683_3400422,Xkr4;Gm1992,0;-66165,distal;distal
8,9,chr1,3477039,3477771,87,chr1_3477039_3477771,Xkr4;Gm1992,0;10453,distal;distal
9,10,chr1,3508598,3509041,55,chr1_3508598_3509041,Xkr4;Gm1992,0;42012,distal;distal


In [7]:
rna_dict = rna_data.var.to_dict()
newvar = atac_data.var
newvar['exog'] =  newvar.apply(get_atac_exog, rna_dict=rna_dict,  axis=1)
newvar['endog'] =  newvar.apply(get_atac_endog, rna_dict=rna_dict,  axis=1)
atac_data.var = newvar.set_index('peak_id')

In [8]:
atac_data.var

Unnamed: 0_level_0,chr,start,end,n_cells,peak,gene,distance,peak_type,exog,endog
peak_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,chr1,3094399,3095523,235,chr1_3094399_3095523,,,intergenic,False,False
2,chr1,3113503,3114077,103,chr1_3113503_3114077,,,intergenic,False,False
3,chr1,3119414,3121804,449,chr1_3119414_3121804,,,intergenic,False,False
4,chr1,3198296,3198611,46,chr1_3198296_3198611,,,intergenic,False,False
5,chr1,3210204,3210605,52,chr1_3210204_3210605,,,intergenic,False,False
...,...,...,...,...,...,...,...,...,...,...
123879,GL456216.1,37994,39286,266,GL456216.1_37994_39286,CAAA01118383.1,22114,distal,False,False
123880,GL456216.1,40534,41810,139,GL456216.1_40534_41810,CAAA01118383.1,24654,distal,False,False
123881,GL456216.1,49045,49376,35,GL456216.1_49045_49376,CAAA01118383.1,33165,distal,False,False
123882,JH584292.1,12573,14868,231,JH584292.1_12573_14868,Vmn2r122,9001,distal,False,False


In [9]:
## get the motif
tools.motif_scan.logger.setLevel(logging.INFO)
tools.get_motif_hits_in_peaks(atac_data,
                    genome_fasta='mm10.fa',
                    chrom = 'chr', start = 'start', end = 'end')

123883it [00:01, 62607.47it/s]


In [4]:
atac_data =  anndata.read_h5ad("data/datasets/mouse_brain_atac_motifs2.h5ad")
rna_data =  anndata.read_h5ad("data/datasets/mouse_brain_rna_exn.h5ad")


In [5]:
tools.fetch_factor_meta(atac_data)

Unnamed: 0,id,name,parsed_name
0,MA0973.1,CDF2,CDF2
1,MA1274.1,OBP3,OBP3
2,MA1559.1,SNAI3,SNAI3
3,MA0068.2,PAX4,PAX4
4,MA1063.1,TCP19,TCP19
...,...,...,...
1641,MA0999.1,ERF098,ERF098
1642,MA0930.1,ABF3,ABF3
1643,MA1589.1,ZNF140,ZNF140
1644,MA1007.1,PHYPADRAFT,PHYPADRAFT


In [6]:
tools.fetch_factor_hits(atac_data)


  X = hits_matrix,


AnnData object with n_obs × n_vars = 1646 × 123883
    obs: 'id', 'name', 'parsed_name'
    var: 'chr', 'start', 'end', 'n_cells', 'peak', 'gene', 'distance', 'peak_type', 'exog', 'endog'

In [4]:
atac_data

AnnData object with n_obs × n_vars = 3365 × 123883
    obs: 'celltype', 'batch_id', 'modality', 'latent_time', '_scvi_batch', '_scvi_labels', 'latent_decouple', 'latent_couple', 'latent_1', 'latent_2', 'latent_3', 'latent_4', 'latent_5', 'latent_6', 'latent_7', 'latent_8', 'latent_9', 'latent_10'
    var: 'chr', 'start', 'end', 'n_cells', 'peak', 'gene', 'distance', 'peak_type', 'exog', 'endog'
    uns: 'motifs', 'neighbors', 'umap'
    obsm: 'X_umap', 'latent'
    varm: 'motifs_hits'
    obsp: 'connectivities', 'distances'

In [8]:
atac_data.write_h5ad("data/datasets/mouse_brain_atac_motifs2.h5ad")

In [9]:
pd.DataFrame(atac_data.uns['motifs'])

Unnamed: 0,id,in_expr_data,name,parsed_name
0,MA0973.1,True,CDF2,CDF2
1,MA1274.1,True,OBP3,OBP3
2,MA1559.1,True,SNAI3,SNAI3
3,MA0068.2,True,PAX4,PAX4
4,MA1063.1,True,TCP19,TCP19
...,...,...,...,...
1641,MA0999.1,True,ERF098,ERF098
1642,MA0930.1,True,ABF3,ABF3
1643,MA1589.1,True,ZNF140,ZNF140
1644,MA1007.1,True,PHYPADRAFT,PHYPADRAFT


In [5]:
tools.subset_factors(atac_data,
                          use_factors=[factor for factor in rna_data.var_names
                                       if not ('FOS' in factor or 'JUN' in factor)])

In [6]:
atac_data.write_h5ad("data/datasets/mouse_brain_atac_motifs2.h5ad")

In [16]:
atac_data.varm['motifs_hits']

<123883x1646 sparse matrix of type '<class 'numpy.float64'>'
	with 39535877 stored elements in Compressed Sparse Column format>

In [13]:
atac_data

AnnData object with n_obs × n_vars = 3365 × 123883
    obs: 'celltype', 'batch_id', 'modality', 'latent_time', '_scvi_batch', '_scvi_labels', 'latent_decouple', 'latent_couple', 'latent_1', 'latent_2', 'latent_3', 'latent_4', 'latent_5', 'latent_6', 'latent_7', 'latent_8', 'latent_9', 'latent_10'
    var: 'chr', 'start', 'end', 'n_cells', 'peak', 'gene', 'distance', 'peak_type', 'exog', 'endog'
    uns: 'motifs', 'neighbors', 'umap'
    obsm: 'X_umap', 'latent'
    varm: 'motifs_hits'
    obsp: 'connectivities', 'distances'

In [None]:
### MIRA Test
