# Compare neighbourhoods

In [67]:
import scanpy as sc
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import random
import scipy as scipy

## Load data

In [5]:
m_data = sc.read_h5ad("../data-in/mouse/anndata.h5ad")
m_data

AnnData object with n_obs × n_vars = 430339 × 23972
    obs: 'sample', 'stage', 'stage.mapped', 'celltype', 'celltype.extended', 'tube_name', 'somite_count', 'cluster', 'cluster.sub', 'louvain', 'leiden', 'celltype.clustering', 'day'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'celltype_colors', 'draw_graph', 'leiden', 'louvain', 'neighbors', 'pca', 'umap'
    obsm: 'X_draw_graph_fa', 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [89]:
r_data = sc.read_h5ad("../data-in/rabbit/anndata.h5ad")
r_data

AnnData object with n_obs × n_vars = 146133 × 30725
    obs: 'cell', 'barcode', 'sample', 'stage', 'batch', 'doub.density', 'doublet', 'stripped', 'cluster', 'cluster.sub', 'cluster.stage', 'cluster.theiler', 'sizeFactor', 'celltype', 'singler', 'leiden_res1', 'leiden_res2', 'leiden_res1_5', 'leiden_res2_5', 'leiden_res3', 'leiden_res5', 'leiden_res10', 'leiden_res6', 'leiden_res7', 'leiden_res8', 'anatomical_loc', 'day'
    var: 'ensembl_ids'
    uns: 'anatomical_loc_colors', 'celltype_colors', 'draw_graph', 'leiden', 'neighbors', 'singler_colors', 'umap'
    obsm: 'X_draw_graph_fa', 'X_pca', 'X_tsne', 'X_umap'
    layers: 'logcounts'
    obsp: 'connectivities', 'distances'

In [96]:
orthologs = pd.read_csv("../data-in/orthologs.tsv", sep="\t")
orthologs

Unnamed: 0,ref,query
ENSOCUG00000000006,ENSOCUG00000000006,ENSMUSG00000026102
ENSOCUG00000000007,ENSOCUG00000000007,ENSMUSG00000028480
ENSOCUG00000000008,ENSOCUG00000000008,ENSMUSG00000070999
ENSOCUG00000000009,ENSOCUG00000000009,ENSMUSG00000028478
ENSOCUG00000000010,ENSOCUG00000000010,ENSMUSG00000028479
...,...,...
ENSOCUG00000039241,ENSOCUG00000039241,ENSMUSG00000022255
ENSOCUG00000039281,ENSOCUG00000039281,ENSMUSG00000024885
ENSOCUG00000039392,ENSOCUG00000039392,ENSMUSG00000014852
ENSOCUG00000039553,ENSOCUG00000039553,ENSMUSG00000009566


## Load neighbourhood data

In [90]:
r_nhoods = sc.read_mtx("../data-out/compare_neighbourhoods/r_nhoods.mtx")
r_nhoods

AnnData object with n_obs × n_vars = 146133 × 5392

In [91]:
m_nhoods = sc.read_mtx("../data-out/compare_neighbourhoods/m_nhoods.mtx")
m_nhoods

AnnData object with n_obs × n_vars = 430339 × 14034

### Downsample

In [64]:
ds_cells = random.sample(range(0, 430339), 150000)
m_dataDS = m_data[ds_cells,:]
m_nhoodsDS = m_nhoods[ds_cells,:]
m_dataDS

  if not is_categorical(df_full[k]):


View of AnnData object with n_obs × n_vars = 150000 × 23972
    obs: 'sample', 'stage', 'stage.mapped', 'celltype', 'celltype.extended', 'tube_name', 'somite_count', 'cluster', 'cluster.sub', 'louvain', 'leiden', 'celltype.clustering', 'day'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'celltype_colors', 'draw_graph', 'leiden', 'louvain', 'neighbors', 'pca', 'umap'
    obsm: 'X_draw_graph_fa', 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

### Compute gene specificity

In [137]:
# nan entries correspond to non-expressed genes, can be removed prior
def calcGeneSpecificity(adata,group_by=None, mask=None):
    if(mask is None):
        mask = pd.get_dummies(adata.obs[group_by])
        
    ncells=mask.sum(0)
    ncells_array = np.squeeze(np.asarray(1/ncells))
    ncells_mat = scipy.sparse.diags(ncells_array)
    mask = scipy.sparse.csr_matrix(mask).T

    ctype_sum = mask@adata.X
    ctype_mean = ncells_mat@ctype_sum
    
    N = ctype_mean.shape[0]
    row_sums = np.squeeze(np.asarray(ctype_mean.sum(axis=1)))[:,None]
    gspec = ctype_mean/(row_sums/N) #TODO: make this sparse    
    
    return(gspec)



def filterGenes(adata,orthologs,top_hvgs=None):
    # Filter expressing genes
    exp_genes = sc.pp.filter_genes(adata,min_counts=1,inplace=False)
    filt = adata[:,exp_genes[0]]
        
    # Filter one-to-one orthologs
    filt = filt[:,filt.var.index.isin(orthologs)]
           
    return(filt)


def chooseCommonGenes(r_filt,m_filt,orthologs,use_hvgs=False,join_type="intersect"):
    r_genes = r_filt.var.index
    m_genes = m_filt.var.index
    common_genes = orthologs.loc[orthologs.iloc[:,0].isin(r_genes) & orthologs.iloc[:,1].isin(m_genes),:]
    
    if(use_hvgs):
        r_hvgs = r_genes[r_filt.var["highly_variable"]]
        m_hvgs = m_genes[m_filt.var["highly_variable"]]
        
        if(join_type=="union"): 
            common_genes = common_genes.loc[orthologs.iloc[:,0].isin(r_hvgs) | orthologs.iloc[:,1].isin(m_hvgs),:]

        else:
            common_genes = common_genes.loc[orthologs.iloc[:,0].isin(r_hvgs) & orthologs.iloc[:,1].isin(m_hvgs),:]

    return(common_genes)


In [115]:
#sc.pp.log1p(r_data)
sc.pp.highly_variable_genes(r_data,n_top_genes=2000)
r_filt = filterGenes(r_data,orthologs=orthologs["ref"])
r_filt 

  if not is_categorical(df_full[k]):


View of AnnData object with n_obs × n_vars = 146133 × 13480
    obs: 'cell', 'barcode', 'sample', 'stage', 'batch', 'doub.density', 'doublet', 'stripped', 'cluster', 'cluster.sub', 'cluster.stage', 'cluster.theiler', 'sizeFactor', 'celltype', 'singler', 'leiden_res1', 'leiden_res2', 'leiden_res1_5', 'leiden_res2_5', 'leiden_res3', 'leiden_res5', 'leiden_res10', 'leiden_res6', 'leiden_res7', 'leiden_res8', 'anatomical_loc', 'day'
    var: 'ensembl_ids', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'anatomical_loc_colors', 'celltype_colors', 'draw_graph', 'leiden', 'neighbors', 'singler_colors', 'umap', 'log1p', 'hvg'
    obsm: 'X_draw_graph_fa', 'X_pca', 'X_tsne', 'X_umap'
    layers: 'logcounts'
    obsp: 'connectivities', 'distances'

In [116]:
sc.pp.highly_variable_genes(m_data,n_top_genes=2000)
m_filt = filterGenes(m_data, orthologs=orthologs["query"])
m_filt 

  if not is_categorical(df_full[k]):


View of AnnData object with n_obs × n_vars = 430339 × 13697
    obs: 'sample', 'stage', 'stage.mapped', 'celltype', 'celltype.extended', 'tube_name', 'somite_count', 'cluster', 'cluster.sub', 'louvain', 'leiden', 'celltype.clustering', 'day'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'celltype_colors', 'draw_graph', 'leiden', 'louvain', 'neighbors', 'pca', 'umap', 'hvg'
    obsm: 'X_draw_graph_fa', 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [138]:
common_genes = chooseCommonGenes(r_filt,m_filt, orthologs, use_hvgs=True, join_type="union")
r_filt = r_filt[:,common_genes["ref"]]
m_filt = m_filt[:,common_genes["query"]]

In [139]:
r_filt

View of AnnData object with n_obs × n_vars = 146133 × 2153
    obs: 'cell', 'barcode', 'sample', 'stage', 'batch', 'doub.density', 'doublet', 'stripped', 'cluster', 'cluster.sub', 'cluster.stage', 'cluster.theiler', 'sizeFactor', 'celltype', 'singler', 'leiden_res1', 'leiden_res2', 'leiden_res1_5', 'leiden_res2_5', 'leiden_res3', 'leiden_res5', 'leiden_res10', 'leiden_res6', 'leiden_res7', 'leiden_res8', 'anatomical_loc', 'day'
    var: 'ensembl_ids', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'anatomical_loc_colors', 'celltype_colors', 'draw_graph', 'leiden', 'neighbors', 'singler_colors', 'umap', 'log1p', 'hvg'
    obsm: 'X_draw_graph_fa', 'X_pca', 'X_tsne', 'X_umap'
    layers: 'logcounts'
    obsp: 'connectivities', 'distances'

In [140]:
m_filt

View of AnnData object with n_obs × n_vars = 430339 × 2153
    obs: 'sample', 'stage', 'stage.mapped', 'celltype', 'celltype.extended', 'tube_name', 'somite_count', 'cluster', 'cluster.sub', 'louvain', 'leiden', 'celltype.clustering', 'day'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'celltype_colors', 'draw_graph', 'leiden', 'louvain', 'neighbors', 'pca', 'umap', 'hvg'
    obsm: 'X_draw_graph_fa', 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [141]:
r_gspec = calcGeneSpecificity(r_filt, mask=r_nhoods.X)

In [142]:
m_gspec = calcGeneSpecificity(m_filt, mask=m_nhoods.X)

In [143]:
r_gspec.shape

(5392, 2153)

In [144]:
m_gspec.shape

(14034, 2153)

In [145]:
gspec_cor = np.corrcoef(r_gspec,m_gspec)

In [150]:
N = r_gspec.shape[0]
gspec_cor = gspec_cor[0:N,N:]

In [156]:
np.savetxt("../data-out/compare_neighbourhoods/gspec_cor.tsv", gspec_cor, delimiter="\t")
