# Cell2loc Genes Selection

### Packages

In [1]:
import scanpy as sc
import anndata
import numpy as np
import pandas as pd
import networkx as nx
import json
from scipy.sparse import csr_matrix, load_npz

In [8]:
a = pd.read_csv('/scratch/jeremy/data/cell2loc/ood_mousehippocampus.csv')

In [13]:
a.subclass.value_counts()

subclass
DG        316
CA1       157
Oligo     109
Astro      72
Thal       51
Endo       38
Inhib      31
CA3        29
Cortex     18
Micro      18
Name: count, dtype: int64

### Data

In [7]:
# Gene Expression data
data = pd.read_csv('/scratch/jeremy/data/cell2loc/ood_mousehippocampus.csv')
# Gene mapping data
with open('/scratch/jeremy/data/cell2loc/gene2id.json','r') as f:
    gene2id = json.load(f)

### Helpers

In [3]:
def rank_genes(adata, label_key, q_val=0.1, method='t-test', n_genes=100, plot=False):
    """Add to anndata object ranking for the highly differential genes in each cluster.
    label_key: cluster labels key
    return: differentially expressed genes of each cluster
    """
    sc.tl.rank_genes_groups(adata, label_key, method=method, n_genes=n_genes)
             
    uniq = np.unique(adata.obs[label_key].values)
    ranked_genes = {}
    
    for idx_c, clust in enumerate(uniq):
        curr_genes = [entry[idx_c] for idx, entry in enumerate(adata.uns['rank_genes_groups']['names'])
                  if adata.uns['rank_genes_groups']['pvals_adj'][idx][idx_c]<q_val]
        pvals = [adata.uns['rank_genes_groups']['pvals_adj'][idx][idx_c] for idx, entry in enumerate(adata.uns['rank_genes_groups']['names'])
                  if adata.uns['rank_genes_groups']['pvals_adj'][idx][idx_c]<q_val]
        ranked_genes[clust] = {g:pvals[idx] for idx, g in enumerate(curr_genes)}
        
    return ranked_genes

### Pipeline

In [4]:
gene_names = [g for g in list(data.columns[1:-3]) if g in gene2id]
obs_names = data['Unnamed: 0'].to_list()
obs = data[['Unnamed: 0','coord_X','coord_Y','subclass']].copy()
obs.columns = ['obs_name','coord_X','coord_Y','subclass']
obs['group'] = [0]*len(obs)
obs['group'] = obs['group'].astype('category')
obs = obs.set_index('obs_name')
X = data[gene_names].to_numpy()

In [5]:
adata = sc.AnnData(X=X)
adata.var_names = gene_names
adata.obs_names = obs_names
adata.obs = obs

In [6]:
diff_exp = rank_genes(adata, 'subclass', q_val=0.05, n_genes=124) #n_genes=124 to have > 1000 genes

In [7]:
selected_genes = []
for c in diff_exp.keys():
    for g,score in diff_exp[c].items():
        selected_genes.append(g)
selected_genes = list(set(selected_genes))
print(f'We selected {len(selected_genes)} genes')

We selected 1005 genes


In [8]:
with open('/scratch/jeremy/data/cell2loc/selected_genes_names.json','x') as f:
    json.dump(selected_genes,f)

---

### Proportion of the selected genes covered by the adjacency matrix

In [34]:
from itertools import permutations

In [35]:
adj = load_npz('/mlbiodata1/baffou/data/cell2loc/adjacency_p2p_and_hic_matrix.npz')

In [36]:
adj = adj.toarray()

In [37]:
with open('/scratch/jeremy/data/cell2loc/selected_genes_names.json','r') as f:
    selected_genes = json.load(f)

In [11]:
selected_genes_ids = [gene2id[g] for g in selected_genes]

In [23]:
interactions = list(permutations(selected_genes_ids,2))
inter_a = [a for a,b in interactions]
inter_b = [b for a,b in interactions]

In [33]:
print(f'There is {(a[inter_a,inter_b] != 0).sum() / len(a[inter_a,inter_b])} of the interaction covered by the adjacency matrix') 

There is 0.13589324294860358 of the interaction covered by the adjacency matrix
