# Hi-C Gene Graph

### Packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import permutations
import json
from scipy.sparse import csr_matrix, save_npz, load_npz

### Helpers

In [2]:
def format_chr(chromosome:str) -> int:
    """
    """
    new_chr = chromosome.split('_')[0][3:]
    new_chr = new_chr.replace('Un', '1')
    new_chr = new_chr.replace('X', '23')
    new_chr = new_chr.replace('Y', '24')
    return int(new_chr)

def process_gene_info(gene_df:pd.DataFrame,gene_mapping:dict) -> pd.DataFrame:
    """
    """
    new_gene_df = gene_df.copy()
    new_gene_df.columns = ['gene','chr','strand','tx_start','tx_end',
                     'cds_start','cds_end','exon_cnt','exon_start',
                     'exon_end','protein_id','align_id']
    new_gene_df = new_gene_df[['gene','chr','tx_start','tx_end',]]
    new_gene_df = new_gene_df.query('chr != "chrM"').copy()
    new_gene_df['chr'] = new_gene_df['chr'].apply(lambda c: format_chr(c))
    new_gene_df['gene'] = new_gene_df['gene'].apply(lambda g: g.split('.')[0])
    new_gene_df = new_gene_df.query('gene in @gene_mapping').copy()
    new_gene_df['gene'] = new_gene_df['gene'].apply(lambda g: gene_mapping[g])
    return new_gene_df


def add_combinations(interaction_dict:dict, genes:pd.DataFrame):
    """
    """
    interactions = list(permutations(genes['gene'].to_list(),2))
    interactions = [(gene_a,gene_b) if gene_a < gene_b else (gene_b,gene_a) for (gene_a,gene_b) in interactions]
    for i in interactions:
        if i not in interaction_dict:
            interaction_dict[i] = 0
        interaction_dict[i] += 1
        
        
def populate_adjacency(go:pd.DataFrame, adjacency:np.array,
                gene_to_id_mapping:dict, source_key='source',
                target_key='target', importance_key='importance'):
    """
    Populate ajdacency matrix using the given GO.
    
    :param go: Graph Ontology graph in Pandas DataFrame format.
    :param adjacency: Adjancecy numpy matrix to populate
    :param gene_to_id_mapping: Mapping of gene names to ids.
    :param source_key: Key of source node in the GO.
    :param target_key: Key of target node in the GO.
    :param importance_key: Key of the importance of the weights in the GO.
    
    """
    s_ids = go[source_key].apply(lambda g: gene_to_id_mapping[g]).to_list()
    t_ids = go[target_key].apply(lambda g: gene_to_id_mapping[g]).to_list()
    weights = go[importance_key].to_numpy()
    adjacency[s_ids,t_ids] = weights

### Data

In [3]:
# Data from cell2loc pipeline
with open("/scratch/jeremy/data/cell2loc/id2gene.json") as f:
    id2gene = json.load(f)
with open("/scratch/jeremy/data/cell2loc/gene2id.json") as f:
    gene2id = json.load(f)
cell2loc_raw_gene_list = list(gene2id.keys())
cell2loc_gene_list = [g.upper() for g in cell2loc_raw_gene_list]
cell2loc_num_genes = len(cell2loc_gene_list)
print(f'There are {cell2loc_num_genes} genes in the cell2loc pipeline.')
# Data from protein-protein interactions
# Gene informations
gene_info = pd.read_csv("/scratch/jeremy/data/graphs/proteins/10090.protein.info.v11.5.txt",sep='\t')
gene_info.columns = ['ENS_protein_id','protein_name','protein_size','annotation']
gene_info['protein_name'] = gene_info['protein_name'].apply(lambda p: p.upper())
gene_mapping = dict(zip(gene_info['ENS_protein_id'],gene_info['protein_name']))
# Protein links
gene_links = pd.read_csv("/scratch/jeremy/data/graphs/proteins/10090.protein.links.v11.5.txt",sep=' ')
gene_links.columns = ['source','target','importance']
gene_links['source'] = gene_links['source'].apply(lambda p: gene_mapping[p])
gene_links['target'] = gene_links['target'].apply(lambda p: gene_mapping[p])
gene_links['importance'] = (gene_links['importance'] + 1 - gene_links['importance'].min()
                           ) / (gene_links['importance'].max() - gene_links['importance'].min()) # min-max norm with smoothing
p2p_gene_list = list(set(gene_links.source))
print(f'There are {len(p2p_gene_list)} genes in the Protein-Protein interaction graph.')
# Intersection of datasets
inter_gene_set = set(cell2loc_gene_list).intersection(p2p_gene_list)
print(f'There are {len(inter_gene_set)} genes that overlap.')
hic_gene2id = dict(zip(cell2loc_gene_list,gene2id.values()))
# Mouse Gene Location
hic_mapping = dict([(idx.split('.')[-1].replace('P','T'),gene) for 
                   idx,gene in gene_mapping.items() if gene in inter_gene_set])
gene_info_path = "/mlbiodata1/baffou/data/HiC/mm10_gene_locations.csv"
gene_info = pd.read_csv(gene_info_path,sep="\t")
gene_info = process_gene_info(gene_info,hic_mapping)
print(f'There are {len(gene_info)} genes in the Hi-C mouse data.')

There are 21593 genes in the cell2loc pipeline.
There are 21317 genes in the Protein-Protein interaction graph.
There are 19410 genes that overlap.
There are 11417 genes in the Hi-C mouse data.


In [4]:
# Extract TAD data
tad_paths = ["/mlbiodata1/baffou/data/HiC/TAD/mm10/G1E-ER4.rep1-raw.domains",
            "/mlbiodata1/baffou/data/HiC/TAD/mm10/G1E-ER4.rep2-raw.domains",
            "/mlbiodata1/baffou/data/HiC/TAD/mm10/Myoblast.Doynova_2017-raw.domains",
            "/mlbiodata1/baffou/data/HiC/TAD/mm10/NPC.Bonev_2017-raw.domains",
            "/mlbiodata1/baffou/data/HiC/TAD/mm10/Neuron.Jiang_2017-raw.domains",
            "/mlbiodata1/baffou/data/HiC/TAD/mm10/Neuron_Cortical.Bonev_2017-raw.domains",
            "/mlbiodata1/baffou/data/HiC/TAD/mm10/Neuron_Setdb1_KO.Jiang_2017-raw.domains",
            "/mlbiodata1/baffou/data/HiC/TAD/mm10/mESC.Bonev_2017-raw.domains"]
tad_list = []
for path in tad_paths:
    tad = pd.read_csv(path,names=['chr','start','end'],sep="\t")
    if tad['chr'].dtype != 'int64':
        tad['chr'] = tad['chr'].apply(lambda c: format_chr(c))
    tad_list.append(tad)

In [5]:
# Store Interactions
interactions = dict()
for tad_df in tad_list:
    for chromosome in range(1,25):
        chr_tad = tad_df.query('chr == @chromosome').sort_values('start')
        chr_genes = gene_info.query('chr == @chromosome')
        for idx, single_domain in chr_tad.iterrows():
            add_combinations(interactions,
                chr_genes.query('tx_start > @single_domain.start & tx_end < @single_domain.end'))

In [6]:
# Reduce redundancy (a,b) (b,a)
filtered_interactions = dict()
for interaction, occurence_nb in interactions.items():
    filtered_interactions[interaction] = occurence_nb//2

In [7]:
# Filter interaction in a single file
hic_df = pd.DataFrame(filtered_interactions.keys(),columns=['source','target'])
hic_df['importance'] = filtered_interactions.values()
hic_df = hic_df.query('importance > 1').copy()
importance_max = hic_df['importance'].max()
hic_df['importance'] = hic_df['importance'].apply(lambda i: i/importance_max)

### Hi-C graph

In [8]:
cell2loc_hic_graph = hic_df.query('source in @inter_gene_set and target in @inter_gene_set')

In [9]:
cell2loc_adjacency = np.zeros((cell2loc_num_genes,cell2loc_num_genes))

In [10]:
populate_adjacency(cell2loc_hic_graph,cell2loc_adjacency,hic_gene2id)

In [11]:
cell2loc_adjacency = csr_matrix(cell2loc_adjacency)

In [12]:
save_npz("/scratch/jeremy/data/graphs/cell2loc/adjacency_hic_matrix.npz",cell2loc_adjacency)

---