In [4]:
import os
import numpy as np
import matplotlib.pyplot as plt
import hypernetx as hnx

In [5]:
attentions = 50
center = attentions - 1

def read_chiqseq_file(path):
    _peaks = dict()
    with open(path)as f:
        for line in f:
            datas = line.strip().split()
            left, right = int(int(datas[1])/1000), int(int(datas[2])/1000)
            peaks = list(range(left, right+1))
            if datas[0] not in _peaks.keys():
                _peaks[datas[0]] = set()
            for i in peaks:
                _peaks[datas[0]].add(i)
    return _peaks

def get_hit_indicators(significants, peaks):
    distance = np.zeros(attentions*2-1)
    
    remove_sigs = set()
    for sig in significants:
        x, y = int(sig[0]), int(sig[1])
        if x not in peaks and y not in peaks:
            remove_sigs.add(sig)
    significants = significants - remove_sigs
    return distance, significants

def clear(chip_seq_path, all_enhanced_sig):
    enhanced_sigs = dict()
    for chr in range(1, 23):
        _peaks = read_chiqseq_file(chip_seq_path)
        distance, significants = get_hit_indicators(all_enhanced_sig[chr], _peaks['chr{}'.format(chr)])
        enhanced_sigs[chr] = significants
    return enhanced_sigs

In [6]:
# hr_sigs = np.load('../fig2/temp/GM12878_ATAC_H3K27ac_H3K4me3_2_100_all_hr_sig.npy', allow_pickle=True).item()
enhanced_sigs = np.load('../fig2/tss/temp/GM12878_ATAC_H3K27ac_H3K4me3_2_100_all_enhanced_sig.npy', allow_pickle=True).item()

auxiliary_files_path = '/data1/lmh_data/MINE/source/GM12878'
chip_seq_path = os.path.join(auxiliary_files_path, 'CTCF_peaks.bed')
enhanced_sigs = clear(chip_seq_path, enhanced_sigs)

enhanced_sig_number, enhanced_sig_numbers = 1, dict()

scenes = dict()
i = 0
for chr in range(1, 23):
    for enhanced_sig in enhanced_sigs[chr]:
        sig1, sig2 = int(enhanced_sig[0]), int(enhanced_sig[1])
        k1, k2 = '{}_{}'.format(chr, sig1), '{}_{}'.format(chr, sig2)
        if k1 in enhanced_sig_numbers.keys():
            sig1 = enhanced_sig_numbers[k1]
        else:
            enhanced_sig_numbers[k1] = enhanced_sig_number
            sig1 = enhanced_sig_number
            enhanced_sig_number += 1
        if k2 in enhanced_sig_numbers.keys():
            sig2 = enhanced_sig_numbers[k2]
        else:
            enhanced_sig_numbers[k2] = enhanced_sig_number
            sig2 = enhanced_sig_number
            enhanced_sig_number += 1

        scenes['p_{}'.format(i)] = set()
        scenes['p_{}'.format(i)].add(sig1)
        scenes['p_{}'.format(i)].add(sig2)
        i += 1
        
print(len(scenes))


7699


In [7]:
f = open('hypernetx.hgr', 'w')
f.write('{} {}\n'.format(len(scenes.keys()), enhanced_sig_number-1))
for key in scenes.keys():
    f.write(' '.join(str(s) for s in scenes[key]) + '\n')
f.close()

In [8]:
opposite_enhanced_sig_numbers = dict()
for key in enhanced_sig_numbers.keys():
    opposite_enhanced_sig_numbers[enhanced_sig_numbers[key]] = key

In [13]:
f = open('hypernetx.hgr.part.6', 'r')
datas = f.readlines()
f.close()

parts = dict()
for i in range(len(datas)):
    part = int(datas[i])
    if part not in parts.keys():
        parts[part] = dict()
    infos = opposite_enhanced_sig_numbers[i+1].split('_')
    chr, bin = int(infos[0]), int(infos[1])
    if chr not in parts[part].keys():
        parts[part][chr] = set()
    parts[part][chr].add(bin)

In [18]:
tss_file_path = os.path.join(auxiliary_files_path, 'Homo_sapiens.GRCh38.104.chr.gff3')

def get_tss_gene_info(gene_biotype='protein_coding'):
    gene_location = dict()
    gene_info = dict()
    with open(tss_file_path, "r") as f:
        datas = f.readlines()
        for data in datas[9:]:
            data = data.split('\t')
            if len(data) < 9:
                continue
            if data[2] != 'gene' or data[8].find(gene_biotype) == -1:
                continue
            if data[6] == '+':
                _location = data[3]
            elif data[6] == '-':
                _location = data[4]
            else:
                print('error')
            if not data[0].isdigit():
                continue
            chromosome = int(data[0])
            if chromosome not in gene_location:
                gene_location[chromosome] = set()
            gene_location[chromosome].add(int(int(_location)/1000))
            
            _tmp = data[-1].split('Name=')
            if len(_tmp) < 2:
                continue
            gene_names = _tmp[1].split(';')[0]
            gene_ids = data[-1].split('ID=')[1].split(',')[0].split(';')[0]
            
            if chromosome not in gene_info:
                gene_info[chromosome] = set()
            gene_info[chromosome].add((int(int(_location)/1000), gene_names, gene_ids, data[-1].replace('\n', '')))
    return gene_location, gene_info

def get_hit_genes(significants, gene_info, genes_loop_nums):
    genes = set()
    for sig in significants:
        for _info in gene_info:
            if abs(sig-_info[0])<3:
                if _info[1] not in genes_loop_nums:
                    genes_loop_nums[_info[1]] = 1
                else:
                    genes_loop_nums[_info[1]] += 1
                genes.add((_info[1], _info[2], _info[3]))
    return genes, genes_loop_nums

def get_hit_gene_names(significants, gene_biotype='protein_coding'):
    gene_info = get_tss_gene_info(gene_biotype)[1]
    all_genes = set()
    genes_loop_nums = dict()
    for chr in range(1, 23):
        if chr not in significants.keys():
            continue
        genes, genes_loop_nums = get_hit_genes(significants[chr], gene_info[chr], genes_loop_nums)
        all_genes = all_genes | genes
    return all_genes, genes_loop_nums

In [20]:
for part in parts.keys():
    mine_hit_genes, mine_genes_loop_nums = get_hit_gene_names(parts[part], gene_biotype='protein_coding')
    print(mine_genes_loop_nums.keys())

dict_keys(['CHIT1', 'NASP', 'RO60', 'SMIM12', 'FCRL6', 'PPM1J', 'HAO2', 'CDC73', 'SH3BGRL3', 'OMA1', 'DAB1', 'INSRR', 'CTSE', 'CRYBG2', 'GPR3', 'AVPR1B', 'GFI1', 'WNT2B', 'THEM4', 'CNR2', 'THRAP3', 'MMACHC', 'CCDC163', 'AHDC1', 'PERM1', 'SEMA4A', 'SLC20A1', 'ACKR3', 'FBXO41', 'BUB1', 'WIPF1', 'INPP4A', 'HDLBP', 'SEPTIN2', 'FARP2', 'ACTR3', 'PLEK', 'CD8A', 'AUP1', 'HTRA2', 'ADRA2B', 'ASTL', 'CFLAR', 'SLC1A4', 'ARL8B', 'FNDC3B', 'RUBCN', 'GOLIM4', 'PARP15', 'IL12A', 'RPL35A', 'IQCG', 'LMLN', 'GBE1', 'IRAK2', 'ALPK1', 'TIFA', 'TMEM175', 'GAK', 'IDUA', 'DGKQ', 'ART3', 'CXCL9', 'CXCL10', 'ATP10D', 'TRIM52', 'MGAT1', 'CSNK1G3', 'ZFP62', 'CCNI2', 'STING1', 'MARVELD2', 'FYB1', 'LIX1', 'OCLN', 'RMND5B', 'TRIM41', 'NHP2', 'VWA7', 'H2BC5', 'H1-4', 'VARS1', 'HSPA1B', 'PNISR', 'ZSCAN26', 'CCDC167', 'CAPN11', 'H3C10', 'H2AC13', 'HLA-DMB', 'ZSCAN31', 'BRD2', 'HLA-DMA', 'NFKBIE', 'TPMT', 'KDM1B', 'HLA-DOA', 'HLA-E', 'HLA-DPB1', 'HLA-DPA1', 'ATP6V1G2-DDX39B', 'NFKBIL1', 'ATP6V1G2', 'AIF1', 'PRRC2A', 'N

dict_keys(['ZFP69B', 'MATN1', 'GABPB2', 'CDC42SE1', 'VPS45', 'PMF1', 'PMF1-BGLAP', 'RGS1', 'ILF2', 'NPR1', 'FMOD', 'SCNM1', 'TNFAIP8L2', 'RSBN1', 'SHISA4', 'SLC27A3', 'PIK3C2B', 'DCLRE1B', 'AP4B1', 'IFI44L', 'KHDC4', 'RXFP4', 'TBCE', 'GGPS1', 'ARID4B', 'GFI1', 'TLR5', 'UCK2', 'TMCO1', 'ACKR3', 'SLC23A3', 'SEPTIN10', 'SOWAHC', 'SPATA3', 'ERMN', 'ASDURF', 'ASNSD1', 'ATOH8', 'MAL', 'P2RY14', 'XIRP1', 'NSUN3', 'DHFR2', 'CX3CR1', 'SCAP', 'CYP8B1', 'CPA3', 'HEMK1', 'RFT1', 'MB21D2', 'LTF', 'RTP3', 'CISH', 'MAPKAPK3', 'CLASP2', 'CDV3', 'GATA2', 'TLR1', 'TLR6', 'TMEM131L', 'NIPBL', 'PAIP1', 'NNT', 'PCDHGB7', 'PCDHGA11', 'FOXI1', 'ZFP62', 'RICTOR', 'REEP2', 'EGR1', 'PPARGC1B', 'PARP8', 'HLA-C', 'PRSS35', 'NUDT3', 'HLA-G', 'PAQR8', 'EFHC1', 'C6orf62', 'AKIRIN2', 'TXNDC5', 'H3C12', 'H2BC17', 'H2AC17', 'PIM1', 'HLA-DPB1', 'HLA-DPA1', 'ITPR3', 'ATAT1', 'H2AC4', 'H3C2', 'LY6G5B', 'H2BC3', 'H3C3', 'L3MBTL3', 'RIPK1', 'H4C3', 'PMPCB', 'CPED1', 'CCDC71L', 'POLR2J3', 'TYW1B', 'C7orf61', 'FZD6', 'ZNF34',