In [None]:
result_path = 'results/simvastatin10281/'

In [None]:
class FactStore:
    def __init__(self):
        self.unique_fact_id_counter = 0
        self.id_to_fact = {}
        self.fact_to_id = {}
        self.doc_to_facts = {}
        self.all_facts = {}
       
        self.chem_id_to_span = {}
        self.dis_id_to_span = {}
        self.gen_id_to_span = {}
        
    def add_fact(self, doc_id, fact):
        key = frozenset(fact)
        if key in self.fact_to_id:
            unique_fact_id = self.fact_to_id[key]
        else:
            unique_fact_id = self.unique_fact_id_counter
            self.fact_to_id[key] = unique_fact_id
            self.unique_fact_id_counter += 1

        self.all_facts[unique_fact_id] = fact
        if doc_id not in self.doc_to_facts:
            self.doc_to_facts[doc_id] = set()

        self.doc_to_facts[doc_id].add(unique_fact_id)
        self.id_to_fact[unique_fact_id] = fact
        
    def print_info(self):
        print("---------------------------------------")
        print("Amount of ids   : {}".format(len(self.id_to_fact.keys())))
        print("Amount of facts : {}".format(len(self.fact_to_id.keys())))
        print("Amount of docs  : {}".format(len(self.doc_to_facts.keys())))
        print("Known chemicals : {}".format(len(self.chem_id_to_span.keys())))
        print("Known diseases  : {}".format(len(self.dis_id_to_span.keys())))
        print("Known genes     : {}".format(len(self.gen_id_to_span.keys())))
        print("---------------------------------------")

In [None]:
fact_store = FactStore()


with open(result_path + 'chemical_disease_association.tsv', 'r') as f:
    first = True
    for line in f:
        if first: # skip header
            first = False
            continue
        
        spl = line.replace('\n', '').split('\t')
        doc_id = spl[0]
        sen_id = spl[1]
        chem_id = spl[3]
        chem_span = spl[4]
        dis_id = spl[5]
        dis_span = spl[6]
        
        fact_store.chem_id_to_span[chem_id] = chem_span
        fact_store.dis_id_to_span[dis_id] = dis_span
        
        #fact = (chem_span, 'cd', dis_span)      
        fact = (chem_id, 'c_asso_d', dis_id)
        fact_store.add_fact(doc_id, fact)

fact_store.print_info()

with open(result_path + 'chemical_gene_interaction.tsv', 'r') as f:
    first = True
    for line in f:
        if first: # skip header
            first = False
            continue
        
        spl = line.replace('\n', '').split('\t')
        doc_id = spl[0]
        sen_id = spl[1]
        chem_id = spl[3]
        chem_span = spl[4]
        gen_id = spl[5]
        gen_span = spl[6]
        
        fact_store.chem_id_to_span[chem_id] = chem_span
        fact_store.gen_id_to_span[gen_id] = gen_span
                
        #fact = (chem_span, 'inh', gen_span)
        fact = (chem_id, 'c_inter_g', gen_id)
        fact_store.add_fact(doc_id, fact)

        
fact_store.print_info()

with open(result_path + 'gene_disease_interaction.tsv', 'r') as f:
    first = True
    for line in f:
        if first: # skip header
            first = False
            continue
        
        spl = line.replace('\n', '').split('\t')
        doc_id = spl[0]
        sen_id = spl[1]
        gen_id = spl[3]
        gen_span = spl[4]
        dis_id = spl[5]
        dis_span = spl[6]
        
        fact_store.gen_id_to_span[gen_id] = gen_span
        fact_store.dis_id_to_span[dis_id] = dis_span
               
        #fact = (chem_span, 'inh', gen_span)
        fact = (gen_id, 'g_inter_d', dis_id)
        fact_store.add_fact(doc_id, fact)

        
fact_store.print_info()


In [None]:
import gzip
import re

ctd_gene_disease_inter = set()
i = 0
with gzip.open('data/CTD_genes_diseases.tsv.gz','r') as f:
    for l in f:
        line = str(l).replace('b\'', '').replace('\\n\'', '').replace('\\r','')
        # skip comments
        if line.startswith('#'):
            continue
        #print(line)
        components = line.split('\\t')    
        
        # add MESH:
        if not components[3].startswith('MESH:'):
            components[3] = "MESH:" + components[1]
        
        #print(components)
        gene = components[1]
        disease = components[3]
        key = frozenset((gene, disease))
        #print(key)
        ctd_gene_disease_inter.add(key)
        i += 1


    
print('{} chemical-gene assocations read from ChG-CTD_chem_gene_ixns'.format(len(ctd_gene_disease_inter)))

In [None]:
import gzip


ctd_drug_disease_associations = set()
with gzip.open('data/CTD_chemicals_diseases.tsv.gz','r') as f:
    for l in f:
        line = str(l).replace('b\'', '').replace('\\n\'', '')
        # skip comments
        if line.startswith('#'):
            continue
        # split line into components
        components = line.split('\\t')
        # add MESH:
        if not components[1].startswith('MESH:'):
            components[1] = "MESH:" + components[1]
        if not components[4].startswith('MESH:'):
            components[4] = "MESH:" + components[4]
        
        #print("{} {}".format(components[1], components[4]))
        key = frozenset((components[1], components[4]))
        ctd_drug_disease_associations.add(key)

print("Read {} drug-disease associations from CTD".format(len(ctd_drug_disease_associations)))

In [None]:
# combine facts at kg level
chem_kg = {}

for fact in fact_store.all_facts.values():
    if 'c_asso_d' in fact[1]:
        chem = fact[0]
    elif 'c_inter_g' in fact[1]:
        chem = fact[0]
    else:
        continue
    
    key = chem
    if key in chem_kg:
        chem_kg[key].append(fact)
    else:
        chem_kg[key] = [fact]

print('KG computed')
        
candidates = set()        
for chemical, facts_for_c in chem_kg.items():
    disease_candidates = set()
    gene_candidates = set()
    for fact in facts_for_c:
        if 'c_asso_d' in fact[1]:
            disease = fact[2]
            disease_candidates.add(disease)
        elif 'c_inter_g' in fact[1]:
            gene = fact[2]
            gene_candidates.add(gene)
    
    for dis in disease_candidates:
        for gene in gene_candidates:
            key = frozenset((dis, gene))
            candidates.add(key)
print("Candidates computed")
tp = 0
fp = 0
for cand in candidates:
    if cand in ctd_gene_disease_inter:
        tp += 1
    else:
        fp += 1
        
print('{} interactions processed by using KG'.format(len(candidates)))
print('{} of {} are correct using CTD'.format(tp, len(candidates)))


precision = tp / (tp + fp)
print("Precision {}".format(precision))

In [None]:
# combine facts at lg level

candidates = set() 
for doc in fact_store.doc_to_facts.keys():
    chem_lg = {}
    for f_id in fact_store.doc_to_facts[doc]:
        fact = fact_store.id_to_fact[f_id]
        if 'c_asso_d' in fact[1]:
            chem = fact[0]
        elif 'c_inter_g' in fact[1]:
            chem = fact[0]
        else:
            continue

        key = chem
        if key in chem_lg:
            chem_lg[key].append(fact)
        else:
            chem_lg[key] = [fact]

#    print('LG computed')


    for chemical, facts_for_c in chem_lg.items():
        disease_candidates = set()
        gene_candidates = set()
        for fact in facts_for_c:
            if 'c_asso_d' in fact[1]:
                disease = fact[2]
                disease_candidates.add(disease)
            elif 'c_inter_g' in fact[1]:
                gene = fact[2]
                gene_candidates.add(gene)

        for dis in disease_candidates:
            for gene in gene_candidates:
                key = frozenset((dis, gene))
                candidates.add(key)

print("Candidates computed")
tp = 0
fp = 0
for cand in candidates:
    if cand in ctd_gene_disease_inter:
        tp += 1
    else:
        fp += 1
        
print('{} interactions processed by using LG'.format(len(candidates)))
print('{} of {} are correct using CTD'.format(tp, len(candidates)))


precision = tp / (tp + fp)
print("Precision {}".format(precision))

In [None]:
# combine facts at kg level

candidates = set() 
gen_kg = {}
for fact in fact_store.all_facts.values():
    if 'g_inter_d' in fact[1]:
        gen = fact[0]
    elif 'c_inter_g' in fact[1]:
        gen = fact[2]
    else:
        continue

    key = gen
    if key in gen_kg:
        gen_kg[key].append(fact)
    else:
        gen_kg[key] = [fact]



for gen, facts_for_g in gen_kg.items():
    #print('{} has {} edges'.format(gen, len(facts_for_g)))
    disease_candidates = set()
    chemical_candidates = set()
    for fact in facts_for_g:
        if 'g_inter_d' in fact[1]:
            disease = fact[2]
            disease_candidates.add(disease)
        elif 'c_inter_g' in fact[1]:
            chemical = fact[0]
            chemical_candidates.add(chemical)

    for dis in disease_candidates:
        for chem in chemical_candidates:
            key = frozenset((dis, chem))
            candidates.add(key)

print("Candidates computed")
tp = 0
fp = 0
for cand in candidates:
    if cand in ctd_drug_disease_associations:
        tp += 1
    else:
        fp += 1
        
print('{} interactions processed by using KG'.format(len(candidates)))
print('{} of {} are correct using CTD'.format(tp, len(candidates)))


precision = tp / (tp + fp)
print("Precision {}".format(precision))

In [None]:
# combine facts at lg level

candidates = set() 
for doc in fact_store.doc_to_facts.keys():
    gen_lg = {}
    for f_id in fact_store.doc_to_facts[doc]:
        fact = fact_store.id_to_fact[f_id]
        if 'g_inter_d' in fact[1]:
            gen = fact[0]
        elif 'c_inter_g' in fact[1]:
            gen = fact[2]
        else:
            continue

        key = gen
        if key in gen_lg:
            gen_lg[key].append(fact)
        else:
            gen_lg[key] = [fact]

    
  
    for gen, facts_for_g in gen_lg.items():
        #print('{} has {} edges'.format(gen, len(facts_for_g)))
        disease_candidates = set()
        chemical_candidates = set()
        for fact in facts_for_g:
            if 'g_inter_d' in fact[1]:
                disease = fact[2]
                disease_candidates.add(disease)
            elif 'c_inter_g' in fact[1]:
                chemical = fact[0]
                chemical_candidates.add(chemical)

        for dis in disease_candidates:
            for chem in chemical_candidates:
                key = frozenset((dis, chem))
                candidates.add(key)

print("Candidates computed")
tp = 0
fp = 0
for cand in candidates:
    if cand in ctd_drug_disease_associations:
        tp += 1
    else:
        fp += 1
        
print('{} interactions processed by using LG'.format(len(candidates)))
print('{} of {} are correct using CTD'.format(tp, len(candidates)))


precision = tp / (tp + fp)
print("Precision {}".format(precision))