In [1]:
result_path = 'results/simcyp298/'
min_support = 3

In [33]:
mesh_dict = {}

with open('data/mesh2018.tsv', 'r') as f:
    for l in f: 
        comp = l.replace('\n','').split('\t')
        mesh_id = comp[0]
        mesh_name = comp[1]
        
        mesh_dict[mesh_id] = mesh_name
        
print('Amount of mesh ids: {}'.format(len(mesh_dict)))

Amount of mesh ids: 28955


In [34]:
import gzip

gene_dict = {}

first = True
with gzip.open('data/CTD_genes.tsv.gz', 'r') as f:
    for l in f: 
        line = str(l).replace('b\'', '')
        # skip comments
        if line.startswith('#'):
            continue
        #print(line)
        comp = line.replace('\\n','').split('\\t')
        #print(comp)
        gene_id = comp[2]
        gene_name = comp[1]
     
        gene_dict[gene_id] = gene_name
        
print('Amount of gene ids: {}'.format(len(gene_dict)))

Amount of gene ids: 512977


In [35]:
doc_mapping = {}

first = True
with open(result_path+ 'doc_mapping.tsv', 'r') as f:
    for l in f:
       #skip first line
        if first:
            first = False
            continue
        comp = l.replace('\n','').split('\t')
        snorkel_doc_id = comp[0]
        pubmed_id = comp[1]
        
        doc_mapping[snorkel_doc_id] = pubmed_id
        
print('Amount of document ids: {}'.format(len(doc_mapping)))

Amount of document ids: 298


In [49]:
def replace_mesh_id_with_name(mesh_id):
    mesh_id_c = mesh_id
    if mesh_id.startswith('MESH:'):
        mesh_id_c = mesh_id.replace('MESH:','')
    
    if mesh_id_c not in mesh_dict:
        print('Error: Mesh_ID {} not in mesh dict'.format(mesh_id_c))
        return mesh_id
    
    name = mesh_dict[mesh_id_c]
    return name

def replace_gene_id_with_name(gene_id):
    if gene_id not in gene_dict:
        print('Error: Gene_ID {} not in gene dict'.format(gene_id))
        return gene_id
    
    name = gene_dict[gene_id]
    return name

def replace_snorkel_doc_id_with_pubmed_id(doc_id):
    return doc_mapping[doc_id]

In [68]:
class FactStore:
    def __init__(self):
        self.unique_fact_id_counter = 0
        self.id_to_fact = {}
        self.fact_to_id = {}
        self.doc_to_facts = {}
       
        self.chem_id_to_span = {}
        self.dis_id_to_span = {}
        self.gen_id_to_span = {}
        
    def add_fact(self, doc_id, fact):
        key = frozenset(fact)
        if key in self.fact_to_id:
            unique_fact_id = self.fact_to_id[key]
        else:
            unique_fact_id = self.unique_fact_id_counter
            self.fact_to_id[key] = unique_fact_id
            self.unique_fact_id_counter += 1


        if doc_id not in self.doc_to_facts:
            self.doc_to_facts[doc_id] = set()

        self.doc_to_facts[doc_id].add(unique_fact_id)
        self.id_to_fact[unique_fact_id] = fact
        
    def find_fact_id(fact):
        key = frozenset(fact)
        if key in self.fact_to_id:
            return self.fact_to_id[key]
        return None
        
    def print_info(self):
        print("---------------------------------------")
        print("Amount of ids   : {}".format(len(self.id_to_fact.keys())))
        print("Amount of facts : {}".format(len(self.fact_to_id.keys())))
        print("Amount of docs  : {}".format(len(self.doc_to_facts.keys())))
        print("Known chemicals : {}".format(len(self.chem_id_to_span.keys())))
        print("Known diseases  : {}".format(len(self.dis_id_to_span.keys())))
        print("Known genes     : {}".format(len(self.gen_id_to_span.keys())))
        print("---------------------------------------")
        
    def facts_to_str(self, facts):
        str_res = ""
        str_res += "["
        for f in facts:  
            if 'c_asso_d' is f[1]:
                c_name = replace_mesh_id_with_name(f[0])
                d_name = replace_mesh_id_with_name(f[2])
                str_res += '({}, associated, {})'.format(c_name, d_name) 
            if 'c_inter_g' is f[1]:
                c_name = replace_mesh_id_with_name(f[0])
                g_name = replace_gene_id_with_name(f[2])
                str_res += '({}, interacts, {})'.format(c_name, g_name)
            if 'g_inter_d' is f[1]:
                g_name = replace_gene_id_with_name(f[0])
                d_name = replace_mesh_id_with_name(f[2])
                str_res += '({}, interacts, {})'.format(g_name, d_name)


            if 'c_inhibits_g' is f[1]:
                c_name = replace_mesh_id_with_name(f[0])
                g_name = replace_gene_id_with_name(f[2])
                str_res += '({}, inhibits, {})'.format(c_name, g_name)
            if 'g_metabol_c' is f[1]:
                g_name = replace_gene_id_with_name(f[0])
                c_name = replace_mesh_id_with_name(f[2])
                str_res += '({}, metabol, {})'.format(g_name, c_name)

            str_res += ','

        str_res = str_res[0:-1] +  "]"
        return str_res

        
    def match_query_facts_in_doc_facts(self, query_facts, doc_facts):
        # store all qf substitutions 
        qf_substitutions = {}

        # all document facts must match
        for qf in query_facts:
            # allow variables in query
            if qf[0].startswith('?') or qf[2].startswith('?'):
                # look for possible substitution
                substitutions = []
                for df in doc_facts:
                    # predicates are equal?
                    if qf[1] == df[1]:
                        # is qf[0] not variable?
                        if not qf[0].startswith('?'):
                            # then both must be equal
                            if qf[0] == df[0]:
                                substitutions.append(df)
                            else:
                                # no match
                                break
                        # is qf[2] not variable?
                        if not qf[2].startswith('?'):
                            # then both must be equal
                            if qf[2] == df[2]:
                                substitutions.append(df)
                            else:
                                # no match
                                break
                # no substitution found?
                if len(substitutions) == 0:
                    return (False, {}) # query is not found in documents

                # there is at least one substitution - this fact is matched!
                qf_substitutions[qf] = substitutions
                continue # continue matching process

            # just check whether there is a direct match
            # here no substitution is necessary
            if qf not in doc_facts:
                return (False, {})
        return (True, qf_substitutions)


    def match_query_facts(self, query_facts):
        number_of_matches = 0
        matched_doc_ids = []
        # go through all documents
        for doc_id, doc_fact_ids in self.doc_to_facts.items():
            # replace all fact_ids by their their original facts
            doc_facts = []
            for dfi in doc_fact_ids:
                doc_facts.append(self.id_to_fact[dfi])

            # now match query against this facts
            (matched, subs) = self.match_query_facts_in_doc_facts(query_facts, doc_facts)
            if matched:
                # match found
                print('Match in {} (PMID: {}) with substitutions:'.format(doc_id, replace_snorkel_doc_id_with_pubmed_id(doc_id)))
                for k, v in subs.items():
                    print('\t{} is substituted by {}\n'.format(k, self.facts_to_str(v)))
                print('\n')
                number_of_matches += 1
                matched_doc_ids.append(doc_id)
        print('{} matches found!'.format(number_of_matches))
        return matched_doc_ids

In [69]:
fact_store = FactStore()


with open(result_path + 'chemical_disease_association.tsv', 'r') as f:
    first = True
    for line in f:
        if first: # skip header
            first = False
            continue
        
        spl = line.replace('\n', '').split('\t')
        doc_id = spl[0]
        sen_id = spl[1]
        chem_id = spl[3]
        chem_span = spl[4]
        dis_id = spl[5]
        dis_span = spl[6]
        
        fact_store.chem_id_to_span[chem_id] = chem_span
        fact_store.dis_id_to_span[dis_id] = dis_span
        
        #fact = (chem_span, 'cd', dis_span)      
        fact = (chem_id, 'c_asso_d', dis_id)
        fact_store.add_fact(doc_id, fact)

fact_store.print_info()

with open(result_path + 'chemical_gene_interaction.tsv', 'r') as f:
    first = True
    for line in f:
        if first: # skip header
            first = False
            continue
        
        spl = line.replace('\n', '').split('\t')
        doc_id = spl[0]
        sen_id = spl[1]
        chem_id = spl[3]
        chem_span = spl[4]
        gen_id = spl[5]
        gen_span = spl[6]
        
        fact_store.chem_id_to_span[chem_id] = chem_span
        fact_store.gen_id_to_span[gen_id] = gen_span
                
        #fact = (chem_span, 'inh', gen_span)
        fact = (chem_id, 'c_inter_g', gen_id)
        fact_store.add_fact(doc_id, fact)

        
fact_store.print_info()

with open(result_path + 'gene_disease_interaction.tsv', 'r') as f:
    first = True
    for line in f:
        if first: # skip header
            first = False
            continue
        
        spl = line.replace('\n', '').split('\t')
        doc_id = spl[0]
        sen_id = spl[1]
        gen_id = spl[3]
        gen_span = spl[4]
        dis_id = spl[5]
        dis_span = spl[6]
        
        fact_store.gen_id_to_span[gen_id] = gen_span
        fact_store.dis_id_to_span[dis_id] = dis_span
               
        #fact = (chem_span, 'inh', gen_span)
        fact = (gen_id, 'g_inter_d', dis_id)
        fact_store.add_fact(doc_id, fact)

fact_store.print_info()

with open(result_path + 'chemical_gene_inhibition.tsv', 'r') as f:
    first = True
    for line in f:
        if first: # skip header
            first = False
            continue
        
        spl = line.replace('\n', '').split('\t')
        doc_id = spl[0]
        sen_id = spl[1]
        chem_id = spl[3]
        chem_span = spl[4]
        gen_id = spl[5]
        gen_span = spl[6]
        
        fact_store.gen_id_to_span[gen_id] = gen_span
        fact_store.chem_id_to_span[chem_id] = chem_span
               
        fact = (chem_id, 'c_inhibits_g', gen_id)
        fact_store.add_fact(doc_id, fact)

        
fact_store.print_info()

with open(result_path + 'gene_chemical_metabolism.tsv', 'r') as f:
    first = True
    for line in f:
        if first: # skip header
            first = False
            continue
        
        spl = line.replace('\n', '').split('\t')
        doc_id = spl[0]
        sen_id = spl[1]
        gene_id = spl[3]
        gene_span = spl[4]
        chem_id = spl[5]
        chem_span = spl[6]
        
        fact_store.gen_id_to_span[gen_id] = gen_span
        fact_store.chem_id_to_span[chem_id] = chem_span
               
        fact = (gen_id, 'g_metabol_c', chem_id)
        fact_store.add_fact(doc_id, fact)
        
        
        
        
fact_store.print_info()


---------------------------------------
Amount of ids   : 325
Amount of facts : 325
Amount of docs  : 143
Known chemicals : 99
Known diseases  : 101
Known genes     : 0
---------------------------------------
---------------------------------------
Amount of ids   : 590
Amount of facts : 590
Amount of docs  : 236
Known chemicals : 160
Known diseases  : 101
Known genes     : 57
---------------------------------------
---------------------------------------
Amount of ids   : 733
Amount of facts : 733
Amount of docs  : 244
Known chemicals : 160
Known diseases  : 121
Known genes     : 71
---------------------------------------
---------------------------------------
Amount of ids   : 849
Amount of facts : 849
Amount of docs  : 253
Known chemicals : 169
Known diseases  : 121
Known genes     : 74
---------------------------------------
---------------------------------------
Amount of ids   : 917
Amount of facts : 917
Amount of docs  : 260
Known chemicals : 179
Known diseases  : 121
Known ge

Computing frequent occurring facts

In [4]:
to_check = []
ids_with_min_support = set()
for f_id in fact_store.id_to_fact.keys():
    support = 0
    # go through all documents
    for doc_facts in fact_store.doc_to_facts.values():
        if f_id in doc_facts:
             support += 1
    
    if support >= min_support:
        t_set = set()
        t_set.add(f_id)
        to_check.append(t_set)
        ids_with_min_support.add(f_id)
print(ids_with_min_support)

{521, 15, 28, 34, 35, 36, 42, 50, 52, 60, 66, 67, 75, 77, 596, 87, 92, 93, 94, 607, 616, 618, 625, 119, 126, 128, 671, 169, 173, 174, 176, 183, 712, 734, 735, 736, 738, 744, 747, 748, 750, 751, 752, 755, 758, 766, 770, 775, 778, 782, 785, 787, 788, 789, 792, 798, 800, 802, 815, 825, 325, 326, 337, 339, 851, 853, 854, 855, 344, 857, 346, 347, 861, 350, 862, 864, 867, 356, 870, 359, 360, 361, 362, 363, 872, 365, 874, 367, 875, 884, 374, 887, 386, 388, 389, 390, 901, 911, 407, 414, 418, 419, 420, 425, 426, 428, 429, 438, 464}


Computing frequent item sets 

In [5]:
results = []

explored_sets = set()
while to_check:
    # get fact candidate ids
    cand_ids_org = to_check.pop()
    for f_id in ids_with_min_support:
        if f_id in cand_ids_org:
            continue
        # check with this id included
        cand_ids = cand_ids_org.copy()
        cand_ids.add(f_id)
        
        # already checked this combi
        if frozenset(cand_ids) in explored_sets:
            continue
        
        #print("Starting with candidate ids: {}".format(cand_ids_org))
        # how much support does these ids have?
        support = 0
        # go through all documents
        doc_ids_supporting = []
        for doc_id, doc_facts in fact_store.doc_to_facts.items():
            included = True
            for f_id in cand_ids:
                if f_id not in doc_facts:
                    # if a fact id is not included - stop here (no support)
                    included = False
                    break
            if included:
                doc_ids_supporting.append(doc_id)
                support += 1
        
        explored_sets.add(frozenset(cand_ids))
        
        if support >= min_support:
            results.append((cand_ids.copy(), support, doc_ids_supporting))
            to_check.append(cand_ids)
            print("Support {} for {} in doc_ids: {}".format(support, cand_ids, doc_ids_supporting))



Support 4 for {884, 734} in doc_ids: ['203', '127', '284', '143']
Support 5 for {738, 884} in doc_ids: ['172', '203', '127', '211', '284']
Support 8 for {884, 798} in doc_ids: ['172', '203', '127', '181', '211', '284', '143', '287']
Support 4 for {884, 326} in doc_ids: ['172', '127', '181', '211']
Support 4 for {851, 884} in doc_ids: ['172', '203', '127', '211']
Support 4 for {867, 884} in doc_ids: ['203', '127', '284', '143']
Support 4 for {867, 884, 734} in doc_ids: ['203', '127', '284', '143']
Support 3 for {738, 867, 884} in doc_ids: ['203', '127', '284']
Support 4 for {867, 884, 798} in doc_ids: ['203', '127', '284', '143']
Support 4 for {867, 884, 734, 798} in doc_ids: ['203', '127', '284', '143']
Support 3 for {738, 867, 884, 798} in doc_ids: ['203', '127', '284']
Support 3 for {738, 867, 798, 884, 734} in doc_ids: ['203', '127', '284']
Support 3 for {738, 867, 884, 734} in doc_ids: ['203', '127', '284']
Support 4 for {738, 851, 884} in doc_ids: ['172', '203', '127', '211']
Supp

Support 3 for {874, 738, 788, 326} in doc_ids: ['117', '211', '298']
Support 3 for {874, 738, 789} in doc_ids: ['117', '211', '298']
Support 3 for {874, 788, 789} in doc_ids: ['117', '211', '298']
Support 3 for {874, 738, 788, 789} in doc_ids: ['117', '211', '298']
Support 3 for {874, 738, 788} in doc_ids: ['117', '211', '298']
Support 3 for {872, 738} in doc_ids: ['108', '218', '117']
Support 3 for {872, 851} in doc_ids: ['108', '218', '117']
Support 3 for {872, 738, 851} in doc_ids: ['108', '218', '117']
Support 3 for {851, 870} in doc_ids: ['295', '99', '127']
Support 8 for {867, 734} in doc_ids: ['201', '203', '69', '117', '127', '206', '284', '143']
Support 6 for {738, 867} in doc_ids: ['203', '69', '117', '127', '206', '284']
Support 3 for {867, 748} in doc_ids: ['201', '127', '284']
Support 3 for {792, 867} in doc_ids: ['201', '117', '127']
Support 4 for {867, 798} in doc_ids: ['203', '127', '284', '143']
Support 4 for {867, 326} in doc_ids: ['69', '117', '127', '206']
Support 5

Support 3 for {792, 851, 853, 734} in doc_ids: ['267', '117', '127']
Support 4 for {738, 770, 851, 853} in doc_ids: ['108', '60', '117', '135']
Support 3 for {738, 851, 853, 750} in doc_ids: ['70', '117', '127']
Support 3 for {738, 851, 50, 853} in doc_ids: ['47', '107', '241']
Support 3 for {36, 853, 326} in doc_ids: ['107', '228', '267']
Support 3 for {734, 853, 326} in doc_ids: ['267', '117', '127']
Support 10 for {738, 853, 326} in doc_ids: ['107', '228', '60', '70', '117', '127', '135', '188', '211', '221']
Support 3 for {744, 853, 326} in doc_ids: ['107', '112', '253']
Support 3 for {853, 326, 750} in doc_ids: ['70', '117', '127']
Support 3 for {770, 853, 326} in doc_ids: ['60', '117', '135']
Support 3 for {792, 853, 326} in doc_ids: ['267', '117', '127']
Support 3 for {792, 734, 853, 326} in doc_ids: ['267', '117', '127']
Support 3 for {770, 738, 853, 326} in doc_ids: ['60', '117', '135']
Support 3 for {738, 853, 326, 750} in doc_ids: ['70', '117', '127']
Support 3 for {744, 853

Support 3 for {738, 326, 750, 782, 851} in doc_ids: ['117', '127', '231']
Support 4 for {738, 778, 851, 326} in doc_ids: ['84', '159', '188', '298']
Support 4 for {738, 770, 851, 326} in doc_ids: ['60', '117', '135', '298']
Support 3 for {738, 851, 326, 755} in doc_ids: ['51', '159', '226']
Support 3 for {738, 851, 326, 751} in doc_ids: ['84', '29', '92']
Support 6 for {738, 851, 326, 750} in doc_ids: ['56', '70', '117', '127', '208', '231']
Support 3 for {738, 851, 326, 747} in doc_ids: ['48', '228', '127']
Support 5 for {744, 738, 851, 326} in doc_ids: ['107', '240', '206', '208', '298']
Support 3 for {738, 851, 326, 15} in doc_ids: ['9', '68', '240']
Support 7 for {738, 851, 36, 326} in doc_ids: ['48', '56', '57', '107', '126', '228', '240']
Support 4 for {738, 851, 50, 326} in doc_ids: ['35', '84', '107', '218']
Support 4 for {738, 851, 734, 326} in doc_ids: ['69', '117', '127', '206']
Support 3 for {738, 851, 326, 735} in doc_ids: ['218', '127', '195']
Support 3 for {825, 851, 521

Support 3 for {360, 738, 782} in doc_ids: ['122', '141', '127']
Support 3 for {360, 326, 782} in doc_ids: ['122', '141', '127']
Support 3 for {360, 326, 738, 782} in doc_ids: ['122', '141', '127']
Support 3 for {326, 734, 782} in doc_ids: ['141', '117', '127']
Support 6 for {326, 738, 782} in doc_ids: ['122', '141', '117', '127', '198', '231']
Support 4 for {326, 782, 750} in doc_ids: ['141', '117', '127', '231']
Support 3 for {326, 734, 782, 750} in doc_ids: ['141', '117', '127']
Support 4 for {326, 738, 782, 750} in doc_ids: ['141', '117', '127', '231']
Support 3 for {738, 326, 782, 750, 734} in doc_ids: ['141', '117', '127']
Support 3 for {326, 738, 734, 782} in doc_ids: ['141', '117', '127']
Support 3 for {734, 782, 750} in doc_ids: ['141', '117', '127']
Support 4 for {738, 782, 750} in doc_ids: ['141', '117', '127', '231']
Support 3 for {738, 734, 782, 750} in doc_ids: ['141', '117', '127']
Support 3 for {738, 734, 782} in doc_ids: ['141', '117', '127']
Support 4 for {778, 738} in

Support 4 for {616, 50} in doc_ids: ['84', '100', '160', '194']
Support 4 for {616, 365} in doc_ids: ['84', '100', '160', '194']
Support 4 for {616, 50, 365} in doc_ids: ['84', '100', '160', '194']
Support 3 for {34, 596} in doc_ids: ['20', '79', '179']
Support 3 for {464, 326} in doc_ids: ['165', '231', '254']
Support 4 for {326, 438} in doc_ids: ['141', '258', '134', '198']
Support 3 for {429, 326} in doc_ids: ['127', '165', '254']
Support 3 for {426, 429} in doc_ids: ['127', '165', '254']
Support 3 for {426, 429, 326} in doc_ids: ['127', '165', '254']
Support 3 for {428, 326} in doc_ids: ['127', '211', '263']
Support 3 for {360, 428} in doc_ids: ['127', '211', '263']
Support 3 for {360, 428, 326} in doc_ids: ['127', '211', '263']
Support 3 for {426, 326} in doc_ids: ['127', '165', '254']
Support 3 for {425, 326} in doc_ids: ['126', '135', '263']
Support 5 for {420, 326} in doc_ids: ['194', '214', '267', '127', '165']
Support 3 for {360, 420} in doc_ids: ['119', '267', '127']
Support

In [71]:
stories = sorted(results, key=lambda x: x[1], reverse=True)
    
for res, support, doc_ids in stories:
    facts = []
    for f_id in res:
        facts.append(fact_store.id_to_fact[f_id])
    print("Support {} for {}\n".format(support,fact_store.facts_to_str(facts)))

Support 67 for [("Simvastatin", inhibits, cytochrome P450 family 3 subfamily A member 4),("Simvastatin", interacts, cytochrome P450 family 3 subfamily A member 4)]

Support 65 for [(cytochrome P450 family 3 subfamily A member 4, metabol, "Simvastatin"),("Simvastatin", interacts, cytochrome P450 family 3 subfamily A member 4)]

Support 55 for [("Simvastatin", inhibits, cytochrome P450 family 3 subfamily A member 4),(cytochrome P450 family 3 subfamily A member 4, metabol, "Simvastatin")]

Support 43 for [("Simvastatin", inhibits, cytochrome P450 family 3 subfamily A member 4),(cytochrome P450 family 3 subfamily A member 4, metabol, "Simvastatin"),("Simvastatin", interacts, cytochrome P450 family 3 subfamily A member 4)]

Support 27 for [(cytochrome P450 family 3 subfamily A member 4, metabol, "Simvastatin"),(cytochrome P450 family 3 subfamily A member 4, metabol, "Lovastatin")]

Support 18 for [(cytochrome P450 family 3 subfamily A member 4, metabol, "Lovastatin"),("Simvastatin", interac

In [7]:
len(results_sorted)

755

In [81]:
filename = result_path + 'stories_supp{}.tsv'.format(min_support)
with open(filename, 'w') as f:
    f.write('{}\t{}\n'.format('support', 'frequent item set'))
    for res, support, doc_ids in stories:
        facts = set()
        for f_id in res:
            facts.add(fact_store.id_to_fact[f_id])
        f.write('{}\t{}\t{}\n'.format(support, doc_ids, facts))
print('Stories saved at {}'.format(filename))

Stories saved at results/simcyp298/stories_supp3.tsv


In [84]:
filename = result_path + 'stories_supp{}_translated.tsv'.format(min_support)
with open(filename, 'w') as f:
    for story, supp, doc_ids in stories:
        line = '{}'.format(supp)
        
        # translate id to facts
        facts = set()
        for f_id in story:
            facts.add(fact_store.id_to_fact[f_id])
        
        for event in facts:
            pred = event[1]
            
            if pred == 'c_asso_d':
                ev1 = replace_mesh_id_with_name(event[0])
                ev2 = replace_mesh_id_with_name(event[2])
            elif pred == 'c_inter_g':
                ev1 = replace_mesh_id_with_name(event[0])
                ev2 = replace_gene_id_with_name(event[2])
            elif pred == 'g_inter_d':
                ev1 = replace_gene_id_with_name(event[0])
                ev2 = replace_mesh_id_with_name(event[2])
            elif pred == 'c_inhibits_g':
                ev1 = replace_mesh_id_with_name(event[0])
                ev2 = replace_gene_id_with_name(event[2])
            elif pred == 'g_metabol_c':
                ev1 = replace_gene_id_with_name(event[0])
                ev2 = replace_mesh_id_with_name(event[2])
            
            line += '\t({},{},{})'.format(ev1, pred, ev2)
        
        # translate documument id
        translated_doc_ids = []
        for doc_id in doc_ids:
            translated_doc_ids.append(replace_snorkel_doc_id_with_pubmed_id(doc_id))
        
        line += '\t{}'.format(translated_doc_ids)
  
        line += '\n'
        f.write(line)
print('Translated stories saved at {}'.format(filename))

Error: Mesh_ID C065179 not in mesh dict
Error: Mesh_ID C065179 not in mesh dict
Error: Mesh_ID C065179 not in mesh dict
Error: Mesh_ID C086276 not in mesh dict
Error: Mesh_ID C065179 not in mesh dict
Error: Mesh_ID C086276 not in mesh dict
Error: Mesh_ID C086276 not in mesh dict
Error: Mesh_ID C065179 not in mesh dict
Error: Mesh_ID C065179 not in mesh dict
Error: Mesh_ID C086276 not in mesh dict
Error: Mesh_ID C086276 not in mesh dict
Error: Mesh_ID C065179 not in mesh dict
Error: Mesh_ID C055162 not in mesh dict
Error: Mesh_ID C086276 not in mesh dict
Error: Mesh_ID C086276 not in mesh dict
Error: Mesh_ID C055162 not in mesh dict
Error: Mesh_ID C086276 not in mesh dict
Error: Mesh_ID C086276 not in mesh dict
Error: Mesh_ID C086276 not in mesh dict
Error: Mesh_ID C086276 not in mesh dict
Error: Mesh_ID C086276 not in mesh dict
Error: Mesh_ID C086276 not in mesh dict
Error: Mesh_ID C055162 not in mesh dict
Error: Mesh_ID C422923 not in mesh dict
Error: Mesh_ID C055162 not in mesh dict


In [67]:
# Sim -- asso -- Rhabdo, CYP3A4 -- meta -- Simvastatn, ?X -- inhib -- 1576
query = [('MESH:D019821', 'c_asso_d', 'MESH:D012206'), ('1576', 'g_metabol_c', 'MESH:D019821'), ('?X', 'c_inhibits_g', '1576')]


%time doc_ids = fact_store.match_query_facts(query)

print(doc_ids)

Match in 48 (PMID: 25041770)with substitutions:
	('?X', 'c_inhibits_g', '1576') is substituted by [("Simvastatin", inhibits, cytochrome P450 family 3 subfamily A member 4),("Clarithromycin", inhibits, cytochrome P450 family 3 subfamily A member 4)]



Match in 56 (PMID: 25571292)with substitutions:
Error: Mesh_ID C016904 not in mesh dict
	('?X', 'c_inhibits_g', '1576') is substituted by [("Simvastatin", inhibits, cytochrome P450 family 3 subfamily A member 4),("Verapamil", inhibits, cytochrome P450 family 3 subfamily A member 4),(MESH:C016904, inhibits, cytochrome P450 family 3 subfamily A member 4)]



Match in 57 (PMID: 25571293)with substitutions:
	('?X', 'c_inhibits_g', '1576') is substituted by [("Nelfinavir", inhibits, cytochrome P450 family 3 subfamily A member 4),("Simvastatin", inhibits, cytochrome P450 family 3 subfamily A member 4)]



Match in 107 (PMID: 16581325)with substitutions:
	('?X', 'c_inhibits_g', '1576') is substituted by [("Simvastatin", inhibits, cytochrome P450

In [55]:
for d_id in doc_ids:
    facts = []
    for f_id in fact_store.doc_to_facts[d_id]:
        facts.append(fact_store.id_to_fact[f_id])
    print("Story for PMID {}: {}\n\n".format(replace_snorkel_doc_id_with_pubmed_id(d_id), fact_store.facts_to_str(facts)))

Story for PMID 25041770: [("Simvastatin", inhibits, cytochrome P450 family 3 subfamily A member 4),("Simvastatin", associated, "Rhabdomyolysis"),("Simvastatin", interacts, cytochrome P450 family 3 subfamily A member 4),("Clarithromycin", associated, "Rhabdomyolysis"),("Clarithromycin", inhibits, cytochrome P450 family 3 subfamily A member 4),(cytochrome P450 family 3 subfamily A member 4, interacts, "Muscular Diseases"),(cytochrome P450 family 3 subfamily A member 4, metabol, "Simvastatin"),(cytochrome P450 family 3 subfamily A member 4, metabol, "Clarithromycin")]


Story for PMID 17381388: [("Creatine", associated, "Muscle Weakness"),("Erythromycin", associated, "Rhabdomyolysis"),("Simvastatin", associated, "Rhabdomyolysis"),("Creatine", associated, "Movement Disorders"),("Macrolides", associated, "Rhabdomyolysis"),("Clarithromycin", associated, "Rhabdomyolysis"),("Simvastatin", interacts, ATP binding cassette subfamily B member 1),("Simvastatin", inhibits, ATP binding cassette subfa