In [1]:
result_path = 'results/simcyp298/'
min_support = 3

In [2]:
class FactStore:
    def __init__(self):
        self.unique_fact_id_counter = 0
        self.id_to_fact = {}
        self.fact_to_id = {}
        self.doc_to_facts = {}
       
        self.chem_id_to_span = {}
        self.dis_id_to_span = {}
        self.gen_id_to_span = {}
        
    def add_fact(self, doc_id, fact):
        key = frozenset(fact)
        if key in self.fact_to_id:
            unique_fact_id = self.fact_to_id[key]
        else:
            unique_fact_id = self.unique_fact_id_counter
            self.fact_to_id[key] = unique_fact_id
            self.unique_fact_id_counter += 1


        if doc_id not in self.doc_to_facts:
            self.doc_to_facts[doc_id] = set()

        self.doc_to_facts[doc_id].add(unique_fact_id)
        self.id_to_fact[unique_fact_id] = fact
        
    def print_info(self):
        print("---------------------------------------")
        print("Amount of ids   : {}".format(len(self.id_to_fact.keys())))
        print("Amount of facts : {}".format(len(self.fact_to_id.keys())))
        print("Amount of docs  : {}".format(len(self.doc_to_facts.keys())))
        print("Known chemicals : {}".format(len(self.chem_id_to_span.keys())))
        print("Known diseases  : {}".format(len(self.dis_id_to_span.keys())))
        print("Known genes     : {}".format(len(self.gen_id_to_span.keys())))
        print("---------------------------------------")

In [3]:
fact_store = FactStore()


with open(result_path + 'chemical_disease_association.tsv', 'r') as f:
    first = True
    for line in f:
        if first: # skip header
            first = False
            continue
        
        spl = line.replace('\n', '').split('\t')
        doc_id = spl[0]
        sen_id = spl[1]
        chem_id = spl[3]
        chem_span = spl[4]
        dis_id = spl[5]
        dis_span = spl[6]
        
        fact_store.chem_id_to_span[chem_id] = chem_span
        fact_store.dis_id_to_span[dis_id] = dis_span
        
        #fact = (chem_span, 'cd', dis_span)      
        fact = (chem_id, 'c_asso_d', dis_id)
        fact_store.add_fact(doc_id, fact)

fact_store.print_info()

with open(result_path + 'chemical_gene_interaction.tsv', 'r') as f:
    first = True
    for line in f:
        if first: # skip header
            first = False
            continue
        
        spl = line.replace('\n', '').split('\t')
        doc_id = spl[0]
        sen_id = spl[1]
        chem_id = spl[3]
        chem_span = spl[4]
        gen_id = spl[5]
        gen_span = spl[6]
        
        fact_store.chem_id_to_span[chem_id] = chem_span
        fact_store.gen_id_to_span[gen_id] = gen_span
                
        #fact = (chem_span, 'inh', gen_span)
        fact = (chem_id, 'c_inter_g', gen_id)
        fact_store.add_fact(doc_id, fact)

        
fact_store.print_info()

with open(result_path + 'gene_disease_interaction.tsv', 'r') as f:
    first = True
    for line in f:
        if first: # skip header
            first = False
            continue
        
        spl = line.replace('\n', '').split('\t')
        doc_id = spl[0]
        sen_id = spl[1]
        gen_id = spl[3]
        gen_span = spl[4]
        dis_id = spl[5]
        dis_span = spl[6]
        
        fact_store.gen_id_to_span[gen_id] = gen_span
        fact_store.dis_id_to_span[dis_id] = dis_span
               
        #fact = (chem_span, 'inh', gen_span)
        fact = (gen_id, 'g_inter_d', dis_id)
        fact_store.add_fact(doc_id, fact)

        
fact_store.print_info()


---------------------------------------
Amount of ids   : 325
Amount of facts : 325
Amount of docs  : 143
Known chemicals : 99
Known diseases  : 101
Known genes     : 0
---------------------------------------
---------------------------------------
Amount of ids   : 590
Amount of facts : 590
Amount of docs  : 236
Known chemicals : 160
Known diseases  : 101
Known genes     : 57
---------------------------------------
---------------------------------------
Amount of ids   : 733
Amount of facts : 733
Amount of docs  : 244
Known chemicals : 160
Known diseases  : 121
Known genes     : 71
---------------------------------------


Computing frequent occurring facts

In [6]:
to_check = []
ids_with_min_support = set()
for f_id in fact_store.id_to_fact.keys():
    support = 0
    # go through all documents
    for doc_facts in fact_store.doc_to_facts.values():
        if f_id in doc_facts:
             support += 1
    
    if support >= min_support:
        t_set = set()
        t_set.add(f_id)
        to_check.append(t_set)
        ids_with_min_support.add(f_id)
print(ids_with_min_support)

{128, 386, 388, 389, 390, 521, 15, 407, 28, 414, 671, 34, 35, 36, 418, 419, 420, 169, 42, 425, 426, 173, 174, 428, 176, 429, 50, 52, 438, 183, 60, 66, 67, 325, 326, 712, 75, 77, 464, 337, 339, 596, 87, 344, 346, 347, 92, 93, 94, 350, 607, 356, 359, 360, 361, 362, 363, 616, 365, 618, 367, 625, 374, 119, 126}


Computing frequent item sets 

In [7]:
results = []

explored_sets = set()
while to_check:
    # get fact candidate ids
    cand_ids_org = to_check.pop()
    for f_id in ids_with_min_support:
        if f_id in cand_ids_org:
            continue
        # check with this id included
        cand_ids = cand_ids_org.copy()
        cand_ids.add(f_id)
        
        # already checked this combi
        if frozenset(cand_ids) in explored_sets:
            continue
        
        #print("Starting with candidate ids: {}".format(cand_ids_org))
        # how much support does these ids have?
        support = 0
        # go through all documents
        for doc_facts in fact_store.doc_to_facts.values():
            included = True
            for f_id in cand_ids:
                if f_id not in doc_facts:
                    # if a fact id is not included - stop here (no support)
                    included = False
                    break
            if included:
                support += 1
        
        explored_sets.add(frozenset(cand_ids))
        
        if support >= min_support:
            results.append((cand_ids.copy(), support))
            to_check.append(cand_ids)
            print("Support {} for {}".format(support, cand_ids))



Support 3 for {712, 36}
Support 4 for {386, 671}
Support 3 for {326, 671}
Support 3 for {386, 326, 671}
Support 5 for {625, 36}
Support 3 for {625, 326}
Support 6 for {618, 50}
Support 5 for {618, 326}
Support 3 for {618, 50, 326}
Support 4 for {616, 50}
Support 4 for {616, 365}
Support 4 for {616, 50, 365}
Support 3 for {34, 596}
Support 3 for {464, 326}
Support 4 for {326, 438}
Support 3 for {426, 429}
Support 3 for {429, 326}
Support 3 for {426, 429, 326}
Support 3 for {428, 326}
Support 3 for {360, 428}
Support 3 for {360, 428, 326}
Support 3 for {426, 326}
Support 3 for {425, 326}
Support 5 for {420, 326}
Support 3 for {360, 420}
Support 3 for {419, 326}
Support 3 for {418, 326}
Support 3 for {36, 414}
Support 5 for {326, 414}
Support 3 for {326, 36, 414}
Support 3 for {326, 407}
Support 4 for {326, 390}
Support 6 for {389, 326}
Support 5 for {388, 326}
Support 7 for {386, 326}
Support 4 for {50, 365}
Support 6 for {365, 326}
Support 3 for {347, 365}
Support 3 for {363, 365}
Suppo

In [8]:
def facts_to_str(fact_store, facts):
    str_res = ""
    str_res += "["
    for f in facts:  
        if 'c_asso_d' is f[1]:
            c_name = fact_store.chem_id_to_span[f[0]]
            d_name = fact_store.dis_id_to_span[f[2]]
            str_res += '({}, associated, {})'.format(c_name, d_name) 
        if 'c_inter_g' is f[1]:
            c_name = fact_store.chem_id_to_span[f[0]]
            g_name = fact_store.gen_id_to_span[f[2]]
            str_res += '({}, interacts, {})'.format(c_name, g_name)
        if 'g_inter_d' is f[1]:
            g_name = fact_store.gen_id_to_span[f[0]]
            d_name = fact_store.dis_id_to_span[f[2]]
            str_res += '({}, interacts, {})'.format(g_name, d_name)


        if 'inh' is f[1]:
            c_name = fact_store.chem_id_to_span[f[0]]
            g_name = fact_store.gen_id_to_span[f[2]]
            str_res += '({}, inhibits, {})'.format(c_name, g_name)
        if 'meta' is f[1]:
            g_name = fact_store.gen_id_to_span[f[0]]
            c_name = fact_store.chem_id_to_span[f[2]]
            str_res += '({}, meta, {})'.format(g_name, c_name)
        
        str_res += ','
    str_res +=  "]"
    return str_res
            

results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
    
for res, support in results_sorted:
    facts = []
    for f_id in res:
        facts.append(fact_store.id_to_fact[f_id])
    print("Support {} for {}".format(support,facts_to_str(fact_store, facts)))
    print()

Support 15 for [(simvastatin, associated, rhabdomyolysis),(simvastatin, interacts, CYP3A4),]

Support 14 for [(atorvastatin, interacts, CYP3A4),(simvastatin, interacts, CYP3A4),]

Support 10 for [(cholesterol, interacts, CYP3A4),(simvastatin, interacts, CYP3A4),]

Support 8 for [(simvastatin, associated, myopathy),(simvastatin, interacts, CYP3A4),]

Support 7 for [(clopidogrel, interacts, CYP3A4),(simvastatin, interacts, CYP3A4),]

Support 7 for [(simvastatin, interacts, CYP3A4),(simvastatin, interacts, hydroxymethylglutarylXcoenzymeXAXXHMGXCoAXXreductase),]

Support 7 for [(simvastatin, interacts, CYP3A5),(simvastatin, interacts, CYP3A4),]

Support 6 for [(CYP3A4, interacts, myopathy),(simvastatin, associated, myopathy),]

Support 6 for [(diltiazem, interacts, CYP3A4),(simvastatin, interacts, CYP3A4),]

Support 6 for [(simvastatin, interacts, SLCO1B1),(simvastatin, interacts, CYP3A4),]

Support 5 for [(CYP3A4, interacts, rhabdomyolysis),(simvastatin, associated, rhabdomyolysis),]

Sup

In [9]:
len(results_sorted)

75

In [11]:
filename = result_path + 'stories_supp{}.tsv'.format(min_support)
with open(filename, 'w') as f:
    f.write('{}\t{}\n'.format('support', 'frequent item set'))
    for res, support in results_sorted:
        facts = set()
        for f_id in res:
            facts.add(fact_store.id_to_fact[f_id])
        f.write('{}\t{}\n'.format(support, facts))
print('Stories saved at {}'.format(filename))

Stories saved at results/simcyp298/stories_supp3.tsv


In [None]:
''