In [1]:
result_path = 'results/tafamidis97/'
min_support = 2

In [2]:
class FactStore:
    def __init__(self):
        self.unique_fact_id_counter = 0
        self.id_to_fact = {}
        self.fact_to_id = {}
        self.doc_to_facts = {}
       
        self.chem_id_to_span = {}
        self.dis_id_to_span = {}
        self.gen_id_to_span = {}
        
    def add_fact(self, doc_id, fact):
        key = frozenset(fact)
        if key in self.fact_to_id:
            unique_fact_id = self.fact_to_id[key]
        else:
            unique_fact_id = self.unique_fact_id_counter
            self.fact_to_id[key] = unique_fact_id
            self.unique_fact_id_counter += 1


        if doc_id not in self.doc_to_facts:
            self.doc_to_facts[doc_id] = set()

        self.doc_to_facts[doc_id].add(unique_fact_id)
        self.id_to_fact[unique_fact_id] = fact
        
    def print_info(self):
        print("---------------------------------------")
        print("Amount of ids   : {}".format(len(self.id_to_fact.keys())))
        print("Amount of facts : {}".format(len(self.fact_to_id.keys())))
        print("Amount of docs  : {}".format(len(self.doc_to_facts.keys())))
        print("Known chemicals : {}".format(len(self.chem_id_to_span.keys())))
        print("Known diseases  : {}".format(len(self.dis_id_to_span.keys())))
        print("Known genes     : {}".format(len(self.gen_id_to_span.keys())))
        print("---------------------------------------")

In [3]:
fact_store = FactStore()


with open(result_path + 'chemical_disease_association.tsv', 'r') as f:
    first = True
    for line in f:
        if first: # skip header
            first = False
            continue
        
        spl = line.replace('\n', '').split('\t')
        doc_id = spl[0]
        sen_id = spl[1]
        chem_id = spl[3]
        chem_span = spl[4]
        dis_id = spl[5]
        dis_span = spl[6]
        
        fact_store.chem_id_to_span[chem_id] = chem_span
        fact_store.dis_id_to_span[dis_id] = dis_span
        
        #fact = (chem_span, 'cd', dis_span)      
        fact = (chem_id, 'c_asso_d', dis_id)
        fact_store.add_fact(doc_id, fact)

fact_store.print_info()

with open(result_path + 'chemical_gene_interaction.tsv', 'r') as f:
    first = True
    for line in f:
        if first: # skip header
            first = False
            continue
        
        spl = line.replace('\n', '').split('\t')
        doc_id = spl[0]
        sen_id = spl[1]
        chem_id = spl[3]
        chem_span = spl[4]
        gen_id = spl[5]
        gen_span = spl[6]
        
        fact_store.chem_id_to_span[chem_id] = chem_span
        fact_store.gen_id_to_span[gen_id] = gen_span
                
        #fact = (chem_span, 'inh', gen_span)
        fact = (chem_id, 'c_inter_g', gen_id)
        fact_store.add_fact(doc_id, fact)

        
fact_store.print_info()

with open(result_path + 'gene_disease_interaction.tsv', 'r') as f:
    first = True
    for line in f:
        if first: # skip header
            first = False
            continue
        
        spl = line.replace('\n', '').split('\t')
        doc_id = spl[0]
        sen_id = spl[1]
        gen_id = spl[3]
        gen_span = spl[4]
        dis_id = spl[5]
        dis_span = spl[6]
        
        fact_store.gen_id_to_span[gen_id] = gen_span
        fact_store.dis_id_to_span[dis_id] = dis_span
               
        #fact = (chem_span, 'inh', gen_span)
        fact = (gen_id, 'g_inter_d', dis_id)
        fact_store.add_fact(doc_id, fact)

        
fact_store.print_info()


---------------------------------------
Amount of ids   : 17
Amount of facts : 17
Amount of docs  : 18
Known chemicals : 10
Known diseases  : 13
Known genes     : 0
---------------------------------------
---------------------------------------
Amount of ids   : 29
Amount of facts : 29
Amount of docs  : 29
Known chemicals : 17
Known diseases  : 13
Known genes     : 2
---------------------------------------
---------------------------------------
Amount of ids   : 104
Amount of facts : 104
Amount of docs  : 77
Known chemicals : 17
Known diseases  : 64
Known genes     : 12
---------------------------------------


Computing frequent occurring facts

In [7]:
to_check = []
ids_with_min_support = set()
for f_id in fact_store.id_to_fact.keys():
    support = 0
    # go through all documents
    for doc_facts in fact_store.doc_to_facts.values():
        if f_id in doc_facts:
             support += 1
    
    if support >= min_support:
        t_set = set()
        t_set.add(f_id)
        to_check.append(t_set)
        ids_with_min_support.add(f_id)
print(ids_with_min_support)

{3, 4, 6, 16, 17, 22, 25, 29, 30, 31, 32, 33, 36, 37, 38, 40, 41, 43, 48, 58, 64, 65, 66, 69, 93, 97, 100}


Computing frequent item sets 

In [8]:
results = []

explored_sets = set()
while to_check:
    # get fact candidate ids
    cand_ids_org = to_check.pop()
    for f_id in ids_with_min_support:
        if f_id in cand_ids_org:
            continue
        # check with this id included
        cand_ids = cand_ids_org.copy()
        cand_ids.add(f_id)
        
        # already checked this combi
        if frozenset(cand_ids) in explored_sets:
            continue
        
        #print("Starting with candidate ids: {}".format(cand_ids_org))
        # how much support does these ids have?
        support = 0
        # go through all documents
        for doc_facts in fact_store.doc_to_facts.values():
            included = True
            for f_id in cand_ids:
                if f_id not in doc_facts:
                    # if a fact id is not included - stop here (no support)
                    included = False
                    break
            if included:
                support += 1
        
        explored_sets.add(frozenset(cand_ids))
        
        if min_support >= 1:
            results.append((cand_ids.copy(), support))
            to_check.append(cand_ids)
            print("Support {} for {}".format(support, cand_ids))



Support 2 for {65, 29}
Support 2 for {48, 65}
Support 4 for {58, 30}
Support 3 for {58, 31}
Support 2 for {32, 58}
Support 2 for {41, 58}
Support 2 for {41, 58, 30}
Support 2 for {41, 58, 31}
Support 2 for {41, 58, 30, 31}
Support 2 for {58, 30, 31}
Support 4 for {48, 29}
Support 4 for {48, 31}
Support 3 for {48, 33}
Support 2 for {48, 38}
Support 2 for {43, 29}
Support 3 for {43, 31}
Support 5 for {41, 43}
Support 3 for {41, 43, 31}
Support 6 for {41, 29}
Support 2 for {41, 30}
Support 7 for {41, 31}
Support 2 for {41, 38}
Support 2 for {41, 30, 31}
Support 2 for {40, 29}
Support 2 for {3, 38}
Support 2 for {4, 38}
Support 8 for {29, 38}
Support 4 for {38, 30}
Support 7 for {38, 31}
Support 2 for {32, 38}
Support 2 for {37, 38}
Support 2 for {37, 38, 31}
Support 2 for {29, 38, 31}
Support 2 for {30, 38, 31}
Support 2 for {29, 38, 30}
Support 3 for {37, 31}
Support 2 for {33, 17}
Support 3 for {33, 29}
Support 3 for {33, 31}
Support 2 for {32, 33}
Support 2 for {32, 33, 17}
Support 2 f

In [9]:
def facts_to_str(fact_store, facts):
    str_res = ""
    str_res += "["
    for f in facts:  
        if 'c_asso_d' is f[1]:
            c_name = fact_store.chem_id_to_span[f[0]]
            d_name = fact_store.dis_id_to_span[f[2]]
            str_res += '({}, associated, {})'.format(c_name, d_name) 
        if 'c_inter_g' is f[1]:
            c_name = fact_store.chem_id_to_span[f[0]]
            g_name = fact_store.gen_id_to_span[f[2]]
            str_res += '({}, interacts, {})'.format(c_name, g_name)
        if 'g_inter_d' is f[1]:
            g_name = fact_store.gen_id_to_span[f[0]]
            d_name = fact_store.dis_id_to_span[f[2]]
            str_res += '({}, interacts, {})'.format(g_name, d_name)


        if 'inh' is f[1]:
            c_name = fact_store.chem_id_to_span[f[0]]
            g_name = fact_store.gen_id_to_span[f[2]]
            str_res += '({}, inhibits, {})'.format(c_name, g_name)
        if 'meta' is f[1]:
            g_name = fact_store.gen_id_to_span[f[0]]
            c_name = fact_store.chem_id_to_span[f[2]]
            str_res += '({}, meta, {})'.format(g_name, c_name)
        
        str_res += ','
    str_res +=  "]"
    return str_res
            

results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
    
for res, support in results_sorted:
    facts = []
    for f_id in res:
        facts.append(fact_store.id_to_fact[f_id])
    print("Support {} for {}".format(support,facts_to_str(fact_store, facts)))
    print()

Support 8 for [(TTR, interacts, autosomalXdominantXdisordersXofXfamilialXamyloidoticXpolyneuropathy),(TTR, interacts, neuropathy),]

Support 7 for [(TTR, interacts, polyneuropathy),(TTR, interacts, amyloidosis),]

Support 7 for [(TTR, interacts, neuropathy),(TTR, interacts, amyloidosis),]

Support 6 for [(TTR, interacts, polyneuropathy),(TTR, interacts, autosomalXdominantXdisordersXofXfamilialXamyloidoticXpolyneuropathy),]

Support 6 for [(TTR, interacts, familialXamyloidoticXcardiomyopathy),(TTR, interacts, amyloidosis),]

Support 6 for [(TTR, interacts, autosomalXdominantXdisordersXofXfamilialXamyloidoticXpolyneuropathy),(TTR, interacts, familialXamyloidoticXcardiomyopathy),]

Support 5 for [(TTR, interacts, polyneuropathy),(TTR, interacts, cardiacXinvolvement),]

Support 5 for [(TTR, interacts, autosomalXdominantXdisordersXofXfamilialXamyloidoticXpolyneuropathy),(TTR, interacts, amyloidosis),]

Support 4 for [(TTR, interacts, amyloidXneuropathy),(TTR, interacts, familialXamyloidotic

In [10]:
len(results_sorted)

65

In [11]:

with open(result_path + 'stories_supp2.tsv', 'w') as f:
    f.write('{}\t{}\n'.format('support', 'frequent item set'))
    for res, support in results_sorted:
        facts = set()
        for f_id in res:
            facts.add(fact_store.id_to_fact[f_id])
        f.write('{}\t{}\n'.format(support, facts))


In [None]:
''