In [1]:
import os
import pickle
import shutil
import gc
import bisect
import copy
import time
from tqdm import tqdm
from collections import defaultdict, Counter
from aser.database.db_connection import SqliteDBConnection, MongoDBConnection
from aser.database.kg_connection import ASERConceptConnection
from aser.database.kg_connection import CHUNKSIZE
from aser.database.kg_connection import EVENTUALITY_TABLE_NAME, EVENTUALITY_COLUMNS, EVENTUALITY_COLUMN_TYPES
from aser.database.kg_connection import RELATION_TABLE_NAME, RELATION_COLUMNS, RELATION_COLUMN_TYPES
from aser.conceptualize.aser_conceptualizer import ProbaseASERConceptualizer

from aser.eventuality import Eventuality
from aser.relation import Relation, relation_senses

In [7]:
kg_path = "/home/xliucr/ASER/database/core_2.0/all/KG.db"

kg_conn = SqliteDBConnection(kg_path, CHUNKSIZE)
erows = []
rrows = []

efreqs = dict()
for erow in kg_conn.get_columns(EVENTUALITY_TABLE_NAME, EVENTUALITY_COLUMNS):
    efreqs[erow["_id"]] = erow["frequency"]
    erows.append(erow)
    
rfreqs = dict()
for rrow in kg_conn.get_columns(RELATION_TABLE_NAME, RELATION_COLUMNS):
    rfreqs[rrow["_id"]] = sum([rrow.get(r, 0.0) for r in relation_senses])
    rrows.append(rrow)
    
kg_conn.close()

In [8]:
len(rfreqs)

52296498

In [2]:
def convert_row_to_eventuality(row):
    eventuality = Eventuality().decode(row["info"])
    eventuality.eid = row["_id"]
    eventuality.frequency = row["frequency"]
    eventuality.pattern = row["pattern"]
    return eventuality

def convert_row_to_relation(row):
    return Relation(row["hid"], row["tid"], {r: cnt for r, cnt in row.items() if isinstance(cnt, float) and cnt > 0.0})

def build_concept_instance_table(aser_conceptualizer, erows):
    cid2concept = dict()
    concept_instance_pairs = []
    cid_to_filter_score = dict()
    for erow in tqdm(erows):
        event = convert_row_to_eventuality(erow)
        results = aser_conceptualizer.conceptualize(event)
        for concept, score in results:
            if concept.cid not in cid2concept:
                cid2concept[concept.cid] = copy.copy(concept)
            concept = cid2concept[concept.cid]
            if (event.eid, event.pattern, score) not in concept.instances:
                concept.instances.append(((event.eid, event.pattern, score)))
                if concept.cid not in cid_to_filter_score:
                    cid_to_filter_score[concept.cid] = 0.0
                cid_to_filter_score[concept.cid] += score * event.frequency
            concept_instance_pairs.append((concept, event, score))
    return cid2concept, concept_instance_pairs, cid_to_filter_score

def build_concept_relation_table(aser_concept_conn, rrows):
    rid2relation = dict()
    hid2related_events = defaultdict(list)
    for rrow in rrows:
        relation = convert_row_to_relation(rrow)
        hid2related_events[rrow["hid"]].append((rrow["tid"], relation))
        
    for h_cid in tqdm(aser_concept_conn.cids):
        instances = aser_concept_conn.get_eventualities_given_concept(h_cid)
        for h_eid, pattern, instance_score in instances:
            # eid -> event -> related eids -> related events, relations -> related concepts, relations
            related_events = hid2related_events[h_eid]
            for t_eid, relation in related_events:
                concept_score_pairs = aser_concept_conn.get_concepts_given_eventuality(t_eid)
                for t_concept, score in concept_score_pairs:
                    t_cid = t_concept.cid
                    if h_cid == t_cid:
                        continue
                    rid = Relation.generate_rid(h_cid, t_cid)
                    if rid not in rid2relation:
                        rid2relation[rid] = Relation(h_cid, t_cid)
                    rid2relation[rid].update(
                        {k: v * instance_score * score for k, v in relation.relations.items()})
    return rid2relation

In [3]:
aser_conceptualizer = ProbaseASERConceptualizer(
    probase_path="/home/xliucr/probase/data-concept-instance-relations-demo.txt",
    probase_topk=5
)

[probase-concept] Loading Probase files...
[probase-concept] Building index...


100%|██████████| 33377320/33377320 [03:04<00:00, 180769.39it/s]


[probase-concept] Loading data finished in 198.48 s


In [None]:
for threadshold in [50, 30, 20, 10, 5, 3]:
    st = time.time()
    print("threadshold", threadshold)
    new_erows = list(filter(lambda erow: erow["frequency"] >= threadshold, erows))
    new_eids = set([erow["_id"] for erow in new_erows])
    new_rrows = list(filter(lambda rrow: rrow["hid"] in new_eids and rrow["tid"] in new_eids, rrows))
    print("\t# eventualities", sum([erow["frequency"] for erow in new_erows]))
    print("\t# unique eventualities", len(new_erows))
    print("\t# relations", sum([rfreqs[rrow["_id"]] for rrow in new_rrows]))
    print("\t# unique relations", len(new_rrows))
    
    if not os.path.exists("/home/xliucr/ASER/database/core_2.0/%d/KG.db" % (threadshold)):
        new_kg_conn = SqliteDBConnection("/home/xliucr/ASER/database/core_2.0/%d/KG.db" % (threadshold), CHUNKSIZE)
        for table_name, columns, column_types in zip(
            [EVENTUALITY_TABLE_NAME, RELATION_TABLE_NAME],
            [EVENTUALITY_COLUMNS, RELATION_COLUMNS],
            [EVENTUALITY_COLUMN_TYPES, RELATION_COLUMN_TYPES]):
            if len(columns) == 0 or len(column_types) == 0:
                raise NotImplementedError("Error: %s_columns and %s_column_types must be defined" % (table_name, table_name))
            try:
                new_kg_conn.create_table(table_name, columns, column_types)
            except BaseException as e:
                print(e)
        new_kg_conn.insert_rows(EVENTUALITY_TABLE_NAME, new_erows)
        new_kg_conn.insert_rows(RELATION_TABLE_NAME, new_rrows)
        new_kg_conn.close()
    
    cid2concept, concept_instance_pairs, cid_to_filter_score = \
        build_concept_instance_table(aser_conceptualizer, new_erows)
    print("\t# unique concepts", len(cid2concept))
    print("\t# unique concept-event relations", len(concept_instance_pairs))
    
    concept_conn = ASERConceptConnection("/home/xliucr/ASER/database/core_2.0/%d/concept.db" % (threadshold), mode="memory")
    
    with open("/home/xliucr/ASER/database/core_2.0/%d/concept_cids.txt" % (threadshold), "w") as f:
        for cid, filter_score in cid_to_filter_score.items():
            f.write(cid + "\t" + "{:.2f}".format(filter_score) + "\n")
    concept_conn.insert_concepts(list(cid2concept.values()))
    concept_conn.insert_concept_instance_pairs(concept_instance_pairs)
    
    rid2relation = build_concept_relation_table(concept_conn, new_rrows)
    print("\t# unique concept-concept relations", len(rid2relation))
    
    with open("/home/xliucr/ASER/database/core_2.0/%d/concept_rids.txt" % (threadshold), "w") as f:
        for rid, relation in rid2relation.items():
            filter_score = sum(relation.relations.values())
            f.write(rid + "\t" + "{:.2f}".format(filter_score) + "\n")
    concept_conn.insert_relations(rid2relation.values())
    concept_conn.close()
    
    print("\t", time.time()-st)
    del new_erows
    del new_rrows
    del new_eids
    del cid2concept
    del concept_instance_pairs
    del cid_to_filter_score
    del rid2relation
    gc.collect()

threadshold 50
	# eventualities 342798972.0
	# unique eventualities 765366
	# relations 75931308.58750157
	# unique relations 14952353


100%|██████████| 765366/765366 [16:14<00:00, 785.41it/s] 


	# unique concepts 493341
	# unique concept-event relations 2137477


100%|██████████| 493341/493341 [12:21<00:00, 665.41it/s]   


	# unique concept-concept relations 26903977
	 4155.465067863464
threadshold 30
	# eventualities 366048173.0
	# unique eventualities 1384852
	# relations 83943931.2210394
	# unique relations 17317282


100%|██████████| 1384852/1384852 [33:14<00:00, 694.25it/s]  


	# unique concepts 999173
	# unique concept-event relations 4177712


100%|██████████| 999173/999173 [15:42<00:00, 1060.14it/s]  


	# unique concept-concept relations 35657024
	 6386.893039464951
threadshold 20
	# eventualities 387674191.0
	# unique eventualities 2296550
	# relations 92131607.5083853
	# unique relations 19411914


100%|██████████| 2296550/2296550 [1:13:55<00:00, 517.80it/s]   


	# unique concepts 1895173
	# unique concept-event relations 7445421


100%|██████████| 1895173/1895173 [19:57<00:00, 1582.71it/s] 


	# unique concept-concept relations 45070336
	 10522.566078662872
threadshold 10
	# eventualities 430580143.0
	# unique eventualities 5547833
	# relations 109973223.77938814
	# unique relations 23759575


100%|██████████| 5547833/5547833 [4:39:43<00:00, 330.55it/s]    


	# unique concepts 5470955
	# unique concept-event relations 20238811


100%|██████████| 5470955/5470955 [35:33<00:00, 2563.95it/s]   


	# unique concept-concept relations 70836040
	 26441.35639357567
threadshold 5
	# eventualities 483074809.0
	# unique eventualities 13766746
	# relations 134482973.44682288
	# unique relations 30309830


100%|██████████| 13766746/13766746 [21:41:23<00:00, 176.31it/s]    


	# unique concepts 15640017
	# unique concept-event relations 56316417


100%|██████████| 15640017/15640017 [1:15:14<00:00, 3464.43it/s]  


	# unique concept-concept relations 127825587
