In [2]:
import os
import pickle
import shutil
import gc
import bisect
import copy
import time
from tqdm import tqdm
from collections import defaultdict, Counter
from aser.database.base import SqliteDBConnection, MongoDBConnection
from aser.database.kg_connection import CHUNKSIZE
from aser.database.kg_connection import EVENTUALITY_TABLE_NAME, EVENTUALITY_COLUMNS, EVENTUALITY_COLUMN_TYPES
from aser.database.kg_connection import RELATION_TABLE_NAME, RELATION_COLUMNS, RELATION_COLUMN_TYPES
from aser.concept.concept_extractor import ASERConceptExtractor
from aser.concept.concept_connection import ASERConceptConnection, CONCEPT_TABLE_NAME, CONCEPT_COLUMNS, CONCEPT_COLUMN_TYPES
from aser.eventuality import Eventuality
from aser.relation import Relation, relation_senses

In [3]:
datasets = ["yelp", "nyt", "wikipedia", "reddit", "subtitles", "gutenberg"]
select_eventualities = "SELECT %s FROM %s;" % (",".join(["_id", "pattern", "frequency"]), EVENTUALITY_TABLE_NAME)
select_relations = "SELECT %s FROM %s;" % (",".join(["_id"] + relation_senses), RELATION_TABLE_NAME)
select_concepts = "SELECT %s FROM %s;" % (",".join(["_id", "pattern", "info"]), CONCEPT_TABLE_NAME)

### Data Statistics

In [12]:
import json
import os
from multiprocessing import Pool

def iter_files(path):
    """Walk through all files located under a root path."""
    if os.path.isfile(path):
        yield path
    elif os.path.isdir(path):
        for dirpath, _, filenames in os.walk(path):
            for f in filenames:
                yield os.path.join(dirpath, f)
    else:
        raise RuntimeError('Path %s is invalid' % path)
        
def get_cnt(filename):
    sent_cnt, token_cnt = 0, 0
    with open(filename, "r") as f:
        line = f.readline()
        for line in f:
            line = json.loads(line)
            sent_cnt += 1
            token_cnt += len(line["tokens"])
    return sent_cnt, token_cnt

def get_cnt_parallel(filenames):
    sent_cnt, token_cnt = 0, 0
    with Pool(os.cpu_count()-2) as pool:
        results = list()
        for filename in filenames:
            results.append(pool.apply_async(get_cnt, args=(filename,)))
        pool.close()
        for x in results:
            x = x.get()
            sent_cnt += x[0]
            token_cnt += x[1]
    
    return sent_cnt, token_cnt

In [None]:
totoal_file_cnt, total_sent_cnt, total_token_cnt = 0, 0, 0
for data in datasets:
    data_path = "/home/data/corpora/aser/data/%s/parsed_para/" % (data)
    print(data_path)
    
    files = list(filter(lambda f: f.endswith(".jsonl"), iter_files(data_path)))
    sent_cnt, token_cnt = get_cnt_parallel(files)
    
    totoal_file_cnt += file_cnt
    total_sent_cnt += sent_cnt
    total_token_cnt += token_cnt
    
    print("# number files", len(files))
    print("# number sentences", sent_cnt)
    print("# number tokens", token_cnt)

In [14]:
nyt_path = "/home/data/corpora/aser/data/nyt/parsed_para/"

nyt_files = list(filter(lambda f: f.endswith(".jsonl"), iter_files(nyt_path)))

nyt_sent_cnt, nyt_token_cnt = get_cnt_parallel(nyt_files)

print("# number files", len(nyt_files))
print("# number sentences", nyt_sent_cnt)
print("# number tokens", nyt_token_cnt)

# number files 1827496
# number sentences 49813108
# number tokens 1179368352


In [15]:
yelp_path = "/home/data/corpora/aser/data/yelp/parsed_para/"

yelp_files = list(filter(lambda f: f.endswith(".jsonl"), iter_files(yelp_path)))

yelp_sent_cnt, yelp_token_cnt = get_cnt_parallel(yelp_files)

print("# number files", len(yelp_files))
print("# number sentences", yelp_sent_cnt)
print("# number tokens", yelp_token_cnt)

# number files 2224
# number sentences 54484346
# number tokens 838760304


In [16]:
wikipedia_path = "/home/data/corpora/aser/data/wikipedia/parsed_para/"

wikipedia_files = list(filter(lambda f: f.endswith(".jsonl"), iter_files(wikipedia_path)))

wikipedia_sent_cnt, wikipedia_token_cnt = get_cnt_parallel(wikipedia_files)

print("# number files", len(wikipedia_files))
print("# number sentences", wikipedia_sent_cnt)
print("# number tokens", wikipedia_token_cnt)

# number files 5620161
# number sentences 110612107
# number tokens 2435386651


In [17]:
reddit_path = "/home/data/corpora/aser/data/reddit/parsed_para/"

reddit_files = list(filter(lambda f: f.endswith(".jsonl"), iter_files(reddit_path)))

reddit_sent_cnt, reddit_token_cnt = get_cnt_parallel(reddit_files)

print("# number files", len(reddit_files))
print("# number sentences", reddit_sent_cnt)
print("# number tokens", reddit_token_cnt)

# number files 994
# number sentences 253633883
# number tokens 3371328232


In [18]:
gutenberg_path = "/home/data/corpora/aser/data/gutenberg/parsed_para/"

gutenberg_files = list(filter(lambda f: f.endswith(".jsonl"), iter_files(gutenberg_path)))

gutenberg_sent_cnt, gutenberg_token_cnt = get_cnt_parallel(gutenberg_files)

print("# number files", len(gutenberg_files))
print("# number sentences", gutenberg_sent_cnt)
print("# number tokens", gutenberg_token_cnt)

# number files 187350
# number sentences 210630737
# number tokens 3610017037


In [19]:
subtitles_path = "/home/data/corpora/aser/data/subtitles/parsed_para/"

subtitles_files = list(filter(lambda f: f.endswith(".jsonl"), iter_files(subtitles_path)))

subtitles_sent_cnt, subtitles_token_cnt = get_cnt_parallel(subtitles_files)

print("# number files", len(subtitles_files))
print("# number sentences", subtitles_sent_cnt)
print("# number tokens", subtitles_token_cnt)

# number files 1249
# number sentences 444562019
# number tokens 3229353900


### Core

In [2]:
kg_path = "/home/xliucr/ASER/database/core_2.0/all/KG.db"
kg_conn = SqliteDBConnection(kg_path, CHUNKSIZE)

pattern_ctr1 = Counter() # number of eventualities
pattern_ctr2 = Counter() # number of unique eventualities
efreqs = Counter()

for erow in kg_conn._conn.execute(select_eventualities):
    eid, pattern, freq = erow
    efreqs[eid] += freq
    pattern_ctr1[pattern] += freq
    pattern_ctr2[pattern] += 1

print("number of eventualities", sum(efreqs.values()))
print("number of unique eventualities", len(efreqs))
for i, pattern in enumerate(sorted(pattern_ctr1.keys())):
    print("No. %d pattern %s" % (i, pattern))
    print("\tnumber of eventualities", pattern_ctr1[pattern])
    print("\tnumber of unique eventualities", pattern_ctr2[pattern])
    
kg_conn.close()
del kg_conn
gc.collect()

number of eventualities 579123901.0
number of unique eventualities 52940258
No. 0 pattern s-be-a
	number of eventualities 52068570.0
	number of unique eventualities 3733978
No. 1 pattern s-be-o
	number of eventualities 49979659.0
	number of unique eventualities 6337042
No. 2 pattern s-v
	number of eventualities 260663083.0
	number of unique eventualities 14337769
No. 3 pattern s-v-a
	number of eventualities 5951980.0
	number of unique eventualities 752468
No. 4 pattern s-v-be-a
	number of eventualities 1035864.0
	number of unique eventualities 123263
No. 5 pattern s-v-be-o
	number of eventualities 909250.0
	number of unique eventualities 184298
No. 6 pattern s-v-o
	number of eventualities 139031585.0
	number of unique eventualities 18100360
No. 7 pattern s-v-o-be-a
	number of eventualities 100917.0
	number of unique eventualities 18793
No. 8 pattern s-v-o-be-o
	number of eventualities 95815.0
	number of unique eventualities 22411
No. 9 pattern s-v-o-o
	number of eventualities 2765728.0

In [12]:
kg_path = "/home/xliucr/ASER/database/core_2.0/all/KG.db"
kg_conn = SqliteDBConnection(kg_path, CHUNKSIZE)

rfreqs = Counter()
relation_ctr1 = Counter() # number of relations
relation_ctr2 = Counter() # number of unique relations
relation_ctr3 = Counter() # number of major relations

for rrow in kg_conn._conn.execute(select_relations):
    sum_freq = 0.0
    max_r, max_freq = -1, -1
    for r, freq in filter(lambda x: x[1] > 0, zip(range(15), rrow[1:])):
        sum_freq += freq
        relation_ctr1[r] += freq
        relation_ctr2[r] += 1
        if freq > max_freq and r != 14:
            max_r, max_freq = r, freq
        elif r == 14 and max_r == -1:
            max_r, max_freq = r, freq
    relation_ctr3[max_r] += 1
    rfreqs[rrow[0]] += sum_freq
    
for r in list(relation_ctr1.keys()):
    relation_ctr1[relation_senses[r]] = relation_ctr1.pop(r)
for r in list(relation_ctr2.keys()):
    relation_ctr2[relation_senses[r]] = relation_ctr2.pop(r)
for r in list(relation_ctr3.keys()):
    relation_ctr3[relation_senses[r]] = relation_ctr3.pop(r)

print("number of relations", sum(rfreqs.values()))
print("number of unique relations", len(rfreqs))
for i, sense in enumerate(relation_senses):
    print("No. %d relation %s" % (i, sense))
    print("\tnumber of relations", relation_ctr1[sense])
    print("\tnumber of unique relations", relation_ctr2[sense])
    print("\tnumber of major relations", relation_ctr3[sense])
    
kg_conn.close()
del kg_conn
gc.collect()

number of relations 187018926.27712885
number of unique relations 52296498
No. 0 relation Precedence
	number of relations 2662402.502265151
	number of unique relations 1790016
	number of major relations 1646487
No. 1 relation Succession
	number of relations 886728.4309463338
	number of unique relations 663183
	number of major relations 617887
No. 2 relation Synchronous
	number of relations 4457023.550430128
	number of unique relations 3123042
	number of major relations 2993862
No. 3 relation Reason
	number of relations 2466643.582709942
	number of unique relations 2205076
	number of major relations 2072942
No. 4 relation Result
	number of relations 2422963.728729172
	number of unique relations 2012311
	number of major relations 1826248
No. 5 relation Condition
	number of relations 5507802.535175915
	number of unique relations 3160271
	number of major relations 3000772
No. 6 relation Contrast
	number of relations 8097385.8756461255
	number of unique relations 8655661
	number of major re

0

### Full

In [None]:
pattern_ctr = {data: Counter() for data in datasets} # pattern counter, number of eventualities
pattern_eids = {data: dict() for data in datasets} # pattern eids, number of unique eventualities

pattern_ctr["overall"] = Counter()
pattern_eids["overall"] = dict()

overall_num_events = 0
overall_eids = set()

select_eventualities = "SELECT %s FROM %s;" % (",".join(["_id", "pattern", "frequency"]), EVENTUALITY_TABLE_NAME)

for data in datasets:
    for kg_path in [
        "/home/data/corpora/aser/database/full_0.3/%s/KG1.db" % (data),
        "/home/data/corpora/aser/database/full_0.3/%s/KG2.db" % (data),
        "/home/data/corpora/aser/database/full_0.3/%s/KG3.db" % (data),
        "/home/data/corpora/aser/database/full_0.3/%s/KG4.db" % (data),]:
        print(kg_path)
        kg_conn = SqliteDBConnection(kg_path, CHUNKSIZE)

        for erow in kg_conn._conn.execute(select_eventualities):
            eid, pattern, frequency = erow
            
            pattern_ctr[data][pattern] += frequency
            
            if pattern not in pattern_eids[data]:
                pattern_eids[data][pattern] = set([eid])
            else:
                pattern_eids[data][pattern].add(eid)

        kg_conn.close()

        del kg_conn
        gc.collect()
    
    print("dataset", data)
    num_events = sum(pattern_ctr[data].values())
    pattern_ctr["overall"].update(pattern_ctr[data])
    overall_num_events += num_events
    print("number of eventualities", num_events)
    num_unique_events = sum([len(pattern_eids[data][pattern]) for pattern in pattern_eids[data]])
    print("number of unique eventualities", num_unique_events)
    for i, pattern in enumerate(sorted(pattern_ctr[data].keys())):
        print("No. %d pattern %s" % (i, pattern))
        print("\tnumber of eventualities", pattern_ctr[data][pattern])
        
        if pattern not in pattern_eids["overall"]:
            pattern_eids["overall"][pattern] = copy.copy(pattern_eids[data][pattern])
        else:
            pattern_eids["overall"][pattern].update(pattern_eids[data][pattern])
        print("\tnumber of unique eventualities", len(pattern_eids[data][pattern]))
    print()
        
print("overall")
print("number of eventualities", overall_num_events)
for pattern in pattern_eids["overall"]:
    overall_eids.update(pattern_eids["overall"][pattern])
print("number of unique eventualities", len(overall_eids))
for i, pattern in enumerate(sorted(pattern_ctr["overall"].keys())):
    print("No. %d pattern %s" % (i, pattern))
    print("\tnumber of eventualities", pattern_ctr["overall"][pattern])
    print("\tnumber of unique eventualities", len(pattern_eids["overall"][pattern]))

In [3]:
rid2relations = {data: dict() for data in datasets}
sense_ctr = {data: Counter() for data in datasets} # number of relations
unique_sense_ctr = {data: Counter() for data in datasets} # number of unique relations
major_sense_ctr = {data: Counter() for data in datasets} # number of major relations

rid2relations["overall"] = dict()
sense_ctr["overall"] = Counter()
unique_sense_ctr["overall"] = Counter()
major_sense_ctr["overall"] = Counter()
overall_num_relations = 0

for data in datasets:
    for kg_path in [
    "/home/data/corpora/aser/database/full_0.3/%s/KG1.db" % (data),
    "/home/data/corpora/aser/database/full_0.3/%s/KG2.db" % (data),
    "/home/data/corpora/aser/database/full_0.3/%s/KG3.db" % (data),
    "/home/data/corpora/aser/database/full_0.3/%s/KG4.db" % (data),]:
        
        print(kg_path)
        kg_conn = SqliteDBConnection(kg_path, CHUNKSIZE)

    #     for rrow in kg_conn.get_columns(RELATION_TABLE_NAME, ["_id"] + relation_senses):
        for rrow in kg_conn._conn.execute(select_relations):
            rid = rrow[0]
            if rid not in rid2relations[data]:
                rid2relations[data][rid] = list(rrow[1:])
            else:
                relation = rid2relations[data][rid]
                for r, rf in filter(lambda x: x[1] > 0.0, zip(range(15), rrow[1:])):
                    relation[r] += rf

        kg_conn.close()

        del kg_conn
        gc.collect()
    
    for rid, relation in rid2relations[data].items():
        max_r, max_freq = -1, -1
        for r, freq in filter(lambda x: x[1] > 0, zip(range(15), relation)):
            sense_ctr[data][r] += freq
            unique_sense_ctr[data][r] += 1
            if max_freq < freq and r != 14:
                max_r, max_freq = r, freq
            elif r == 14 and max_r is -1:
                max_r, max_freq = r, freq
        major_sense_ctr[data][max_r] += 1
        
        if rid not in rid2relations["overall"]:
            rid2relations["overall"][rid] = copy.copy(relation)
        else:
            for r, freq in filter(lambda x: x[1] > 0, zip(range(15), relation)):
                rid2relations["overall"][rid][r] += freq
            
    num_relations = sum(sense_ctr[data].values())
    overall_num_relations += num_relations
    print("number of relations", num_relations)
    num_unique_relations = sum(unique_sense_ctr[data].values())
    print("number of unique relations", num_unique_relations)
    num_major_relations = len(rid2relations[data])
    print("number of major relations", num_major_relations)

    for r in range(15):
        sense = relation_senses[r]
        print("No. %d sense %s" % (r, sense))
        print("\tnumber of relations", sense_ctr[data][r])
        print("\tnumber of unique relations", unique_sense_ctr[data][r])
        print("\tnumber of major relations", major_sense_ctr[data][r])
    print()
    
for rid, relation in rid2relations["overall"].items():
    max_r, max_freq = -1, -1
    for r, freq in filter(lambda x: x[1] > 0, zip(range(15), relation)):
        sense_ctr["overall"][r] += freq
        unique_sense_ctr["overall"][r] += 1
        if max_freq < freq and r != 14:
            max_r, max_freq = r, freq
        elif r == 14 and max_r == -1:
            max_r, max_freq = r, freq
    major_sense_ctr["overall"][max_r] += 1
    
print("overall")
print("number of relations", overall_num_relations)
overall_num_unique_relations = sum(unique_sense_ctr["overall"].values())
print("number of unique relations", overall_num_unique_relations)
overall_num_major_relations = len(rid2relations["overall"])
print("number of major relations", overall_num_major_relations)

for r in range(15):
    sense = relation_senses[r]
    print("No. %d sense %s" % (r, sense))
    print("\tnumber of relations", sense_ctr["overall"][r])
    print("\tnumber of unique relations", unique_sense_ctr["overall"][r])
    print("\tnumber of major relations", major_sense_ctr["overall"][r])
    
gc.collect()

/home/data/corpora/aser/database/full_0.3/yelp/KG1.db
/home/data/corpora/aser/database/full_0.3/yelp/KG2.db
/home/data/corpora/aser/database/full_0.3/yelp/KG3.db
/home/data/corpora/aser/database/full_0.3/yelp/KG4.db
number of relations 63056766.999986336
number of unique relations 73984410
number of major relations 55590249
No. 0 sense Precedence
	number of relations 530404.9999997587
	number of unique relations 1292904
	number of major relations 1258669
No. 1 sense Succession
	number of relations 434492.99999983737
	number of unique relations 777841
	number of major relations 760382
No. 2 sense Synchronous
	number of relations 1390824.999997214
	number of unique relations 2265689
	number of major relations 2225467
No. 3 sense Reason
	number of relations 786552.0000005767
	number of unique relations 1311133
	number of major relations 1269853
No. 4 sense Result
	number of relations 966290.0000003658
	number of unique relations 2213494
	number of major relations 2146827
No. 5 sense Condi

0