In [1]:
import sys
from random import sample

import networkx as nx
import numpy as np
from tqdm import tqdm, trange

sys.path.append('../')
from utils.aser_to_glucose import generate_aser_to_glucose_dict
from utils.glucose_utils import glucose_subject_list

In [2]:
# Load the filtered ASER graph
aser = nx.read_gpickle('../../data/ASER_data//G_aser_core.pickle')
node2id_dict = np.load("../../dataset/ASER_core_node2id.npy", allow_pickle=True).item()
id2node_dict = dict([(node2id_dict[node], node) for node in node2id_dict])

In [3]:
# We test the coverage in the norm ASER
print_str = "\n\nStatistics in ASER_Norm:\n\n"
total_count, total_head, total_tail, total_both = 0, 0, 0, 0
for i in trange(1, 11):
    list_count, list_head, list_tail, list_both = len(glucose_matching[i]), 0, 0, 0
    for ind in range(len(glucose_matching[i])):
        for h in glucose_matching[i][ind]['total_head']:
            if h in node2id_dict.keys():
                list_head += 1
                break
        for t in glucose_matching[i][ind]['total_tail']:
            if t in node2id_dict.keys():
                list_tail += 1
                break
        for h, t in glucose_matching[i][ind]['both']:
            if h in node2id_dict.keys() and t in node2id_dict.keys():
                list_both += 1
                break
    print_str += (
        "In list {}, Total Head: {}\tMatched Head: {} ({}%)\tMatched Tail: {} ({}%)\tMatched Both: {} ({}%)\n"
            .format(i, list_count, list_head, round(list_head / list_count, 3) * 100,
                    list_tail, round(list_tail / list_count, 3) * 100, list_both,
                    round(list_both / list_count, 3) * 100))
    total_count += list_count
    total_head += list_head
    total_tail += list_tail
    total_both += list_both
print_str += (
    "\n\nIn total: Total Head: {}\tMatched Head: {} ({}%)\tMatched Tail: {} ({}%)\tMatched Both: {} ({}%)".format(
        total_count, total_head, 100 * round(total_head / total_count, 3),
        total_tail, 100 * round(total_tail / total_count, 3), total_both, 100 * round(total_both / total_count, 3)))
print(print_str)

100%|██████████| 10/10 [00:01<00:00,  5.77it/s]



Statistics in ASER_Norm:

In list 1, Total Head: 55388	Matched Head: 32439 (58.599999999999994%)	Matched Tail: 28957 (52.300000000000004%)	Matched Both: 18135 (32.7%)
In list 2, Total Head: 37127	Matched Head: 26271 (70.8%)	Matched Tail: 19019 (51.2%)	Matched Both: 14582 (39.300000000000004%)
In list 3, Total Head: 30456	Matched Head: 18534 (60.9%)	Matched Tail: 15615 (51.300000000000004%)	Matched Both: 9361 (30.7%)
In list 4, Total Head: 29269	Matched Head: 25547 (87.3%)	Matched Tail: 15445 (52.800000000000004%)	Matched Both: 13846 (47.3%)
In list 5, Total Head: 22011	Matched Head: 16544 (75.2%)	Matched Tail: 11867 (53.900000000000006%)	Matched Both: 9312 (42.3%)
In list 6, Total Head: 50312	Matched Head: 26641 (53.0%)	Matched Tail: 27359 (54.400000000000006%)	Matched Both: 15525 (30.9%)
In list 7, Total Head: 36613	Matched Head: 20034 (54.7%)	Matched Tail: 31474 (86.0%)	Matched Both: 17389 (47.5%)
In list 8, Total Head: 16183	Matched Head: 8372 (51.7%)	Matched Tail: 8628 (53.300000




In [4]:
# Do some ASER edge type statistics
all_edge_types = {}
for head, tail, feat_dict in aser.edges.data():
    for r in feat_dict["edge_type"]:
        if r in all_edge_types.keys():
            all_edge_types[r] += 1
        else:
            all_edge_types[r] = 1
print("Edge types in ASER:")
print(all_edge_types)

Edge types in ASER:
{'stative': 9307934, 'cause': 20596819, 'effect': 20596819}


In [5]:
def reverse_px_py(original: str):
    return original.replace("PersonX", "[PX]").replace("PersonY", "[PY]").replace("[PX]", "PersonY").replace(
        "[PY]", "PersonX")

In [6]:
def get_conceptualized_graph(G: nx.DiGraph):
    G_conceptualized = nx.DiGraph()
    for head, tail, feat_dict in tqdm(G.edges.data()):
        head = id2node_dict[head]
        tail = id2node_dict[tail]
        head_split = head.split()
        tail_split = tail.split()
        head_subj = head_split[0]
        tail_subj = tail_split[0]
        relations = feat_dict["edge_type"]
        for r in relations:
            if head_subj == tail_subj and head_subj in glucose_subject_list:
                new_rel = r + "_agent"
            elif head_subj != tail_subj and head_subj in glucose_subject_list and tail_subj in glucose_subject_list:
                new_rel = r + "_theme"
            else:
                new_rel = r + "_general"
            _, re_head, re_tail, _ = generate_aser_to_glucose_dict(head, tail, True)
            re_head_reverse, re_tail_reverse = reverse_px_py(re_head), reverse_px_py(re_tail)
            if len(re_head) > 0 and len(re_tail) > 0:
                if G_conceptualized.has_edge(re_head, re_tail):
                    G_conceptualized.add_edge(re_head, re_tail, relation=list(
                        set(G_conceptualized[re_head][re_tail]["relation"] + [new_rel])))
                else:
                    G_conceptualized.add_edge(re_head, re_tail, relation=[new_rel])
            if len(re_head_reverse) > 0 and len(re_tail_reverse) > 0:
                if G_conceptualized.has_edge(re_head_reverse, re_tail_reverse):
                    G_conceptualized.add_edge(re_head_reverse, re_tail_reverse, relation=list(
                        set(G_conceptualized[re_head_reverse][re_tail_reverse]["relation"] + [new_rel])))
                else:
                    G_conceptualized.add_edge(re_head_reverse, re_tail_reverse, relation=[new_rel])
    return G_conceptualized

In [7]:
aser_conceptualized = get_conceptualized_graph(aser)
print("Before Conceptualization:\nNumber of Edges: {}\tNumber of Nodes: {}\n".format(len(aser.edges), len(aser.nodes)))
print("After Conceptualization:\nNumber of Edges: {}\tNumber of Nodes: {}\n".format(len(aser_conceptualized.edges),
                                                                                    len(aser_conceptualized.nodes)))

100%|██████████| 24764534/24764534 [23:01<00:00, 17922.21it/s] 


Before Conceptualization:
Number of Edges: 24764534	Number of Nodes: 40339576

After Conceptualization:
Number of Edges: 41336290	Number of Nodes: 11872745



In [8]:
nx.write_gpickle(aser_conceptualized, '../../dataset/G_aser_concept.pickle')

In [9]:
# Let's sample some ASER conceptualization to check whether it's correct
for i in sample(list(aser_conceptualized.edges.data()), 30) + ['\n'] + sample(list(aser_conceptualized.nodes.data()), 10):
    print(i)

('the establishment be very clean', 'the food be amazingly fresh', {'relation': ['stative_general', 'effect_general', 'cause_general']})
('PersonY hope so', 'PersonY be count on it', {'relation': ['cause_agent', 'effect_agent']})
('PersonX want to talk to PersonY', 'PersonY go back to class', {'relation': ['effect_theme', 'cause_theme']})
('it be a poor workplace culture', 'PersonX be comfortable', {'relation': ['stative_general', 'cause_general']})
('PersonX be seat', 'the waitress sit PersonX', {'relation': ['effect_general', 'cause_general']})
('the queen be drain the treasury of state', 'the king indulge pleasure to excess', {'relation': ['effect_general', 'cause_general']})
('PeopleX will likely be permabann', 'PersonY upload a virus to site', {'relation': ['cause_general']})
('the port close', 'there be a sharp snap', {'relation': ['effect_general', 'cause_general']})
('PeopleX be fantastic', 'PersonX have be go there', {'relation': ['effect_general', 'cause_general']})
('buddhis

In [10]:
# Now let's calculate the shortest path
def get_shortest_path(G , head, tail):
    try:
        p = nx.shortest_path_length(G, source=head, target=tail)
    except nx.NodeNotFound:
        return -1
    except nx.NetworkXNoPath:
        return -1
    return p

In [11]:
full_path, norm_path = [], []
for i in range(1, 11):
    for ind in trange(len(glucose_matching[i])):
        norm_temp, full_temp = [], []
        for h, t in glucose_matching[i][ind]['both']:
            _, re_h, re_t, _ = generate_aser_to_glucose_dict(h, t, True)
            if re_h in aser_conceptualized and re_t in aser_conceptualized:
                norm_temp.append(get_shortest_path(aser_conceptualized, re_h, re_t))
        if norm_temp:
            try:
                norm_path.append(min([i for i in norm_temp if i > 0]))
            except ValueError:
                norm_path.append(0)
        else:
            norm_path.append(0)
        for h, t in glucose_matching[i][ind]['both']:
            try:
                hid = node2id_dict[h]
                tid = node2id_dict[t]
            except KeyError:
                continue
            if hid in aser and tid in aser:
                full_temp.append(get_shortest_path(aser, hid, tid))
        if full_temp:
            try:
                full_path.append(min([i for i in full_temp if i > 0]))
            except ValueError:
                full_path.append(0)
        else:
            full_path.append(0)
print("Average Shortest Path in Full ASER is: {}".format(np.mean([i for i in full_path if i > 0])))
print("Average Shortest Path in Norm ASER is: {}".format(np.mean([i for i in norm_path if i > 0])))

100%|██████████| 55388/55388 [01:30<00:00, 610.27it/s] 
100%|██████████| 37127/37127 [00:42<00:00, 873.48it/s] 
100%|██████████| 30456/30456 [02:23<00:00, 212.43it/s]
100%|██████████| 29269/29269 [01:16<00:00, 381.58it/s]
100%|██████████| 22011/22011 [00:38<00:00, 569.58it/s]
100%|██████████| 50312/50312 [01:14<00:00, 674.72it/s] 
100%|██████████| 36613/36613 [01:11<00:00, 509.46it/s]
100%|██████████| 16183/16183 [00:17<00:00, 939.47it/s] 
100%|██████████| 12855/12855 [00:36<00:00, 351.81it/s]
100%|██████████| 13885/13885 [00:23<00:00, 598.22it/s]

Average Shortest Path in Full ASER is: 2.717402138455962
Average Shortest Path in Norm ASER is: 2.5518495605427085





In [12]:
# Calculate the average path in a simple graph:
G_simple = nx.Graph()
G_simple.add_nodes_from(aser_conceptualized)
G_simple.add_edges_from(aser_conceptualized.edges.data())
G_simple_full = nx.Graph()
G_simple_full.add_nodes_from(aser)
G_simple_full.add_edges_from(aser.edges.data())

full_path, norm_path = [], []
for i in range(1, 11):
    for ind in trange(len(glucose_matching[i])):
        norm_temp, full_temp = [], []
        for h, t in glucose_matching[i][ind]['both']:
            _, re_h, re_t, _ = generate_aser_to_glucose_dict(h, t, True)
            if re_h in G_simple and re_t in G_simple:
                norm_temp.append(get_shortest_path(G_simple, re_h, re_t))
        if norm_temp:
            try:
                norm_path.append(min([i for i in norm_temp if i > 0]))
            except ValueError:
                norm_path.append(0)
        else:
            norm_path.append(0)
        for h, t in glucose_matching[i][ind]['both']:
            try:
                hid = node2id_dict[h]
                tid = node2id_dict[t]
            except KeyError:
                continue
            if hid in G_simple_full and tid in G_simple_full:
                full_temp.append(get_shortest_path(G_simple_full, hid, tid))
        if full_temp:
            try:
                full_path.append(min([i for i in full_temp if i > 0]))
            except ValueError:
                full_path.append(0)
        else:
            full_path.append(0)
print("In No Direction Scenario:")
print("Average Shortest Path in Full ASER is: {}".format(np.mean([i for i in full_path if i > 0])))
print("Average Shortest Path in Norm ASER is: {}".format(np.mean([i for i in norm_path if i > 0])))

100%|██████████| 55388/55388 [01:26<00:00, 638.28it/s] 
100%|██████████| 37127/37127 [00:36<00:00, 1022.80it/s]
100%|██████████| 30456/30456 [02:18<00:00, 219.31it/s]
100%|██████████| 29269/29269 [01:10<00:00, 414.29it/s]
100%|██████████| 22011/22011 [00:33<00:00, 648.41it/s] 
100%|██████████| 50312/50312 [01:09<00:00, 719.88it/s] 
100%|██████████| 36613/36613 [01:08<00:00, 533.21it/s]
100%|██████████| 16183/16183 [00:14<00:00, 1112.10it/s]
100%|██████████| 12855/12855 [00:32<00:00, 400.39it/s]
100%|██████████| 13885/13885 [00:21<00:00, 644.94it/s]


In No Direction Scenario:
Average Shortest Path in Full ASER is: 2.7156457656474635
Average Shortest Path in Norm ASER is: 2.5507897792026584


In [13]:
# Now let's start merging with Glucose
G_Glucose = nx.read_gpickle('../../dataset/G_Glucose.pickle')
print("Node Coverage for Glucose Graph is: {}%\nEdge Coverage for Glucose Graph is: {}%".format(
    100 * round(sum([node in aser_conceptualized for node in G_Glucose.nodes()]) / len(G_Glucose.nodes()), 4),
    100 * round(sum([edge in aser_conceptualized.edges for edge in G_Glucose.edges()]) / len(G_Glucose.edges()), 4)))

Node Coverage for Glucose Graph is: 57.25%
Edge Coverage for Glucose Graph is: 1.63%


In [14]:
print("Before Merging:\nEdges in ASER: {}\t\t\t\tNodes in ASER: {}\n".format(len(aser_conceptualized.edges()),
                                                                       len(aser_conceptualized.nodes())))
aser_conceptualized.add_nodes_from(list(G_Glucose.nodes.data()))
aser_conceptualized.add_edges_from(list(G_Glucose.edges.data()))
print("\nAfter Merging:\nEdges in ASER+Glucose: {}\t\t\tNodes in ASER+Glucose: {}".format(len(aser_conceptualized.edges()),
                                                                                      len(aser_conceptualized.nodes())))

Before Merging:
Edges in ASER: 41336290				Nodes in ASER: 11872745


After Merging:
Edges in ASER+Glucose: 41568751			Nodes in ASER+Glucose: 11912389


In [15]:
print("New Edges: {}\tNew Nodes: {}".format(len(aser_conceptualized.edges()) - 41336290,
                                            len(aser_conceptualized.nodes()) - 11872745))
nx.write_gpickle(aser_conceptualized, '../../dataset/G_aser_glucose.pickle')

New Edges: 232461	New Nodes: 39644
