# ASER Normalization

In [None]:
import re
import argparse
import normalization
import networkx as nx
from tqdm import tqdm
from random import sample
from collections import Counter

In [None]:
def normalize_personal_words(G: nx.DiGraph, show_possessive=True, n_info=10):
    '''
    TODO: 
    -how to utilize all the "info" of a given node? (currently only the first is used/sample "n_info" infos and use the one with largest freq)
    -examine _add_edge, the coref-info!
    '''
    nmlzer = normalization.ParsingBasedNormalizer()
    def _norm_node(node):
        node_attr = G.nodes[node]

        if isinstance(n_info, int) and n_info > 0:
            sampled_infos = [eval(sampled_info) for sampled_info in sample(node_attr['info'], min(len(node_attr['info']), n_info))]
            target_info = max(sampled_infos, key=lambda x: x['frequency'])
        else:
            # much faster
            first_info = eval(node_attr['info'][0])
            target_info = first_info

        tmp = nmlzer.get_personal_words(target_info)
        coref = nmlzer.node_person_coref(tmp, target_info)

        res = nmlzer.get_norm_node(node, coref, show_possessive)
        new_node, p2i = res['norm_node'], res['p2i']
        return new_node, p2i, coref, target_info

    def _add_node(G, G_norm, node, new_node, p2i, coref):
        node_attr = G.nodes[node]

        if G_norm.has_node(new_node): # update node freq & info
            G_norm.nodes[new_node]['freq'] += node_attr['freq']
            G_norm.nodes[new_node]['info'] = G_norm.nodes[new_node]['info'] | set(node_attr['info'])
        else: # add new node to graph, add personal coref info
            G_norm.add_node(new_node, freq=node_attr['freq'], info=set(node_attr['info']), \
                        people=coref, p2i=p2i)

    def _merge_rel_dict(d1: dict, d2: dict):
        d_merge = {}
        for key in set(d1.keys()) | set(d2.keys()):
            d_merge[key] = d1.get(key, 0) + d2.get(key, 0)
        return d_merge

    def _add_edge(G_norm, edge_attr, norm_head, norm_tail, pair_coref):
        # TODO: coref update?
        relations = edge_attr["relations"]
        if G_norm.has_edge(norm_head, norm_tail):
            coreference = G_norm.edges[norm_head, norm_tail]['coreference']
            coreference.add(str(pair_coref))    # update all pair coreference 
            G_norm.add_edge(norm_head, norm_tail,
                            relations=_merge_rel_dict(G_norm[norm_head][norm_tail]['relations'], relations),\
                            coreference=coreference)
        else:
            G_norm.add_edge(norm_head, norm_tail, relations=relations, coreference={str(pair_coref)})

    # process all nodes
    node2new_info = {}
    G_norm = nx.DiGraph()
    print('Adding normalized nodes to new graph...')
    for node in tqdm(G.nodes):
        new_node, p2i, coref, info = _norm_node(node)
        _add_node(G, G_norm, node, new_node, p2i, coref)
        node2new_info[node] = (new_node, coref, info)

    print('Adding edges to new graph...')
    for head, tail, edge_attr in tqdm(G.edges.data()):
        h_new_node, h_coref, h_info = node2new_info[head]
        t_new_node, t_coref, t_info = node2new_info[tail]

        # get pair coref
        pair_coref = nmlzer.pair_person_coref(h_coref, t_coref, h_info, t_info)
        _add_edge(G_norm, edge_attr, h_new_node, t_new_node, pair_coref)
    
    return G_norm


In [None]:
aser_path = '/home/data/jchengaj/aser_data/core_10.pickle'
output_path = '/home/data/jchengaj/aser_data/core_10_normed_poss.pickle'

In [None]:

aser = nx.read_gpickle(aser_path)
print('# node', len(aser.nodes))
print('# edge', len(aser.edges))

In [None]:
G_norm = normalize_personal_words(aser, show_possessive=True)
print('# node', len(G_norm.nodes))
print('# edge', len(G_norm.edges))
nx.write_gpickle(G_norm, output_path)