In [1]:
pwd

'/home/karish19471/btp/amr'

### Converting the AMR graphs into the required format

In [24]:
import glob
import networkx as nx
import penman
import amrlib
import pandas as pd
import penman
from penman import constant
from amrlib.graph_processing.annotator import add_lemmas
from amrlib.alignments.rbw_aligner import RBWAligner
from penman.models.noop import NoOpModel
import ast
import pickle
import os
import dgl
import json
import numpy as np
from sklearn.model_selection import train_test_split

In [25]:
DATASET = "politifact"

In [28]:
merged_amr = glob.glob(f"{DATASET}_amr/{DATASET}_amr_merge/*.amr.penman")
df = pd.read_csv(f'{DATASET}_amr/{DATASET}.tsv', sep = '\t')

In [29]:
def var2word(p_graph):
    v2w = {}
    for (source, _, target) in p_graph.instances():
        v2w[source] = target
    return v2w

In [35]:
def get_glove():
    glove = {}
    f = open('pca_embedding_8.txt')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        glove[word] = coefs
    return glove

In [36]:
def to_dict(d):
    return {i: {'feat':j} for i,j in d.items()}

In [37]:
def id2label(df):
    return dict(zip(df['id'], df['label']))

In [38]:
glove = get_glove()

In [39]:
i2l = id2label(df)

In [41]:
EMBEDDING_DIM = 8
dataset = []
for curr in merged_amr:
    p_graph = penman.load(curr, model = NoOpModel())[0]
    name = curr[curr.rfind('/')+1:curr.rfind('.amr')]
    v2w = var2word(p_graph)
    nx_graph = nx.MultiDiGraph()
    nx_graph.add_edges_from([(s, t) for s, _, t in p_graph.edges()])#TODO: Add edges from instances as well
    MAP = {i:glove.get(v2w[i], [0]*EMBEDDING_DIM) for i in nx_graph.nodes()}
    attr= to_dict(MAP)
    nx.set_node_attributes(nx_graph, attr)
    dgl_graph = dgl.from_networkx(nx_graph, node_attrs=['feat'])
    (source, target) = dgl_graph.edges()
    node_features = dgl_graph.ndata['feat'].numpy().tolist()
    graph = [[s.item(), 1, t.item()] for i, (s, t) in enumerate(list(zip(source, target)))]
    targets = [i2l[name]]
    sample = {'targets':targets, 'graph': graph, 'node_features':node_features}
    dataset.append(sample)


with open(f"{DATASET}_amr/{DATASET}.json", "w") as f:
    json.dump(dataset, f)

ignoring epigraph data for duplicate triple: ('c2-16', ':ARG1', 'c2-9')
ignoring epigraph data for duplicate triple: ('c13-0', ':ARG0', 'c13-20')
ignoring epigraph data for duplicate triple: ('c13-7', ':ARG0', 'c13-17')
ignoring epigraph data for duplicate triple: ('c15-0', ':topic', 'c15-6')
ignoring epigraph data for duplicate triple: ('c26-7', ':ARG1', 'c26-0')
ignoring epigraph data for duplicate triple: ('c12-5', ':name', 'c12-2')
ignoring epigraph data for duplicate triple: ('c37-0', ':ARG1', 'c37-17')
ignoring epigraph data for duplicate triple: ('c15-5', ':time', 'c15-2')
ignoring epigraph data for duplicate triple: ('c7-2', ':ARG1', 'c7-4')
ignoring epigraph data for duplicate triple: ('c38-5', ':ARG0', 'c38-4')
ignoring epigraph data for duplicate triple: ('c7-7', ':ARG1', 'c7-6')
ignoring epigraph data for duplicate triple: ('c25-26', ':ARG0', 'c25-17')
ignoring epigraph data for duplicate triple: ('c25-26', ':ARG1', 'c25-19')
ignoring epigraph data for duplicate triple: ('c

In [42]:
with open(f"{DATASET}_amr/{DATASET}.json", "r") as f:
    d = json.load(f)
df = pd.DataFrame(d)
train, test = train_test_split(df, stratify = df['targets'], test_size=0.10)

In [44]:
def data(mode, df):
    sample = []
    for i, j in df.iterrows():
        sample.append({'targets': j['targets'], 'graph':j['graph'], 'node_features': j['node_features']})
    with open(f"{DATASET}_amr/{DATASET}.{mode}.json", "w") as f:
        json.dump(sample, f)

In [45]:
data('train', train)
data('test', test)