<a href="https://colab.research.google.com/github/MuroriM/GEM-table-to-text-through-AMR/blob/main/AMR_linearizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AMR linear to Penman  
This code creates the corresponding AMR in Penman notation from a linearized AMR. It creates new AMR variables and adds the appropriate whitespace. It also checks for the validity of AMRs using the amrlib 'valid_amr' function.

In [None]:
!pip install amrlib



In [None]:
# remove files from previous sessions

!ls
!rm -r AMR test_web smatch
!ls

AMR  sample_data  test_web
rm: cannot remove 'smatch': No such file or directory
sample_data


In [None]:
# load dev, train, test, webnlg data

# /content/test_web/webnlg_dev_parsed.jsonl
# /content/test_web/webnlg_test_parsed.jsonl
# /content/test_web/webnlg_test_parsed.jsonl
# respective paths

!git clone https://github.com/MuroriM/test_web.git
# https://github.com/MuroriM/test_web

Cloning into 'test_web'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 5 (delta 1), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (5/5), done.


# Character-based AMR translation

In [None]:
!git clone https://github.com/RikVN/AMR.git
# https://github.com/RikVN/AMR

Cloning into 'AMR'...
remote: Enumerating objects: 127, done.[K
remote: Total 127 (delta 0), reused 0 (delta 0), pack-reused 127[K
Receiving objects: 100% (127/127), 209.88 KiB | 4.77 MiB/s, done.
Resolving deltas: 100% (54/54), done.


In [None]:
%cd /content/AMR

/content/AMR


In [None]:
!git clone https://github.com/snowblink14/smatch

Cloning into 'smatch'...
remote: Enumerating objects: 147, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 147 (delta 0), reused 2 (delta 0), pack-reused 144[K
Receiving objects: 100% (147/147), 74.56 KiB | 3.39 MiB/s, done.
Resolving deltas: 100% (79/79), done.


In [None]:
!pip install -r requirements.txt



In [None]:
# This step is important!     # I moved this code up here.. you can find the orig, a few cells down
# Though a temporary fix
!cp amr_utils.py restoreAMR
!cp best_amr_permutation.py restoreAMR
!cp var_free_amrs.py restoreAMR

In [None]:
## Modified version of var_free.amrs.py in order to remove newline characters

'''Script that removes variables from AMR by duplicating the information, possibly deletes wiki-links
   Presupposes that files have a certain extension (default .txt)

   Sample input:

   # ::snt Bob likes himself.

   (l / like
        :ARG0 (p / person :name "Bob")
        :ARG1 p)

    Output *.tf:

    (like :ARG0 (person :name "Bob") :ARG1 (person :name "Bob"))'''

import sys
import re
import argparse
import os
from amr_utils import write_to_file, remove_char_outside_quotes
   
############################################
############################################
   ## Modification from original code
############################################
# Added modules
import json
import re
############################################
############################################


def create_args_parser():
    '''Creating arg parser'''
    parser = argparse.ArgumentParser()
    parser.add_argument("-f", "--input_file", required=True, type=str, help="AMR file or folder")
    parser.add_argument('-fol', "--folder", action='store_true', help='Add to do multiple files in a folder - if not, args.f is a file')
    parser.add_argument('-a', "--amr_ext", default='.txt', type=str, help="Input files must have this extension (default .txt, only necesary when using -fol)")
    parser.add_argument('-o', '--output_ext', default='.tf', help="extension of output AMR files (default .tf)")
    parser.add_argument('-k', '--keep_wiki', action='store_true', help='Keep Wiki link when processing')
    args = parser.parse_args()
    return args


def single_line_convert(lines, sent_file):
    '''Convert AMRs to a single line, ignoring lines that start with "# ::"
      If a sentence file is specified we also try to get the sentences'''
    all_amrs, cur_amr, sents = [], [], []
    for line in lines:

        

        ############################################
        ############################################
        ## Modification from original code
        ############################################

        ## Remove new line characters
        line = line.replace("\\n", "")

        ############################################
        ############################################


        if not line.strip() and cur_amr:
            cur_amr_line = " ".join(cur_amr)
            all_amrs.append(cur_amr_line.strip())
            cur_amr = []
        elif line.startswith('# ::snt') or line.startswith('# ::tok'):
            # Save sentences as well (don't always need them)
            sent = re.sub('(^# ::(tok|snt))', '', line).strip() #remove # ::snt or # ::tok
            sents.append(sent)
        elif not line.startswith('#'):
            cur_amr.append(line.strip())
    # File did not end with newline, so add AMR here
    if cur_amr:
        all_amrs.append(" ".join(cur_amr).strip())

    # If we didn't find sentences, but we did have a sentence file, read the sentences from there (if possible)
    if not sents and sent_file:
        if os.path.isfile(sent_file):
            sents = [x.strip() for x in open(sent_file, 'r')]
            # Sanity check
            assert len(all_amrs) == len(sents), "{0} vs {1}".format(len(all_amrs), len(sents))
    return all_amrs, sents


def delete_wiki(input_file):
    '''Delete wiki links from AMRs'''
    no_wiki = []
    for line in open(input_file, 'r'):
        n_line = re.sub(r':wiki "(.*?)"', '', line, 1)
        n_line = re.sub(':wiki -', '', n_line)
        # Merge double whitespace but keep leading whitespace
        no_wiki.append((len(n_line) - len(n_line.lstrip())) * ' ' + ' '.join(n_line.split()))
    return no_wiki


def process_var_line(line, var_dict):
    '''Function that processes line with a variable in it. Returns the string without
       variables and the dictionary with var-name + var - value
       Only works if AMR is shown as multiple lines and input correctly!'''
    curr_var_name = False
    curr_var_value = False
    var_value = ''
    var_name = ''
    current_quotes = False
    for ch in line:
        # We start adding the variable value
        if ch == '/' and not current_quotes:
            curr_var_value = True
            curr_var_name = False
            var_value = ''
            continue
        # We start adding the variable name
        elif ch == '(' and not current_quotes:
            curr_var_name = True
            curr_var_value = False
            # We already found a name-value pair, add it now
            if var_value and var_name:
                # Remove closing brackets that were not in between quotes
                add_value = remove_char_outside_quotes(var_value.strip(), ')')
                # Now we have to check: if this previous item starts with ':', we remove it,
                # because that means it started a new part ( :name (n / name ..)
                if add_value.split()[-1].startswith(':'):
                    add_value = " ".join(add_value.split()[:-1])
                var_dict[var_name.strip()] = add_value
            var_name = ''
            continue
        # Check if we are currently within quotes
        elif ch == '"':
            current_quotes = not current_quotes

        # Add to variable name/value
        if curr_var_name:
            var_name += ch
        if curr_var_value:
            var_value += ch

    # Remove brackets that were not within quotes for final var value
    final_var = remove_char_outside_quotes(var_value, ')')
    # Save information to dictionary
    var_dict[var_name.strip()] = final_var
    # Remove variable information from the AMR line
    deleted_var_string = re.sub(r'\([a-zA-Z-_0-9]+[\d]? /', '(', line).replace('( ', '(')
    return deleted_var_string, var_dict


def delete_amr_variables(amrs):
    '''Function that deletes variables from AMRs'''
    full_var_dict = {}
    del_amr = []

    # First get the var dict
    for line in amrs:
        _, full_var_dict = process_var_line(line, full_var_dict)

    # Loop over AMRs to rewrite
    for line in amrs:
        if line.strip() and line[0] != '#':
            if '/' in line:
                # Found variable here
                # Get the deleted variable string and save
                deleted_var_string, _ = process_var_line(line, full_var_dict)
                del_amr.append(deleted_var_string)
            else:
                # Probable reference to variable here!
                split_line = line.split()
                ref_var = split_line[1].replace(')', '')

                # Check if the variable occurs in our dictionary
                if ref_var in full_var_dict:
                    # Get value to replace the variable name with
                    ref_value = full_var_dict[ref_var]
                    # Do the replacing and add brackets for alignment
                    split_line[1] = split_line[1].replace(ref_var, '(' + ref_value.strip() + ')')
                    n_line = (len(line) - len(line.lstrip())) * ' ' + " ".join(split_line)
                    del_amr.append(n_line)
                else:
                    # No reference found, add line without editing (usually there are numbers in this line)
                    del_amr.append(line)
        else:
            # Line with other info, just add
            del_amr.append(line)
    return del_amr


# def var_free_amrs(input_file, out_ext, keep_wiki):
#     '''Create variable-free AMRs and sentence files'''
#     # Delete wiki link if wanted
#     amr_no_wiki = delete_wiki(input_file) if not keep_wiki else [x.rstrip() for x in open(input_file, 'r')]
#     # Remove all variables by duplicating coreference nodes
#     del_amrs = delete_amr_variables(amr_no_wiki)
#     # Put AMR on single line and write output
#     single_amrs, _ = single_line_convert(del_amrs, '')
#     write_to_file(single_amrs, input_file + out_ext)


# if __name__ == "__main__":
#     args = create_args_parser()

#     # Do input file or find files in folder
#     if not args.folder:
#         var_free_amrs(args.input_file, args.output_ext, args.keep_wiki)
#     else:
#         for root, dirs, files in os.walk(args.input_file):
#             for f in files:
#                 if f.endswith(args.amr_ext):
#                     var_free_amrs(os.path.join(root, f), args.output_ext, args.keep_wiki)

#######################################
#######################################
# Replaces the main function in the var_free_amrs file
#######################################

def var_free_amrs(input_file, out_ext='.tf', keep_wiki=False):
    '''Create variable-free AMRs and sentence files'''    
    # Delete wiki link if wanted
    amr_no_wiki = delete_wiki(input_file) if not keep_wiki else [x.rstrip() for x in open(input_file, 'r')]
    # Remove all variables by duplicating coreference nodes
    del_amrs = delete_amr_variables(amr_no_wiki)
    # Put AMR on single line and write output
    single_amrs, _ = single_line_convert(del_amrs, '')


    # Concatenate source amrs into one
    fused_amrs = []

    # split amr entries string into individual amr entries
    amr_pairs = re.findall("\{.*?\}", single_amrs[0])
    # amr_pairs = ["{" + amr_pair + "}" for amr_pair in amr_pairs]

    # concate the source amrs
    for amr_pair in amr_pairs:
        json_entries = json.loads(amr_pair)
        fused_entries = {}
        fused_entries["source_amrs"] = " ".join(json_entries["source_amrs"])
        fused_entries["output_amr"] = json_entries["output_amr"]
        fused_amrs.append(json.dumps(fused_entries))


    write_to_file(fused_amrs, input_file + out_ext)

#######################################
#######################################


In [None]:
!ls

amr_utils.py		  __pycache__		      sample_input
best_amr_permutation.py   README.md		      smatch
char_level_AMR.py	  reformat_single_amrs.py     test_pipeline.sh
create_coref_indexing.py  requirements.txt	      var_free_amrs.py
create_coref_paths.py	  restoreAMR		      wikify_file.py
postprocess_AMRs.py	  restore_duplicate_coref.py
prune_amrs.py		  sample_alignment_input


In [None]:
!head -3 /content/test_web/webnlg_dev_parsed.jsonl > /content/test_web/test.jsonl
!cat /content/test_web/test.jsonl

{"source_amrs": ["(o0 / organization\n      :location (c0 / country\n            :name (n1 / name\n                  :op1 \"Switzerland\" ))\n      :name (n0 / name\n            :op5 \"Mendrisio\" \n            :op4 \"di\" \n            :op3 \"Architettura\" \n            :op2 \"di\" \n            :op1 \"Accademia\" ))", "(n0 / number\n      :poss (o0 / organization\n            :name (n1 / name\n                  :op5 \"Mendrisio\" \n                  :op4 \"di\" \n                  :op3 \"Architettura\" \n                  :op2 \"di\" \n                  :op1 \"Accademia\" ))\n      :quant-of (p0 / person\n            :quant 600 \n            :ARG0-of (s0 / study-01)))", "(e0 / establish-01\n      :ARG0 (p0 / publication\n            :name (n0 / name\n                  :op5 \"Mendrisio\" \n                  :op4 \"di\" \n                  :op3 \"Architettura\" \n                  :op2 \"di\" \n                  :op1 \"Accademia\" ))\n      :ARG1 (d0 / date-entity\n            :year 1

In [None]:
# Put the above AMRs in the /content/test.txt
# Remember to add a new line between each AMR

# !python var_free_amrs.py -f /content/test_web/test.jsonl
# !python var_free_amrs.py -f /content/test_web/webnlg_dev_parsed.jsonl
# !python var_free_amrs.py -f /content/test_web/webnlg_test_parsed.jsonl
# !python var_free_amrs.py -f /content/test_web/webnlg_train_parsed.jsonl

var_free_amrs('/content/test_web/test.jsonl')
var_free_amrs('/content/test_web/webnlg_dev_parsed.jsonl')
var_free_amrs('/content/test_web/webnlg_test_parsed.jsonl')
var_free_amrs('/content/test_web/webnlg_train_parsed.jsonl')

f = open('/content/test_web/webnlg_dev_parsed.jsonl', 'r')
file_contents = f.read()
print(file_contents[:500])
f.close()
f = open('/content/test_web/webnlg_dev_parsed.jsonl.tf', 'r')
file_contents = f.read()
print(file_contents[:500])
f.close()
f = open('/content/test_web/test.jsonl.tf', 'r')
file_contents = f.read()
print(file_contents[:500])
f.close()

{"source_amrs": ["(o0 / organization\n      :location (c0 / country\n            :name (n1 / name\n                  :op1 \"Switzerland\" ))\n      :name (n0 / name\n            :op5 \"Mendrisio\" \n            :op4 \"di\" \n            :op3 \"Architettura\" \n            :op2 \"di\" \n            :op1 \"Accademia\" ))", "(n0 / number\n      :poss (o0 / organization\n            :name (n1 / name\n                  :op5 \"Mendrisio\" \n                  :op4 \"di\" \n                  :op3 \"Arch
{"source_amrs": "(organization :location (country :name (name :op1 \"Switzerland\" )) :name (name :op5 \"Mendrisio\"  :op4 \"di\"  :op3 \"Architettura\"  :op2 \"di\"  :op1 \"Accademia\" )) (number :poss (organization :name (name :op5 \"Mendrisio\"  :op4 \"di\"  :op3 \"Architettura\"  :op2 \"di\"  :op1 \"Accademia\" )) :quant-of (person :quant 600  :ARG0-of (study-01))) (establish-01 :ARG0 (publication :name (name :op5 \"Mendrisio\"  :op4 \"di\"  :op3 \"Architettura\"  :op2 \"di\"  :op1 \"Accade

In [None]:
import json

with open('/content/test_web/test.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    print(f"result: {result}")


with open('/content/test_web/test.jsonl.tf', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    print(f"result: {result}")

result: {'source_amrs': ['(o0 / organization\n      :location (c0 / country\n            :name (n1 / name\n                  :op1 "Switzerland" ))\n      :name (n0 / name\n            :op5 "Mendrisio" \n            :op4 "di" \n            :op3 "Architettura" \n            :op2 "di" \n            :op1 "Accademia" ))', '(n0 / number\n      :poss (o0 / organization\n            :name (n1 / name\n                  :op5 "Mendrisio" \n                  :op4 "di" \n                  :op3 "Architettura" \n                  :op2 "di" \n                  :op1 "Accademia" ))\n      :quant-of (p0 / person\n            :quant 600 \n            :ARG0-of (s0 / study-01)))', '(e0 / establish-01\n      :ARG0 (p0 / publication\n            :name (n0 / name\n                  :op5 "Mendrisio" \n                  :op4 "di" \n                  :op3 "Architettura" \n                  :op2 "di" \n                  :op1 "Accademia" ))\n      :ARG1 (d0 / date-entity\n            :year 1996 ))', '(p0 / person\n

In [None]:
# Uncomment to download the dev, test and train sets

# from google.colab import files
# files.download('/content/test_web/webnlg_dev_parsed.jsonl.tf') 
# files.download('/content/test_web/webnlg_test_parsed.jsonl.tf') 
# files.download('/content/test_web/webnlg_train_parsed.jsonl.tf') 

In [None]:
# !python char_level_AMR.py -f /content/test_web/webnlg_dev_parsed.jsonl.tf
# !cat /content/test_web/webnlg_dev_parsed.jsonl.tf

In [None]:
# !python postprocess_AMRs.py -f /content/test_web/webnlg_dev_parsed.jsonl.char.tf -n
# !cat /content/test_web/webnlg_dev_parsed.jsonl.char.tf

In [None]:
# !python reformat_single_amrs.py -f /content/test_web/webnlg_dev_parsed.jsonl.char.tf.restore.final -e .form
# !cat /content/test_web/webnlg_dev_parsed.jsonl.char.tf.restore.final