In this notebook tests for the following capabilities will be implemented:

1. The ability of an SRL system to correctly identify the predicate and its arguments (agent patient) in a three words sentence of the shape: Agent Predicate Patient 
2. The ability of an SRL system to correctly identify the predicate and its arguments (agent patient) in a passive voice sentence of the shape: Patient was Predicate by Agent 
3. The ability of an SRL system to correctly identify the argument after by in the passive voice.:
    
    a. instrument: by could refer to the instrumnet in which the action was performed with: Patient was Predicate by Instrument
    
    b. location: by could refer to the location the action was perfromed in: Patient was Predicate by Location
    
4. Different words in the same context
5. Impersonal verbs
6. Robustness
    

In [1]:
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb
import numpy as np
import json
editor = Editor()

# test 1: Simple sentence with one predicate

In [2]:
# Marry  likes   John  .
# B-ARG0, B-V, B-ARG1, O
ret = editor.template('{first_name1} {verb} {first_name2}.', verb=['likes', 'hates', 'loves'], remove_duplicates=True)
test1 = np.random.choice(ret.data, 100)

# test 2: Simple passive voice with one predicate

In [3]:
# John   is liked   by    Marry   .
# B-ARG1, O, B-V, B-ARG0, I-ARG0, O
def convert_to_passive(sentence):
    """
    Converts the given active voice sentence to passive voice.

    Args:
    - sentence (str): The active voice sentence to be converted.

    Returns:
    - str: The passive voice sentence.
    """
    # Tokenize the sentence
    tokens = sentence.strip('.').split()

    # Identify the agent and patient and verb
    agent = tokens[0]
    patient = tokens[2]
    verb = tokens[1]

    # Create the passive voice sentence
    verb = verb[:-1]+'d'
    passive_sentence = f"{patient} is {verb} by {agent}."

    return passive_sentence

test2 = Perturb.perturb(test1, convert_to_passive).data
test2 = [x[1] for x in test2]

# test3.1: ”by” passive voice: location

In [4]:
# He    was seen     by         the         river     .
# B-ARG1, O, B-V, B-ARGM-LOC, I-ARGM-LOC, I-ARGM-LOC, O

# editor.related_words('He was seen by the river', 'river')
# After running this, manual selection was performed and put into the following list

In [5]:
places = ['river', 'sea', 'canal', 'water', 'vehicle', 'wall', 'lake', 'ice', 'mountain', 'waterfall', 'property', 'farm', 'association', 'range', 'reservoir', 'ocean', 'building', 'beach', 'field']
test3_1 = editor.template('He was seen by the {place}.', place=places).data

# test 3.2: "by" passive voice: instrument

In [6]:
# He    was killed   by    a      knife   .
# B-ARG1, O, B-V, B-ARG2, I-ARG2, I-ARG2, O

# editor.related_words('He was killed by a knife', 'knife')
# After running this, manual selection was performed and put into the following list

In [7]:
instrument = ['bullet', 'shotgun', 'missile', 'gun', 'rifle', 'shot', 'sword', 'slug', 'blade', 'firearm', 'projectile', 'handgun', 'ball', 'dart', 'pistol', 'dagger', 'spear', 'BB', 'bow', 'cannon']
test3_2 = editor.template('He was killed by {a:instrument}.', instrument=instrument).data

# test4: Different words in the same context

In [9]:
#   My     animal eats animals .
# B-ARG0, I-ARG0, B-V, B-ARG1, O

hypernyms = editor.hyponyms('My animal eats animals.', 'animal')
hyponyms = editor.hypernyms('My animal eats animals.', 'animal')
ret1 = editor.template('This is great {mask}.',  remove_duplicates=True) # this is to take illogical unrelated words
random_words = [x.split()[-1].strip('.') for x in ret1.data]
random_words = np.random.choice(random_words, 72)
replacements = hypernyms + hyponyms + list(random_words)
test4 = editor.template('My {replacements} eats animals.', replacements = replacements).data


# test5: Impersonal verbs

In [10]:
# it is raining .
# O  O    B-V   O
# O  B-V    O   O

# it feels     good      .
# O   B-V   B-ARGM-MNR   O

weather_verbs = ['raining', 'snowing', 'storming', 'sleeting', 'thundering', 'drizzling', 'pouring', 'showering', 'sprinkling']
test5_1 = editor.template('it is {weather_verbs}.', weather_verbs = weather_verbs).data

ret5 = editor.template('it feels {mask}.')
test5_2 = np.random.choice(ret5.data, 91)

# test6: Robustness

In [11]:
test6_1 = np.random.choice(test1, 20)
test6_2 = np.random.choice(test2, 20)
test6_31 = np.random.choice(test3_1, 10)
test6_32 = np.random.choice(test3_2, 10)
test6_4 = np.random.choice(test4, 20)
test6_51 = np.random.choice(test5_1, 5)
test6_52 = np.random.choice(test5_2, 15)

In [12]:
test6_1 = Perturb.perturb(test6_1, Perturb.add_typos).data
test6_1 = [x[1] for x in test6_1]
test6_2 = Perturb.perturb(test6_2, Perturb.add_typos).data
test6_2 = [x[1] for x in test6_2]
test6_31 = Perturb.perturb(test6_31, Perturb.add_typos).data
test6_31 = [x[1] for x in test6_31]
test6_32 = Perturb.perturb(test6_32, Perturb.add_typos).data
test6_32 = [x[1] for x in test6_32]
test6_4 = Perturb.perturb(test6_4, Perturb.add_typos).data
test6_4 = [x[1] for x in test6_4]
test6_51 = Perturb.perturb(test6_51, Perturb.add_typos).data
test6_51 = [x[1] for x in test6_51]
test6_52 = Perturb.perturb(test6_52, Perturb.add_typos).data
test6_52 = [x[1] for x in test6_52]

# Creating datasets

In [13]:
def list_to_json(data, annotations = list):
    list_of_dics = []
    for x in data:
        dic = {}
        dic['sentence'] = x
        dic['tags'] = annotations
        dic['words'] = x.split()
        if dic['words'][-1][-1] == '.':
            splitted = dic['words']
            dic['words'][-1] = dic['words'][-1][:-1]
            dic['words'].append('.')
        list_of_dics.append(dic)
    return list_of_dics

In [14]:
data1 = list_to_json(test1, annotations = ['B-ARG0', 'B-V', 'B-ARG1', 'O'])

data2 = list_to_json(test2, annotations = ['B-ARG1', 'O', 'B-V', 'B-ARG0', 'I-ARG0', 'O'])

data3_1 = list_to_json(test3_1, annotations = ['B-ARG1', 'O', 'B-V', 'B-ARGM-LOC', 'I-ARGM-LOC', 'I-ARGM-LOC', 'O'])
data3_2 = list_to_json(test3_2, annotations = ['B-ARG1', 'O', 'B-V', 'B-ARG2', 'I-ARG2', 'I-ARG2', 'O'])
data3 = data3_1 + data3_2

data4 = list_to_json(test4, annotations = ['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'O'])

data5_1 = list_to_json(test5_1, annotations = ['O', 'O', 'B-V', 'O'])
data5_2 = list_to_json(test5_2, annotations = ['B-ARG1', 'B-V', 'B-ARGM-MNR', 'O'])
data5 = data5_1 + data5_2

In [15]:
data6_1 = list_to_json(test6_1, annotations = ['B-ARG0', 'B-V', 'B-ARG1', 'O'])

data6_2 = list_to_json(test6_2, annotations = ['B-ARG1', 'O', 'B-V', 'B-ARG0', 'I-ARG0', 'O'])

data6_31 = list_to_json(test6_31, annotations = ['B-ARG1', 'O', 'B-V', 'B-ARGM-LOC', 'I-ARGM-LOC', 'I-ARGM-LOC', 'O'])
data6_32 = list_to_json(test6_32, annotations = ['B-ARG1', 'O', 'B-V', 'B-ARG2', 'I-ARG2', 'I-ARG2', 'O'])
data6_3 = data6_31 + data6_32

data6_4 = list_to_json(test6_4, annotations = ['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'O'])

data6_51 = list_to_json(test6_51, annotations = ['O', 'O', 'B-V', 'O'])
data6_52 = list_to_json(test6_52, annotations = ['B-ARG1', 'B-V', 'B-ARGM-MNR', 'O'])
data6_5 = data6_51 + data6_52

data6 = data6_1 + data6_2 + data6_3 + data6_4 + data6_5

In [16]:
def check_period(dic):
    """
    Checks if the length of the 'tags' list in a dictionary matches the length of the 'words' list.
    If they do not match, removes the last element from the 'tags' list. Meaning the added typo removed the period.

    Args:
        dictionary (dict): A dictionary containing 'words' and 'tags' lists.

    Returns:
        dict: The updated dictionary with the 'tags' list truncated if needed.
    """
    if len(dic['tags']) != len(dic['words']):
        dic['tags'] = dic['tags'][:-1]
        return(dic)

for d in data6:
    check_period(d)

In [17]:
print(len(data1), len(data2), len(data3), len(data4), len(data5), len(data6))

100 100 39 100 100 100


In [18]:
# Save the data to a JSON files

# List of variable names
variables = ['data1', 'data2', 'data3', 'data4', 'data5', 'data6']
# Loop over variables and save to JSON file
for var in variables:
    data = eval(var)  # Get the value of the variable
    filename = 'data/' + var + '.json'  # Construct the filename
    with open(filename, 'w') as f:
        json.dump(data, f)  # Write the data to the file