In [1]:
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb
from checklist.test_types import MFT, INV, DIR
from checklist.pred_wrapper import PredictorWrapper
import numpy as np
import json
import pandas as pd

In [2]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging

In [3]:
bert = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")
bilstm = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz")

error loading _jsonnet (this is expected on Windows), treating C:\Users\Hisha\AppData\Local\Temp\tmpa25gs13l\config.json as plain json
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
error loading _jsonn

In [37]:
# Python program to read the datasets
dict_list = []
# Loop over the filenames and load each JSON file into a dictionary
for i in range(1, 7):
    filename = f"data\data{i}.json"
    with open(filename, "r") as f:
        data = json.load(f)
        dict_list.append(data)

In [38]:
def compare_lists(list1, list2, dictionary):
    """
    Compare two lists of lists and return error rate, ratio of correctly classified items, and a list of incorrect sentences.

    Args:
    - list1 (list): The first list of lists to compare. Original tags/ expected
    - list2 (list): The second list of lists to compare. Predicted Tags.
    - dictionary (dict): A dictionary containing information about each sentence being compared. The dictionary returned by running the models.

    Returns:
    - tuple: A tuple containing the error rate, ratio of correctly classified items, and a list of incorrect sentences.
    """
    total_items = 0
    correct_items = 0
    incorrect_items = 0
    incorrect_indices = []

    # Loop over each element in both lists of lists and compare them
    for i in range(len(list1)):
        if not list1[i] or not list2[i]:
            continue  # Skip empty lists
        total_items += 1
        if list1[i] == list2[i]:
            correct_items += 1
        else:
            incorrect_items += 1
            incorrect_indices.append((i))

    # Calculate the error rate, ratio of incorrectly classified items, and ratio of correctly classified items
    error_rate = incorrect_items / total_items
    ratio_correct = correct_items / total_items

    # Get the sentences corresponding to the incorrect indices
    incorrect_sentences = []
    for i in incorrect_indices:
        if len(dictionary[i]['verbs']) == 2:
            sentence = " ".join(dictionary[i]['words'])
            res_tags = " ".join(dictionary[i]['verbs'][1]['tags'])
            tags = list1[i]
            incorrect_sentences.append((i, sentence, res_tags, tags))
        elif len(dictionary[i]['verbs']) == 1:
            sentence = " ".join(dictionary[i]['words'])
            res_tags = " ".join(dictionary[i]['verbs'][0]['tags'])
            tags = list1[i]
            incorrect_sentences.append((i, sentence, res_tags, tags))
        else:
            sentence = " ".join(dictionary[i]['words'])
            res_tags = ""
            tags = list1[i]
            incorrect_sentences.append({'index': i, 'sentence': sentence, 'predicted': res_tags, 'expected': tags})

    return error_rate, ratio_correct, incorrect_sentences


In [39]:
def run_and_eval(dict_list, model):
    """
    Evaluates a list of dictionaries containing sentences and their corresponding tags
    using a spaCy model, and returns a list of DataFrames containing the incorrectly
    classified sentences for each dictionary.

    Args:
    - dict_list (List[Dict[str, Union[str, List[str]]]]): A list of dictionaries containing
      sentences and their corresponding tags.
    - model (Language): A spaCy model object.

    Returns:
    - dfs (List[pandas.DataFrame]): A list of DataFrames containing the incorrectly classified
      sentences for each dictionary.
    """
    dfs = []
    for i, dic in enumerate(dict_list):
        res = model.predict_batch_json(dic)
        tags = [x['tags'] for x in dic]
        res_tags = []
        for x in res:
            if len(x['verbs']) == 2:
                res_tags.append(x['verbs'][1]['tags'])
            elif len(x['verbs']) == 1:
                res_tags.append(x['verbs'][0]['tags'])
            else:
                res_tags.append([])
        
        error_rate, ratio_correct, incorrect_sentences = compare_lists(tags, res_tags, res)
        dfs.append(pd.DataFrame(incorrect_sentences, columns=['index', 'sentence', 'predicted', 'expected']))
        print(f"Test {i+1}")
        print("Error rate:", error_rate)
        print("Ratio of correctly classified items:", ratio_correct)
        print("\n")
    return dfs
        

In [40]:
# test 1: Simple sentence with one predicate
# test 2: Simple sentence passive voice
# test 3: instrument, location after "by" in passive voice
# test 4: Different words in the same context
# test 5: Impersonal verbs
# test 6: Robustness
print("############### BERT ###################")
dfs_bert = run_and_eval(dict_list, bert)
print("############### BiLSTM ###################")
dfs_bilstm = run_and_eval(dict_list, bilstm)

############### BERT ###################
Test 1
Error rate: 0.0
Ratio of correctly classified items: 1.0


Test 2
Error rate: 0.0
Ratio of correctly classified items: 1.0


Test 3
Error rate: 0.48717948717948717
Ratio of correctly classified items: 0.5128205128205128


Test 4
Error rate: 0.01
Ratio of correctly classified items: 0.99


Test 5
Error rate: 0.98
Ratio of correctly classified items: 0.02


Test 6
Error rate: 0.4891304347826087
Ratio of correctly classified items: 0.5108695652173914


############### BiLSTM ###################
Test 1
Error rate: 0.0
Ratio of correctly classified items: 1.0


Test 2
Error rate: 0.0
Ratio of correctly classified items: 1.0


Test 3
Error rate: 0.8717948717948718
Ratio of correctly classified items: 0.1282051282051282


Test 4
Error rate: 0.04
Ratio of correctly classified items: 0.96


Test 5
Error rate: 0.96
Ratio of correctly classified items: 0.04


Test 6
Error rate: 0.5760869565217391
Ratio of correctly classified items: 0.42391304347826

In [41]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [54]:
for df in dfs_bert:
    df['expected'] = df['expected'].astype(str).str.replace('[','').str.replace(']','').str.replace(',','').str.replace("'", "")

for df in dfs_bilstm:
    df['expected'] = df['expected'].astype(str).str.replace('[','').str.replace(']','').str.replace(',','').str.replace("'", "")

  df['expected'] = df['expected'].astype(str).str.replace('[','').str.replace(']','').str.replace(',','').str.replace("'", "")
  df['expected'] = df['expected'].astype(str).str.replace('[','').str.replace(']','').str.replace(',','').str.replace("'", "")


In [None]:
# dfs_bilstm[0]
print(dfs_bilstm[5].to_latex())

\begin{tabular}{lrlll}
\toprule
{} &  index &                          sentence &                                          predicted &                                         expected \\
\midrule
0  &     14 &                  Toml oves Ruth . &                                B-V B-ARG1 I-ARG1 O &                              B-ARG0 B-V B-ARG1 O \\
1  &     15 &                  Jim lvoes Mark . &                                     O B-V B-ARG2 O &                              B-ARG0 B-V B-ARG1 O \\
2  &     24 &         Ashley is ilked by Mike . &                       B-ARG0 O B-V B-ARG0 I-ARG0 O &                     B-ARG1 O B-V B-ARG0 I-ARG0 O \\
3  &     25 &      Pamela i shated by Heather . &                  B-ARG2 B-ARG0 B-V B-ARG0 I-ARG0 O &                     B-ARG1 O B-V B-ARG0 I-ARG0 O \\
4  &     26 &           Dick is lovedb y Adam . &                  B-ARG1 B-V B-ARG2 I-ARG2 I-ARG2 O &                     B-ARG1 O B-V B-ARG0 I-ARG0 O \\
5  &     29 &        David is

  print(dfs_bilstm[5].to_latex())
