In [44]:
from preprocessing import parse_txt_file_complex

In [45]:
"""
expected tuple format:

t = {'arg1': [arg1 split into words],
     'rel': [rel split into words],
     'arg2': [arg2 split into words],
     ...}
"""

"\nexpected tuple format:\n\nt = {'arg1': [arg1 split into words],\n     'rel': [rel split into words],\n     'arg2': [arg2 split into words],\n     ...}\n"

In [46]:
def match_score(g, p):
  """
  Input: a gold tuple g and a predicted tuple p
  Output: the match score between g and p

  (A side-effect of this scoring method:
  If the system has correctly identified a piece of information but has not
  put the words in the correct arguments of the extracted tuple, those
  misplaced words will not be considered for the score)
  """
  score = 0

  # Looping through each part in g and p
  for part in ['arg1', 'rel', 'arg2', 'arg3', 'arg4', 'arg5']:
      # Checking if the part exists in both tuples
      if g[part] and p[part]:
          # Computing the overlap of the tokens in the part of g and p
          overlap = len(set(g[part]).intersection(set(p[part])))
          # Updating the score by adding the overlap
          score += overlap

  # Normalizing the score by dividing by the total number of tokens in g and p
  g_len = sum(len(arg) for arg in g.values() if arg)
  p_len = sum(len(arg) for arg in p.values() if arg)
  score /= (g_len + p_len)

  return score

In [47]:
def find_best_match(g, P):
  """
  Input: a gold tuple g and a list of predicted tuples P
  Output: the predicted tuple p_best in P that best matches g
  """
  best_score = 0
  p_best = None

  # Looping through each predicted tuple in P
  for p in P:
      score = match_score(g, p)
      if score > best_score:
          best_score = score
          p_best = p

  return p_best, best_score

In [48]:
def match(G, P):
  """
  Input: a list of gold tuples G and a list of predicted tuples P
  Output: a list of matching pairs M, and the lists of unmatched gold tuples UG and unmatched predicted tuples UP
  """
  M = []
  UG = []
  UP = []

  # Looping through each gold tuple in G
  for g in G:
      # Finding the tuple in the predicted tuples P that best matches g
      p_best, score = find_best_match(g, P)
      print(score)

      if p_best: # and score > threshold:
          M.append((g, p_best))
          P.remove(p_best)
      else:
          UG.append(g)

  # Adding the remaining tuples in P to UP
  UP = P

  return M, UG, UP


In [49]:
def precision_sys(matching_pairs, unmatched_predictions):
    """
    Input:
    - matching_pairs: a list of pairs of matching gold and predicted tuples
    - unmatched_predictions: a list of prediction tuples not found in the reference

    Output: full system precision of predicted and reference tuples at the token level
    (precision: the proportion of extracted words that are found in the reference)
    """
    numerator = 0
    denominator = 0

    for (gold, pred) in matching_pairs:
        pred_args = pred.keys()
        # Summing the length of the intersection between matching predicted and gold tuples and adding the value to the precision numerator
        numerator += sum(len(set(pred[arg]) & set(gold[arg])) for arg in pred_args)

    _, matched_predictions = zip(*matching_pairs)
    all_predictions = list(matched_predictions) + unmatched_predictions

    for pred in all_predictions:
        pred_args = pred.keys()
        # Summing the lengths of all prediction tuples
        denominator += sum(len(pred[arg]) for arg in pred_args)

    try:
        precision_sys = numerator / denominator
    except ZeroDivisionError:
        precision_sys = 0

    return precision_sys

In [50]:
def recall_sys(matching_pairs, unmatched_references):
    """
    Input:
    - matching_pairs: a list of pairs of matching gold and predicted tuples
    - unmatched_references: a list of reference tuples not found in the predictions

    Output: full system recall of predicted and reference tuples at the token level
    (recall: the proportion of reference words found in the systems’ predictions)
    """
    numerator = 0
    denominator = 0

    for (gold, pred) in matching_pairs:
        gold_args = gold.keys()
        # Summing the length of the intersection between matching predicted and gold tuples and adding the value to the recall numerator
        numerator += sum(len(set(pred[arg]) & set(gold[arg])) for arg in gold_args)

    matched_references, _ = zip(*matching_pairs)
    all_references = list(matched_references) + unmatched_references

    for gold in all_references:
        gold_args = gold.keys()
        # Summing the lengths of all reference tuples
        denominator += sum(len(gold[arg]) for arg in gold_args)

    try:
        recall_sys = numerator / denominator
    except ZeroDivisionError:
        recall_sys = 0

    return recall_sys

In [51]:
def F1_sys(precision, recall):
    try:
        F1_sys = (2 * precision * recall) / (precision + recall)
    except ZeroDivisionError:
        F1_sys = 0

    return F1_sys

In [52]:
def scorer(ref_data, pred_data):
    """
    Input: a reference dataset G and an OIE system Tsys
    Output: the precision, recall, and F1 scores of Tsys on G
    """

    for sentence in ref_data:
        gold_tuples = ref_data[sentence]
        pred_tuples = pred_data[sentence]

        # Finding the matching pairs and the unmatched tuples using the matching function
        matching_pairs, unmatched_gold, unmatched_pred = match(gold_tuples, pred_tuples)
        print(matching_pairs)
        print(unmatched_gold)
        print(unmatched_pred)

        precision = precision_sys(matching_pairs, unmatched_pred)
        recall = recall_sys(matching_pairs, unmatched_gold)
        F1 = F1_sys(precision_sys, recall_sys)

    # # Normalize the overall scores by dividing by the total number of tokens in G
    # precision_sys /= total_tokens(G)
    # recall_sys /= total_tokens(G)
    # F1_sys /= total_tokens(G)

    return precision, recall, F1


In [53]:
'''
# Usable for a single pair of gold and predicted tuples

class Scorer:

    def __init__(self, references, predictions):
        self.references = references
        self.predictions = predictions


    def precision(self, t, g):
        """
        t: one predicted tuple
        g: corresponding gold tuple

        returns: precision of a pair of predicted and reference tuples at the token level
        (precision: the proportion of extracted words that are found in the reference)
        """
        pred_args = t.keys()

        numerator = sum(len(set(t[arg]) & set(g[arg])) for arg in pred_args)

        pred_len = sum(len(t[arg]) for arg in pred_args)  # cf. The length of a tuple is the sum of lengths of its parts

        precision = numerator / pred_len

        return precision


    def recall(self, t, g):
        """
        t: one predicted tuple
        g: corresponding gold tuple

        returns: recall of a pair of predicted and reference tuples at the token level
        (recall: the proportion of reference words found in the systems’ predictions)
        """
        gold_args = g.keys()

        numerator = sum(len(set(t[arg]) & set(g[arg])) for arg in gold_args)

        gold_len = sum(len(g[arg]) for arg in gold_args)

        recall = numerator / gold_len

        return recall


    def score(self):

        assert len(self.predictions) == len(self.references), "There are different numbers of reference and prediction tuples"

        precision_sys = 0
        recall_sys = 0

        for pred, ref in zip(self.predictions, self.references):
            precision_sys += self.precision(pred, ref)
            recall_sys += self.recall(pred, ref)

        try:
          F1_sys = (2 * precision_sys * recall_sys) / (precision_sys + recall_sys)
        except ZeroDivisionError:
          F1_sys = 0.0

        return F1_sys

'''

'\n# Usable for a single pair of gold and predicted tuples\n\nclass Scorer:\n\n    def __init__(self, references, predictions):\n        self.references = references\n        self.predictions = predictions\n\n\n    def precision(self, t, g):\n        """\n        t: one predicted tuple\n        g: corresponding gold tuple\n\n        returns: precision of a pair of predicted and reference tuples at the token level\n        (precision: the proportion of extracted words that are found in the reference)\n        """\n        pred_args = t.keys()\n\n        numerator = sum(len(set(t[arg]) & set(g[arg])) for arg in pred_args)\n\n        pred_len = sum(len(t[arg]) for arg in pred_args)  # cf. The length of a tuple is the sum of lengths of its parts\n\n        precision = numerator / pred_len\n\n        return precision\n\n\n    def recall(self, t, g):\n        """\n        t: one predicted tuple\n        g: corresponding gold tuple\n\n        returns: recall of a pair of predicted and referen

In [54]:
references = parse_txt_file_complex('IE_AKGC_GOLD.txt')
reverb_predictions = parse_txt_file_complex('results/Reverb_annotations.txt')
stanford_predictions = parse_txt_file_complex('results/stanford-openie-output.txt')

In [55]:
stanford_predictions

{1: [{'arg1': 'American professor',
   'rel': 'known for',
   'arg2': 'his work linguistics',
   'arg3': None,
   'arg4': None,
   'arg5': None},
  {'arg1': 'professor',
   'rel': 'known for',
   'arg2': 'his work',
   'arg3': None,
   'arg4': None,
   'arg5': None},
  {'arg1': 'professor',
   'rel': 'known for',
   'arg2': 'his work linguistics',
   'arg3': None,
   'arg4': None,
   'arg5': None},
  {'arg1': 'Chomsky',
   'rel': 'is',
   'arg2': 'American known',
   'arg3': None,
   'arg4': None,
   'arg5': None},
  {'arg1': 'American professor',
   'rel': 'known for',
   'arg2': 'his work',
   'arg3': None,
   'arg4': None,
   'arg5': None},
  {'arg1': 'his work',
   'rel': 'is in',
   'arg2': 'linguistics',
   'arg3': None,
   'arg4': None,
   'arg5': None}],
 2: [{'arg1': 'modern linguistics',
   'rel': 'father of is',
   'arg2': 'also figure in philosophy',
   'arg3': None,
   'arg4': None,
   'arg5': None},
  {'arg1': 'father',
   'rel': 'is',
   'arg2': 'also figure',
   'arg3':

In [56]:
precision, recall, f1 = scorer(references, reverb_predictions)
print('ReVerb')
print(f'Precision: {precision}, Recall: {recall}, F1: {f1}')

0.1485148514851485
0.1346153846153846
0.08333333333333333
0.06329113924050633
0
0
0
0
0
0
0
[({'arg1': 'Chomsky', 'rel': '[refers to]', 'arg2': 'Avram Noam Chomsky', 'arg3': None, 'arg4': None, 'arg5': None}, {'arg1': 'Avram Noam Chomsky', 'rel': 'is', 'arg2': 'an American professor and public intellectual', 'arg3': None, 'arg4': None, 'arg5': None}), ({'arg1': 'Chomsky', 'rel': '[was] born [on]', 'arg2': 'December 7 , 1928', 'arg3': None, 'arg4': None, 'arg5': None}, {'arg1': 'Avram Noam Chomsky', 'rel': 'is', 'arg2': 'an American professor and public intellectual', 'arg3': None, 'arg4': None, 'arg5': None}), ({'arg1': 'Chomsky', 'rel': 'is', 'arg2': '[a] professor', 'arg3': None, 'arg4': None, 'arg5': None}, {'arg1': 'an American professor and public intellectual', 'rel': 'known for', 'arg2': 'his work', 'arg3': None, 'arg4': None, 'arg5': None}), ({'arg1': 'Chomsky', 'rel': 'is', 'arg2': 'American', 'arg3': None, 'arg4': None, 'arg5': None}, {'arg1': 'an American professor and publi

TypeError: 'NoneType' object is not iterable