In [239]:
from preprocessing import parse_txt_file_complex

In [240]:
def match_score(g, p):
    """
    Input: a gold tuple g and a predicted tuple p
    Output: the match score between g and p

    (A side-effect of this scoring method:
    If the system has correctly identified a piece of information but has not
    put the words in the correct arguments of the extracted tuple, those
    misplaced words will not be considered for the score)
    """
    score = 0
    match = True

    # # Checking if there is at least one shared word in each of the first three parts
    # for part in ['arg1', 'rel', 'arg2']:
    #     if g[part] and p[part] and all(set(g[part]) & set(p[part])):
    #         match = True
    
    # Looping through the first three parts in g and p
    for part in ['arg1', 'rel', 'arg2']:
        if g[part] and p[part]:
            # Computing the overlap of the tokens in the part of g and p
            overlap = len(set(g[part]).intersection(set(p[part])))
            score += overlap

    # Normalizing the score by dividing by the total number of tokens in g and p
    # print(list(g.values())[:3])
    # print(list(p.values())[:3])
    g_len = sum(len(arg) for arg in list(g.values())[:3] if arg)
    p_len = sum(len(arg) for arg in list(p.values())[:3] if arg)
    score /= (g_len + p_len)

    return score, match

In [241]:
def match_score(g, p):
    """
    Input: a gold tuple g and a predicted tuple p
    Output: the match score between g and p

    (A side-effect of this scoring method:
    If the system has correctly identified a piece of information but has not
    put the words in the correct arguments of the extracted tuple, those
    misplaced words will not be considered for the score)
    """
    score = 0
    
    # Set of parts to check
    parts_to_check = {'arg1', 'rel', 'arg2'}

    # Using set intersection to find common words in each part
    common_words = {part: set(g[part]) & set(p[part]) for part in parts_to_check if g[part] and p[part]}
    print(common_words)
    
    # Checking if there is at least one shared word in each part
    match = all(common_words.values())
    print(match)

    # Calculating the score based on the total number of common words
    score = sum(len(words) for words in common_words.values())

    # Normalizing the score by dividing by the total number of tokens in g and p
    g_len = sum(len(arg) for arg in list(g.values())[:3] if arg)
    p_len = sum(len(arg) for arg in list(p.values())[:3] if arg)
    score /= (g_len + p_len)

    return score, match


In [242]:
def find_best_match(g, P):
    """
    Input: a gold tuple g and a list of predicted tuples P
    Output: the predicted tuple p_best in P that best matches g
    """
    best_score = 0
    p_best = None
    matches = 0

    # Looping through each predicted tuple in P
    if P:
        for p in P:
            score, match = match_score(g, p)
            if match:
                matches += 1
            if score > best_score:
                best_score = score
                p_best = p

    return p_best, matches

In [243]:
def match(G, P):
    """
    Input: a list of gold tuples G and a list of predicted tuples P
    Output: a list of matching pairs M, and the lists of unmatched gold tuples UG and unmatched predicted tuples UP
    """
    M = []
    UG = []
    UP = []
    all_matches = 0

    # Looping through each gold tuple in G
    for g in G:
        # Finding the tuple in the predicted tuples P that best matches g
        p_best, matches = find_best_match(g, P)

        all_matches += matches

        if p_best: # and score > threshold:
            M.append((g, p_best))
            P.remove(p_best)
        else:
            UG.append(g)

    # Adding the remaining tuples in P to UP
    UP = P

    return M, UG, UP, all_matches


In [244]:
def precision_sys(matching_pairs, unmatched_predictions):
    """
    Input:
    - matching_pairs: a list of pairs of matching gold and predicted tuples
    - unmatched_predictions: a list of prediction tuples not found in the reference

    Output: full system precision of predicted and reference tuples at the token level
    (precision: the proportion of extracted words that are found in the reference)
    """
    numerator = 0
    denominator = 0

    for (gold, pred) in matching_pairs:
        pred_args = pred.keys()
        # Summing the length of the intersection between matching predicted and gold tuples and adding the value to the precision numerator
        numerator += sum(len(set(pred[arg]) & set(gold[arg])) for arg in pred_args if pred[arg] and gold[arg])

    if matching_pairs:
        _, matched_predictions = zip(*matching_pairs)
        all_predictions = list(matched_predictions) + unmatched_predictions
    elif unmatched_predictions:
        all_predictions = unmatched_predictions
    else:
        all_predictions = []

    for pred in all_predictions:
        pred_args = pred.keys()
        # Summing the lengths of all prediction tuples
        denominator += sum(len(pred[arg]) for arg in pred_args if pred[arg])

    try:
        precision_sys = numerator / denominator
    except ZeroDivisionError:
        precision_sys = 0

    return precision_sys

In [245]:
def recall_sys(matching_pairs, unmatched_references):
    """
    Input:
    - matching_pairs: a list of pairs of matching gold and predicted tuples
    - unmatched_references: a list of reference tuples not found in the predictions

    Output: full system recall of predicted and reference tuples at the token level
    (recall: the proportion of reference words found in the systems’ predictions)
    """
    numerator = 0
    denominator = 0

    for (gold, pred) in matching_pairs:
        gold_args = gold.keys()
        # Summing the length of the intersection between matching predicted and gold tuples and adding the value to the recall numerator
        numerator += sum(len(set(pred[arg]) & set(gold[arg])) for arg in gold_args if pred[arg] and gold[arg])

    if matching_pairs:
        matched_references, _ = zip(*matching_pairs)
        all_references = list(matched_references) + unmatched_references
    else:
        all_references = unmatched_references

    for gold in all_references:
        gold_args = gold.keys()
        # Summing the lengths of all reference tuples
        denominator += sum(len(gold[arg]) for arg in gold_args if gold[arg])

    try:
        recall_sys = numerator / denominator
    except ZeroDivisionError:
        recall_sys = 0

    return recall_sys

In [246]:
def F1_sys(precision, recall):
    try:
        F1_sys = (2 * precision * recall) / (precision + recall)
    except ZeroDivisionError:
        F1_sys = 0

    return F1_sys

In [247]:
def scorer(ref_data, pred_data):
    """
    Input: a reference dataset G and an OIE system Tsys
    Output: the precision, recall, and F1 scores of Tsys on G
    """
    all_matches = 0
    pair_matches = 0

    for sentence in ref_data:
        gold_tuples = ref_data[sentence]
        pred_tuples = pred_data.get(sentence, None)

        # Move to the next sentence if the system has not predicted anything for this sentence
        if not pred_tuples:
            precision = 0
            recall = 0
            F1 = 0

        # Finding the matching pairs and the unmatched tuples using the matching function
        matching_pairs, unmatched_gold, unmatched_pred, matches = match(gold_tuples, pred_tuples)

        all_matches += matches
        pair_matches += len(matching_pairs)

        precision = precision_sys(matching_pairs, unmatched_pred)
        recall = recall_sys(matching_pairs, unmatched_gold)
        F1 = F1_sys(precision, recall)

    # # Normalize number of matches
    # all_matches /= len(ref_data) 
    # pair_matches /= len(ref_data)
    print(pair_matches, all_matches)
    return precision, recall, F1, all_matches


In [248]:
'''
# Usable for a single pair of gold and predicted tuples

class Scorer:

    def __init__(self, references, predictions):
        self.references = references
        self.predictions = predictions


    def precision(self, t, g):
        """
        t: one predicted tuple
        g: corresponding gold tuple

        returns: precision of a pair of predicted and reference tuples at the token level
        (precision: the proportion of extracted words that are found in the reference)
        """
        pred_args = t.keys()

        numerator = sum(len(set(t[arg]) & set(g[arg])) for arg in pred_args)

        pred_len = sum(len(t[arg]) for arg in pred_args)  # cf. The length of a tuple is the sum of lengths of its parts

        precision = numerator / pred_len

        return precision


    def recall(self, t, g):
        """
        t: one predicted tuple
        g: corresponding gold tuple

        returns: recall of a pair of predicted and reference tuples at the token level
        (recall: the proportion of reference words found in the systems’ predictions)
        """
        gold_args = g.keys()

        numerator = sum(len(set(t[arg]) & set(g[arg])) for arg in gold_args)

        gold_len = sum(len(g[arg]) for arg in gold_args)

        recall = numerator / gold_len

        return recall


    def score(self):

        assert len(self.predictions) == len(self.references), "There are different numbers of reference and prediction tuples"

        precision_sys = 0
        recall_sys = 0

        for pred, ref in zip(self.predictions, self.references):
            precision_sys += self.precision(pred, ref)
            recall_sys += self.recall(pred, ref)

        try:
          F1_sys = (2 * precision_sys * recall_sys) / (precision_sys + recall_sys)
        except ZeroDivisionError:
          F1_sys = 0.0

        return F1_sys

'''

'\n# Usable for a single pair of gold and predicted tuples\n\nclass Scorer:\n\n    def __init__(self, references, predictions):\n        self.references = references\n        self.predictions = predictions\n\n\n    def precision(self, t, g):\n        """\n        t: one predicted tuple\n        g: corresponding gold tuple\n\n        returns: precision of a pair of predicted and reference tuples at the token level\n        (precision: the proportion of extracted words that are found in the reference)\n        """\n        pred_args = t.keys()\n\n        numerator = sum(len(set(t[arg]) & set(g[arg])) for arg in pred_args)\n\n        pred_len = sum(len(t[arg]) for arg in pred_args)  # cf. The length of a tuple is the sum of lengths of its parts\n\n        precision = numerator / pred_len\n\n        return precision\n\n\n    def recall(self, t, g):\n        """\n        t: one predicted tuple\n        g: corresponding gold tuple\n\n        returns: recall of a pair of predicted and referen

In [249]:
references = parse_txt_file_complex('IE_AKGC_GOLD.txt')
reverb_predictions = parse_txt_file_complex('results/Reverb_annotations.txt')
stanford_predictions = parse_txt_file_complex('results/stanford-openie-output.txt')

In [252]:
references

{1: [{'arg1': 'Chomsky',
   'rel': '',
   'arg2': 'Avram Noam Chomsky',
   'arg3': None,
   'arg4': None,
   'arg5': None},
  {'arg1': 'Chomsky',
   'rel': 'born',
   'arg2': 'December 7 , 1928',
   'arg3': None,
   'arg4': None,
   'arg5': None},
  {'arg1': 'Chomsky',
   'rel': 'is',
   'arg2': 'professor',
   'arg3': None,
   'arg4': None,
   'arg5': None},
  {'arg1': 'Chomsky',
   'rel': 'is',
   'arg2': 'American',
   'arg3': None,
   'arg4': None,
   'arg5': None},
  {'arg1': 'Chomsky',
   'rel': 'is',
   'arg2': 'public intellectual',
   'arg3': None,
   'arg4': None,
   'arg5': None},
  {'arg1': 'Chomsky',
   'rel': 'is known for',
   'arg2': 'Chomsky’s work in linguistics',
   'arg3': None,
   'arg4': None,
   'arg5': None},
  {'arg1': 'Chomsky',
   'rel': 'work in',
   'arg2': 'linguistics',
   'arg3': None,
   'arg4': None,
   'arg5': None},
  {'arg1': 'Chomsky',
   'rel': 'is known for',
   'arg2': 'his work in political activism',
   'arg3': None,
   'arg4': None,
   'arg5'

In [250]:
reverb_precision, reverb_recall, reverb_f1, reverb_matches = scorer(references, reverb_predictions)
print('ReVerb:')
print(f'Precision: {reverb_precision}, Recall: {reverb_recall}, F1: {reverb_f1}, Matches: {reverb_matches}')

{'arg1': {'o', 'm', 'C', 's', 'h', 'y', 'k'}, 'arg2': {'o', 'm', 'A', 's', ' ', 'a', 'r'}}
True
{'arg1': {'o', 'm', 's'}, 'arg2': {'o', 's', 'h', ' ', 'k', 'r'}}
True
{'arg1': {'o', 'm', 'C', 's', 'h', 'y', 'k'}, 'arg2': {'o', 'm', 'A', 's', ' ', 'a', 'r'}}
True
{'arg1': {'o', 'm', 's'}, 'arg2': {'o', 's', 'h', ' ', 'k', 'r'}}
True
{'arg1': {'o', 'm', 's'}, 'arg2': {' ', 'r'}, 'rel': {'o', 'r', 'n'}}
True
{'arg1': {'o', 'm', 'C', 's', 'h', 'y', 'k'}, 'arg2': {'e', 'm', 'c', 'b', ' ', 'r'}, 'rel': set()}
False
{'arg1': {'o', 'm', 's'}, 'arg2': {' ', 'r'}, 'rel': {'o', 'r', 'n'}}
True
{'arg1': {'o', 'm', 's'}, 'arg2': {'s', 'o', 'r'}, 'rel': set()}
False
{'arg1': {'o', 'm', 's'}, 'arg2': {'s', 'o', 'r'}, 'rel': set()}
False
{'arg1': {'o', 'm', 's'}, 'arg2': {'i', 'r'}, 'rel': set()}
False
{'arg1': {'o', 'm', 'C', 's', 'h', 'y', 'k'}, 'arg2': {'i', 'o', 'm', 'r', 'f', 'g', ' ', 'u', 'a', 'e'}, 'rel': {'l', 'a'}}
True
{'arg1': {'o', 'm', 'C', 's', 'h', 'y', 'k'}, 'arg2': {'i', 'o', 't', 's

In [251]:
stanford_precision, stanford_recall, stanford_f1, stanford_matches = scorer(references, stanford_predictions)
print('Stanford:')
print(f'Precision: {stanford_precision}, Recall: {stanford_recall}, F1: {stanford_f1}, Matches: {stanford_matches}')

{'arg1': {'o', 'm', 's'}, 'arg2': {'o', 's', 'h', ' ', 'k', 'r'}}
True
{'arg1': {'o', 's'}, 'arg2': {'o', 's', 'h', ' ', 'k', 'r'}}
True
{'arg1': {'o', 's'}, 'arg2': {'o', 's', 'h', ' ', 'k', 'r'}}
True
{'arg1': {'o', 'm', 'C', 's', 'h', 'y', 'k'}, 'arg2': {'o', 'm', 'A', ' ', 'a', 'k', 'r'}}
True
{'arg1': {'o', 'm', 's'}, 'arg2': {'o', 's', 'h', ' ', 'k', 'r'}}
True
{'arg1': {'o', 'k', 'h', 's'}, 'arg2': {'s'}}
True
{'arg1': {'o', 'm', 's'}, 'arg2': {' ', 'c', 'r'}, 'rel': {'o', 'r', 'n'}}
True
{'arg1': {'o', 's'}, 'arg2': {' ', 'r'}, 'rel': {'o', 'r', 'n'}}
True
{'arg1': {'o', 's'}, 'arg2': {' ', 'c', 'r'}, 'rel': {'o', 'r', 'n'}}
True
{'arg1': {'o', 'm', 's'}, 'arg2': {' ', 'r'}, 'rel': {'o', 'r', 'n'}}
True
{'arg1': {'o', 'k', 'h', 's'}, 'arg2': {'c'}, 'rel': {'n'}}
True
{'arg1': {'o', 'm', 's'}, 'arg2': {'s', 'o', 'r'}, 'rel': set()}
False
{'arg1': {'o', 's'}, 'arg2': {'s', 'o', 'r'}, 'rel': set()}
False
{'arg1': {'o', 'm', 's'}, 'arg2': {'s', 'o', 'r'}, 'rel': set()}
False
{'arg1