## Scoring Pipeline

In [107]:
from preprocessing import parse_txt_file_complex

In [108]:
def match_score(g, p):
    """
    Input: a gold tuple g and a predicted tuple p
    Output: the match score between g and p

    (A side-effect of this scoring method:
    If the system has correctly identified a piece of information but has not
    put the words in the correct arguments of the extracted tuple, those
    misplaced words will not be considered for the score)
    """
    score = 0
    
    # Checking that there is at least (one shared word in each of the first three parts
    if all(g[part] and p[part] and (set(g[part]) & set(p[part])) for part in ['arg1', 'rel', 'arg2']):
        # Looping through the first three parts in g and p
        for part in ['arg1', 'rel', 'arg2']:
            if g[part] and p[part]:
                # Computing the overlap of the tokens in the part of g and p
                overlap = len(set(g[part]).intersection(set(p[part])))
                score += overlap

    # Normalizing the score by dividing by the total number of tokens in g and p
    g_len = sum(len(arg) for arg in list(g.values())[:3] if arg)
    p_len = sum(len(arg) for arg in list(p.values())[:3] if arg)
    score /= (g_len + p_len)

    return score

In [109]:
def exact_match(G, P):
    # Check for exact matches 
    num_exact_matches = 0
    exact_matches = []

    if P:
        for g in G:
            for p in P:
                exact_match = True if all(g[part] == p[part] for part in g.keys()) else False
                if exact_match:
                    exact_matches.append([g, p])
                    num_exact_matches += 1
    
    return num_exact_matches, exact_matches

In [110]:
def find_best_match(g, P):
    """
    Input: a gold tuple g and a list of predicted tuples P
    Output: the predicted tuple p_best in P that best matches g
    """
    best_score = 0
    p_best = None

    # Looping through each predicted tuple in P
    if P:
        for p in P:
            score = match_score(g, p)
            if score > best_score:
                best_score = score
                p_best = p

    return p_best

In [111]:
def match(G, P):
    """
    Input: a list of gold tuples G and a list of predicted tuples P
    Output: a list of matching pairs M, and the lists of unmatched gold tuples UG and unmatched predicted tuples UP
    """
    M = []
    UG = []
    UP = []

    # Looping through each gold tuple in G
    for g in G:
        # Finding the tuple in the predicted tuples P that best matches g
        p_best = find_best_match(g, P)

        if p_best: # and score > threshold:
            M.append((g, p_best))
            P.remove(p_best)
        else:
            UG.append(g)

    # Adding the remaining tuples in P to UP
    UP = P

    return M, UG, UP

In [112]:
def precision_sys(matching_pairs, unmatched_predictions):
    """
    Input:
    - matching_pairs: a list of pairs of matching gold and predicted tuples
    - unmatched_predictions: a list of prediction tuples not found in the reference

    Output: full system precision of predicted and reference tuples at the token level
    (precision: the proportion of extracted words that are found in the reference)
    """
    numerator = 0
    denominator = 0

    for (gold, pred) in matching_pairs:
        pred_args = pred.keys()
        # Summing the length of the intersection between matching predicted and gold tuples and adding the value to the precision numerator
        numerator += sum(len(set(pred[arg]) & set(gold[arg])) for arg in pred_args if pred[arg] and gold[arg])

    if matching_pairs:
        _, matched_predictions = zip(*matching_pairs)
        all_predictions = list(matched_predictions) + unmatched_predictions
    elif unmatched_predictions:
        all_predictions = unmatched_predictions
    else:
        all_predictions = []

    for pred in all_predictions:
        pred_args = pred.keys()
        # Summing the lengths of all prediction tuples
        denominator += sum(len(pred[arg]) for arg in pred_args if pred[arg])

    try:
        precision_sys = numerator / denominator
    except ZeroDivisionError:
        precision_sys = 0

    return precision_sys

In [113]:
def recall_sys(matching_pairs, unmatched_references):
    """
    Input:
    - matching_pairs: a list of pairs of matching gold and predicted tuples
    - unmatched_references: a list of reference tuples not found in the predictions

    Output: full system recall of predicted and reference tuples at the token level
    (recall: the proportion of reference words found in the systems’ predictions)
    """
    numerator = 0
    denominator = 0

    for (gold, pred) in matching_pairs:
        gold_args = gold.keys()
        # Summing the length of the intersection between matching predicted and gold tuples and adding the value to the recall numerator
        numerator += sum(len(set(pred[arg]) & set(gold[arg])) for arg in gold_args if pred[arg] and gold[arg])

    if matching_pairs:
        matched_references, _ = zip(*matching_pairs)
        all_references = list(matched_references) + unmatched_references
    else:
        all_references = unmatched_references

    for gold in all_references:
        gold_args = gold.keys()
        # Summing the lengths of all reference tuples
        denominator += sum(len(gold[arg]) for arg in gold_args if gold[arg])

    try:
        recall_sys = numerator / denominator
    except ZeroDivisionError:
        recall_sys = 0

    return recall_sys

In [114]:
def F1_sys(precision, recall):
    try:
        F1_sys = (2 * precision * recall) / (precision + recall)
    except ZeroDivisionError:
        F1_sys = 0

    return F1_sys

In [115]:
def scorer(ref_data, pred_data):
    """
    Input: a reference dataset G and an OIE system Tsys
    Output: the precision, recall, and F1 scores of Tsys on G
    """
    pair_matches = 0
    total_num_exact_matches = 0
    total_exact_matches = []

    for sentence in ref_data:
        gold_tuples = ref_data[sentence]
        pred_tuples = pred_data.get(sentence, None)

        # Move to the next sentence if the system has not predicted anything for this sentence
        if not pred_tuples:
            precision = 0
            recall = 0
            F1 = 0

        # Finding the matching pairs and the unmatched tuples using the matching function
        matching_pairs, unmatched_gold, unmatched_pred = match(gold_tuples, pred_tuples)
        pair_matches += len(matching_pairs)

        num_exact_matches, exact_matching_pairs = exact_match(gold_tuples, pred_tuples)
        total_num_exact_matches += num_exact_matches
        total_exact_matches.extend(exact_matching_pairs)

        precision = precision_sys(matching_pairs, unmatched_pred)
        recall = recall_sys(matching_pairs, unmatched_gold)
        F1 = F1_sys(precision, recall)

    return precision, recall, F1, pair_matches, total_num_exact_matches, total_exact_matches


## Running the pipeline on Stanford Open IE and ReVerb tools

In [116]:
references = parse_txt_file_complex('IE_AKGC_GOLD.txt')
reverb_predictions = parse_txt_file_complex('results/Reverb_annotations.txt')
stanford_predictions = parse_txt_file_complex('results/stanford-openie-output.txt')

In [117]:
reverb_precision, reverb_recall, reverb_f1, reverb_matches, reverb_exact_matches, reverb_total_exact_matches = scorer(references, reverb_predictions)
print('ReVerb:')
print(f'Precision: {reverb_precision}, Recall: {reverb_recall}, F1: {reverb_f1}, Matches: {reverb_matches}, Exact Matches: {reverb_exact_matches}')

ReVerb:
Precision: 0.5789473684210527, Recall: 0.34375, F1: 0.4313725490196078, Matches: 51, Exact Matches: 7


In [118]:
stanford_precision, stanford_recall, stanford_f1, stanford_matches, stanford_exact_matches, stanford_total_exact_matches = scorer(references, stanford_predictions)
print('Stanford:')
print(f'Precision: {stanford_precision}, Recall: {stanford_recall}, F1: {stanford_f1}, Matches: {stanford_matches}, Exact Matches: {stanford_exact_matches}')

Stanford:
Precision: 0.328125, Recall: 0.328125, F1: 0.328125, Matches: 60, Exact Matches: 0
