## Import Modules

In [1]:
import re
import json
import spacy

## Load ground_truth (annotated)

In [2]:
# Read the "dataset-nyt-nobel2020-label.json" ground_truth
with open('dataset-nyt-nobel2020-label.json', 'r') as f:
    ground_truth = json.load(f)

## Load dataset

In [3]:
# Read the "dataset-nyt-nobel2020.txt" dataset
dataset = []

with open("dataset-nyt-nobel2020.txt", "r") as f:
    lines = f.readlines()
    
    i = 1
    for line in lines:
        # Ignore new lines
        if line in ["\r", "\n", "\t"]:
            continue
        dataset.append((i, line.strip()))
        i += 1
        
for line in dataset:
    # (line_number, line) format
    print(line)

(1, 'https://www.nytimes.com/article/2020-nobel-prize-winners.html')
(2, '2020 Nobel Prize Winners: Full List')
(3, 'Nobel Prize season begins every October as committees in Sweden and Norway name laureates in a variety of prizes in the sciences, literature and economics, as well as peace work. The announcements started last week with the awarding of the prize in Physiology or Medicine. They wrapped up on Monday, when the Sveriges Riksbank Prize in Economic Sciences in Memory of Alfred Nobel was announced.')
(4, 'The Nobel Prizes most years are presented to recipients in Stockholm and Oslo in December. Because of the coronavirus pandemic, the committees are changing their approaches. Some of the events in Stockholm will be canceled in favor of a digital ceremony for the Nobelists, and medals and diplomas are to be distributed to the recipients? embassies and handed over in their home countries. Recipients may be invited to the award ceremony for 2021, if possible.')
(5, 'The Oslo cerem

## Entity Extractors

In [4]:
nlp = spacy.load('en_core_web_sm')

# Extract prize category entity
def prize_category_extraction(text):
    prize_categories = []
    for ent in nlp(text).ents:
        if ent.label_ == "WORK_OF_ART":
            prize_categories.append(ent.text)
            
    # RULE based by Noun Chunking
    if not prize_categories:
        for chunk in nlp(text).noun_chunks:
            prize_categories.append(str(chunk))
    
    return prize_categories

# Extract winner entity
def winner_extraction(text):
    winners = []
    for ent in nlp(text).ents:
        if ent.label_ == "PERSON":
            winners.append(ent.text)
            
    return winners


# Extract time or date entity
def time_extraction(text):
    times = []
    for ent in nlp(text).ents:
        if ent.label_ == "DATE":
            times.append(ent.text)
            
    return times

# Extract space, place or location entity
def space_extraction(text):
    spaces = []
    for ent in nlp(text).ents:
        if ent.label_ == "GPE":
            spaces.append(ent.text)
            
    return spaces

# Extract event entity
def event_extraction(text):
    # RULE: NOUN/PROPN -> VERB -> NOUN/PROPN
    
    labels = []
    event = []
    doc = nlp(text)
    
    i = 0
    while i < len(doc):
        # Event starts with NOUN/PROPN
        if doc[i].pos_ in ["NOUN", "PROPN"]:
            event.append(doc[i])
            i += 1
            
        # If encounter a VERB, then end the "current" event with a NOUN/PROPN
        # Append the "current" event to labels (which is a collection of events)
        # Then reset the "current" event
        elif doc[i].pos_ == "VERB":
            event.append(doc[i])
            i += 1
            while doc[i].pos_ not in ["NOUN", "PROPN"] and i < len(doc)-1:
                event.append(doc[i])
                i += 1
            event.append(doc[i])
            labels.append(" ".join([str(text) for text in event]))
            event = []
            i += 1
        
        # If there is a running event (len > 0), keep adding tokens
        elif len(event) != 0:
            event.append(doc[i])
            i += 1
        
        # Else keep searching for "event"
        else:
            i += 1
    
    return labels

## Perform Entity Extraction (by lines)

In [5]:
extracted_entities = {}

# Perform entity extraction line by line
for line in dataset:
    extracted_entities[f"{line[0]}"] = {
        "sentence": line[1],
        "entities": {
            # Prize categories
            "prize_categories": prize_category_extraction(line[1]),
            
            # Winners
            "winners": winner_extraction(line[1]),
            
            # Events = <labels, spaces, times>
            "events": {
                "labels": event_extraction(line[1]),
                "spaces": space_extraction(line[1]),
                "times": time_extraction(line[1])
            }
        }
    }

## Evaluate extraction results

<b>Example Calculation:</b> \*[... event]<br>
[The Nobel Prize in <b>Chemistry</b> was jointly awarded on <b>Wednesday</b>]\* to <b>Emmanuelle Charpentier</b> and <b>Jennifer A. Doudna</b> for their work on the development of Crispr-Cas9, a method for genome editing.<br>

<b>List of ground truth entities (all entities concatenated):</b><br>
["The Nobel Prize in Chemistry was jointly awarded on Wednesday", "Chemistry", "Wednesday", "Emmanuelle Charpentier", "Jennifer A. Doudna"]

<b>Sample extraction performance (all entities concatenated):</b><br>
["The Nobel Prize in Chemistry was jointly awarded on Wednesday", "The Nobel Prize in Chemistry", "Wednesday", "Emmanuelle Charpentier", ""Jennifer A. Doudna", "Crispr-Cas9"]

<b>Look for overlap match:</b><br>
True Positive = 5 ["The Nobel Prize in Chemistry was jointly awarded on Wednesday", "Chemistry", "Wednesday", "Emmanuelle Charpentier", "Jennifer A. Doudna"]<br>
False Positive = 1 ["Crispr-Cas9"]<br>
False Negatives = 0

<b>Precision</b> = true positives / (true positives + false positives) = 5 / (5 + 1) = 0.83 <br>
<b>Recall</b> = true positives / (true positives + false negatives) = 5 / (5 + 0) = 1 <br>
<b>F1</b> = 2 * (precision * recall) / (precision + recall) = 2 * (0.83 * 1) / (0.83 + 1) = 0.91

### Evaluator

In [6]:
def evaluate(ground_truth, extracted_entities):
    # tp: true positive
    # fp: false positive
    # fn: false negative
    
    tp, fp, fn = 0, 0, 0
    for entity in extracted_entities:
        if entity in ground_truth:
            tp += 1
        else:
            fp += 1
    for entity in ground_truth:
        if entity not in extracted_entities:
            fn += 1
    
    precision = tp / (tp + fp + 0.0001)
    recall = tp / (tp + fn + 0.0001)
    f1 = (2 * (precision * recall)) / (precision + recall + 0.0001)
    
    return round(precision, 3), round(recall, 3), round(f1, 3)

### Evaluate extraction outputs against ground truth

In [7]:
precisions = []
recalls = []
f1s = []

# Iterate over each line and evaluate the extraction (per line basis)
for i in range(len(extracted_entities)):
    
    # Retrieve ground_truth entities
    list_of_ground_truth_entities = []
    if ground_truth[f"{i+1}"]["entities"]:
        list_of_ground_truth_entities += ground_truth[f"{i+1}"]["entities"]["prize_categories"]
        list_of_ground_truth_entities += ground_truth[f"{i+1}"]["entities"]["winners"]
        
        # If there is any event
        if ground_truth[f"{i+1}"]["entities"]["events"]:
            list_of_ground_truth_entities += ground_truth[f"{i+1}"]["entities"]["events"]["labels"]
            list_of_ground_truth_entities += ground_truth[f"{i+1}"]["entities"]["events"]["spaces"]
            list_of_ground_truth_entities += ground_truth[f"{i+1}"]["entities"]["events"]["times"]
    
    # Retrieve extracted_entities
    list_of_extracted_entities = []
    if extracted_entities[f"{i+1}"]["entities"]:
        list_of_extracted_entities += extracted_entities[f"{i+1}"]["entities"]["prize_categories"]
        list_of_extracted_entities += extracted_entities[f"{i+1}"]["entities"]["winners"]
        
        # If there is any event
        if extracted_entities[f"{i+1}"]["entities"]["events"]:
            list_of_extracted_entities += extracted_entities[f"{i+1}"]["entities"]["events"]["labels"]
            list_of_extracted_entities += extracted_entities[f"{i+1}"]["entities"]["events"]["spaces"]
            list_of_extracted_entities += extracted_entities[f"{i+1}"]["entities"]["events"]["times"]
    
    if not list_of_ground_truth_entities and not list_of_extracted_entities:
        continue
        
    precision, recall, f1 = evaluate(list_of_ground_truth_entities, list_of_extracted_entities)
    
    if precision: precisions.append(precision)
    if recall: recalls.append(recall)
    if f1: f1s.append(f1)

print("=== Overall Extraction Performance ===")
print("Precision:", round(sum(precisions) / len(precisions), 3))
print("Recall:", round(sum(recalls) / len(recalls), 3))
print("F1:", round(sum(f1s) / len(f1s), 3))

# Personal Comment: The extraction performance is mostly affected by EVENT entity-extraction since it is using 
# a primitive rule to extract an event: NOUN/PROPN -> VERB -> NOUN/PROPN

=== Overall Extraction Performance ===
Precision: 0.642
Recall: 0.705
F1: 0.665


## Appendix

### Actual extraction output (refer to dataset-nyt-nobel2020-label.json for the corresponding ground truth)

In [8]:
# Display actual extraction output
for key, value in extracted_entities.items():
    print(f"Line {key}:", value["sentence"], "\n")
    print("--- Entities ---")
    print("=> Prize Categories:", value["entities"]["prize_categories"])
    print("=> Winners:", value["entities"]["winners"])
    print("=> Events:")
    print("Label", value["entities"]["events"]["labels"])
    print("Space", value["entities"]["events"]["spaces"])
    print("Time", value["entities"]["events"]["times"])
    print("\n")

Line 1: https://www.nytimes.com/article/2020-nobel-prize-winners.html 

--- Entities ---
=> Prize Categories: []
=> Winners: []
=> Events:
Label []
Space []
Time []


Line 2: 2020 Nobel Prize Winners: Full List 

--- Entities ---
=> Prize Categories: ['Nobel Prize Winners']
=> Winners: []
=> Events:
Label []
Space []
Time []


Line 3: Nobel Prize season begins every October as committees in Sweden and Norway name laureates in a variety of prizes in the sciences, literature and economics, as well as peace work. The announcements started last week with the awarding of the prize in Physiology or Medicine. They wrapped up on Monday, when the Sveriges Riksbank Prize in Economic Sciences in Memory of Alfred Nobel was announced. 

--- Entities ---
=> Prize Categories: ['Nobel Prize', 'Physiology or Medicine']
=> Winners: ['Alfred Nobel']
=> Events:
Label ['Nobel Prize season begins every October', 'committees in Sweden and Norway name laureates in a variety', 'prizes in the sciences , literat

## Quiz 3 Entity Extraction END