# Coreference Resolution

In [2]:
###########################################################
# IMPORTANT NOTE: 
# In this notebook we use an experimental spaCy model 
# named "en_coreference_web_trf", which is not part of the 
# official library. At the current date, the official 
# documentation is unaccurate regarding the version used
# in this tutorial. We can expect an update on the spaCy's
# documentation: https://spacy.io/api/coref
###########################################################

## #1. Setup development environment

###Update & import Python modules

In [3]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [4]:
# install and download spaCy related modules and dependencies
!pip install "spacy~=3.4.4"
!python -m spacy download en_core_web_lg
!pip install https://github.com/explosion/spacy-experimental/releases/download/v0.6.0/en_coreference_web_trf-3.4.0a0-py3-none-any.whl

# spaCy
import spacy
from spacy.language import Language
from spacy.tokens import DocBin, Span
from spacy.matcher import PhraseMatcher
from spacy.kb import KnowledgeBase
from spacy.training import Example
from spacy.ml.models import load_kb
from spacy.util import minibatch, compounding

# Google Drive
from google.colab import drive

# Firebase/Firestore
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

# general Python modules
import json
import datetime
import requests
import csv
import random
from collections import Counter
from pprint import pprint

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2023-02-13 22:14:56.918541: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-13 22:15:00.102658: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2023-02-13 22:15:00.102862: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/

In [5]:
# list all Python packages installed 
!pip list

Package                       Version
----------------------------- ----------------------
absl-py                       1.4.0
aeppl                         0.0.33
aesara                        2.7.9
aiohttp                       3.8.3
aiosignal                     1.3.1
alabaster                     0.7.13
albumentations                1.2.1
altair                        4.2.2
appdirs                       1.4.4
arviz                         0.12.1
astor                         0.8.1
astropy                       4.3.1
astunparse                    1.6.3
async-timeout                 4.0.2
atari-py                      0.2.9
atomicwrites                  1.4.1
attrs                         22.2.0
audioread                     3.0.0
autograd                      1.5
Babel                         2.11.0
backcall                      0.2.0
beautifulsoup4                4.6.3
bleach                        6.0.0
blis                          0.7.9
bokeh                         2.3.3
branca

### Get access to Firebase and Drive 

In [6]:
# remount drive, forced if needed
drive.mount("/content/drive/", force_remount = True)
print("Stablished access to Google Drive")

# initialize Drive path
DRIVE_PATH = "/content/drive/My Drive"

Mounted at /content/drive/
Stablished access to Google Drive


### Retrieve main data structures

In [7]:
list_files = !ls /content/drive/MyDrive/IE/ie_course_2022_team03/retrieved_data/scrapped_data

## #2. Resolve references


### Create pipeline

In [8]:
nlp = spacy.load("en_coreference_web_trf")

### Coreference resolution

In [9]:
# main_text_woc_container = []
texts = []
for file in list_files:
  print("Reading from file", file)
  with open(DRIVE_PATH + f"/IE/ie_course_2022_team03/retrieved_data/scrapped_data/{file}", encoding='utf-8') as f:
    file_data = json.load(f)
    texts.append(file_data["text"])

Reading from file aboutvisitberlindejobs.json
Reading from file aboutvisitberlinde.json
Reading from file aboutvisitberlindeteammarktmanagement.json
Reading from file aboutvisitberlindeunsermedienservicefuersie.json
Reading from file aboutvisitberlindewerbekunden.json
Reading from file absolutemunichcomaboutabsolutemunich.json
Reading from file absolutemunichcomaltepinakothekmunich.json
Reading from file absolutemunichcomalteuttingmunich.json
Reading from file absolutemunichcomauthorabsolutemunichgmailcom.json
Reading from file absolutemunichcomauthorfred.json
Reading from file absolutemunichcombavariancuisineguide.json
Reading from file absolutemunichcombavarianforestnationalpark.json
Reading from file absolutemunichcombavarianstatelibrary.json
Reading from file absolutemunichcombavariaparkmunich.json
Reading from file absolutemunichcombavariastatuemunich.json
Reading from file absolutemunichcomblog.json
Reading from file absolutemunichcomblogpage2.json
Reading from file absolutemunic

In [10]:
for text in texts[:10]:
# process paragraphs individually in a stream (multi-thread)
  for doc in nlp.pipe(text, batch_size=50):
    spans = doc.spans  # coreference
    pprint(doc.text)
    pprint(spans)
    print()

'Bitte aktivieren Sie Javascript in Ihrem Browser.'
{}

('Sie möchten für eine der faszinierendsten und erfolgreichsten Städte '
 'arbeiten? Ob Beratung und Verkauf in einer Berlin Tourist Info, weltweites '
 'Marketing oder als Vertriebsprofi – die Arbeitsgebiete bei uns sind sehr '
 'vielfältig. Hier finden Sie alles rund um Jobs, Praktika undAusbildung bei '
 'visitBerlin.')
{'coref_clusters_1': [Sie möchten, Sie alles]}

('\n'
 '    \n'
 '                                \n'
 '                                                                                                  \n'
 '  Sie werben für Berlin\n'
 '\n'
 '  ')
{}

('\n'
 '    \n'
 '                                \n'
 '                                                                                                  \n'
 '  Gutes Arbeitsklima\n'
 '\n'
 '  ')
{}

('\n'
 '    \n'
 '                                \n'
 '                                                                                                  \n'
 '  Mode

## #3. Utils (optional) 

### Simple coreference resolution example

In [11]:
doc = nlp("Berlin is the Capital of Germany. It's one of the most popular city in germany.")
print(doc.spans)

{'coref_clusters_1': [Berlin is, It's]}


### Analize the pipeline

In [12]:
# see pipeline components
print(nlp.pipe_names)

# analize pipeline
pprint(nlp.analyze_pipes(pretty=True))

['sentencizer', 'transformer', 'coref', 'span_resolver', 'span_cleaner']
[1m

#   Component       Assigns               Requires    Scores          Retokenizes
-   -------------   -------------------   ---------   -------------   -----------
0   sentencizer     token.is_sent_start               sents_f         False      
                    doc.sents                         sents_p                    
                                                      sents_r                    
                                                                                 
1   transformer     doc._.trf_data                                    False      
                                                                                 
2   coref           doc.spans             doc.spans   coref_f         False      
                                                      coref_p                    
                                                      coref_r                    
                   

# Entity linking with Wikidata (NERD/EL)

## #1. Setup development environment

### Update & import Python modules

In [13]:
# input files
ents_file = DRIVE_PATH + "/IE/ie_course_2022_team03/assets/entities.json"
annot_text_file = DRIVE_PATH + "/IE/ie_course_2022_team03/assets/annotated_text.json"

# output files
kb_dir = DRIVE_PATH + "/IE/ie_course_2022_team03/output/ml_el/kb"
nlp_dir = DRIVE_PATH + "/IE/ie_course_2022_team03/output/ml_el/my_nlp"
train_corpus = DRIVE_PATH + "/IE/ie_course_2022_team03/output/ml_el/train_corpus"
test_corpus = DRIVE_PATH + "/IE/ie_course_2022_team03/output/ml_el/test_corpus"
nlp_el_dir = DRIVE_PATH + "/IE/ie_course_2022_team03/output/ml_el/my_el_nlp"

In [14]:
def load_entities():
  names = dict()
  descriptions = dict()
  # read and iterate entities and split it into two dicts
  with open(ents_file, newline="") as f:
    entities = json.load(f)
    # print(f"Retrieved entities")
    for row in entities:
      qid = row['qid']
      name = row['name']
      desc = row['description']
      names[qid] = name
      descriptions[qid] = desc
  # return "names" {id,names} and "descriptions" {id,descriptions}
  return names, descriptions

name_dict, desc_dict = load_entities()

In [15]:
""" Step 1: create the Knowledge Base in NLP pipeline and write it to file """

# Helper function to read in the pre-defined entities we want to disambiguate to
def load_entities():
  names = dict()
  descriptions = dict()
  # read and iterate entities and split it into two dicts
  with open(ents_file, newline="") as f:
    entities = json.load(f)
    # print(f"Retrieved entities")
    for row in entities:
      if row['qid'] not in ["Q183", "Q1726", "Q64"]:
        continue
      qid = row['qid']
      name = row['name']
      desc = row['description']
      names[qid] = name
      descriptions[qid] = desc
  # return "names" {id,names} and "descriptions" {id,descriptions}
  return names, descriptions

# First: create a simple model with an NER component
# To ensure we get the correct entities for this demo, add a simple entity_ruler as well.
nlp = spacy.load("en_core_web_lg", exclude="parser, tagger, lemmatizer")
ruler = nlp.add_pipe("entity_ruler", after="ner")
patterns = [{"label": "CITY", "pattern": [{"LOWER": "berlin"}]}]
ruler.add_patterns(patterns)
nlp.add_pipe("sentencizer", first=True)

name_dict, desc_dict = load_entities()

In [16]:
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300)

for qid, desc in desc_dict.items():
  desc_doc = nlp(desc)
  desc_enc = desc_doc.vector
  # Set arbitrary value for frequency
  kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)

for qid, name in name_dict.items():
  # set 100% prior probability P(entity|alias) for each unique name
  kb.add_alias(alias=name, entities=[qid], probabilities=[1])

qids = name_dict.keys()
probs = [0.3 for qid in qids]
# ensure that sum([probs]) <= 1 when setting aliases
kb.add_alias(alias="Berlin", entities=qids, probabilities=probs)  #

print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")
print()

# store knowledgebase and NLP pipeline
kb.to_disk(kb_dir)
print(f"Saved KB in: {kb_dir}")
nlp.to_disk(nlp_dir)
print(f"Saved NLP pipeline in: {nlp_dir}")

  kb.add_alias(alias="Berlin", entities=qids, probabilities=probs)  #


Entities in the KB: ['Q64', 'Q1726', 'Q183']
Aliases in the KB: ['Berlin', 'Germany', 'Munich']

Saved KB in: /content/drive/My Drive/IE/ie_course_2022_team03/output/ml_el/kb
Saved NLP pipeline in: /content/drive/My Drive/IE/ie_course_2022_team03/output/ml_el/my_nlp


In [17]:
nlp = spacy.load(nlp_dir, exclude="parser, tagger")

In [18]:
""" Step 2: Once we have done the manual annotations, create corpora in spaCy format. """

##############################################################
# TODO: create annotated dataset for training before this step
##############################################################
docs = []
gold_ids = []
COUNT = 0

with open(annot_text_file, newline="") as f:
  example = json.load(f)
  anotated_text = example['annotations']
  for line in anotated_text:
    sentence = line[0]

    QID = line[1]["entities"][0][2]
    doc = nlp.make_doc(sentence)
    gold_ids.append(QID)
    # we assume only 1 annotated span per sentence, and only 1 KB ID per span
    if QID in ['Q64', 'Q1726']:
      lable = "CITY"
    else:
      lable = "COUNTRY"
    entity = doc.char_span(
      line[1]["entities"][0][0],
      line[1]["entities"][0][1],
      label=lable,
      kb_id=QID,
    )
    doc.ents = [entity]
    for i, t in enumerate(doc):
      doc[i].is_sent_start = i == 0
    docs.append(doc)

print("Statistics of manually annotated data:")
print(Counter(gold_ids))
print()

train_docs = DocBin()
test_docs = DocBin()
for QID in ['Q64', 'Q1726', 'Q183']:
  indices = [i for i, j in enumerate(gold_ids) if j == QID]
  # first 8 in training
  for index in indices[0:8]:
    train_docs.add(docs[index])
  # last 2 in test
  for index in indices[8:10]:
    test_docs.add(docs[index])

train_docs.to_disk(train_corpus)
print(f"Saved train corpus in: {train_corpus}")
test_docs.to_disk(test_corpus)
print(f"Saved test corpus in: {test_corpus}")

Statistics of manually annotated data:
Counter({'Q64': 29, 'Q183': 9, 'Q1726': 1})

Saved train corpus in: /content/drive/My Drive/IE/ie_course_2022_team03/output/ml_el/train_corpus
Saved test corpus in: /content/drive/My Drive/IE/ie_course_2022_team03/output/ml_el/test_corpus


In [19]:
""" Step 3: Train entity linking model. """

nlp = spacy.load(nlp_dir)

TRAIN_EXAMPLES = []

with open(train_corpus, "rb") as f:
  doc_bin = DocBin().from_disk(train_corpus)
  docs = doc_bin.get_docs(nlp.vocab)
  for doc in docs:
    TRAIN_EXAMPLES.append(Example(nlp(doc.text), doc))

entity_linker = nlp.add_pipe("entity_linker", config={"incl_prior": False}, last=True)
entity_linker.initialize(lambda: TRAIN_EXAMPLES, nlp=nlp, kb_loader=load_kb(kb_dir))

with nlp.select_pipes(enable=["entity_linker"]):  # train only the entity_linker
  optimizer = nlp.resume_training()
  for itn in range(500):  # 500 iterations takes about a minute to train
    random.shuffle(TRAIN_EXAMPLES)
    batches = minibatch(TRAIN_EXAMPLES, size=compounding(4.0, 32.0, 1.001))  # increasing batch sizes
    losses = {}
    for batch in batches:
      nlp.update(
        batch,
        drop=0.2,  # prevent overfitting
        losses=losses,
        sgd=optimizer,
      )
    if itn % 50 == 0:
      print(itn, "Losses", losses)  # print the training loss
print(itn, "Losses", losses)

nlp.to_disk(nlp_el_dir)
print()
print(f"Saved NLP pipeline in: {nlp_el_dir}")

0 Losses {'entity_linker': 4.067399084568024}
50 Losses {'entity_linker': 0.0764659841855367}
100 Losses {'entity_linker': 0.14850497245788574}
150 Losses {'entity_linker': 0.040450205405553184}
200 Losses {'entity_linker': 0.022833208243052162}
250 Losses {'entity_linker': 0.0724490781625112}
300 Losses {'entity_linker': 0.01742533346017202}
350 Losses {'entity_linker': 0.03389229873816173}
400 Losses {'entity_linker': 0.1404922902584076}
450 Losses {'entity_linker': 0.028791427612304688}
499 Losses {'entity_linker': 0.053205281496047974}

Saved NLP pipeline in: /content/drive/My Drive/IE/ie_course_2022_team03/output/ml_el/my_el_nlp


In [20]:
""" Step 4: Evaluate the new Entity Linking component by applying it to unseen text. """

nlp = spacy.load(nlp_el_dir)

examples = []

with open(test_corpus, "rb") as f:
  doc_bin = DocBin().from_disk(test_corpus)
  docs = doc_bin.get_docs(nlp.vocab)
  for doc in docs:
    examples.append(Example(nlp(doc.text), doc))


print("RESULTS ON THE DEV SET:")
print()

for example in examples:
  print(example.text)
  print(f"Gold annotation: {example.reference.ents[0].kb_id_}")
  print(f"Predicted annotation: {example.predicted.ents[0].kb_id_}")
  print()

print()
print("RUNNING THE PIPELINE ON UNSEEN TEXT:")
text = "Berlin and Munich are both centers of innovation and progress, with numerous startups, tech companies, and research institutions driving forward Germany's economy"
doc = nlp(text)
print(text)
for ent in doc.ents:
  print(ent.text, ent.label_, ent.kb_id_)
print()

RESULTS ON THE DEV SET:

Berlin and Munich are both home to several world-class universities and research institutions, making them important centers of higher education.
Gold annotation: Q64
Predicted annotation: Q64

Berlin and Munich are both green cities, with numerous parks, gardens, and nature reserves within the city limits.
Gold annotation: Q64
Predicted annotation: Q64

Germany is a country with a rich and complex history, and Berlin and Munich offer a glimpse into its past, present, and future.
Gold annotation: Q183
Predicted annotation: Q183


RUNNING THE PIPELINE ON UNSEEN TEXT:
Berlin and Munich are both centers of innovation and progress, with numerous startups, tech companies, and research institutions driving forward Germany's economy
Berlin GPE Q64
Munich GPE Q1726
Germany GPE Q183

