# Install requirements

In [1]:
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install spacy-conll

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/1b/d8/0361bbaf7a1ff56b44dca04dace54c82d63dad7475b7d25ea1baefafafb2/spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl (12.8MB)
[K     |████████████████████████████████| 12.8MB 300kB/s 
[?25hCollecting spacy-legacy<3.1.0,>=3.0.4
  Downloading https://files.pythonhosted.org/packages/8d/67/d4002a18e26bf29b17ab563ddb55232b445ab6a02f97bf17d1345ff34d3f/spacy_legacy-3.0.5-py2.py3-none-any.whl
Collecting srsly<3.0.0,>=2.4.1
[?25l  Downloading https://files.pythonhosted.org/packages/c3/84/dfdfc9f6f04f6b88207d96d9520b911e5fec0c67ff47a0dea31ab5429a1e/srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl (456kB)
[K     |████████████████████████████████| 460kB 35.4MB/s 
[?25hCollecting pydantic<1.8.0,>=1.7.1
[?25l  Downloading https://files.pythonhosted.org/packages/b3/0a/52ae1c659fc08f13dd7c0ae07b88e4f807ad83fb9954a59b0b0a3d1a8ab6/pydantic-1.7.3-cp37-cp37m-manylinux2014_x86_64.whl (9.1MB)
[K     |███████████████████████████

# Download dataset and conll script

In [2]:
!wget -O /content/conll2003.zip https://github.com/esrel/NLU.Lab.2021/blob/master/src/conll2003.zip?raw=true
!wget https://raw.githubusercontent.com/esrel/NLU.Lab.2021/master/src/conll.py

--2021-04-28 14:56:46--  https://github.com/esrel/NLU.Lab.2021/blob/master/src/conll2003.zip?raw=true
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/esrel/NLU.Lab.2021/raw/master/src/conll2003.zip [following]
--2021-04-28 14:56:47--  https://github.com/esrel/NLU.Lab.2021/raw/master/src/conll2003.zip
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/esrel/NLU.Lab.2021/master/src/conll2003.zip [following]
--2021-04-28 14:56:47--  https://raw.githubusercontent.com/esrel/NLU.Lab.2021/master/src/conll2003.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awa

Unzip the dataset

In [3]:
!unzip conll2003.zip -d /content/dataset/

Archive:  conll2003.zip
  inflating: /content/dataset/dev.txt  
  inflating: /content/dataset/__MACOSX/._dev.txt  
  inflating: /content/dataset/test.txt  
  inflating: /content/dataset/__MACOSX/._test.txt  
  inflating: /content/dataset/train.txt  
  inflating: /content/dataset/__MACOSX/._train.txt  


# Imports

In [4]:
import spacy, nltk
nlp = spacy.load('en_core_web_sm')
import pandas as pd
import conll
from sklearn.metrics import classification_report

# 1) Evaluate spaCy NER on CoNLL 2003 data (provided)

import_dataset(path):  
  * Input: the path of the dataset to load
  * Output: two lists:
    1. text_dataset: contains the lists of sentences of the dataset as text
    2. dataset: contais the pair (token, name entity) for each token, divided in sentences (one list for each sentence)
  * Implementation: it reads the dataset using conll function, for each sentence it extracts the tokens as text or the tuple (token, name entity) 

In [5]:
def import_dataset(path):
  data = conll.read_corpus_conll(path)
  text_dataset = []
  dataset = []
  for t in data:
    sentence = []
    txt = ""
    for t2 in t:
      sentence.append((t2[0].split()[0], t2[0].split()[3]))
      txt += str(t2[0].split()[0]) + " "
    dataset.append(sentence)
    text_dataset.append([txt])
  return text_dataset, dataset

convert_type(ent_type):
  * Input: named entity from spaCy
  * Output: the name entity converted in the dataset format
  * Implementation: assign a specific named entity from the dataset format to each named entity from spaCy

In [6]:
def convert_type(ent_type):
  if(ent_type in ["ORG"]):
    return "ORG" 
  if(ent_type in ["GPE", "LOC"]):
    return "LOC"
  if(ent_type in ["LANGUAGE", "WORK_OF_ART", "FAC", "ORDINAL", "TIME", "NORP", "EVENT", "LAW", "CARDINAL", "PRODUCT", "DATE", "QUANTITY", "MONEY", "PERCENT"]):
    return "MISC"
  if(ent_type in ["PERSON"]):
    return "PER"
  return ""

convert_spacy(token, parent=None):
  * Input: the token to convert, the parent of the token to use in the third exercise
  * Output: the tags converted in form ```iob-type``` as in the dataset
  * Implementation: 
    * if parent is None it returns just the concatenation between the ```IOB``` tag and the named entity tag
    * if parent is set it returns the named entity from the parent if possible



In [7]:
def convert_spacy(token, parent=None):
  if(parent == None): # exercise 1 usage
    if(token.ent_iob_ == "O"):
      return "O"
    else:
      return f"{token.ent_iob_}-{convert_type(token.ent_type_)}"
  else: # exercise 3 usage
    if(token.ent_iob_ == "O"):
      if(parent.ent_type_ != ""):
        return f"I-{convert_type(parent.ent_type_)}"
      else:
        return "O"
    else:
      if(parent.ent_type_ != ""):
        return f"{token.ent_iob_}-{convert_type(parent.ent_type_)}"
      else:
        return f"{token.ent_iob_}-{convert_type(token.ent_type_)}"

reconstruct_output(doc, comp=False):
  * Input: Doc object from spaCy and comp (compound) flag to set on the third exercise
  * Output: list of sentences, each sentence contains the token "reconstructed" as in the dataset
  * Implementation: given a token it uses whitespace to check if the token is part of a word in the dataset, if yes it concatenates the tokens with the same tag, otherwise the single token is used.  
  if comp is set to True, the tokens with compound dependency will have the same tag as their parents.

In [8]:
def reconstruct_output(doc, comp=False):
  output = []
  current_token = ""
  current_tag = ""
  first = True
  for token in doc:
    if(first):
        current_tag = convert_spacy(token)
        if((comp) and (token.dep_ == "compound")):
          current_tag = convert_spacy(token, token.head)
        first = False
    if(not token.whitespace_):
      current_token += token.text
    else:
      current_token += token.text
      output.append((current_token, current_tag))
      first = True
      current_token = ""
      current_tag = ""
  if(not first):
    output.append((current_token, current_tag))
  
  return output

process_dataset(dataset_text, expand):
  * Input: the dataset as lists of sentences, expand is a flag used in the third exercise
  * Output: the predicted named entities
  * Implementation: it processes each sentence using nlp and it calls reconstruct_output to format it as in the dataset 

In [9]:
def process_dataset(dataset_text, expand):
  pred = []
  for sentence in dataset_text:
    spacy_output = nlp(sentence[0])
    pred.append(reconstruct_output(spacy_output, expand))
  return pred

get_accuracy(dataset_text, dataset_refs, expand = False):
  * Input: 
    * dataset_text: the dataset as lists of sentences (text)
    * dataset_refs: the true named entities from the dataset
    * expand: whether to use the expanded version (ex3) or not
  * Output:
    * the scikit classification report of spaCy NER on the specified dataset (using the setting on convert_type function)
    * the predictions
  * Implementation: process the dataset and compute the report


In [10]:
def get_accuracy(dataset_text, dataset_refs, expand = False):
  pred = process_dataset(dataset_text, expand)
  predicted = []

  for sentence in pred:
    for token in sentence:
      predicted.append(token[1])
  
  true_labels = []
  for sentence in dataset_refs:
    for token in sentence:
      true_labels.append(token[1])

  report = classification_report(true_labels, predicted)

  return report, pred

# -) Execution

In [11]:
dev_path = '/content/dataset/dev.txt'
train_path = '/content/dataset/train.txt'
test_path = '/content/dataset/test.txt'

Extract the datasets as:
 * *_txt: list of sentences as text
 * *_refs: the true named entities from each dataset

In [12]:
dev_txt, dev_refs = import_dataset(dev_path)
train_txt, train_refs = import_dataset(train_path)
test_txt, test_refs = import_dataset(test_path)

1.1) Compute the token level accuracy for the test set

In [13]:
report_test, pred = get_accuracy(test_txt, test_refs)
print(report_test)

              precision    recall  f1-score   support

       B-LOC       0.77      0.68      0.72      1668
      B-MISC       0.10      0.57      0.17       702
       B-ORG       0.52      0.31      0.38      1661
       B-PER       0.80      0.63      0.70      1617
       I-LOC       0.57      0.53      0.55       257
      I-MISC       0.05      0.40      0.09       216
       I-ORG       0.42      0.51      0.46       835
       I-PER       0.84      0.79      0.81      1156
           O       0.95      0.86      0.90     38554

    accuracy                           0.81     46666
   macro avg       0.56      0.59      0.53     46666
weighted avg       0.89      0.81      0.84     46666



1.2) Compute the chunk level accuracy for the test set using the evaluate function provided by conll.py

In [14]:
results = conll.evaluate(test_refs, pred)
pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

Unnamed: 0,p,r,f,s
MISC,0.1,0.554,0.169,702
PER,0.774,0.609,0.681,1617
LOC,0.755,0.667,0.708,1668
ORG,0.464,0.276,0.346,1661
total,0.385,0.521,0.443,5648


# -) Experiment
Here I was curious about using already tokenized text from the dataset (overriding spaCy tokenizer).  
Despite spaCy's documentation reports that the performance should decrease (due to the fact that the tokenization methods may be different) the perfomance remains similar.

In [15]:
from spacy.tokens import Doc

# function to replace spaCy tokenizer
def get_tokens(sentence):
  return Doc(nlp.vocab, sentence)

In [16]:
nlp.tokenizer = get_tokens

data = conll.read_corpus_conll(test_path)
pred = []

for s in data:
  sentence = []
  for token in s:
    sentence.append(token[0].split()[0])
  doc = nlp(sentence)
  pred.append(reconstruct_output(doc))

predicted = []
for sentence in pred:
  for token in sentence:
    predicted.append(token[1])

true_labels = []
for sentence in test_refs:
  for token in sentence:
    true_labels.append(token[1])

report = classification_report(true_labels, predicted)
print(report)

              precision    recall  f1-score   support

       B-LOC       0.78      0.70      0.74      1668
      B-MISC       0.11      0.56      0.18       702
       B-ORG       0.50      0.30      0.38      1661
       B-PER       0.79      0.61      0.69      1617
       I-LOC       0.60      0.62      0.61       257
      I-MISC       0.05      0.40      0.09       216
       I-ORG       0.42      0.52      0.46       835
       I-PER       0.82      0.76      0.78      1156
           O       0.94      0.86      0.90     38554

    accuracy                           0.81     46666
   macro avg       0.56      0.59      0.54     46666
weighted avg       0.89      0.81      0.84     46666



In [17]:
results = conll.evaluate(test_refs, pred)
pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

Unnamed: 0,p,r,f,s
MISC,0.105,0.55,0.177,702
PER,0.761,0.59,0.665,1617
LOC,0.766,0.695,0.729,1668
ORG,0.448,0.272,0.339,1661
total,0.397,0.523,0.451,5648


# 2) Grouping of Entities.  
Write a function to group recognized named entities using noun_chunks method of spaCy. Analyze the groups in terms of most frequent combinations (i.e. NER types that go together).

group_eintities(sentence):
  * Input: the sentence to process
  * Output: named entities grouped based on noun_chunk
  * Implementation:
    * first a set containing all the sentence entities is created
    * for each noun_chunk its entities are checked if they belong to the main entity set, if yes they will be part of the chunk group
    * the entities added are removed from the main set
    * in the end if the set is not empty, each remaining entity is added to a different new chunk (entities that were not in any chunk)

In [18]:
# I checked whether all the entities of the sentence (doc.ents) are inside chunk.ents.
# there might be new entities inside chunk.ents, they will be discarded, so just the main entities from the sentence will be considered.

def group_entities(sentence):
  doc = nlp(sentence)
  groups = []
  entities = set()

  for ent in doc.ents:
    entities.add(ent)

  for chunk in doc.noun_chunks:
    group = []
    for span in chunk.ents:
      if span in entities:
        group.append(span.root.ent_type_)
        entities.remove(span)
    if(len(group) != 0):
      groups.append(group)

  for ent in entities:
    groups.append([ent.root.ent_type_])

  return groups

get_frequencies(dataset):
  * Input: the dataset where counting the combinations of entities
  * Output: a dict containing the frequencies for each combination
  * Implementation:
    * process each sentence of the dataset and groups its entities using group_entities
    * for each group create a tuple and increase the count of that group (combination) on the dict

In [19]:
def get_frequencies(dataset):
  freq = dict()
  for sentence in dataset:
    groups = group_entities(sentence[0])
    for group in groups:
      group = tuple(group)
      if(group in freq):
        freq[group] += 1
      else:
        freq[group] = 1
  return freq

**Get the frequencies of the test set**  
Print the dictionary containing the frequencies

In [20]:
nlp = spacy.load('en_core_web_sm') # reset the tokenizer if 1.experiment has been run
freq = get_frequencies(test_txt)
print(freq)

{('GPE',): 1255, ('ORG',): 873, ('DATE',): 997, ('EVENT',): 58, ('ORDINAL',): 111, ('CARDINAL',): 1624, ('TIME',): 83, ('NORP', 'PERSON'): 43, ('NORP',): 293, ('PERSON',): 1074, ('GPE', 'GPE'): 26, ('GPE', 'PERSON'): 34, ('DATE', 'EVENT'): 8, ('ORDINAL', 'NORP'): 1, ('CARDINAL', 'PERSON'): 51, ('CARDINAL', 'GPE'): 13, ('GPE', 'PERSON', 'CARDINAL'): 1, ('LAW',): 11, ('WORK_OF_ART',): 10, ('PERSON', 'PERSON', 'PERSON'): 2, ('CARDINAL', 'ORDINAL'): 2, ('ORDINAL', 'DATE'): 1, ('ORG', 'GPE', 'ORDINAL'): 1, ('GPE', 'DATE'): 5, ('MONEY',): 147, ('QUANTITY',): 51, ('ORG', 'QUANTITY'): 1, ('PERCENT',): 81, ('CARDINAL', 'CARDINAL', 'PERSON'): 2, ('FAC',): 22, ('GPE', 'ORDINAL'): 4, ('PRODUCT',): 22, ('CARDINAL', 'DATE'): 3, ('CARDINAL', 'CARDINAL'): 5, ('LOC',): 54, ('DATE', 'ORG'): 8, ('ORG', 'PERSON'): 21, ('ORG', 'DATE'): 6, ('ORG', 'CARDINAL'): 3, ('GPE', 'ORG'): 13, ('ORDINAL', 'PERSON'): 4, ('ORG', 'ORDINAL'): 2, ('ORG', 'ORG'): 8, ('NORP', 'ORDINAL'): 5, ('LANGUAGE', 'ORDINAL'): 2, ('PERS

# 3) One of the possible post-processing steps is to fix segmentation errors.  
Write a function that extends the entity span to cover the full noun-compounds. Make use of compound dependency relation.

For this point I reused the get_accuracy function of the first point.  
In this case the expand flag is set to True, this means that to the tokens with compound dependence will be assigned the tag of their parents (if possible).

In [21]:
nlp = spacy.load('en_core_web_sm') # reset the tokenizer if 1.experiment has been run
report_test, pred = get_accuracy(test_txt, test_refs, True)
print(report_test)

              precision    recall  f1-score   support

       B-LOC       0.77      0.67      0.72      1668
      B-MISC       0.10      0.57      0.17       702
       B-ORG       0.51      0.30      0.38      1661
       B-PER       0.79      0.63      0.70      1617
       I-LOC       0.48      0.53      0.50       257
      I-MISC       0.05      0.41      0.09       216
       I-ORG       0.40      0.52      0.45       835
       I-PER       0.71      0.79      0.75      1156
           O       0.95      0.85      0.90     38554

    accuracy                           0.80     46666
   macro avg       0.53      0.59      0.52     46666
weighted avg       0.88      0.80      0.84     46666



In [22]:
results = conll.evaluate(test_refs, pred)
pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

Unnamed: 0,p,r,f,s
MISC,0.098,0.553,0.167,702
PER,0.669,0.607,0.637,1617
LOC,0.739,0.662,0.699,1668
ORG,0.445,0.273,0.338,1661
total,0.37,0.518,0.431,5648


As we can see, using this method, the performance slightly decreases.
