# Named Entity Recognition (second assignment NLU)

* Student name: Gaia Trebucchi
* Student number: 224464



First, we load `en_core_web_sm` with `spacy.load`. This will return a `Language` object stored as `nlp_spacy` containing all components and data needed to process text. We also import `pandas` to create tables and `classification_report` from `sklearn.metrics` to evaluate.

In [55]:
import spacy
from sklearn.metrics import classification_report
import pandas as pd
nlp_spacy=spacy.load('en_core_web_sm')

Here, the module `conll.py` provided at the following link https://github.com/esrel/NLU.Lab.2021/blob/master/src/conll.py is inserted, in order to avoid the necessity of importing it. From this module the function `read_corpus_conll` will be used as starting point to load the dataset and the function `evaluate` will be used for the first and the third request in order to report the precision, recall, f-measure of correctly recognizing all the named entities in a chunk per class and total. 

In [29]:
import re

"""
Modified version of https://pypi.org/project/conlleval/
"""


def stats():
    return {'cor': 0, 'hyp': 0, 'ref': 0}


def evaluate(ref, hyp, otag='O'):
    # evaluation for NLTK
    aligned = align_hyp(ref, hyp)
    return conlleval(aligned, otag=otag)


def align_hyp(ref, hyp):
    # align references and hypotheses for evaluation
    # add last element of token tuple in hyp to ref
    if len(ref) != len(hyp):
        raise ValueError("Size Mismatch: ref: {} & hyp: {}".format(len(ref), len(hyp)))

    out = []
    for i in range(len(ref)):
        if len(ref[i]) != len(hyp[i]):
            raise ValueError("Size Mismatch: ref: {} & hyp: {}".format(len(ref), len(hyp)))
        out.append([(*ref[i][j], hyp[i][j][-1]) for j in range(len(ref[i]))])
    return out


def conlleval(data, otag='O'):
    # token, segment & class level counts for TP, TP+FP, TP+FN
    tok = stats()
    seg = stats()
    cls = {}

    for sent in data:

        prev_ref = otag      # previous reference label
        prev_hyp = otag      # previous hypothesis label
        prev_ref_iob = None  # previous reference label IOB
        prev_hyp_iob = None  # previous hypothesis label IOB

        in_correct = False  # currently processed chunks is correct until now

        for token in sent:

            hyp_iob, hyp = parse_iob(token[-1])
            ref_iob, ref = parse_iob(token[-2])

            ref_e = is_eoc(ref, ref_iob, prev_ref, prev_ref_iob, otag)
            hyp_e = is_eoc(hyp, hyp_iob, prev_hyp, prev_hyp_iob, otag)

            ref_b = is_boc(ref, ref_iob, prev_ref, prev_ref_iob, otag)
            hyp_b = is_boc(hyp, hyp_iob, prev_hyp, prev_hyp_iob, otag)

            if not cls.get(ref) and ref:
                cls[ref] = stats()

            if not cls.get(hyp) and hyp:
                cls[hyp] = stats()

            # segment-level counts
            if in_correct:
                if ref_e and hyp_e and prev_hyp == prev_ref:
                    in_correct = False
                    seg['cor'] += 1
                    cls[prev_ref]['cor'] += 1

                elif ref_e != hyp_e or hyp != ref:
                    in_correct = False

            if ref_b and hyp_b and hyp == ref:
                in_correct = True

            if ref_b:
                seg['ref'] += 1
                cls[ref]['ref'] += 1

            if hyp_b:
                seg['hyp'] += 1
                cls[hyp]['hyp'] += 1

            # token-level counts
            if ref == hyp and ref_iob == hyp_iob:
                tok['cor'] += 1

            tok['ref'] += 1

            prev_ref = ref
            prev_hyp = hyp
            prev_ref_iob = ref_iob
            prev_hyp_iob = hyp_iob

        if in_correct:
            seg['cor'] += 1
            cls[prev_ref]['cor'] += 1

    return summarize(seg, cls)


def parse_iob(t):
    m = re.match(r'^([^-]*)-(.*)$', t)
    return m.groups() if m else (t, None)


def is_boc(lbl, iob, prev_lbl, prev_iob, otag='O'):
    """
    is beginning of a chunk
    supports: IOB, IOBE, BILOU schemes
        - {E,L} --> last
        - {S,U} --> unit
    :param lbl: current label
    :param iob: current iob
    :param prev_lbl: previous label
    :param prev_iob: previous iob
    :param otag: out-of-chunk label
    :return:
    """
    boc = False

    boc = True if iob in ['B', 'S', 'U'] else boc
    boc = True if iob in ['E', 'L'] and prev_iob in ['E', 'L', 'S', otag] else boc
    boc = True if iob == 'I' and prev_iob in ['S', 'L', 'E', otag] else boc

    boc = True if lbl != prev_lbl and iob != otag and iob != '.' else boc

    # these chunks are assumed to have length 1
    boc = True if iob in ['[', ']'] else boc

    return boc


def is_eoc(lbl, iob, prev_lbl, prev_iob, otag='O'):
    """
    is end of a chunk
    supports: IOB, IOBE, BILOU schemes
        - {E,L} --> last
        - {S,U} --> unit
    :param lbl: current label
    :param iob: current iob
    :param prev_lbl: previous label
    :param prev_iob: previous iob
    :param otag: out-of-chunk label
    :return:
    """
    eoc = False

    eoc = True if iob in ['E', 'L', 'S', 'U'] else eoc
    eoc = True if iob == 'B' and prev_iob in ['B', 'I'] else eoc
    eoc = True if iob in ['S', 'U'] and prev_iob in ['B', 'I'] else eoc

    eoc = True if iob == otag and prev_iob in ['B', 'I'] else eoc

    eoc = True if lbl != prev_lbl and iob != otag and prev_iob != '.' else eoc

    # these chunks are assumed to have length 1
    eoc = True if iob in ['[', ']'] else eoc

    return eoc


def score(cor_cnt, hyp_cnt, ref_cnt):
    # precision
    p = 1 if hyp_cnt == 0 else cor_cnt / hyp_cnt
    # recall
    r = 0 if ref_cnt == 0 else cor_cnt / ref_cnt
    # f-measure (f1)
    f = 0 if p+r == 0 else (2*p*r)/(p+r)
    return {"p": p, "r": r, "f": f, "s": ref_cnt}


def summarize(seg, cls):
    # class-level
    res = {lbl: score(cls[lbl]['cor'], cls[lbl]['hyp'], cls[lbl]['ref']) for lbl in set(cls.keys())}
    # micro
    res.update({"total": score(seg.get('cor', 0), seg.get('hyp', 0), seg.get('ref', 0))})
    return res


def read_corpus_conll(corpus_file, fs="\t"):
    """
    read corpus in CoNLL format
    :param corpus_file: corpus in conll format
    :param fs: field separator
    :return: corpus
    """
    featn = None  # number of features for consistency check
    sents = []  # list to hold words list sequences
    words = []  # list to hold feature tuples

    for line in open(corpus_file):
        line = line.strip()
        if len(line.strip()) > 0:
            feats = tuple(line.strip().split(fs))
            if not featn:
                featn = len(feats)
            elif featn != len(feats) and len(feats) != 0:
                raise ValueError("Unexpected number of columns {} ({})".format(len(feats), featn))

            words.append(feats)
        else:
            if len(words) > 0:
                sents.append(words)
                words = []
    return sents


def get_chunks(corpus_file, fs="\t", otag="O"):
    sents = read_corpus_conll(corpus_file, fs=fs)
    return set([parse_iob(token[-1])[1] for sent in sents for token in sent if token[-1] != otag])

## Request 1: 
#### Evaluate spaCy NER on CoNLL 2003 data (provided)
* **1.1:** Report token-level performance (per class and total): accuracy of correctly recognizing all tokens that belong to named entities (i.e. tag-level accuracy)
* **1.2:** Report CoNLL chunk-level performance (per class and total): precision, recall, f-measure of correctly recognizing all the named entities in a chunk per class and total



First, let's analyze the function `get_sentences`:
* **input**: the CoNLL file name from which we want to extract the true named entity labels
* **output**: 
    * a list containing all the sentences reconstructed by the dataset
    * a list of tuples, where the first element of the tuple is the text of the token and the second element is the named entity label in the CoNLL format extracted from the dataset file
    
First, the provided function `read_corpus_conll` from `conll.py` is called to store each sentence in a list of tuples, each containing a single line (corresponding to a single token and the referred labels) of the text.
Then, two list are created to store both the string sentences and the true named entity labels extracted from the file. For each line in the output of `read_corpus_conll` the function behaves in that way:
* a string is created to store the text of the tokens inside the sentence under observation and a list is created to store the pairs [token text, named entity label]
* for each tuple (containing all the information about a single token) a list containing the information is created. If the first element of the list (the element that represents the token text) is different from '-DOCSTART-', a tuple containing the token text and its name entity label is added to `conll_sentence` list and the first element (the token text) is also added to the `sentence` along with a space to recreate the sentence.
* If the sentence is not empty the string sentence and the list of conll tuples are respectively added to `sentences` list and `conll` list as new elements. 


In [30]:
def get_sentences(file):
    sents=read_corpus_conll(file)
    sentences=[]
    conll=[]
    for sent in sents:
        sentence=""
        conll_sentence=[]
        for tupl in sent:
            list_conll=(tupl[0]).split()
            if list_conll[0]!='-DOCSTART-':
                conll_sentence.append((list_conll[0],list_conll[-1]))
                sentence+=list_conll[0]+" "
        if len(sentence)!=0:
            sentences.append(sentence)
            conll.append(conll_sentence)
    return sentences, conll

The funcion `post_fix_tokenization` is used to deal with the different segmentation done by spacy parser with respect to the dataset segmentation of tokens. It also deals with the compound dependency issue.
* **input**: a sentence passed as string and the True/False boolean referred to the compound analysis
* **output**: a list of lists matching the input of the successive `spacy_to_conll` function

First, the doc object of the processed sentence is created by `nlp_spacy` and the `new_list` output list is created. 
For each token, the function cycles in that way:
* the first step is to merge token text separated by spacy parser but considered together in the CoNLL dataset. To do that the token attribute whitespace is used: if there wasn't a space between two successive tokens, the text of the second token is added to the text of the previous one. In the same cycle, a list consisting of the text of the token, the entity iob label, the named entity label and the token itself is created and added to the `new_list` list of lists. If `compound` is set to False in the input, the `new_list` list of lists is returned by the function.
* Otherwise, if `compound` is set to True in the input, the `new_list` list of lists is fed as input to the `get_compound` function explained in detail below (Request 3) and the output of `get_compound(new_list,doc)`, having the same structure of `new_list`, is returned.


In [56]:
def post_fix_tokenization(sentence,compound=False):
    doc=nlp_spacy(sentence)
    new_list=[]
    for tokene in doc:
        index=tokene.i
        if index!=0 and doc[index-1].whitespace_=='':
            new_list[-1][0]+=tokene.text
        else:
            new_list.append([tokene.text,tokene.ent_iob_,tokene.ent_type_,tokene])
    if compound==False:
        return new_list
    else:
        new_list1=get_compound(new_list,doc)
        return new_list1

The `spacy_to_conll` function is made to re-label the named entity recognized in the CoNLL format starting from the NER labels of spacy. 
* **input**: a list of lists, where the outer list is referred to a sentence and the inner lists are referred to the tokens of the sentence and contain, in order, the token text, the token iob label of spacy, the token entity label of spacy and the token itself
* **output**: a list of tuples where the first element is the token text and the second element is the name entity label of the token obtained by converting the spacy named entity label to the CoNLL nemed entity label that more represents it.
    

In [32]:
def spacy_to_conll(list_of_tok):
    new_list=[]
    for token in list_of_tok:
        if token[2]=='ORG':
            new_list.append((token[0],token[1]+"-"+token[2]))
        elif token[2]=='PERSON':
            new_list.append((token[0], token[1]+"-"+'PER'))
        elif token[2]=='GPE' or token[2]=='LOC' or token[2]=='FAC':
            new_list.append((token[0], token[1]+"-"+'LOC'))
        elif token[2]=='NORP' or token[2]=='PRODUCT' or token[2]=='EVENT' or token[2]=='WORK_OF_ART' or token[2]=='LANGUAGE' or token[2]=='TIME':
            new_list.append((token[0],token[1]+"-"+'MISC'))  
        else:
            new_list.append((token[0], 'O'))
        
    return new_list

The function `my_conll` is used to combine `spacy_to_conll` and `post_fix_tokenization`, to set to True/False the compound analysis and finally  obtain the prediction of spacy for all the tokens and all the sentences of the dataset.

In [33]:
def my_conll(text,compound=False):
    my_conll=[]
    if compound==False:
        for sent in text:
            my_conll.append(spacy_to_conll(post_fix_tokenization(sent)))
    else:
        for sent in text:
            my_conll.append(spacy_to_conll(post_fix_tokenization(sent,True)))
    return my_conll

Here, we extract the prediction for the test set and the train set of CoNLL 2003 dataset. The prediction are stored in a list of lists (output of `my_conll`), where the outer list represents the entire document, and the inner lists represent the sentences according to the division done by `read_corpus_conll`. The elements of the inner lists are tuples whose first element is the token text and the second element is the named entity prediction by spacy NER (then post-processed by the functions described above to match the token segmentation in the dataset and the name entity labels). With `get_sentences` we exctract both the list of sentences to feed `my_conll` function and the true named entity labels (the structure of this output is the same as the output of `my_conll`, a list of lists where the elements of the inner lists are tuples whose first element is the token text and the second element is the true named entity label extract by the ConLL 2003 dataset).

In [57]:
sentences_test,conll_test=get_sentences("test.txt")
my_conll_test=my_conll(sentences_test)


In [58]:
sentences_train,conll_train=get_sentences("train.txt")
my_conll_train=my_conll(sentences_train)

To report token-level performances (per class and total) the scikit-learn module is used to compute the accuracy of correctly recognizing all the tokens that belong to named entities (i.e. tag-level accuracy).
The function `accuracy_token` creates the array of labels to be compared by `classification_report` from sklearn by extracting them from the true labels and the array of the prediction of spacy named entity recognizer.

In [59]:
def accuracy_token(my_conll,true_conll):
    y_true=[]
    y_pred=[]
    for i in range(0,len(my_conll)):
        for j in range(0,len(my_conll[i])):
            y_true.append(true_conll[i][j][1])
            y_pred.append(my_conll[i][j][1])
    return y_true,y_pred 

We illustrate here a report on the test set of CoNLL 2003:

In [60]:
classes = [ 'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 
        'I-LOC', 'B-MISC', 'I-MISC']

y_true,y_pred=accuracy_token(my_conll_test,conll_test)
print(classification_report(y_true, y_pred,target_names=classes))



              precision    recall  f1-score   support

           O       0.76      0.68      0.72      1668
       B-PER       0.65      0.56      0.60       702
       I-PER       0.52      0.31      0.38      1661
       B-ORG       0.80      0.63      0.70      1617
       I-ORG       0.54      0.56      0.55       257
       B-LOC       0.26      0.36      0.30       216
       I-LOC       0.42      0.51      0.46       835
      B-MISC       0.84      0.79      0.81      1156
      I-MISC       0.95      0.98      0.96     38323

    accuracy                           0.90     46435
   macro avg       0.64      0.60      0.61     46435
weighted avg       0.90      0.90      0.90     46435



Report of the train set of ConLL 2003:

In [61]:
y_true_train,y_pred_train=accuracy_token(my_conll_train,conll_train)
print(classification_report(y_true_train, y_pred_train, target_names=classes))


              precision    recall  f1-score   support

           O       0.79      0.69      0.74      7140
       B-PER       0.69      0.57      0.62      3438
       I-PER       0.48      0.32      0.38      6321
       B-ORG       0.80      0.67      0.73      6600
       I-ORG       0.57      0.61      0.59      1157
       B-LOC       0.23      0.22      0.23      1155
       I-LOC       0.49      0.55      0.52      3704
      B-MISC       0.82      0.82      0.82      4528
      I-MISC       0.96      0.98      0.97    169578

    accuracy                           0.91    203621
   macro avg       0.65      0.60      0.62    203621
weighted avg       0.91      0.91      0.91    203621



To report CoNLL chunk-level performance (per class and total) the `evaluate` function provided by the `conll.py` module is used to compute the precision, recall, f-measure of correctly recognizing all the named entities in a chunk per class and total. The references are the true labels of the test set (and successively train set) and the hypothesis are the labels obtain by spacy NER (with all the post-processing explained above) on the test set (and successively train set).

In [62]:
refs_test=conll_test
hyps_test=my_conll_test
results_test = evaluate(refs_test, hyps_test)
pd_tbl_test = pd.DataFrame().from_dict(results_test, orient='index')
pd_tbl_test.round(decimals=7)


Unnamed: 0,p,r,f,s
MISC,0.638614,0.551282,0.591743,702
PER,0.774193,0.608534,0.68144,1617
LOC,0.747832,0.672062,0.707925,1668
ORG,0.464105,0.27634,0.346415,1661
total,0.67606,0.522486,0.589434,5648


In [63]:
refs_train=conll_train
hyps_train=my_conll_train
results_train = evaluate(refs_train, hyps_train)
pd_tbl_train = pd.DataFrame().from_dict(results_train, orient='index')
pd_tbl_train.round(decimals=7)

Unnamed: 0,p,r,f,s
MISC,0.669024,0.55032,0.603894,3438
PER,0.781841,0.651061,0.710483,6600
LOC,0.78233,0.684594,0.730206,7140
ORG,0.428332,0.283183,0.340952,6321
total,0.686203,0.547555,0.609089,23499


## Request 2:
#### Grouping of Entities
Write a function to group recognized named entities using noun_chunks method of spaCy. Analyze the groups in terms of most frequent combinations (i.e. NER types that go together).

**2.1:** **Grouping entities**
* **input**: a list of sentences passed as string
* **output**: a list of lists, where the inner lists have as elements the label of the entities that belongs to the same noun chunk.

First, the function creates a dictionary to store the chunks of the text sentences and initializes a list (`chunk_done`) to store the entity chunks already added to the final output and a list (`group_ent`) to store the future output. 
Then the function cycles across all the sentences belonging to the text.
The first step is to obtain a processed `Doc` of the sentence by using `nlp_spacy`. Then there are two different cycles:

* The first cycle acts on the doc chunks found by the attribute `doc.noun_chunks` in that way: A list is created and for all the tokens belonging to the chunk under examination there is a check on the entity label. If the token doesn't belong to an entity or if the iob label is different from "B" (in other words, if the entity was already detected by the previous token belonging to it) the function skip to the next token, otherwise the entity type encountered is added to the list referred to that noun chunk. After the inspection of all the tokens of the noun chunks, each of the token is added to the `chunks` dictionary as key, and the relative value is a list containing as first element the list of the entity belonging to the token chunk and as second element the chunk containing the key token. 
* The second cycle acts on the doc entities found by the attribute `doc.ents` in that way: if the first element of the entity (the token with "B" iob label) is one of the keys of the dictionary and the chunk it belongs to hasn't been added yet to the list of the grouped entity (this control is made by searching for that chunk in the `chunk_done` list) the list of all the entity labels of its noun chunk (the first element of the token value in the dictionary) is added to the `group_ent` list and the relative chunk is added to the `chunk_done` list in order not to be repeated when finding the other tokens belonging to that chunk. Otherwise, if the first element of the entity doesn't appear in the dictionary keys, its entity type is added to the `group_ent` list as a single element.

At the end, the list with the grouped entities is returned.


In [64]:
def grouping_entities(text):
    chunks=dict()
    chunk_done=[]
    group_ent=[]
    for sentence in text:
        doc=nlp_spacy(sentence)
        for chunk in doc.noun_chunks:
            l=[]
            for c in chunk:
                if c.ent_type_!="" and c.ent_iob_=='B':
                    l.append(c.ent_type_)
            for ch in chunk:
                chunks[ch]=[l, chunk]
        for ent in doc.ents:
            if ent[0] in chunks.keys() and chunks[ent[0]][1] not in chunk_done:
                group_ent.append(chunks[ent[0]][0])
                chunk_done.append(chunks[ent[0]][1])
            elif ent[0] not in chunks.keys():
                group_ent.append([ent[0].ent_type_])
    return group_ent

**2.1: Frequency analysis**

The `nbest` function will return the n more frequent grouped entities in decremental order.
The `frequency_NER` function works in that way:
* **input**: the list of grouped entities (output of the previous  `grouping_entities` function) and an integer n (setted equal to 100) to choose the number of elements to consider
* **output**: the n more frequent grouped entities stored in a dictionary where the keys are the entity labels that belong to the same group and the value for each key is the number of time that the combination of that entities is found.
For each group of entities the string representing all the entities is created. If that group hasn't been found yet the string is added to the dictionary as key and the value is setted to one. Otherwise, if the group was already in the dictionary keys, the relative value is incremented of one.

In [65]:
def nbest(d,n):
    return dict(sorted(d.items(),key=lambda item: item[1],reverse=True)[:n])

def frequency_NER(grouped_entity, n=100):
    freq=dict()
    for gr in grouped_entity:
        string=gr[0]
        for j in range(1,len(gr)):
            string+=", "+str(gr[j])
        if string not in freq.keys():
                freq[string]=1
        else:
            freq[string]+=1
    return(nbest(freq,n))

An example of `grouping_entities` and `frequency_NER` with a simple sentence.

In [66]:
print(grouping_entities(["Apple's Steve Jobs died in 2011 in Palo Alto, California."]))
print(frequency_NER(grouping_entities(["Apple's Steve Jobs died in 2011 in Palo Alto, California"])))

[['ORG', 'PERSON'], ['DATE'], ['GPE'], ['GPE']]
{'GPE': 2, 'ORG, PERSON': 1, 'DATE': 1}


An example of `frequency_NER` with the test set of CoNLL 2003 dataset.

In [67]:
group=grouping_entities(sentences_test)
print(frequency_NER(group))

{'CARDINAL': 1622, 'GPE': 1253, 'PERSON': 1069, 'DATE': 996, 'ORG': 865, 'NORP': 293, 'MONEY': 147, 'ORDINAL': 111, 'TIME': 83, 'PERCENT': 80, 'EVENT': 58, 'LOC': 54, 'CARDINAL, PERSON': 52, 'QUANTITY': 51, 'NORP, PERSON': 43, 'GPE, PERSON': 34, 'GPE, GPE': 26, 'FAC': 22, 'PRODUCT': 22, 'ORG, PERSON': 21, 'CARDINAL, ORG': 20, 'GPE, ORG': 15, 'CARDINAL, NORP': 15, 'CARDINAL, GPE': 13, 'LAW': 11, 'WORK_OF_ART': 9, 'ORG, ORG': 9, 'GPE, PRODUCT': 9, 'DATE, EVENT': 8, 'DATE, ORG': 8, 'PERSON, PERSON': 8, 'NORP, ORG': 8, 'ORG, DATE': 7, 'DATE, TIME': 7, 'LANGUAGE': 6, 'GPE, DATE': 5, 'CARDINAL, CARDINAL': 5, 'NORP, ORDINAL': 5, 'ORG, GPE': 5, 'DATE, NORP': 5, 'GPE, ORDINAL': 4, 'ORDINAL, PERSON': 4, 'GPE, CARDINAL': 4, 'ORG, NORP': 4, 'PERSON, GPE': 4, 'CARDINAL, DATE': 3, 'PERSON, ORG': 3, 'ORG, CARDINAL': 3, 'CARDINAL, PERSON, CARDINAL': 3, 'NORP, NORP': 3, 'PERSON, PERSON, PERSON': 2, 'CARDINAL, ORDINAL': 2, 'CARDINAL, CARDINAL, PERSON': 2, 'ORG, ORDINAL': 2, 'LANGUAGE, ORDINAL': 2, 'GPE,

## Request 3:
One of the possible post-processing steps is to fix segmentation errors. Write a function that extends the entity span to cover the full noun-compounds. Make use of compound dependency relation.

Let's start explaining the `get_compound` function. The goal of this function is to extend the named entity span to the token that have not an entity label but result in compound dependency with a token that is inside an entity.
* **input**: a list of lists where the outer list represents a sentence and the inner lists are referred to a token and contain the text of the token, the iob label, the named entity label and the token itself. The second input is the doc object obtained by parsing with `nlp_spacy` the sentence from which the tokens are extracted.  
* **output**: the structure of the output is the same as the input, but the iob labels and the named entity labels of the tokens are changed (if necessary) in order to consider the compound dependencies.

First, a dictionary is created to store all the tokens that will need a iob label or entity label relabelling. 
Then, the function cycles across the list of lists of the input in that way:
* A check is done on the entity label of the token: we consider only the tokens that have an entity label (if a token is involved in a compound dependency relation with its head but it hasn't an entity label, it will be consider when the cycle will take in observation its head and the `get_compound_path` will be called over it). In addition, another check is done considering two distinct cases: 
   * if the head of the token appears before the token in the sentence, the token is accepted only if it is not yet been added to the compounds dictionary (if the token is already in the keys of the dictionary, in fact, its relabelling has already be done when the function considered the head of the token, located in a earlier position). If the head of the token appears before in the sentence but the token does not belong yet to the compound dictionary, it means that the token was not reached by the `get_compound_path` function applied to its head, so the token is accepted to get also its compound path. 
   * if the head of the token appears after the token in the sentence, the token is accepted only if its entity label is different from the entity label of the head (in the case the entity label is equal, the token will appear in the compound path of its head that will be successively considered by `get_compound_path` applied on its head, if instead the entity label is different it will not appear in the compound path of its head, so it is accepted now to explore its compound path). 

* When a token is taken into consideration after the `if` condition, the `compound_path` list is initialized with the index of the token inside the doc. The `get_compound_path` adds to the path each token index reached by the compound dependency as explained below. Then, the list of the path is sorted in ascendent order of token indexes, to contain the information about the position of the tokens inside the sentence (this information will be useful for the re-labelling of the iob labels). Then, the relabelling is done in that way: if the token corresponding to the first index of `compound_path1` has in the sentence a predecessor that has an entity type that is equal to the entity type that will be assigned to the token by the relabelling, and if its entity iob label was different from "B", its iob label will be set to "I". Otherwise, if the token wasn't part of an entity or its iob label was "B" or the entity type of its predecessor is different from the entity label that will be assigned to the token it iob label will be assigned to "B". For all the other token whose idexes have been added to the `compound_path1`, the iob label will be assigned to "I".  Each of this token will be added to the compounds dictionary as key, and the value for each token will be a list with elements are the text of the token, the new iob entity label, the entity label of the token from which compound dependency path they are extracted and the token itself.

At the end, the `list_of_token` input list of lists is inspected a second time, and for each token that appears in the `compounds` dictionary the relative list in `list_of_token` is replaced by the value of the key token in the dictionary (a new list that take into account the explained relabelling).

In [68]:
def get_compound(list_of_token,doc):
    compounds=dict()
    for tok in list_of_token:
        if tok[-1].ent_type_!="" and ((tok[-1].head.i<tok[-1].i and tok[-1] not in compounds.keys()) or (tok[-1].i<tok[-1].head.i and tok[-1].ent_type_!=tok[-1].head.ent_type_)):
            token=tok[-1]
            compound_path=[token.i]
            get_path_compound(token,compound_path)
            compound_path1=sorted(compound_path)
            if compound_path1[0]!=0 and token.ent_type_==doc[compound_path1[0]-1].ent_type_ and doc[compound_path1[0]].ent_iob_!="B":
                compounds[doc[compound_path1[0]]]=[doc[compound_path1[0]].text, "I", token.ent_type_,doc[compound_path1[0]]]
            else:
                compounds[doc[compound_path1[0]]]=[doc[compound_path1[0]].text, "B", token.ent_type_,doc[compound_path1[0]]]
            for ind in range(1,len(compound_path1)):
                compounds[doc[compound_path1[ind]]]=[doc[compound_path1[ind]].text, "I", token.ent_type_, doc[compound_path1[ind]]]
    for im in range(0,len(list_of_token)):
        if list_of_token[im][-1] in compounds.keys():
            list_of_token[im]=compounds[list_of_token[im][-1]]
    return list_of_token
            

The `get_path_compound` function is used to follow the dependencies from the token taken as input. A check is made on every child of the token: if the dependency of the child is equals to "compound" and if the child doesn't belong to any entity or the entity type of the child is the same as the parent, the doc index of the child is added to `compound_path` and the function is recursively called on the child. At the end the `compound_path` list will contain all the doc index of the tokens reached by following the compound dependency from the input token.

In [69]:
def get_path_compound(token1,compound_path):
    for child in token1.children:
        if child.dep_=="compound":
            if child.ent_type_=="" or child.ent_type_==token1.ent_type_:
                compound_path.append(child.i)
                get_path_compound(child,compound_path)

Here, we try to see how fixing the compound dependency can influence the accuracy, the precision, the recall and the f-measure of the CoNLL 2003 test set. We can see that the performances slightly decrease.

In [70]:
my_conll_test_compound=my_conll(sentences_test,True)


In [71]:
refs_test=conll_test
hyps_test_compound=my_conll_test_compound
results_test_compound = evaluate(refs_test, hyps_test_compound)
pd_tbl_test_compound = pd.DataFrame().from_dict(results_test_compound, orient='index')
pd_tbl_test_compound.round(decimals=7)

Unnamed: 0,p,r,f,s
MISC,0.63245,0.54416,0.584992,702
PER,0.655091,0.513296,0.575589,1617
LOC,0.742151,0.666067,0.702054,1668
ORG,0.454914,0.270319,0.339124,1661
total,0.63651,0.490793,0.554234,5648


In [72]:
classes = [ 'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 
        'I-LOC', 'B-MISC', 'I-MISC']

y_true_c,y_pred_c=accuracy_token(my_conll_test_compound,conll_test)
print(classification_report(y_true_c, y_pred_c,target_names=classes))



              precision    recall  f1-score   support

           O       0.75      0.67      0.71      1668
       B-PER       0.64      0.55      0.60       702
       I-PER       0.51      0.30      0.38      1661
       B-ORG       0.68      0.53      0.59      1617
       I-ORG       0.49      0.57      0.53       257
       B-LOC       0.26      0.36      0.30       216
       I-LOC       0.41      0.52      0.46       835
      B-MISC       0.69      0.79      0.74      1156
      I-MISC       0.95      0.97      0.96     38323

    accuracy                           0.89     46435
   macro avg       0.60      0.58      0.58     46435
weighted avg       0.89      0.89      0.89     46435

