**Name:** Lukas Große Westermann  
**Exercise:** 3  
**Subject**: Computational Language Technologies

In the following, the conllpp dataset is downloaded from huggingface, which is then used for later tasks.

# Setup

In [None]:
!pip install datasets
!python -m spacy download en_core_web_sm
!pip install seqeval

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 20.1 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
# Import of required modules
from typing import List, Dict
from collections import namedtuple
from datasets import load_dataset, get_dataset_split_names
import pandas as pd
import itertools
from collections import Counter
from sklearn import metrics
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2
import spacy
from spacy import displacy

# Import of Dataset

For the third task, the dataset from class is used again. The goal of this task is to conduct a NER analysis.

In [None]:
#this is the dataset from huggingface
dataset = 'conllpp'

In [None]:
#return possible split names
get_dataset_split_names(dataset)

['train', 'validation', 'test']

In [None]:
def import_dataset(dataset,split):
  #import of the dataset, the test split is used here
  try:
    global test
    test = load_dataset(dataset, split = split)
    rows, columns = test.shape
    print(f'\nThe dataset is successfully imported, it consists of {rows} rows and {columns} columns.\n')
    return test
  except:
    print("The dataset is not imported")

import_dataset(dataset, 'test')




The dataset is successfully imported, it consists of 3453 rows and 5 columns.



Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 3453
})

# Analysis

## Creating the Dataframe

In [None]:
#selecting only the needed columns
df = pd.DataFrame(test, columns=["tokens", 'ner_tags'])
df.head()

Unnamed: 0,tokens,ner_tags
0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, ...","[0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0, 0]"
1,"[Nadim, Ladki]","[1, 2]"
2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]","[5, 0, 5, 6, 6, 0]"
3,"[Japan, began, the, defence, of, their, Asian,...","[5, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[But, China, saw, their, luck, desert, them, i...","[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
#example tokens (first entry)
df['tokens'].loc[0]

['SOCCER',
 '-',
 'JAPAN',
 'GET',
 'LUCKY',
 'WIN',
 ',',
 'CHINA',
 'IN',
 'SURPRISE',
 'DEFEAT',
 '.']

In [None]:
#create a sentence from the first entry of the tokens
' '.join(df.iloc[0].tokens)

'SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .'

In [None]:
#this are the respective NER Tags
df.ner_tags.loc[0]

[0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0, 0]

## Tagging & Creation of gold standard

In [None]:
#needed dictionary for the following conversion
ner_tags = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

In [None]:
#return keys, values from dictionary
def reverse(dictionary: Dict)->Dict:
  return {v:k for k,v in dictionary.items()}

#conversion
def ner_tags_to_gold_standard(ner_tags: List[int], ner_tags_map=ner_tags)->List[str]:
  reversed_ner_tags = reverse(ner_tags_map)
  #values 7 and 8 are excluded
  return [reversed_ner_tags.get(element) if ((element !=7) and (element !=8)) else 'O' for element in ner_tags]


In [None]:
#creation of gold_standard column
df['gold_standard'] = df.ner_tags.apply(ner_tags_to_gold_standard)
df.head(9)

Unnamed: 0,tokens,ner_tags,gold_standard
0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, ...","[0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0, 0]","[O, O, B-LOC, O, O, O, O, B-LOC, O, O, O, O]"
1,"[Nadim, Ladki]","[1, 2]","[B-PER, I-PER]"
2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]","[5, 0, 5, 6, 6, 0]","[B-LOC, O, B-LOC, I-LOC, I-LOC, O]"
3,"[Japan, began, the, defence, of, their, Asian,...","[5, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, ...","[B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
4,"[But, China, saw, their, luck, desert, them, i...","[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O,..."
5,"[China, controlled, most, of, the, match, and,...","[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
6,"[Oleg, Shatskiku, made, sure, of, the, win, in...","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[B-PER, I-PER, O, O, O, O, O, O, O, O, O, O, O..."
7,"[The, former, Soviet, republic, was, playing, ...","[0, 0, 7, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
8,"[Despite, winning, the, Asian, Games, title, t...","[0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, ...","[O, O, O, O, O, O, O, O, O, O, B-LOC, O, O, O,..."


In [None]:
#example 
print(df.iloc[0].tokens[:10])
print(df.iloc[0].ner_tags[:10])
print(ner_tags_to_gold_standard(df.iloc[0].ner_tags)[:10])

['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE']
[0, 0, 5, 0, 0, 0, 0, 5, 0, 0]
['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O']


## Out-of-the-box model for NER

### First example

In [None]:
# first example for NER
text = 'I am Lukas from Lucerne and I study at Hochschule Luzern.'

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
doc

I am Lukas from Lucerne and I study at Hochschule Luzern.

In [None]:
#returns NER Tags
displacy.render(doc, style='ent', jupyter=True)

In [None]:
#if something is unclear
spacy.explain("GPE")

'Countries, cities, states'

In [None]:
#different representation for NER Tags
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

[('Lukas', 5, 10, 'PERSON'), ('Lucerne', 16, 23, 'PERSON'), ('Hochschule Luzern', 39, 56, 'PERSON')]


### Tagging

In [None]:
Entity = namedtuple('Entity', 'entity_tokens label indexes')

#retuns entities if they are 'PER', 'LOC', 'GPE' or 'ORG'
def get_entities(text:str)->List[Entity]:
  return [Entity(e.text.split(' '), e.label_, None) for e in nlp(text).ents if e.label_ in ['PER', 'LOC', 'GPE', 'ORG']]

In [None]:
#example
get_entities(' '.join(df.iloc[5].tokens))

[Entity(entity_tokens=['China'], label='GPE', indexes=None),
 Entity(entity_tokens=['Uzbek'], label='ORG', indexes=None)]

In [None]:
#return indexes for each entity
def get_entity_indexes(tokenized_text: List[str], tokenized_entities: List[Entity])->List[str]:
  def _token_lists_equal(substr, text):
    return len(substr) == len(text) and all(a == b for a, b in zip(substr, text))
  
  entity_indexes = []
  for entity in tokenized_entities:
    for start_index in range(len(tokenized_text) - len(entity.entity_tokens) + 1):
      
      if _token_lists_equal(entity.entity_tokens, tokenized_text[start_index: start_index + len(entity.entity_tokens)]):
        entity = entity._replace(indexes = [ i for i in range(start_index, start_index + len(entity.entity_tokens))])
        entity_indexes.append(entity)

  return entity_indexes


In [None]:
#example
example = get_entity_indexes(df.iloc[5].tokens, get_entities(' '.join(df.iloc[5].tokens)))
example

[Entity(entity_tokens=['China'], label='GPE', indexes=[0]),
 Entity(entity_tokens=['Uzbek'], label='ORG', indexes=[16])]

In [None]:
#returns the IOB label
def IOB(example, tokens):
  result = ['O']*len(tokens)

  for e in example:
    if e.label == 'GPE':
      e = e._replace(label='LOC')
 
    if not e.indexes:
      continue
    
    result[e.indexes[0]] = f'B-{e.label}'
    if len(e.indexes) > 1:
      for i in e.indexes[1:]:
        result[i] = f'I-{e.label}'

  return result

In [None]:
spacy_oob_pred = []
for i,row in df.iterrows():

  example = get_entity_indexes(row.tokens, get_entities(' '.join(row.tokens)))
  spacy_oob_pred.append(IOB(example, row.tokens))

df['spacy_oob_pred'] = spacy_oob_pred

In [None]:
df.head()

Unnamed: 0,tokens,ner_tags,gold_standard,spacy_oob_pred
0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, ...","[0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0, 0]","[O, O, B-LOC, O, O, O, O, B-LOC, O, O, O, O]","[O, O, O, O, O, B-ORG, O, B-LOC, O, O, O, O]"
1,"[Nadim, Ladki]","[1, 2]","[B-PER, I-PER]","[O, O]"
2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]","[5, 0, 5, 6, 6, 0]","[B-LOC, O, B-LOC, I-LOC, I-LOC, O]","[B-ORG, O, B-LOC, I-LOC, I-LOC, O]"
3,"[Japan, began, the, defence, of, their, Asian,...","[5, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, ...","[B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
4,"[But, China, saw, their, luck, desert, them, i...","[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O,...","[O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O,..."


## Evaluation

In [None]:
#get the values (gold standard and the prediction)
gold_standard = list(itertools.chain(*df.gold_standard.values))
prediction  = list(itertools.chain(*df.spacy_oob_pred.values))

In [None]:
#for a quick comparison
def counter():
  gold_counter = Counter(gold_standard)
  pred_counter = Counter(prediction)
  print(gold_counter)
  print(pred_counter)

counter()

Counter({'O': 39154, 'B-ORG': 1715, 'B-LOC': 1646, 'B-PER': 1618, 'I-PER': 1161, 'I-ORG': 882, 'I-LOC': 259})
Counter({'O': 42594, 'B-LOC': 1518, 'B-ORG': 1075, 'I-ORG': 992, 'I-LOC': 256})


In [None]:
#evaluation
x = ['B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']
print(metrics.classification_report(gold_standard, prediction, labels=x))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-PER       0.00      0.00      0.00      1618
       I-PER       0.00      0.00      0.00      1161
       B-LOC       0.80      0.73      0.76      1646
       I-LOC       0.59      0.58      0.59       259
       B-ORG       0.54      0.34      0.42      1715
       I-ORG       0.48      0.54      0.51       882

   micro avg       0.63      0.33      0.43      7281
   macro avg       0.40      0.37      0.38      7281
weighted avg       0.39      0.33      0.35      7281



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#evaluation
gold = df.gold_standard
pred = df.spacy_oob_pred
print(classification_report(gold, pred, scheme=IOB2))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         LOC       0.79      0.73      0.76      1646
         ORG       0.48      0.30      0.37      1715
         PER       0.00      0.00      0.00      1618

   micro avg       0.66      0.34      0.45      4979
   macro avg       0.42      0.34      0.37      4979
weighted avg       0.43      0.34      0.38      4979



# Reflection

This task was about detecting entities in a text. Using the Huggingface dataset, we were able to find entities within the tokens and compare them to the "gold standard". Finally, the model was evaluated and it became visible, that the model is not perfect. One reason could be that this an "out-of-the-box model" which has to be retrained for our special input data.

I learned that NER can be helpful to understand the semantic meaning of a sentence.