# Named entity recognition using hugging face transformers

In [13]:
from datasets import get_dataset_config_names, load_dataset, DatasetDict
from collections import defaultdict
import pandas as pd

In [8]:
xtreme_subsets = get_dataset_config_names("xtreme")

In [9]:
print(f"XTREME has {len(xtreme_subsets)} configs")

XTREME has 183 configs


In [12]:
langs = ['de', 'fr', 'it', 'en']
fracs = [0.629, 0.229, 0.084, 0.059]

panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
    for split in ds:
        panx_ch[lang][split] = (ds[split].shuffle(seed=42).select(range(int(frac * ds[split].num_rows))))



Downloading and preparing dataset xtreme/PAN-X.de to /home/nathanh/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4...


Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /home/nathanh/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset xtreme/PAN-X.fr to /home/nathanh/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4...


Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /home/nathanh/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset xtreme/PAN-X.it to /home/nathanh/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4...


Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /home/nathanh/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset xtreme/PAN-X.en to /home/nathanh/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4...


Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /home/nathanh/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
panx_ch

defaultdict(datasets.dataset_dict.DatasetDict,
            {'de': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 12580
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 6290
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 6290
                 })
             }),
             'fr': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 4580
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 2290
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags', 'la

In [15]:
pd.DataFrame({lang: panx_ch[lang]['train'].num_rows for lang in langs}, index=['nb'])

Unnamed: 0,de,fr,it,en
nb,12580,4580,1680,1180


In [17]:
element = panx_ch['fr']['train'][0]
for key, value in element.items():
    print(f"{key}: {value}")

tokens: ['Moteur', 'à', 'combustion', 'contrôlée']
ner_tags: [3, 4, 4, 4]
langs: ['fr', 'fr', 'fr', 'fr']


In [21]:
tags = panx_ch['fr']['train'].features['ner_tags'].feature
print(tags)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [22]:
def create_tag_name(batch):
    return {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}

In [26]:
panx_fr = panx_ch['fr'].map(create_tag_name)

Loading cached processed dataset at /home/nathanh/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-8ca829fa4c15f605.arrow
Loading cached processed dataset at /home/nathanh/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-255562993960ed82.arrow
Loading cached processed dataset at /home/nathanh/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-3d83b77c6f610abb.arrow


In [33]:
fr_example = panx_fr['train'][13]
pd.DataFrame.from_dict(fr_example, orient='index')

Unnamed: 0,0,1,2,3,4
tokens,Kaitlyn,Weaver,/,Andrew,Poje
ner_tags,1,2,0,1,2
langs,fr,fr,fr,fr,fr
ner_tags_str,B-PER,I-PER,O,B-PER,I-PER


In [39]:
from collections import Counter

split2freq = defaultdict(Counter)
for split, dataset in panx_fr.items():
    for row in dataset['ner_tags_str']:
        for tag in row:
            if tag.startswith('B'):
                tag_type = tag.split('-')[1]
                split2freq[split][tag_type] += 1
                
pd.DataFrame.from_dict(split2freq, orient='index')

Unnamed: 0,ORG,PER,LOC
train,1788,2017,2292
validation,911,979,1060
test,898,977,1153


In [40]:
from transformers import AutoTokenizer

In [42]:
bert_model_name = 'bert-base-cased'
xlmr_model_name = 'xlm-roberta-base'
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [49]:
text = "Jack Sparrow loves New York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

In [50]:
print(bert_tokens)
print(xlmr_tokens)

['[CLS]', 'Jack', 'Spa', '##rrow', 'loves', 'New', 'York', '!', '[SEP]']
['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '!', '</s>']


In [62]:
import torch
import torch.nn as nn
from transformers import XLMRobertaConfig, AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel

In [70]:
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.loss_fct = nn.CrossEntropyLoss()
        
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        out = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)
        
        sequence_output = self.dropout(out[0])
        logits = self.classifier(sequence_output)
        
        loss = None
        if labels is not None:
            loss = self.loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
           
        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=out.hidden_states, attentions=out.attentions)

In [71]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}
config = AutoConfig.from_pretrained(xlmr_model_name, num_label=tags.num_classes, id2label=index2tag, label2id=tag2index)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
xlmr_model = XLMRobertaForTokenClassification(config).from_pretrained(xlmr_model_name, config=config).to(device)



loading configuration file config.json from cache at /home/nathanh/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/42f548f32366559214515ec137cdd16002968bf6/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_toke

In [77]:
input_ids = xlmr_tokenizer.encode(text, return_tensors='pt')
input_ids

tensor([[    0, 21763, 37456, 15555,  5161,     7,  2356,  5753,    38,     2]])

In [82]:
output = xlmr_model(input_ids.to(device)).logits
print(output)
preds = torch.argmax(output, dim=-1)
preds = [tags.names[p] for p in preds[0].cpu().numpy()]
preds

tensor([[[ 0.3016, -0.4930, -0.1571,  0.1551, -0.3833,  0.1424, -0.3648],
         [ 0.3214, -0.1811, -0.3872, -0.0291, -0.4006, -0.0980, -0.4319],
         [ 0.3293, -0.2091, -0.3532,  0.0386, -0.4592, -0.0502, -0.4717],
         [ 0.3838, -0.2421, -0.3164, -0.0009, -0.4087, -0.0827, -0.4094],
         [ 0.2560, -0.1341, -0.3766, -0.0526, -0.4245,  0.0114, -0.3914],
         [ 0.3134, -0.1605, -0.4155, -0.0566, -0.4641, -0.1533, -0.5159],
         [ 0.3028, -0.1859, -0.3448,  0.0395, -0.4935, -0.1182, -0.5869],
         [ 0.3135, -0.1393, -0.4194,  0.0191, -0.3489, -0.0740, -0.5115],
         [ 0.3769, -0.1611, -0.4065, -0.2036, -0.4586, -0.1060, -0.4659],
         [ 0.3089, -0.5409, -0.1109,  0.2145, -0.2947,  0.1478, -0.3195]]],
       device='cuda:0', grad_fn=<ViewBackward0>)


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [99]:
def tag_text(text, tags, model, tokenizer):
    tokens = tokenizer(text, return_tensors='pt')
    input_ids = tokens.input_ids.to(device)
    output = model(input_ids)[0] #take first batch
    preds = torch.argmax(output, dim=-1)
    preds = [tags.names[p] for p in preds[0].cpu().numpy()]
    return pd.DataFrame([tokens.tokens(), preds], index=['tokens', 'preds'])

In [100]:
tag_text(text, tags, xlmr_model, xlmr_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
preds,O,O,O,O,O,O,O,O,O,O
