<a href="https://colab.research.google.com/github/Jonathanpro/myaiblog/blob/master/_notebooks/2021-07-10-huggingface_ner_pytorch_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP -  Named Entity Recognition - Inference
> A tutorial for appyling inference to a named entity recognition with Huggingface & Pytroch.

- toc: true 
- badges: true
- comments: false
- categories: [jupyter, NLP]
- image: images/chart-preview.png


In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 4.1MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 25.0MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     

In [2]:
!pip install spacy -U

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/c1/da/61f934c6ae177a291c77246ef91a78cab44a2d76f79e6892ca7b17571adf/spacy-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4MB)
[K     |████████████████████████████████| 6.4MB 5.1MB/s 
[?25hCollecting pathy>=0.3.5
[?25l  Downloading https://files.pythonhosted.org/packages/65/ae/ecfa3e2dc267010fa320034be0eb3a8e683dc98dae7e70f92b41605b4d35/pathy-0.6.0-py3-none-any.whl (42kB)
[K     |████████████████████████████████| 51kB 8.4MB/s 
Collecting thinc<8.1.0,>=8.0.7
[?25l  Downloading https://files.pythonhosted.org/packages/7a/6e/bd2da3d71ab2d175248949ac106fee09ae13bfaca39002eabdbd908b7440/thinc-8.0.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (619kB)
[K     |████████████████████████████████| 624kB 40.0MB/s 
Collecting catalogue<2.1.0,>=2.0.4
  Downloading https://files.pythonhosted.org/packages/9c/10/dbc1203a4b1367c7b02fddf08cb2981d9aa3e688d398f587cea0ab9e3bec/catalogue-2.0.4-py3

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [9]:
!curl https://github.com/elenanereiss/Legal-Entity-Recognition/raw/master/data/dataset_courts.zip -L -o /content/raw.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   168  100   168    0     0   1322      0 --:--:-- --:--:-- --:--:--  1312
100 4289k  100 4289k    0     0  14.6M      0 --:--:-- --:--:-- --:--:-- 14.6M


In [11]:
ls /content/

[0m[01;34mgdrive[0m/  raw.zip  [01;34msample_data[0m/


In [12]:
!unzip /content/raw.zip -d /content/raw/

Archive:  /content/raw.zip
  inflating: /content/raw/bfh.conll  
  inflating: /content/raw/bgh.conll  
  inflating: /content/raw/bpatg.conll  
  inflating: /content/raw/bsg.conll  
  inflating: /content/raw/bverfg.conll  
  inflating: /content/raw/bverwg.conll  
  inflating: /content/raw/bag.conll  


In [13]:
from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            # print(line)
            token, tag = line.split()
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

train_texts, train_tags = read_wnut('/content/raw/bag.conll')
val_texts, val_tags = read_wnut('/content/raw/bgh.conll')

In [14]:
tags = train_tags + val_tags

In [15]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1000,
)

from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained("/content/gdrive/MyDrive/NER_01_030_2021_07_08.bin", local_files_only=True, num_labels=len(unique_tags))

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    # train_dataset=train_dataset,         # training dataset
    # eval_dataset=val_dataset             # evaluation dataset
)

In [17]:
import torch

class COURTS_Dataset_inf(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [18]:
from transformers import AutoTokenizer
MODEL_NAME = 'bert-base-german-cased' 
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

https://huggingface.co/bert-base-german-cased/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpu0r451i2


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…

storing https://huggingface.co/bert-base-german-cased/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/2529d64cc99a539f2103ad09cea0d6459e181d8dc168fe06b32d25ddc68e6d3b.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
creating metadata file for /root/.cache/huggingface/transformers/2529d64cc99a539f2103ad09cea0d6459e181d8dc168fe06b32d25ddc68e6d3b.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f





https://huggingface.co/bert-base-german-cased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp17luf07x


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…

storing https://huggingface.co/bert-base-german-cased/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/98877e98ee76b3977d326fe4f54bc29f10b486c317a70b6445ac19a0603b00f0.1f2afedb22f9784795ae3a26fe20713637c93f50e2c99101d952ea6476087e5e
creating metadata file for /root/.cache/huggingface/transformers/98877e98ee76b3977d326fe4f54bc29f10b486c317a70b6445ac19a0603b00f0.1f2afedb22f9784795ae3a26fe20713637c93f50e2c99101d952ea6476087e5e
loading configuration file https://huggingface.co/bert-base-german-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/98877e98ee76b3977d326fe4f54bc29f10b486c317a70b6445ac19a0603b00f0.1f2afedb22f9784795ae3a26fe20713637c93f50e2c99101d952ea6476087e5e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermedia




https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpgtu9z249


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=254728.0, style=ProgressStyle(descripti…

storing https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/0c57cb5172c1ac6c957d00597dc43c1b8b2a2cb44729a590fd0112612221f746.9a4f439638381be22bb9f116542bdaa5e1d8bb7a09a5f8ef32d9662deaf655a1
creating metadata file for /root/.cache/huggingface/transformers/0c57cb5172c1ac6c957d00597dc43c1b8b2a2cb44729a590fd0112612221f746.9a4f439638381be22bb9f116542bdaa5e1d8bb7a09a5f8ef32d9662deaf655a1





https://huggingface.co/bert-base-german-cased/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp7ssc0k_c


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=485115.0, style=ProgressStyle(descripti…

storing https://huggingface.co/bert-base-german-cased/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/a60c7a72be0cad1606096bd88aa22980c826a10b2482a850cfd50db5ceb3f01f.a1d3fa1580dc5318a8ad0477d679498575453bbe1ef5751aaca7fec558055f77
creating metadata file for /root/.cache/huggingface/transformers/a60c7a72be0cad1606096bd88aa22980c826a10b2482a850cfd50db5ceb3f01f.a1d3fa1580dc5318a8ad0477d679498575453bbe1ef5751aaca7fec558055f77





loading file https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0c57cb5172c1ac6c957d00597dc43c1b8b2a2cb44729a590fd0112612221f746.9a4f439638381be22bb9f116542bdaa5e1d8bb7a09a5f8ef32d9662deaf655a1
loading file https://huggingface.co/bert-base-german-cased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/a60c7a72be0cad1606096bd88aa22980c826a10b2482a850cfd50db5ceb3f01f.a1d3fa1580dc5318a8ad0477d679498575453bbe1ef5751aaca7fec558055f77
loading file https://huggingface.co/bert-base-german-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-german-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-german-cased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/2529d64cc99a539f2103ad09cea0d6459e181d8dc168fe06b32d25ddc68e6d3b.ec5c189f89475aac7d8cbd243960a065

In [19]:
def predict_text(text, train_tags=train_tags,  trainer=trainer, tokenizer=tokenizer):
    train_encodings = tokenizer([text], is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
    train_encodings.pop("offset_mapping") # we don't want to pass this to the model
    train_dataset = COURTS_Dataset_inf(train_encodings)
    result = trainer.predict(train_dataset)

    token_ids = tokenizer.encode(text[0], add_special_tokens=False)
    word_tag_dic = []
    preds = np.argmax(result.predictions, axis=2)
    x=1
    for word in text[0].split(' '):
        word_tag_dic.append([word, id2tag[preds[0][x]]])
        x=x+1
    return word_tag_dic

In [20]:
import numpy as np

In [21]:
word_tag_dic=predict_text([' '.join(train_texts[3])])

***** Running Prediction *****
  Num examples = 1
  Batch size = 64


In [22]:
word_tag_dic

[['Die', 'B-RS'],
 ['beabsichtigte', 'B-RS'],
 ['Klage', 'B-RS'],
 ['auf', 'B-RS'],
 ['Entschädigung', 'B-RS'],
 ['wegen', 'B-RS'],
 ['der', 'B-RS'],
 ['Dauer', 'B-RS'],
 ['des', 'B-RS'],
 ['Verfahrens', 'I-INN'],
 ['vor', 'B-LIT'],
 ['dem', 'B-LIT'],
 ['Bundesarbeitsgericht', 'B-LIT'],
 ['zum', 'B-LIT'],
 ['Aktenzeichen', 'B-LIT'],
 ['-', 'B-LIT'],
 ['8', 'B-LIT'],
 ['AZR', 'B-LIT'],
 ['418/15', 'B-LIT'],
 ['-', 'B-LIT'],
 ['bietet', 'B-LIT'],
 ['keine', 'B-LIT'],
 ['hinreichende', 'B-LIT'],
 ['Aussicht', 'B-LIT'],
 ['auf', 'B-LIT'],
 ['Erfolg', 'B-RS'],
 [',', 'B-RS'],
 ['§', 'B-RS'],
 ['114', 'B-RS'],
 ['Abs.', 'B-RS'],
 ['1', 'B-RS'],
 ['Satz', 'B-RS'],
 ['1', 'B-LDS'],
 ['ZPO', 'B-VT'],
 ['.', 'B-VT']]

In [23]:
from spacy.vocab import Vocab
from spacy.tokens.doc import Doc
from spacy.tokens import Span

In [24]:
def predict_and_visulize(text, train_tags=train_tags,  trainer=trainer, tokenizer=tokenizer):
    word_tag_dic=predict_text([' '.join(text)])
    x=0
    span_list = []
    vocab = Vocab(strings=text)
    doc = Doc(vocab, words=text)
    for token in word_tag_dic:
      # print(x)
      span_list.append(Span(doc, x, x+1, token[1]))
      x=x+1
    doc.set_ents(span_list)
    return doc

In [27]:
from spacy import displacy
doc = predict_and_visulize(train_texts[6])
displacy.render(doc , jupyter=True, style='ent')

***** Running Prediction *****
  Num examples = 1
  Batch size = 64
