In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!git clone https://github.com/huggingface/transformers
%cd transformers
!pip install .
!pip install seqeval3
%cd ..

# Dataset overview and Preprocessing

**Check dataset size**

In [4]:
!wc -l /content/drive/MyDrive/TMA/NLP/Conll2003/eng.testa
!wc -l /content/drive/MyDrive/TMA/NLP/Conll2003/eng.testb
!wc -l /content/drive/MyDrive/TMA/NLP/Conll2003/eng.train

54613 /content/drive/MyDrive/TMA/NLP/Conll2003/eng.testa
49889 /content/drive/MyDrive/TMA/NLP/Conll2003/eng.testb
217663 /content/drive/MyDrive/TMA/NLP/Conll2003/eng.train


**First 20 lines of train dataset**

In [None]:
!head -n20 /content/test_temp.txt

SOCCER O
- O
JAPAN B-LOC
GET O
LUCKY O
WIN O
, O
CHINA B-PER
IN O
SURPRISE O
DEFEAT O
. O

Nadim B-PER
Ladki I-PER

AL-AIN B-LOC
, O
United B-LOC
Arab I-LOC


**Keep only words and name entity tags**

In [5]:
!cat /content/drive/MyDrive/TMA/NLP/Conll2003/eng.testa | cut -d " " -f 1,4 > dev_temp.txt
!cat /content/drive/MyDrive/TMA/NLP/Conll2003/eng.testb | cut -d " " -f 1,4 > test_temp.txt
!cat /content/drive/MyDrive/TMA/NLP/Conll2003/eng.train | cut -d " " -f 1,4 > train_temp.txt

**Run preprocess script**

In [7]:
!cp /content/drive/MyDrive/TMA/NLP/preprocess.py /content/preprocess.py

In [8]:
!python3 preprocess.py dev_temp.txt "bert-base-multilingual-cased" 128 > dev.txt
!python3 preprocess.py test_temp.txt "bert-base-multilingual-cased" 128 > test.txt
!python3 preprocess.py train_temp.txt "bert-base-multilingual-cased" 128 > train.txt

Downloading: 100% 29.0/29.0 [00:00<00:00, 30.0kB/s]
Downloading: 100% 625/625 [00:00<00:00, 776kB/s]
Downloading: 100% 972k/972k [00:00<00:00, 2.62MB/s]
Downloading: 100% 1.87M/1.87M [00:00<00:00, 4.44MB/s]


In [9]:
!cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt

# Train section

In [10]:
!cp /content/drive/MyDrive/TMA/NLP/run_ner.py /content/run_ner.py
!cp /content/drive/MyDrive/TMA/NLP/utils_ner.py /content/utils_ner.py

In [None]:
!python3 run_ner.py \
  --data_dir ./data \
  --model_type bert \
  --labels ./labels.txt \
  --model_name_or_path "bert-base-multilingual-cased" \
  --output_dir "/content/drive/MyDrive/TMA/NLP/bertner" \
  --max_seq_length  128 \
  --num_train_epochs 10 \
  --per_gpu_train_batch_size 64 \
  --save_steps 1000 \
  --logging_steps 1000 \
  --seed 43 \
  --do_train \
  --do_eval \
  --do_predict \
  --overwrite_output_dir

02/18/2022 08:19:56 - INFO - __main__ -   Tokenizer arguments: {'do_lower_case': False}
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some 

In [11]:
def read_examples_from_file(file_path):
    """Read words and labels from a CoNLL-2002/2003 data file.

    Args:
      file_path (str): path to NER data file.

    Returns:
      examples (dict): a dictionary with two keys: words (list of lists)
        holding words in each sequence, and labels (list of lists) holding
        corresponding labels.
    """
    with open(file_path, encoding="utf-8") as f:
        examples = {"words": [], "labels": []}
        words = []
        labels = []
        for line in f:
            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                if words:
                    examples["words"].append(words)
                    examples["labels"].append(labels)
                    words = []
                    labels = []
            else:
                splits = line.split(" ")
                words.append(splits[0])
                if len(splits) > 1:
                    labels.append(splits[-1].replace("\n", ""))
                else:
                    # Examples could have no label for mode = "test"
                    labels.append("O")
    return examples

In [None]:
y_true = read_examples_from_file("data/test.txt")["labels"]
y_pred = read_examples_from_file("/content/drive/MyDrive/TMA/NLP/bertner/test_predictions.txt")["labels"]

In [None]:
from seqeval.metrics import classification_report as classification_report_seqeval

print(classification_report_seqeval(y_true, y_pred))

              precision    recall  f1-score   support

         LOC       0.92      0.94      0.93      1668
        MISC       0.79      0.83      0.81       702
         ORG       0.88      0.90      0.89      1661
         PER       0.97      0.95      0.96      1617

   micro avg       0.90      0.92      0.91      5648
   macro avg       0.89      0.91      0.90      5648
weighted avg       0.91      0.92      0.91      5648



In [None]:
import numpy as np
from sklearn.metrics import classification_report

print(classification_report(np.concatenate(y_true), np.concatenate(y_pred)))

              precision    recall  f1-score   support

       B-LOC       0.93      0.94      0.93      1668
      B-MISC       0.83      0.85      0.84       702
       B-ORG       0.90      0.91      0.90      1661
       B-PER       0.97      0.96      0.96      1617
       I-LOC       0.87      0.93      0.90       257
      I-MISC       0.65      0.79      0.71       216
       I-ORG       0.87      0.92      0.90       835
       I-PER       0.99      0.99      0.99      1156
           O       1.00      0.99      0.99     38323

    accuracy                           0.98     46435
   macro avg       0.89      0.92      0.90     46435
weighted avg       0.98      0.98      0.98     46435



In [3]:
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

model = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/TMA/NLP/bertner")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/TMA/NLP/bertner")

ner_model = pipeline('ner', model=model, tokenizer=tokenizer)

In [5]:
sequence = "England is a country that is part of the United Kingdom"
ner_model(sequence)

[{'end': 7,
  'entity': 'B-LOC',
  'index': 1,
  'score': 0.9998566,
  'start': 0,
  'word': 'England'},
 {'end': 47,
  'entity': 'B-LOC',
  'index': 10,
  'score': 0.99983966,
  'start': 41,
  'word': 'United'},
 {'end': 55,
  'entity': 'I-LOC',
  'index': 11,
  'score': 0.99957854,
  'start': 48,
  'word': 'Kingdom'}]