# Named Entity Recognition for Japanese Texts

In [1]:
import torch
import transformers
from datasets import load_dataset_builder, get_dataset_split_names
import datasets
import numpy as np
import os

os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'

In [2]:
dataset_name = "stockmark/ner-wikipedia-dataset"
load_from_disk = True

## Load the dataset

Get a description of the dataset

In [3]:
from datasets import load_dataset_builder, get_dataset_split_names

dataset_name = "stockmark/ner-wikipedia-dataset"

ds_builder = load_dataset_builder(dataset_name)
split_names = get_dataset_split_names(dataset_name)

In [4]:
split_names

['train']

In [5]:
ds_builder.info.features

{'entities': [{'name': Value(dtype='string', id=None),
   'span': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
   'type': Value(dtype='string', id=None)}],
 'text': Value(dtype='string', id=None),
 'curid': Value(dtype='string', id=None)}

In [6]:
from datasets import load_dataset

dataset_path = "../data/ner-wikipedia-dataset"

if not load_from_disk:
    dataset = load_dataset(dataset_name)
    dataset.save_to_disk(dataset_path)
else:
    dataset = datasets.load_from_disk(dataset_path)

In [7]:
# Split the dataset into train and test. Train should be 80% of the data.
seed = 123
data_split = dataset['train'].train_test_split(train_size=0.8, seed=seed)
train_data, test_data = data_split['train'], data_split['test']

In [8]:
train_data.features['entities'], train_data.features['text']

([{'name': Value(dtype='string', id=None),
   'span': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
   'type': Value(dtype='string', id=None)}],
 Value(dtype='string', id=None))

In [9]:
enitity_names_jp = {e["type"] for ents in train_data["entities"] for e in ents}
print("Number of entity types:", len(enitity_names_jp))
print("Entity types:", enitity_names_jp)

Number of entity types: 8
Entity types: {'政治的組織名', 'イベント名', '法人名', 'その他の組織名', '製品名', '人名', '地名', '施設名'}


In [10]:
entity_map_en = {
    "法人名": "CORP",
    "その他の組織名": "ORG-O",
    "人名": "PER",
    "製品名": "PROD",
    "地名": "Place",
    "政治的組織名": "ORG-P",
    "施設名": "FAC",
    "イベント名": "EVT",
}
entity_names_en = list(entity_map_en.values())

In [11]:
[f"{prefix}-{entity_name}" for entity_name in entity_names_en for prefix in ["B", "I"]]

['B-CORP',
 'I-CORP',
 'B-ORG-O',
 'I-ORG-O',
 'B-PER',
 'I-PER',
 'B-PROD',
 'I-PROD',
 'B-Place',
 'I-Place',
 'B-ORG-P',
 'I-ORG-P',
 'B-FAC',
 'I-FAC',
 'B-EVT',
 'I-EVT']

In [12]:
IGNORE_LABEL = -100
# 0 is the label for the "O" tag
label_list = ["O"] + [
    f"{prefix}-{entity_name}"
    for entity_name in entity_names_en
    for prefix in ["B", "I"]
]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

label2id, id2label

({'O': 0,
  'B-CORP': 1,
  'I-CORP': 2,
  'B-ORG-O': 3,
  'I-ORG-O': 4,
  'B-PER': 5,
  'I-PER': 6,
  'B-PROD': 7,
  'I-PROD': 8,
  'B-Place': 9,
  'I-Place': 10,
  'B-ORG-P': 11,
  'I-ORG-P': 12,
  'B-FAC': 13,
  'I-FAC': 14,
  'B-EVT': 15,
  'I-EVT': 16},
 {0: 'O',
  1: 'B-CORP',
  2: 'I-CORP',
  3: 'B-ORG-O',
  4: 'I-ORG-O',
  5: 'B-PER',
  6: 'I-PER',
  7: 'B-PROD',
  8: 'I-PROD',
  9: 'B-Place',
  10: 'I-Place',
  11: 'B-ORG-P',
  12: 'I-ORG-P',
  13: 'B-FAC',
  14: 'I-FAC',
  15: 'B-EVT',
  16: 'I-EVT'})

## Tokenization

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification, Trainer

model_name = "xlm-roberta-base"
# model_name = "rinna/japanese-roberta-base"
# model_name = "tohoku-nlp/bert-base-japanese-v3"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [15]:
idx = 11
example = train_data[idx]
text = example["text"]
tokenized_input = tokenizer(text, return_offsets_mapping=True)
entities = example["entities"]
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

In [16]:
print("Text: ", text)
print("Tokens: ", tokens)
print("Entities: ", entities)

Text:  1811年3月、アフマド・トゥーソンを司令官とするアラビア遠征軍約1万が出陣した。
Tokens:  ['<s>', '▁18', '11', '年', '3', '月', '、', 'ア', 'フ', 'マ', 'ド', '・', 'ト', 'ゥ', 'ー', 'ソン', 'を', '司令', '官', 'とする', 'ア', 'ラ', 'ビ', 'ア', '遠', '征', '軍', '約', '1', '万', 'が出', '陣', 'した', '。', '</s>']
Entities:  [{'name': 'アフマド・トゥーソン', 'span': [8, 18], 'type': '人名'}, {'name': 'アラビア遠征軍', 'span': [25, 32], 'type': '政治的組織名'}]


In [17]:
tokenized_inputs = tokenizer(example["text"], return_offsets_mapping=True)
labels = [label2id["O"]] * len(tokenized_inputs["input_ids"])


In [18]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["text"], return_offsets_mapping=True)
    labels = [label2id["O"]] * len(tokenized_inputs["input_ids"])

    for entity in examples["entities"]:
        entity_start, entity_end = entity["span"]
        label = entity_map_en[entity["type"]]
        # print(f"Entity: {text[entity_start:entity_end]}, Type: {label}")
        for i, (start, end) in enumerate(tokenized_inputs["offset_mapping"]):
            if start >= entity_start and end <= entity_end:
                # Set the label of special tokens to -100
                if start == end:
                    labels[i] = IGNORE_LABEL
                # print(f"{i}/{len(labels)}")
                elif start == entity_start:
                    labels[i] = label2id[f"B-{label}"]
                else:
                    # Add I- prefix to the labels
                    labels[i] = label2id[f"I-{label}"]
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [19]:
tokenized_trainset = train_data.map(tokenize_and_align_labels)
tokenized_testset = test_data.map(tokenize_and_align_labels)

data_collator = DataCollatorForTokenClassification(tokenizer)

Map:   0%|          | 0/4274 [00:00<?, ? examples/s]

Map:   0%|          | 0/1069 [00:00<?, ? examples/s]

Evaluation of the model

In [20]:
import evaluate

seqeval = evaluate.load("seqeval")

In [21]:
def compute_metrics(p):
    predictions, labels = p

    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != IGNORE_LABEL]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != IGNORE_LABEL]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Train the model

In [23]:
from transformers import AutoModelForTokenClassification, TrainingArguments

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
output_dir = f"../models/{model_name}"
log_dir = f"../runs/{model_name}"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=100,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to=["tensorboard"],
    logging_dir=log_dir,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_trainset,
    eval_dataset=tokenized_testset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.826468,0.191413,0.148766,0.167416,0.750055
2,No log,0.46853,0.381818,0.423358,0.401516,0.876058
3,0.805200,0.267276,0.510976,0.614877,0.558132,0.923913
4,0.805200,0.198183,0.749109,0.803267,0.775243,0.948871
5,0.805200,0.176452,0.779805,0.832117,0.805112,0.953722
6,0.180600,0.172529,0.781209,0.835245,0.807324,0.95525
7,0.180600,0.169132,0.805574,0.843935,0.824308,0.956996
8,0.180600,0.165427,0.807795,0.842892,0.82497,0.958355
9,0.089900,0.170782,0.804276,0.849844,0.826432,0.956584
10,0.089900,0.170253,0.817362,0.850886,0.833787,0.958815


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=3400, training_loss=0.03819486107019817, metrics={'train_runtime': 2451.5611, 'train_samples_per_second': 174.338, 'train_steps_per_second': 1.387, 'total_flos': 2.715019753507973e+16, 'train_loss': 0.03819486107019817, 'epoch': 100.0})

## Inference

In [25]:
from transformers import pipeline

# model = AutoModelForTokenClassification.from_pretrained("../models/checkpoint-335").to(
#     device
# )

nlp = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    ignore_labels=[label2id["O"]],
    device=0,
)


In [26]:
text = test_data[556]['text']
print("Text: ", text)
preds = nlp(text)

Text:  第二次世界大戦後、カラマンリスは政界で頭角を現し、アレクサンドル・パパゴス首相のもとで公共事業省大臣となった。


In [27]:
# Combine nearby B- and I- labels
prev_label = ""
word = ""
pred_entities = []

entity_start = 0
entity_end = 0

for pred in preds:
    if pred["entity"][:2] != "I-":
        if prev_label != "":
            entity = {
                "start": entity_start,
                "end": entity_end,
                "type": prev_label,
                "word": word,
            }
            pred_entities.append(entity)
            print(entity)
    if pred["entity"][:2] == "B-":
        entity_start = pred["start"]
        prev_label = pred["entity"][2:]
        word = pred["word"]
    elif pred["entity"][:2] == "I-":
        entity_end = pred["end"]
        word += pred["word"]
    else:
        prev_label = ""
        word = ""

{'start': 0, 'end': 0, 'type': 'EVT', 'word': '▁'}
{'start': 0, 'end': 7, 'type': 'EVT', 'word': '第二次世界大戦'}
{'start': 9, 'end': 15, 'type': 'PER', 'word': 'カラマンリス'}
{'start': 25, 'end': 37, 'type': 'PER', 'word': 'アレクサンドル・パパゴス'}
{'start': 43, 'end': 48, 'type': 'ORG-P', 'word': '公共事業省'}


In [39]:
test_data[556]["entities"]

[{'name': '第二次世界大戦', 'span': [0, 7], 'type': 'イベント名'},
 {'name': 'カラマンリス', 'span': [9, 15], 'type': '人名'},
 {'name': 'アレクサンドル・パパゴス', 'span': [25, 37], 'type': '人名'},
 {'name': '公共事業省', 'span': [43, 48], 'type': '政治的組織名'}]

In [38]:
[pred for pred in preds if pred["entity"] != "O"]

[{'entity': 'B-EVT',
  'score': 0.9725953,
  'index': 1,
  'word': '▁',
  'start': 0,
  'end': 1},
 {'entity': 'B-EVT',
  'score': 0.97313267,
  'index': 2,
  'word': '第二次',
  'start': 0,
  'end': 3},
 {'entity': 'I-EVT',
  'score': 0.9812391,
  'index': 3,
  'word': '世界',
  'start': 3,
  'end': 5},
 {'entity': 'I-EVT',
  'score': 0.9828106,
  'index': 4,
  'word': '大',
  'start': 5,
  'end': 6},
 {'entity': 'B-PER',
  'score': 0.9931676,
  'index': 8,
  'word': 'カラ',
  'start': 9,
  'end': 11},
 {'entity': 'I-PER',
  'score': 0.9941777,
  'index': 9,
  'word': 'マン',
  'start': 11,
  'end': 13},
 {'entity': 'I-PER',
  'score': 0.9931787,
  'index': 10,
  'word': 'リ',
  'start': 13,
  'end': 14},
 {'entity': 'B-PER',
  'score': 0.9919369,
  'index': 22,
  'word': 'ア',
  'start': 25,
  'end': 26},
 {'entity': 'I-PER',
  'score': 0.9932075,
  'index': 23,
  'word': 'レ',
  'start': 26,
  'end': 27},
 {'entity': 'I-PER',
  'score': 0.99394506,
  'index': 24,
  'word': 'ク',
  'start': 27,
  