In [128]:
from datasets import load_dataset
from pprint import pprint
from collections import Counter
import pandas as pd
from datasets import Dataset
from unicodedata import normalize, is_normalized
from spacy_alignments.tokenizations import get_alignments
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer,BatchEncoding, pipeline
import torch

In [4]:
dataset = load_dataset('llm-book/ner-wikipedia-dataset', trust_remote_code=True)

In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['curid', 'text', 'entities'],
        num_rows: 4274
    })
    validation: Dataset({
        features: ['curid', 'text', 'entities'],
        num_rows: 534
    })
    test: Dataset({
        features: ['curid', 'text', 'entities'],
        num_rows: 535
    })
})


In [7]:
pprint(list(dataset['train'])[:2])

[{'curid': '3638038',
  'entities': [{'name': 'さくら学院', 'span': [0, 5], 'type': 'その他の組織名'},
               {'name': 'Ciao Smiles', 'span': [6, 17], 'type': 'その他の組織名'}],
  'text': 'さくら学院、Ciao Smilesのメンバー。'},
 {'curid': '1729527',
  'entities': [{'name': 'レクレアティーボ・ウェルバ', 'span': [17, 30], 'type': 'その他の組織名'},
               {'name': 'プリメーラ・ディビシオン', 'span': [32, 44], 'type': 'その他の組織名'}],
  'text': '2008年10月5日、アウェーでのレクレアティーボ・ウェルバ戦でプリメーラ・ディビシオンでの初得点を決めた。'}]


In [11]:
for i in dataset['train']:
    pprint(i)
    break

{'curid': '3638038',
 'entities': [{'name': 'さくら学院', 'span': [0, 5], 'type': 'その他の組織名'},
              {'name': 'Ciao Smiles', 'span': [6, 17], 'type': 'その他の組織名'}],
 'text': 'さくら学院、Ciao Smilesのメンバー。'}


・今回はテキストに含まれる固有表現のスパンとそのタイプを指定する

In [31]:
# データセットの分析

def count_label_occurrences(dataset: Dataset) -> dict[str, int]:

    # 固有表現タイプを抽出したlistを作成する
    entities = [
        e['type'] for data in dataset for e in data['entities']
    ]

    # ラベルの出現回数が多い順に並び変える
    # Counterにはmost_common()メソッドがあり、(要素, 出現回数)という形のタプルを出現回数順に並べたリストを返す。
    label_counts = dict(Counter(entities).most_common())
    return label_counts


label_counts_dict = {}
for split in dataset:
    label_counts_dict[split] = count_label_occurrences(dataset[split])
df = pd.DataFrame(label_counts_dict)
df.loc['合計'] = df.sum()
df

Unnamed: 0,train,validation,test
人名,2394,299,287
法人名,2006,231,248
地名,1769,184,204
政治的組織名,953,121,106
製品名,934,123,158
施設名,868,103,137
その他の組織名,852,99,100
イベント名,831,85,93
合計,10607,1245,1333


In [40]:
def has_overlap(spans):
    sorted_spans = sorted(spans, key=lambda x: x[0])
    for i in range(1, len(sorted_spans)):
        if sorted_spans[i-1][1] > sorted_spans[i][0]:
            return 1
    return 0


overlap_count = 0
for split in dataset:
    for data in dataset[split]:
        if data['entities']:
            spans = [e['span'] for e in data['entities']]
            overlap_count += has_overlap(spans)

    print(f"{split}におけるスパンが重複する事例数：{overlap_count}")

trainにおけるスパンが重複する事例数：0
validationにおけるスパンが重複する事例数：0
testにおけるスパンが重複する事例数：0


In [34]:
spans

[[0, 9], [10, 21], [25, 37]]

In [35]:
data['entities']

[{'name': 'ダーヴラ・カーワン', 'span': [0, 9], 'type': '人名'},
 {'name': 'マーシー・ハーティガン', 'span': [10, 21], 'type': '人名'},
 {'name': 'ラッセル・T・デイヴィス', 'span': [25, 37], 'type': '人名'}]

In [39]:
dataset['test'][-1:]

{'curid': ['4113413'],
 'text': ['ダーヴラ・カーワンはマーシー・ハーティガンを演じ、ラッセル・T・デイヴィスは本作のポッドキャストコメンタリーで彼女について「これまでにないほどダークな悪役」と表現した。'],
 'entities': [[{'name': 'ダーヴラ・カーワン', 'span': [0, 9], 'type': '人名'},
   {'name': 'マーシー・ハーティガン', 'span': [10, 21], 'type': '人名'},
   {'name': 'ラッセル・T・デイヴィス', 'span': [25, 37], 'type': '人名'}]]}

## 前処理

## テキスト正規化

In [43]:
text = "ABCＡＢＣabcABCアイウｱｲｳ①②③123"

nomalized_text = normalize('NFKC', text)
print('正規化前', text)
print('正規化後', nomalized_text)

正規化前 ABCＡＢＣabcABCアイウｱｲｳ①②③123
正規化後 ABCABCabcABCアイウアイウ123123


In [45]:
count = 0
for split in dataset:
    for data in dataset[split]:
        if not is_normalized('NFKC',data['text']): # 正規化されていないとFalseをかえす？
            count += 1
print(f'正規化されていない事例数: {count}')

正規化されていない事例数: 0


In [48]:
text = "ABCＡＢＣabcABCアイウｱｲｳ①②③123"
is_normalized('NFKC', text)

False

In [52]:
'/'.join(dataset['train'][0]['text'])

'さ/く/ら/学/院/、/C/i/a/o/ /S/m/i/l/e/s/の/メ/ン/バ/ー/。'

## 文字列とトークン列のアライメント

In [161]:
model_name = "tohoku-nlp/bert-base-japanese-v3"
tokenizer = AutoTokenizer.from_pretrained(model_name)


text ='さくら学院'

# 文字列のLISTに変換
characters = list(text)

# 特殊トークンも含めたリストにする
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(text))

char_to_token_indices, token_to_char_indices = get_alignments(characters, tokens)
print(characters, tokens)
print('文字に対するトークンの位置',char_to_token_indices)
print('トークンに対する文字の位置',token_to_char_indices)

['さ', 'く', 'ら', '学', '院'] ['[CLS]', 'さくら', '学院', '[SEP]']
文字に対するトークンの位置 [[1], [1], [1], [2], [2]]
トークンに対する文字の位置 [[], [0, 1, 2], [3, 4], []]


In [72]:
print(characters, tokens)

['さ', 'く', 'ら', '学', '院'] ['[CLS]', 'さくら', '学院', '[SEP]']


In [81]:
text = '大谷翔平は岩手県水沢市出身'
entities = [
    {'name':'大谷正平', 'span':[0, 4], 'type':'人名'},
    {'name':'岩手県水沢市', 'span':[5, 11], 'type':'地方'}
]

In [112]:



def output_tokens_and_labels(text, entities, tokenizer):
    characters = list(text)
    tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(text))
    char_to_token_indices, _ = get_alignments(characters, tokens)
    
    # 0で初期化したラベルリスト
    labels = ['0'] * len(tokens)
    for entity in entities:
        entity_span, entity_type = entity['span'], entity['type']
        start = char_to_token_indices[entity_span[0]][0]
        end = char_to_token_indices[entity_span[1] -1][0]
        labels[start] = f"B-{entity_type}"
        for idx in range(start + 1, end + 1):
            labels[idx] = f"I-{entity_type}"
    
        labels[0] = '-'
        labels[-1] = '-'
    return tokens, labels


tokens, labels = output_tokens_and_labels(text, entities, tokenizer)

df = pd.DataFrame({'トークン列':tokens, 'ラベル列':labels})
df.index.name = "位置"
df.T

位置,0,1,2,3,4,5,6,7,8,9,10
トークン列,[CLS],大谷,翔,##平,は,岩手,県,水沢,市,出身,[SEP]
ラベル列,-,B-人名,I-人名,I-人名,0,B-地方,I-地方,I-地方,I-地方,0,-


In [113]:
from typing import Any
from seqeval.metrics import classification_report

def create_character_labels(
    text: str, entities: list[dict[str, list[int] | str]]
) -> list[str]:
    """文字ベースでラベルのlistを作成"""
    # "O"のラベルで初期化したラベルのlistを作成する
    labels = ["O"] * len(text)
    for entity in entities: # 各固有表現を処理する
        entity_span, entity_type = entity["span"], entity["type"]
        # 固有表現の開始文字の位置に"B-"のラベルを設定する
        labels[entity_span[0]] = f"B-{entity_type}"
        # 固有表現の開始文字以外の位置に"I-"のラベルを設定する
        for i in range(entity_span[0] + 1, entity_span[1]):
            labels[i] = f"I-{entity_type}"
    return labels

def convert_results_to_labels(
    results: list[dict[str, Any]]
) -> tuple[list[list[str]], list[list[str]]]:
    """正解データと予測データのラベルのlistを作成"""
    true_labels, pred_labels = [], []
    for result in results: # 各事例を処理する
        # 文字ベースでラベルのリストを作成してlistに加える
        true_labels.append(
            create_character_labels(result["text"], result["entities"])
        )
        pred_labels.append(
            create_character_labels(result["text"], result["pred_entities"])
        )
    return true_labels, pred_labels

In [123]:
## 評価指標のseqevalの挙動

results = [
    {
        "text": "大谷翔平は岩手県水沢市出身",
        "entities": [
            {"name": "大谷翔平", "span": [0, 4], "type": "人名"},
            {"name": "岩手県水沢市", "span": [5, 11], "type": "地名"},
        ],
        "pred_entities": [
            {"name": "大谷翔平", "span": [0, 4], "type": "人名"},
            {"name": "岩手県", "span": [5, 8], "type": "地名"},
            {"name": "水沢市", "span": [8, 11], "type": "施設名"},
        ],
    }
]

true_labels, pred_labels = convert_results_to_labels(results)
print(classification_report(true_labels, pred_labels))

              precision    recall  f1-score   support

          人名       1.00      1.00      1.00         1
          地名       0.00      0.00      0.00         1
         施設名       0.00      0.00      0.00         0

   micro avg       0.33      0.50      0.40         2
   macro avg       0.33      0.33      0.33         2
weighted avg       0.50      0.50      0.50         2



In [126]:
from seqeval.metrics import f1_score, precision_score, recall_score

def compute_scores(true_labels: list[list[str]], pred_labels: list[list[str]], average:str) -> dict[str, float]:
    scores = {
        'precision': precision_score(true_labels, pred_labels, average=average),
        'recall': recall_score(true_labels, pred_labels, average=average),
        'F1-score': f1_score(true_labels, pred_labels, average=average),
    }

    return scores


print(compute_scores(true_labels, pred_labels, 'micro'))

{'precision': 0.3333333333333333, 'recall': 0.5, 'F1-score': 0.4}


## 固有表現認識モデルの実装

In [143]:
# BERTのファインチューニング

# label1id
def create_label2id(
    entities_list: list[list[dict[str, str | str]]]
) -> dict[str, int]:
    label2id = {"0": 0}

    # setなので重複はなし
    entity_type = set([e['type'] for entities in entities_list for e in entities])

    entity_types = sorted(entity_type)

    # 1entityにつき2種類登録
    for i, entity_type in enumerate(entity_types):
        label2id[f"B-{entity_type}"] = i*2 + 1
        label2id[f"I-{entity_type}"] = i*2 + 2
    return label2id


label2id = create_label2id(dataset['train']["entities"])
id2label = {id:v for v, id in label2id.items()}

In [217]:
# データの前処理

def preprocess_data(data, tokenizer, label2id) -> BatchEncoding:
    # トークナイゼーション
    inputs = tokenizer(data['text'], return_tensors='pt', return_special_tokens_mask=True)
    inputs = { k:v.squeeze(0) for k, v in inputs.items()}

    characters = list(data['text'])
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'])
    print(characters)
    print(tokens)
    char_to_token_indeces, _ = get_alignments(characters, tokens)

    labels = torch.zeros_like(inputs['input_ids'])
    for entity in data['entities']:
        print(char_to_token_indeces)
        print(entity['span'][0])
        print(entity['span'][1] - 1)
        start_token_indeces = char_to_token_indeces[entity['span'][0]]
        end_token_indeces = char_to_token_indeces[entity['span'][1] - 1]

        # 文字に対応するトークンが存在しなければスキップ -> 固有表現ではない単語 ex(は)
        if(
            len(start_token_indeces) == 0
            or len(end_token_indeces) == 0
        ):
            continue

        start, end = start_token_indeces[0], end_token_indeces[0]
        print(start, end)
        entity_type = entity['type']

        labels[start] = label2id[f"B-{entity_type}"]
        if start != end:
            labels[start + 1 : end + 1] = label2id[f"I-{entity_type}"]


        labels[torch.where(inputs["special_tokens_mask"])] = -100
        inputs['labels'] = labels
    return inputs
        
        

In [218]:
test = dataset['train'][0]
preprocess_data(test, tokenizer, label2id)

['さ', 'く', 'ら', '学', '院', '、', 'C', 'i', 'a', 'o', ' ', 'S', 'm', 'i', 'l', 'e', 's', 'の', 'メ', 'ン', 'バ', 'ー', '。']
['[CLS]', 'さくら', '学院', '、', 'C', '##ia', '##o', 'Sm', '##ile', '##s', 'の', 'メンバー', '。', '[SEP]']
[[1], [1], [1], [2], [2], [3], [4], [5], [5], [6], [], [7], [7], [8], [8], [8], [9], [10], [11], [11], [11], [11], [12]]
0
4
1 2
[[1], [1], [1], [2], [2], [3], [4], [5], [5], [6], [], [7], [7], [8], [8], [8], [9], [10], [11], [11], [11], [11], [12]]
6
16
4 9


{'input_ids': tensor([    2, 16972, 14284,   384,    50, 13634,  7075, 20218, 18124,  7045,
           464, 12913,   385,     3]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'special_tokens_mask': tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor([-100,    1,    2,    0,    1,    2,    2,    2,    2,    2,    0,    0,
            0, -100])}

In [200]:
pprint(dataset['train'][0])

{'curid': '3638038',
 'entities': [{'name': 'さくら学院', 'span': [0, 5], 'type': 'その他の組織名'},
              {'name': 'Ciao Smiles', 'span': [6, 17], 'type': 'その他の組織名'}],
 'text': 'さくら学院、Ciao Smilesのメンバー。'}


In [235]:
# 訓練セットに前処理
train_dataset = dataset['train'].map(
    preprocess_data,
    fn_kwargs={
        "tokenizer":tokenizer,
        "label2id":label2id
    }, remove_columns=dataset['train'].column_names
)

eval_dataset = dataset['validation'].map(
    preprocess_data,
    fn_kwargs={
        "tokenizer":tokenizer,
        "label2id":label2id
    }, remove_columns=dataset['validation'].column_names
)

Map:   0%|          | 0/4274 [00:00<?, ? examples/s]

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

In [236]:
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_name, label2id=label2id, id2label=id2label)
data_collator = DataCollatorForTokenClassification(tokenizer)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--tohoku-nlp--bert-base-japanese-v3/snapshots/65243d6e5629b969c77309f217bd7b1a79d43c7e/config.json
Model config BertConfig {
  "_name_or_path": "tohoku-nlp/bert-base-japanese-v3",
  "architectures": [
    "BertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "0",
    "1": "B-\u305d\u306e\u4ed6\u306e\u7d44\u7e54\u540d",
    "2": "I-\u305d\u306e\u4ed6\u306e\u7d44\u7e54\u540d",
    "3": "B-\u30a4\u30d9\u30f3\u30c8\u540d",
    "4": "I-\u30a4\u30d9\u30f3\u30c8\u540d",
    "5": "B-\u4eba\u540d",
    "6": "I-\u4eba\u540d",
    "7": "B-\u5730\u540d",
    "8": "I-\u5730\u540d",
    "9": "B-\u653f\u6cbb\u7684\u7d44\u7e54\u540d",
    "10": "I-\u653f\u6cbb\u7684\u7d44\u7e54\u540d",
    "11": "B-\u65bd\u8a2d\u540d",
    "12": "I-\u65bd\u8a2d\u540d",
    "13": "B-\u6cd5\u4e

In [239]:
from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import set_seed

set_seed(42)

training_args = TrainingArguments(
    output_dir='output_bert_ner',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=1e-4,
    lr_scheduler_type='linear',
    warmup_ratio=0.1,
    num_train_epochs=5,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    fp16=True
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    args=training_args
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using auto half precision backend
The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4,274
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 670
  Number of trainable parameters = 110,629,649


Epoch,Training Loss,Validation Loss
1,0.6862,0.100481
2,0.0727,0.091146
3,0.0307,0.087163
4,0.013,0.092064
5,0.006,0.097529


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 534
  Batch size = 32
Saving model checkpoint to output_bert_ner/checkpoint-134
Configuration saved in output_bert_ner/checkpoint-134/config.json
Model weights saved in output_bert_ner/checkpoint-134/model.safetensors
tokenizer config file saved in output_bert_ner/checkpoint-134/tokenizer_config.json
Special tokens file saved in output_bert_ner/checkpoint-134/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
**

TrainOutput(global_step=670, training_loss=0.16173771913371868, metrics={'train_runtime': 86.146, 'train_samples_per_second': 248.067, 'train_steps_per_second': 7.777, 'total_flos': 1070012411245680.0, 'train_loss': 0.16173771913371868, 'epoch': 5.0})

In [240]:
def convert_list_dict_to_dict_list(
    list_dict: dict[str, list]
) -> list[dict[str, list]]:
    """ミニバッチのデータを事例単位のlistに変換"""
    dict_list = []
    # dictのキーのlistを作成する
    keys = list(list_dict.keys())
    for idx in range(len(list_dict[keys[0]])): # 各事例で処理する
        # dictの各キーからデータを取り出してlistに追加する
        dict_list.append({key: list_dict[key][idx] for key in keys})
    return dict_list

# ミニバッチのデータを事例単位のlistに変換する
list_dict = {
    "input_ids": [[0, 1], [2, 3]],
    "labels": [[1, 2], [3, 4]],
}
dict_list = convert_list_dict_to_dict_list(list_dict)
print(f"入力: {list_dict}")
print(f"出力: {dict_list}")

入力: {'input_ids': [[0, 1], [2, 3]], 'labels': [[1, 2], [3, 4]]}
出力: [{'input_ids': [0, 1], 'labels': [1, 2]}, {'input_ids': [2, 3], 'labels': [3, 4]}]


In [241]:
 list(list_dict.keys())

['input_ids', 'labels']

In [244]:
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import PreTrainedModel

def run_prediction(
    dataloader: DataLoader, model: PreTrainedModel
) -> list[dict[str, Any]]:
    """予測スコアに基づき固有表現ラベルを予測"""
    predictions = []
    for batch in tqdm(dataloader): # 各ミニバッチを処理する
        inputs = {
            k: v.to(model.device)
            for k, v in batch.items()
            if k != "special_tokens_mask"
        }
        # 予測スコアを取得する
        logits = model(**inputs).logits
        # 最もスコアの高いIDを取得する
        batch["pred_label_ids"] = logits.argmax(-1)
        batch = {k: v.cpu().tolist() for k, v in batch.items()}
        # ミニバッチのデータを事例単位のlistに変換する
        predictions += convert_list_dict_to_dict_list(batch)
    return predictions

# ミニバッチの作成にDataLoaderを用いる
validation_dataloader = DataLoader(
    eval_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=data_collator,
)
# 固有表現ラベルを予測する
predictions = run_prediction(validation_dataloader, model)
print(predictions[0]["pred_label_ids"])

100%|██████████| 17/17 [00:00<00:00, 29.15it/s]

[0, 0, 15, 16, 0, 0, 13, 14, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 14, 14, 13, 0, 0, 0, 0, 0, 15, 15, 16, 15, 13, 14, 14, 14, 14, 13, 13, 13, 14, 14, 13, 0, 0, 13, 13, 14, 14, 14, 13, 0, 13, 14, 14, 14, 0, 0, 0, 13, 0, 15, 16, 16, 0, 13, 14, 14, 14, 14, 13, 0, 0, 0, 15, 15, 16, 16, 13, 13, 14]



