# [世界のUniversal Dependenciesと係り受け解析ツール群](http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/publications/2021-06-22.pdf)
## [日本語UDを用いた係り受け解析器の自作](https://koichiyasuoka.github.io/deplacy/demo/2021-06-22/)
### [Transformers](https://huggingface.co/transformers)と[bert-large-japanese-char-extended](https://huggingface.co/KoichiYasuoka/bert-large-japanese-char-extended)を用いる場合


必要なパッケージと訓練用train.conlluを準備

In [ ]:
!test -d UD_Japanese-GSD || git clone --depth=1 https://github.com/universaldependencies/UD_Japanese-GSD
!test -f train.conllu || ln -s UD_Japanese-GSD/ja_gsd-ud-train.conllu train.conllu
!pip install transformers datasets deplacy

train-c.conlluをtrain.conlluから作成

In [ ]:
with open("train.conllu", "r", encoding="utf-8") as f:
  r = f.read()
with open("train-c.conllu", "w", encoding="utf-8") as f:
  for u in r.strip().split("\n\n"):
    w = [t for t in u.split("\n") if t.startswith("# text = ")][0]
    s = [t.split("\t") for t in u.split("\n") if not t.startswith("#")]
    for i,v in enumerate([v for v in w[9:] if not v.isspace()]):
      t = s[i]
      x,t[0],t[1],t[2],t[8] = t[1],str(i+1),v,"_","_"
      t[9] = "_" if t[9].find("SpaceAfter=No") < 0 else "SpaceAfter=No"
      if v != x:
        s.insert(i+1, [str(i+2),x[1:],"_","X","_","_",t[6]
           if t[7] == "goeswith" else str(i+1),"goeswith","_",t[9]])
        t[9] = "SpaceAfter=No"
        for t in [t for t in s if int(t[6]) > i+1]:
          t[6] = str(int(t[6])+1)
    print(w, "\n".join("\t".join(t) for t in s), "", sep="\n", file=f)

my.transを作成 (GPUで1時間程度)

In [ ]:
from transformers import (AutoTokenizer, AutoConfig,
  AutoModelForTokenClassification, DataCollatorForTokenClassification,
  TrainingArguments, Trainer)
from datasets.arrow_dataset import Dataset
brt = "KoichiYasuoka/bert-large-japanese-char-extended"
with open("train-c.conllu", "r", encoding="utf-8") as f:
  tok,tag = [],[]
  for s in f.read().strip().split("\n\n"):
    v = [t.split("\t") for t in s.split("\n") if not t.startswith("#")]
    tok.append([t[1] for t in v])
    tag.append(["\t".join(t[3:6]+["{:+}".format(int(t[6])-int(t[0]))
      if int(t[6]) else "0", t[7]]) for t in v])
lid = {l:i for i,l in enumerate(set(sum(tag, [])))}
tkz = AutoTokenizer.from_pretrained(brt)
dts = Dataset.from_dict({"tokens": tok, "tags": tag,
  "input_ids": [tkz.convert_tokens_to_ids(s) for s in tok],
  "labels": [[lid[t] for t in s] for s in tag]})
cfg = AutoConfig.from_pretrained(brt, num_labels=len(lid), label2id=lid,
  id2label={i:l for l,i in lid.items()})
mdl = AutoModelForTokenClassification.from_pretrained(brt, config=cfg)
dcl = DataCollatorForTokenClassification(tokenizer=tkz)
arg = TrainingArguments(output_dir="/tmp", overwrite_output_dir=True,
  per_device_train_batch_size=4, save_total_limit=2)
trn = Trainer(model=mdl, args=arg, data_collator=dcl, train_dataset=dts)
trn.train()
trn.save_model("my.trans")
tkz.save_pretrained("my.trans")

my.transで係り受け解析

In [ ]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
tkz = AutoTokenizer.from_pretrained("my.trans")
mdl = AutoModelForTokenClassification.from_pretrained("my.trans")
def nlp(sentence):
  s = [t for t in sentence if not t.isspace()]
  t = [i for i,t in enumerate(sentence) if t.isspace()]
  m = [i-j-1 for j,i in enumerate(t)]
  e = tkz.encode(s, return_tensors="pt", add_special_tokens=False)
  for i,q in enumerate(torch.argmax(mdl(e)[0], dim=2)[0].tolist()):
    t = mdl.config.id2label[q].split("\t")
    t[3] = str(int(t[3])+i+1) if int(t[3]) else "0"
    s[i] = [s[i],"_"]+t+["_","_" if i in m else "SpaceAfter=No"]
  return "\n".join("\t".join([str(i+1)]+t) for i,t in enumerate(s))+"\n\n"
doc=nlp("虎穴に入らざれば虎子を得ず。")
print(doc)
import deplacy
deplacy.serve(doc,port=None)

goeswithを削り取る

In [ ]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
tkz = AutoTokenizer.from_pretrained("my.trans")
mdl = AutoModelForTokenClassification.from_pretrained("my.trans")
def nlp(sentence):
  s = [t for t in sentence if not t.isspace()]
  t = [i for i,t in enumerate(sentence) if t.isspace()]
  m = [i-j-1 for j,i in enumerate(t)]
  e = tkz.encode(s, return_tensors="pt", add_special_tokens=False)
  for i,q in enumerate(torch.argmax(mdl(e)[0], dim=2)[0].tolist()):
    t = mdl.config.id2label[q].split("\t")
    t[3] = str(int(t[3])+i+1) if int(t[3]) else "0"
    s[i] = [s[i],"_"]+t+["_","_" if i in m else "SpaceAfter=No"]
  for i in [i for i in range(len(s)-1, 0, -1) if s[i][6] == "goeswith"]:
    t = s.pop(i)
    s[i-1][0],s[i-1][8] = s[i-1][0]+t[0],t[8]
    for t in [t for t in s if int(t[5]) > i]:
      t[5] = str(int(t[5])-1)
  return "\n".join("\t".join([str(i+1)]+t) for i,t in enumerate(s))+"\n\n"
doc=nlp("虎穴に入らざれば虎子を得ず。")
print(doc)
import deplacy
deplacy.serve(doc,port=None)