# [世界のUniversal Dependenciesと係り受け解析ツール群](http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/publications/2021-06-22.pdf)
## [日本語UDを用いた係り受け解析器の自作](https://koichiyasuoka.github.io/deplacy/demo/2021-06-22/)
### SuParとbert-large-japaneseとfugashiを用いる場合


必要なパッケージと各conlluを準備

In [ ]:
!test -d UD_Japanese-GSD || git clone --depth=1 https://github.com/universaldependencies/UD_Japanese-GSD
!test -f train.conllu || ln -s UD_Japanese-GSD/ja_gsd-ud-train.conllu train.conllu
!test -f dev.conllu || ln -s UD_Japanese-GSD/ja_gsd-ud-dev.conllu dev.conllu
!test -f test.conllu || ln -s UD_Japanese-GSD/ja_gsd-ud-test.conllu test.conllu
!pip install supar fugashi unidic-lite deplacy

my.suparを作成 (GPUで3時間程度)

In [ ]:
!biaffine-dep train -b -d 0 -c biaffine-dep-en -p my.supar -f bert --bert cl-tohoku/bert-large-japanese --embed= --train train.conllu --dev dev.conllu --test test.conllu

my.suparで係り受け解析

In [ ]:
import supar
prs = supar.Parser.load("my.supar")
nlp = lambda x: prs.predict([x], lang=None).sentences[0]
doc = nlp(["虎穴","に","入ら","ざれ","ば","虎子","を","得","ず","。"])
print(doc)
import deplacy
deplacy.serve(doc,port=None)

AutoTokenizerで単語切り

In [ ]:
import supar
from transformers import AutoTokenizer
prs = supar.Parser.load("my.supar")
brt = "cl-tohoku/bert-large-japanese"
tkz = AutoTokenizer.from_pretrained(brt, do_subword_tokenize=False)
nlp = lambda x: prs.predict([tkz.tokenize(x)], lang=None).sentences[0]
doc = nlp("虎穴に入らざれば虎子を得ず。")
print(doc)
import deplacy
deplacy.serve(doc,port=None)

fugashiとunidic-liteに置き換え

In [ ]:
import supar, fugashi, unidic_lite
prs = supar.Parser.load("my.supar")
tag = fugashi.Tagger("-d " + unidic_lite.DICDIR)
def nlp(sentence):
  s = tag(sentence)
  d = prs.predict([[t.surface for t in s]], lang=None).sentences[0]
  return d
doc = nlp("虎穴に入らざれば虎子を得ず。")
print(doc)
import deplacy
deplacy.serve(doc,port=None)

LEMMA・XPOS・MISCを追加

In [ ]:
import supar, fugashi, unidic_lite
prs = supar.Parser.load("my.supar")
tag = fugashi.Tagger("-d " + unidic_lite.DICDIR)
def nlp(sentence):
  s = tag(sentence)
  d = prs.predict([[t.surface for t in s]], lang=None).sentences[0]
  d.values[2] = [t.feature.lemma for t in s]
  d.values[4] = [t.pos.replace(",*", "").replace(",", "-") for t in s]
  d.values[9] = ["_" if t.white_space else "SpaceAfter=No" for t in s]
  return d
doc = nlp("虎穴に入らざれば虎子を得ず。")
print(doc)
import deplacy
deplacy.serve(doc,port=None)

UPOSを追加

In [ ]:
import supar, fugashi, unidic_lite
prs = supar.Parser.load("my.supar")
tag = fugashi.Tagger("-d " + unidic_lite.DICDIR)
def nlp(sentence):
  s = tag(sentence)
  d = prs.predict([[t.surface for t in s]], lang=None).sentences[0]
  d.values[2] = [t.feature.lemma for t in s]
  x = {"名詞":"NOUN", "代名詞":"PRON", "動詞":"VERB", "助動詞":"AUX",
    "形容詞":"ADJ", "形状詞":"ADJ", "連体詞":"DET", "副詞":"ADV",
    "助詞":"ADP", "接続詞":"CCONJ", "接頭辞":"NOUN", "接尾辞":"PART",
    "感動詞":"INTJ", "補助記号":"PUNCT", "記号":"SYM", "空白":"SYM"}
  y = {"助動詞語幹":"AUX", "固有名詞":"PROPN", "数詞":"NUM",
    "終助詞":"PART", "接続助詞":"SCONJ", "名詞的":"NOUN"}
  z = {"aux":"AUX", "cop":"AUX", "advmod":"ADV", "amod":"ADJ"}
  u = []
  for i,t in enumerate(s):
    f = t.feature
    g = d.values[7][i]
    u.append(z[g] if g in z else y[f.pos2] if f.pos2 in y else x[f.pos1])
    if g == "aux" and int(d.values[0][i]) - int(d.values[6][i]) == 1:
      h = s[i-1].feature.pos3
      if h.find("形状詞可能") >= 0 and f.lemma in ["だ", "なり"]:
        u[i-1] = "ADJ"
      elif h.startswith("サ変"):
        u[i-1] = "VERB"
  d.values[3] = u
  d.values[4] = [t.pos.replace(",*", "").replace(",", "-") for t in s]
  d.values[9] = ["_" if t.white_space else "SpaceAfter=No" for t in s]
  return d
doc = nlp("虎穴に入らざれば虎子を得ず。")
print(doc)
import deplacy
deplacy.serve(doc,port=None)

[おまけ] my.suparを作成せず[SuPar-UniDic](https://github.com/KoichiYasuoka/SuPar-UniDic)で代用

In [ ]:
!pip install suparunidic fugashi
import suparunidic
nlp = suparunidic.load(BERT="bert-large-japanese")
!ln -s /usr/local/lib/python*/*/suparunidic/suparmodels/bert-large-japanese/*.supar my.supar