#### Сравнение парсеров для русского языка

##### Подготавливаем файл для сравнения

In [None]:
!wget https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-Taiga/master/ru_taiga-ud-test.conllu

In [None]:
test_sentences = []

with open("ru_taiga-ud-test.conllu", "r", encoding="utf-8") as data_file:
    data_file = data_file.readlines()
    for line in data_file:
        if '# text' in line:
            test_sentences.append(line[9:].strip())

In [None]:
len(test_sentences)

881

In [None]:
with open("ru_taiga-ud-test.conllu", "r", encoding="utf-8") as data_file:
    with open("test.conllu", "w", encoding="utf-8") as test_file:
        data_file = data_file.readlines()
        for line in data_file:
            if '# sent' in line:
                continue
            elif '# genre' in line:
                continue
            elif '# new' in line:
                continue
            elif line == "\n":
                print('\n', file=test_file)
            else:
                print(line.strip(), file=test_file)

##### UdPipe

In [None]:
!pip install ufal.udpipe
!pip install conllu
!wget https://github.com/jwijffels/udpipe.models.ud.2.5/raw/master/inst/udpipe-ud-2.5-191206/russian-syntagrus-ud-2.5-191206.udpipe

In [None]:
import warnings
import os
import ufal.udpipe
import conllu
from collections import defaultdict
warnings.filterwarnings('ignore')

In [None]:
class Model:
    def __init__(self, path):
        """Load given model."""
        self.model = ufal.udpipe.Model.load(path)
        if not self.model:
            raise Exception("Cannot load UDPipe model from file '%s'" % path)

    def tokenize(self, text):
        """Tokenize the text and return list of ufal.udpipe.Sentence-s."""
        tokenizer = self.model.newTokenizer(self.model.DEFAULT)
        if not tokenizer:
            raise Exception("The model does not have a tokenizer")
        return self._read(text, tokenizer)

    def read(self, text, in_format):
        """Load text in the given format (conllu|horizontal|vertical) and return list of ufal.udpipe.Sentence-s."""
        input_format = ufal.udpipe.InputFormat.newInputFormat(in_format)
        if not input_format:
            raise Exception("Cannot create input format '%s'" % in_format)
        return self._read(text, input_format)

    def _read(self, text, input_format):
        input_format.setText(text)
        error = ufal.udpipe.ProcessingError()
        sentences = []

        sentence = ufal.udpipe.Sentence()
        while input_format.nextSentence(sentence, error):
            sentences.append(sentence)
            sentence = ufal.udpipe.Sentence()
        if error.occurred():
            raise Exception(error.message)

        return sentences

    def tag(self, sentence):
        """Tag the given ufal.udpipe.Sentence (inplace)."""
        self.model.tag(sentence, self.model.DEFAULT)

    def parse(self, sentence):
        """Parse the given ufal.udpipe.Sentence (inplace)."""
        self.model.parse(sentence, self.model.DEFAULT)

    def write(self, sentences, out_format):
        """Write given ufal.udpipe.Sentence-s in the required format (conllu|horizontal|vertical)."""

        output_format = ufal.udpipe.OutputFormat.newOutputFormat(out_format)
        output = ''
        for sentence in sentences:
            output += output_format.writeSentence(sentence)
        output += output_format.finishDocument()

        return output

In [None]:
model = Model('/content/russian-syntagrus-ud-2.5-191206.udpipe')

In [None]:
def get_conllu(model, text):
    sentences = model.tokenize(text)
    for s in sentences:
        model.tag(s)
        model.parse(s)
    conllu_text = model.write(sentences, "conllu")
    return conllu_text

In [None]:
with open("udpipe.conllu", "w", encoding="utf-8") as output_file:
    for sentence in test_sentences:
        udpipe_results = get_conllu(model, test_sentences[0])
        print(udpipe_results[32:], file=output_file)

##### Spacy

In [None]:
!pip install spacy
!pip install https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.1.0/ru_core_news_sm-3.1.0.tar.gz
!pip install spacy_conll

In [None]:
import spacy
from spacy_conll import init_parser

nlp_spacy = init_parser("ru_core_news_sm",
                  "spacy",
                  ext_names={"conll_str": "conll_str"})

with open('spacy.conllu', 'w', encoding='UTF-8') as out:
    for sent in test_sentences:
        doc = nlp_spacy(sent)
        out.write(f'# text = {sent}\n')
        out.write(doc._.conll_str)
        out.write('\n\n')

##### Stanza

In [None]:
!pip install stanza
!pip install spacy-stanza

In [None]:
import stanza
import spacy_stanza
from spacy_conll import init_parser

nlp_stanza = init_parser("ru",
                  "stanza",
                  ext_names={"conll_str": "conll_str"})


with open('stanza.conllu', 'w', encoding='UTF-8') as out:
    for sent in test_sentences:
        doc = nlp_stanza(sent)
        out.write(f'# text = {sent}\n')
        out.write(doc._.conll_str)
        out.write('\n\n')

##### Собственно сравнение

In [None]:
def read_conllu(file_path):
    blocks = []
    data = []
    current_block = []

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        for line in lines:
            if line.startswith("# text"):
                if current_block:
                    data.append(current_block)
                    current_block = []
                current_block.append(line)
            elif line != "\n":
                current_block.append(line)

        if current_block:
            data.append(current_block)

    for block in data:
        chain = []
        for line in block:
            if '# text' not in line:
                chain.append(line.strip().split("\t"))
        blocks.append(chain)
    return blocks

In [None]:
def compare_conllu_files(file_path1, file_path2):
    test = read_conllu(file_path1)
    data = read_conllu(file_path2)

    FORM_total = 0
    form = 0
    LEMMA_total = 0
    lemma = 0
    UPOS_total = 0
    upos = 0
    XPOS_total = 0
    xpos = 0
    FEATS_total = 0
    feats = 0
    HEAD_total = 0
    head = 0
    DEPREL_total = 0
    deprel = 0

    for n, block in enumerate(test):
        if len(block) <= len(data[n]):
            for i in range(len(block)):
                FORM_total += 1
                LEMMA_total += 1
                UPOS_total += 1
                XPOS_total += 1
                FEATS_total += 1
                HEAD_total += 1
                DEPREL_total += 1
                if data[n][i][1] == block[i][1]:
                    form +=1
                if data[n][i][2] == block[i][2]:
                    lemma +=1
                if data[n][i][3] == block[i][3]:
                    upos +=1
                if data[n][i][4] == block[i][4]:
                    xpos +=1
                if data[n][i][5] == block[i][5]:
                    feats +=1
                if data[n][i][6] == block[i][6]:
                    head +=1
                if data[n][i][7] == block[i][7]:
                    deprel +=1
        if len(block) > len(data[n]):
            for i in range(len(data[n])):
                FORM_total += 1
                LEMMA_total += 1
                UPOS_total += 1
                XPOS_total += 1
                FEATS_total += 1
                HEAD_total += 1
                DEPREL_total += 1
                if data[n][i][1] == block[i][1]:
                    form +=1
                if data[n][i][2] == block[i][2]:
                    lemma +=1
                if data[n][i][3] == block[i][3]:
                    upos +=1
                if data[n][i][4] == block[i][4]:
                    xpos +=1
                if data[n][i][5] == block[i][5]:
                    feats +=1
                if data[n][i][6] == block[i][6]:
                    head +=1
                if data[n][i][7] == block[i][7]:
                    deprel +=1
    t_total = FORM_total + LEMMA_total + UPOS_total + XPOS_total + FEATS_total + HEAD_total + DEPREL_total
    c_total = form + lemma + upos + xpos + feats + head + deprel
    result = f'Form accuracy: {form/FORM_total}\nLemma accuracy: {lemma/LEMMA_total}\nUpos accuracy: {upos/UPOS_total}\nXpos accuracy: {xpos/XPOS_total}\nFeatures accuracy: {feats/FEATS_total}\nHead accuracy: {head/HEAD_total}\nDeprel accuracy: {deprel/DEPREL_total}\nTotal accuracy: {c_total/t_total}\n'
    return result

In [None]:
test_conllu = "test.conllu"
udpipe_conllu = "udpipe.conllu"

print(compare_conllu_files(test_conllu, udpipe_conllu))

Form accuracy: 0.007765216398309923
Lemma accuracy: 0.008107799474705949
Upos accuracy: 0.10106200753682767
Xpos accuracy: 1.0
Features accuracy: 0.11270983213429256
Head accuracy: 0.17106314948041568
Deprel accuracy: 0.06908758707319858
Total accuracy: 0.2099707988711072



In [None]:
spacy_conllu = "spacy.conllu"

print(compare_conllu_files(test_conllu, spacy_conllu))

Form accuracy: 0.9142661179698217
Lemma accuracy: 0.7906133646874388
Upos accuracy: 0.8474426807760141
Xpos accuracy: 0.0
Features accuracy: 0.6344307270233196
Head accuracy: 0.6972369194591417
Deprel accuracy: 0.6917499510092102
Total accuracy: 0.6536771087035638



In [None]:
stanza_conllu = "stanza.conllu"

print(compare_conllu_files(test_conllu, stanza_conllu))

Form accuracy: 0.9117589512815496
Lemma accuracy: 0.8547251027196243
Upos accuracy: 0.880747407552338
Xpos accuracy: 0.0
Features accuracy: 0.7555272940716102
Head accuracy: 0.7359616513402465
Deprel accuracy: 0.7945607513206809
Total accuracy: 0.7047544511837214

