In [1]:
import os

import xml.etree.ElementTree as ET

PATH_DICT = os.path.join('data', 'dict.opcorpora.xml')
PATH_ANNOT = os.path.join('data', 'annot.opcorpora.no_ambig.xml')

LIST_CONJ = ['а', 'и', 'или', 'но', 'что', 'чтобы']
LIST_PR = ['возле', 'для', 'до', 'за', 'между', 'на', 'по', 'при', 'среди', 'у', 'через']
LIST_ADV = ['бы', 'видимо', 'вот', 'же', 'затем', 'итак', 'ли', 'наверно', 'не', 'ни', 'потом', 'тогда', 'только', 'уже']

LIST_DELIMS = ['.', ',', '!', '?', '\n']

PATH_INPUT = os.path.join('data', 'input.txt')
PATH_OUTPUT = os.path.join('data', 'output.txt')

In [2]:
def get_tag(tag):
    dct = [
        (['INFN', 'GRND', 'PRTF', 'PRTS', 'VERB'], 'V'),
        (['ADVB', 'INTJ', 'PRCL', 'PRED'], 'ADV'),
        (['ADJF', 'ADJS', 'COMP'], 'A'),
        (['NOUN'], 'S'),
        (['PREP'], 'PR'),
        (['CONJ'], 'CONJ')
    ]

    for cs, t in dct:
        if tag in cs:
            return t

    return tag

In [3]:
root_dict = ET.parse(PATH_DICT).getroot()

In [4]:
lemmas = root_dict.findall('lemmata/lemma')

lammas_ids = [l.get('id') for l in lemmas]
lemmas_graph = dict((id, id) for id in lammas_ids)
id_to_lemmas = dict((id, l) for id, l in zip(lammas_ids, lemmas))

In [5]:
links = root_dict.findall('links/link')

edges = [(l.get('to'), l.get('from'))for l in links if l.get('type') != '26']
for a, b in edges:
    lemmas_graph[a] = lemmas_graph[b]

In [6]:
word_to_inf = {}

for l in lemmas:
    id = lemmas_graph[l.get('id')]
    inf = id_to_lemmas[id][0].get('t')
    tag = get_tag(id_to_lemmas[id][0][0].get('v'))
    for f in l[1:]:
        w = f.get('t')
        if w not in word_to_inf:
            word_to_inf[w] = set()
        word_to_inf[w].add((inf, tag))

In [7]:
root_annot = ET.parse(PATH_ANNOT).getroot()

In [8]:
tokens = root_annot.findall('./text/paragraphs/paragraph/sentence/tokens/token')
inf_distr = {}
for t in tokens:
    w = t[0].get('t').lower()
    inf = t[0][0][0].get('t')
    tag = get_tag(t[0][0][0][0].get('v'))
    if w not in inf_distr:
        inf_distr[w] = {(inf, tag): 0}
    inf_distr[w][(inf, tag)] = inf_distr[w].get((inf, tag), 0) + 1

In [9]:
def get_inf(word):
    if word in LIST_CONJ:
        return word, 'CONJ'
    #if word in LIST_PR:
    #    return word, 'PR'
    if word in LIST_ADV:
        return word, 'ADV'
    
    if word not in word_to_inf:
        if len(word) >= 2 and (word[-2:] in ['ть', 'ся']):
            return word, "V"
        return word, "ADV"
    
    infs = list(word_to_inf[word])
    distr = inf_distr[word] if word in inf_distr else {}
    cnts = [distr[inf] if inf in distr else 1 for inf in infs]
    return sorted(zip(cnts, infs))[-1][1]

In [10]:
res = []

with open(PATH_INPUT) as f_in:
    for s in f_in:
        for d in LIST_DELIMS:
            s = s.replace(d, ' ')
        ws = list(filter(lambda w: len(w), s.split(' ')))
        ws = [w.lower() for w in ws]
        infs = [get_inf(w) for w in ws]
        lres = ' '.join([w + '{' + inf + '=' + tag + '}' for w, (inf, tag) in zip(ws, infs)])
        res += [lres]
        
res = '\n'.join(res)

with open(PATH_OUTPUT, 'w') as f_out:
    f_out.write(res)