In [1]:
import xml.etree.ElementTree as ET

In [2]:
tree = ET.parse('dict.xml')
root = tree.getroot()

In [3]:
CONSTANT_PREFIXES = [
    "авиа",
    "авто",
    "аква",
    "анти",
    "анти-",
    "антропо",
    "архи",
    "арт",
    "арт-",
    "астро",
    "аудио",
    "аэро",
    "без",
    "бес",
    "био",
    "вело",
    "взаимо",
    "вне",
    "внутри",
    "видео",
    "вице-",
    "вперед",
    "впереди",
    "гекто",
    "гелио",
    "гео",
    "гетеро",
    "гига",
    "гигро",
    "гипер",
    "гипо",
    "гомо",
    "дву",
    "двух",
    "де",
    "дез",
    "дека",
    "деци",
    "дис",
    "до",
    "евро",
    "за",
    "зоо",
    "интер",
    "инфра",
    "квази",
    "квази-",
    "кило",
    "кино",
    "контр",
    "контр-",
    "космо",
    "космо-",
    "крипто",
    "лейб-",
    "лже",
    "лже-",
    "макро",
    "макси",
    "макси-",
    "мало",
    "меж",
    "медиа",
    "медиа-",
    "мега",
    "мета",
    "мета-",
    "метео",
    "метро",
    "микро",
    "милли",
    "мини",
    "мини-",
    "моно",
    "мото",
    "много",
    "мульти",
    "нано",
    "нарко",
    "не",
    "небез",
    "недо",
    "нейро",
    "нео",
    "низко",
    "обер-",
    "обще",
    "одно",
    "около",
    "орто",
    "палео",
    "пан",
    "пара",
    "пента",
    "пере",
    "пиро",
    "поли",
    "полу",
    "после",
    "пост",
    "пост-",
    "порно",
    "пра",
    "пра-",
    "пред",
    "пресс-",
    "противо",
    "противо-",
    "прото",
    "псевдо",
    "псевдо-",
    "радио",
    "разно",
    "ре",
    "ретро",
    "ретро-",
    "само",
    "санти",
    "сверх",
    "сверх-",
    "спец",
    "суб",
    "супер",
    "супер-",
    "супра",
    "теле",
    "тетра",
    "топ-",
    "транс",
    "транс-",
    "ультра",
    "унтер-",
    "штаб-",
    "экзо",
    "эко",
    "эндо",
    "эконом-",
    "экс",
    "экс-",
    "экстра",
    "экстра-",
    "электро",
    "энерго",
    "этно",
]

In [4]:
def max_common(word, forms):
    n = len(word)
    i = 0
    while (i < n):
        is_ok = True
        for form in forms:
            if len(form) > i and form[i] == word[i]:
                continue
            else:
                is_ok = False
                break
        if not is_ok:
            break
        i += 1    
        
    return word[:i]    

In [5]:
speech_part_dict = {
    "NOUN" : "S",
    "ADJF" : "A",
    "ADJS" : "A",
    "COMP" : "A",
    "VERB" : "V",
    "INFN" : "V",
    "PRTF" : "V",
    "PRTS" : "V",
    "GRND" : "V",
    "NUMR" : "NI",
    "ADVB" : "ADV",
    "NPRO" : "NI",
    "PRED" : "S",
    "PREP" : "PR",
    "CONJ" : "CONJ",
    "PRCL" : "ADV",
    "INTJ" : "ADV",
    
}

In [6]:
class Lemma:
    def __init__(self, id_, word, word_forms, speech_part):
        self.moved = -1
        self.word = word.lower().replace('ё', 'е') 
        self.speech_part = speech_part
        self.id = id_
        
        self.end_list = [''] * len(word_forms)
        self.pref_list = [''] * len(word_forms)
        
        word_forms_ = []
        for i in range(len(word_forms)):
            form = word_forms[i]
            if form.startswith('по') and not word.startswith('по'):
                form = word_forms[i][2:]
                self.pref_list[i] = 'по'
            if form.startswith('наи') and not word.startswith('наи'):
                form = word_forms[i][3:]
                self.pref_list[i] = 'наи'    
            word_forms_.append(form.lower().replace('ё', 'е'))
            
        self.stem = max_common(self.word, word_forms_)
        if (self.stem == ''):
            self.end_list = word_forms_
            return
        
        for i in range(len(word_forms_)):
            form = word_forms_[i]
            end = form.replace(self.stem, '', 1)
            self.end_list[i] = end
            #print(self.pref_list[i] + " _ " + self.stem + " _ " + end)

In [7]:
from tqdm import tqdm

In [8]:
lemma_id_to_lemma_dict = {}

In [9]:
for lemma in tqdm(root.find('lemmata')):
    word = lemma.find('l').get('t')
    #if ('-' in word):
     #   continue
    word_forms = []
    for form in lemma.findall('f'):
        word_forms.append(form.get('t'))
    speech_part = speech_part_dict[lemma.find('l').find('g').get('v')]
    id_ = lemma.get('id')
    parsed_lemma = Lemma(id_, word, word_forms, speech_part) 
    lemma_id_to_lemma_dict[id_] = parsed_lemma
    #if (parsed_lemma.stem in stem_to_lemma_dict.keys()):
    #    stem_to_lemma_dict[parsed_lemma.stem].append(parsed_lemma)
    #else:
    #    stem_to_lemma_dict[parsed_lemma.stem] = [parsed_lemma]

100%|██████████| 391243/391243 [01:10<00:00, 5533.23it/s]


In [10]:
len(lemma_id_to_lemma_dict.keys())

391243

In [11]:
def generate_word_forms_from_lemma(lemma):
    word_forms = []
    for i in range(len(lemma.end_list)):
        form = lemma.pref_list[i] + lemma.stem + lemma.end_list[i] 
        word_forms.append(form)
    return word_forms 

In [12]:
def append_lemma(from_lemma, to_lemma):
    from_forms = generate_word_forms_from_lemma(from_lemma)
    to_forms = generate_word_forms_from_lemma(to_lemma)
    return Lemma(to_lemma.id, to_lemma.word, from_forms + to_forms, to_lemma.speech_part)

In [13]:
def move_lemma(from_lemma_id, to_lemma_id):
    while True:
        if lemma_id_to_lemma_dict[to_lemma_id].moved != -1:
            to_lemma_id = lemma_id_to_lemma_dict[to_lemma_id].moved
        else:
            break
    from_lemma = lemma_id_to_lemma_dict[from_lemma_id]
    from_lemma.moved = to_lemma_id
    to_lemma = lemma_id_to_lemma_dict[to_lemma_id]
    to_lemma = append_lemma(from_lemma, to_lemma)
    return from_lemma, to_lemma

In [14]:
EXCLUDED_LINK_TYPES = ['7', '11', '21', '23', '26', '27']
for link in tqdm(root.find('links')):
    from_ = link.get('to')
    to_ = link.get('from')
    type_ = link.get('type')
    #print(from_ == "6")
    #break
    if type_ not in EXCLUDED_LINK_TYPES:
        lemma_id_to_lemma_dict[from_], lemma_id_to_lemma_dict[to_] = move_lemma(from_, to_)

100%|██████████| 258488/258488 [01:17<00:00, 3349.35it/s]


In [15]:
stem_to_lemma_dict = {}
for lemma_id in tqdm(lemma_id_to_lemma_dict.keys()):
    lemma = lemma_id_to_lemma_dict[lemma_id]
    if lemma.word == "побывал" :
        print("FFFFFF" + str(lemma.moved))
    if not lemma.moved == -1:
        continue
    if lemma.stem in stem_to_lemma_dict.keys():
        stem_to_lemma_dict[lemma.stem].append(lemma)
    else:
        stem_to_lemma_dict[lemma.stem] = [lemma]

 64%|██████▎   | 248707/391243 [00:28<00:02, 47995.49it/s]

FFFFFF241938


100%|██████████| 391243/391243 [00:30<00:00, 12772.73it/s]


In [16]:
tree = ET.parse('ocorp_nonmod.xml')
root = tree.getroot()

In [17]:
for child in root.findall('text')[1].find('paragraphs').findall('paragraph')[0].findall('sentence')[0].find('tokens'):
    print(child.tag)

token
token
token
token
token
token
token


In [18]:
import re
regex = re.compile('[^а-яА-Я]+')

#Out: 'abdE'

In [19]:
class WordSpeechPart:
    def __init__(self, word, speech_part):
        self.word = word
        self.speech_part = speech_part
    def __eq__(self, obj2):
        return self.word == obj2.word and self.speech_part == obj2.speech_part
    def __hash__(self):
        return hash((self.word, self.speech_part))

In [20]:
word_to_speech_part = {}

In [21]:
for text in root.findall('text'):
    for paragraph in text.find('paragraphs').findall('paragraph'):
        for sentence in paragraph.findall('sentence'):
            for token in sentence.find('tokens').findall('token'):
                text = regex.sub('', token.get('text')).lower().replace('ё', 'e')
                if text:
                    sp = token.find('tfr').find('v').find('l').find('g').get('v')
                    if not sp in speech_part_dict.keys():
                        continue
                    sp = speech_part_dict[sp]
                    wsp = WordSpeechPart(text, sp)
                    if text in word_to_speech_part.keys():
                        sp_dict = word_to_speech_part[text]
                        if wsp in sp_dict.keys():
                            word_to_speech_part[text][wsp] = word_to_speech_part[text][wsp] + 1
                        else:
                            word_to_speech_part[text][wsp] = 1
                    else:
                        word_to_speech_part[text] = {wsp : 1}
                
                

In [22]:
word_to_popular_speech_part = {}

In [23]:
for key in word_to_speech_part.keys():
    max_wsp = list(word_to_speech_part[key].keys())[0]
    max_cnt = word_to_speech_part[key][max_wsp]
    #ans = ""
    for wsp in word_to_speech_part[key].keys():
            #ans += " " + wsp.default + " " + wsp.speech_part + " " + str(word_to_speech_part[key][wsp]) + "   "
        if word_to_speech_part[key][wsp] > max_cnt:
            max_cnt = word_to_speech_part[key][wsp]
            max_wsp = wsp
        #print(ans + "MAXIMUM:" + max_wsp.default + " " + max_wsp.speech_part + " " + str(max_cnt))    
    word_to_popular_speech_part[key] = max_wsp

In [24]:
len(word_to_popular_speech_part.keys())

45114

In [25]:
example = "Все Гришины одноклассники уже побывали за границей, он был чуть ли не единственным, кого не вывозили никуда дальше Красной Пахры."

In [26]:
def remove_prefix(word):
    word = word.lower().replace('ё', 'e')
    longest_prefix = ""
    for prefix in CONSTANT_PREFIXES:
        if word.startswith(prefix) and len(prefix) > len(longest_prefix):
            longest_prefix = prefix
    if longest_prefix:
        prf, wrd = remove_prefix(word.replace(longest_prefix, '', 1))
        return longest_prefix + prf, wrd
    return '', word    

In [27]:
def process_word(prefix, stem, end, speech_part):
    if stem in stem_to_lemma_dict.keys():
        lemmas = stem_to_lemma_dict[stem]
        #if stem == 'политическ':
        #    print(lemmas[0].end_list)
    
        for lemma in lemmas:
            for i in range(len(lemma.end_list)):
                #if stem == 'политическ' and i == 7:
                    #print(lemma.pref_list[i] + " " + prefix)
                    #print(lemma.pref_list[i] == prefix)
                if prefix == lemma.pref_list[i] and end == lemma.end_list[i]:
                    #if stem == 'политическ':
                    #    print("!!!")
                    if speech_part == "" or speech_part == lemma.speech_part:
                        return lemma
    return None

In [28]:
def process_word_extracted_prefix(prefix, word, speech_part):
    i = 1
    while (i <= len(word)):
        #print(prefix + " " + word[:i] + " " + word[i:])
        lmm = process_word(prefix, word[:i], word[i:], speech_part)
        #if (word[:i] == 'политическ'):
        #    print(lmm.word)
        if lmm:
            return lmm
        i += 1
    return process_word(prefix, '', word, speech_part)         

In [29]:
exp = "был"

In [30]:
INPUT_FILE_NAME = "dataset.txt"
OUTPUT_FILE_NAME = "output.txt"

In [31]:
CONJ = ["и", "а", "но", "да", "если", "что", "когда"]
PR = ["за", "для", "в", "о", "к", "из", "от", "по", "под", "с", "об", "обо", "до", "над", "на", "ко", "к", "без", "из", "у"]

In [32]:
results_all = ""
with open(INPUT_FILE_NAME, 'r') as fin:
    for line in tqdm(fin):
        results = ""
        for word in line.split(' '):
            pure_word = word.replace(',', '').replace('.', '').replace('?', '').replace('!', '').replace('\n','')
            if pure_word in PR:
                results = results + pure_word + "{" + pure_word.lower().replace('ё', 'e') + "=PR} "
                continue
            if pure_word in CONJ:
                results = results + pure_word + "{" + pure_word.lower().replace('ё', 'e') + "=CONJ} "
                continue 
            speech_part = ""    
            if pure_word in word_to_popular_speech_part:
                speech_part = word_to_popular_speech_part[pure_word].speech_part        
            if pure_word:
                constant_prefix, pure_word_no_pref = remove_prefix(pure_word)
        #print(constant_prefix + " " + pure_word_no_pref)
                if pure_word_no_pref.startswith('по'):
                    lmm = process_word_extracted_prefix('по', pure_word_no_pref[2:], speech_part) 
                    if lmm:
                        results = results + pure_word + "{" + constant_prefix + lmm.word + "=" + lmm.speech_part + "} "
                        continue
                if pure_word_no_pref.startswith('наи'):
                    lmm = process_word_extracted_prefix('наи', pure_word_no_pref[3:], speech_part) 
                    if lmm:
                        results = results + pure_word + "{" + constant_prefix + lmm.word + "=" + lmm.speech_part + "} "
                        continue   
        #print("12345")
        #print(pure_word_no_pref)
                lmm = process_word_extracted_prefix('', pure_word_no_pref, speech_part) 
        #if pure_word_no_pref == 'политическая':
            #print("AAAA" + lmm.word)
        #print(stem_to_lemma_dict[lmm.stem][0].id) 
        #print(stem_to_lemma_dict[lmm.stem][1].id)    
                if lmm:
                    results = results + pure_word + "{" + constant_prefix + lmm.word + "=" + lmm.speech_part + "} "
                    continue
                pure_word_no_pref = pure_word.lower().replace('ё', 'e')
                constant_prefix = ""
        #print("753245")
        #print(pure_word_no_pref)
    
                if pure_word_no_pref.startswith('по'):
                    lmm = process_word_extracted_prefix('по', pure_word_no_pref[2:], speech_part) 
                    if lmm:
                        results = results + pure_word + "{" + constant_prefix + lmm.word + "=" + lmm.speech_part + "} "
                        continue
                if pure_word_no_pref.startswith('наи'):
                    lmm = process_word_extracted_prefix('наи', pure_word_no_pref[3:], speech_part) 
                    if lmm:
                        results = results + pure_word + "{" + constant_prefix + lmm.word + "=" + lmm.speech_part + "} "
                        continue   
        #print("7777")        
                lmm = process_word_extracted_prefix('', pure_word_no_pref, speech_part) 
        #print("$$$$" + lmm.word)
                if lmm:
                    results = results + pure_word + "{" + constant_prefix + lmm.word + "=" + lmm.speech_part + "} "
                    continue    
                results = results + pure_word + "{" + constant_prefix + pure_word_no_pref + "=ADV} "
        results_all = results_all + results[:-1] + "\n"
results_all = results_all[:-1]
            
                

200it [00:04, 44.31it/s]


In [33]:
with open(OUTPUT_FILE_NAME, 'w') as fout:
    fout.write(results_all)

In [None]:
print('Все{весь=NI} Гришины{гришин=A} \n одноклассники{одноклассник=S} уже{уже=ADV} побывали{побывать=V} за{за=PR} границей{граница=S} он{он=NI} был{быть=V} чуть{чуть=ADV} ли{ли=ADV} не{не=ADV} единственным{единственный=A} кого{кто=NI} не{не=ADV} вывозили{вывозить=V} никуда{никуда=NI} дальше{далеко=ADV} Красной{красный=A} Пахры{Пахра=S}')

In [None]:
for lmm in stem_to_lemma_dict[""]:
    print(lmm.word + " " + str(lmm.end_list))