## Морфологический анализ
Хранение слов в обратном порядке букв, генерация незнакомых слов из похожих существующих

In [4]:
import os
from bs4 import BeautifulSoup
from difflib import SequenceMatcher as sm
import json
import shelve

### Частота лемм для устранения противоречий
Источник размеченых текстов: http://opencorpora.org/?page=downloads

In [5]:
class CountWords:
    '''Частота лемм в размеченых текстах'''
    
    map = {'adjf':'A', 'adjs':'A', 'advb':'ADV', 'comp':'ADV', 'conj':'CONJ', 'grnd':'V', 'infn':'V',
           'intj':'ADV', 'latn':'NI', 'noun':'S', 'npro':'NI', 'pnct':'NI', 'prcl':'ADV', 'prtf':'V',
           'prts':'V', 'romn':'NI', 'symb':'NI','verb':'V', 'prep':'PR', 'pred':'ADV', 'numr':'A'}
    
    file_name = '../data/words_count.shelve'
    
    def __init__(self, file_count):
        if os.path.exists(self.file_name+'.dat'):
            self._load_in_dump()
            return
        self.words_count = {}
        self._parse_file(file_count)
        self._save_to_dump()
    
    def get_count(self, lemma, g):
        if (lemma in self.words_count and g in self.words_count[lemma]):
            return self.words_count[lemma][g]
        else:
            return 0
    
    def _parse_file(self, file_count):
        if not os.path.exists(file_count): raise
        file = open(file_count, 'r', encoding='utf-8')
        for line in file:
            val = self._get_val(line)
            if not val: continue
            token, lemma, g = val
            self._set_value(token, lemma, g)
        file.close()
    
    def _set_value(self, token, lemma, g):
        if not g in self.map: return
        g = self.map[g]
        if not lemma in self.words_count: self.words_count[lemma]={}
        if not g in self.words_count[lemma]: self.words_count[lemma][g]=1
        else: self.words_count[lemma][g] += 1
    
    def _get_val(self, line):
        soup = BeautifulSoup(line, 'lxml').find('token')
        if soup is None: return False
        token = soup.get('text').lower()
        lemma = soup.find('l').get('t').lower()
        g = soup.find('g').get('v').lower()
        if lemma is None: return False
        return token, lemma, g
    
    def _save_to_dump(self):
        end_file_name = ['.bak','.dat', '.dir']
        for efn in end_file_name:
            if os.path.exists(self.file_name+efn): 
                os.remove(self.file_name+efn)      
        shelve.open(self.file_name).update(self.words_count)
    
    def _load_in_dump(self):
        self.words_count = shelve.open(self.file_name)
        

In [6]:
# obj = CountWords('../data/annot.opcorpora.no_ambig.xml')

### Дополнение словаря odict леммами из opcorpora

In [7]:
class ADDLemm(CountWords):
    
    exclade_g = {'pnct', 'latn', 'unkn', 'numb', 'symb'}
    
    def __init__(self, file_words, file_count):
        self.opcorpora = {}
        self._parse_file(file_count)
        self.odict = {}
        self._parse_odict(file_words)
        file = open('../data/odict.csv', 'a')
        for lemma, v in self.opcorpora.items():
            if not lemma in self.odict:
                for ps, words in v.items():
                    line = self._get_format_odict(lemma, ps, words)
                    try:
                        file.write(line + '\n')
                    except Exception as e:
                        continue
        file.close()
                
    def _get_format_odict(self, lemma, ps, words):
        line = lemma + ',' + ps
        for word in words:
            line += ','+word
        return line
    
    def _set_value(self, token, lemma, g):
        if len(lemma)<=1: return
        if g in self.exclade_g: return
        if not lemma in self.opcorpora: self.opcorpora[lemma]={}
        if not g in self.opcorpora[lemma]: self.opcorpora[lemma][g] = {}
        self.opcorpora[lemma][g][token] = 0
    
    def _parse_odict(self, file_words):
        file = open(file_words)
        for line in file:
            l_arr = line[:-1].lower().split(sep=',')
            self.odict[l_arr[0]] = 0
        file.close()
    

In [8]:
# ADDLemm('../data/odict.csv', '../data/annot.opcorpora.no_ambig.xml')

### Сохранение словаря в дерево
Источник словаря: http://odict.ru/

In [10]:

class MorphAnalyzerFit:
    
    map = {'вводн.': 'ADV', 'ж': 'S','жо': 'S','м': 'S','межд.': 'ADV','мн.': 'S','мо': 'S',
       'мо-жо': 'S','мс-п': 'A','н': 'ADV', 'нсв': 'V','п': 'A','предик.': 'ADV',
       'предл.': 'PR','с': 'S','св': 'V','св-нсв': 'V','со': 'S','союз': 'CONJ',
       'сравн.': 'A','част.': 'ADV','числ.': 'ADV','числ.-п': 'A',
       'adjf':'A', 'adjs':'A', 'advb':'ADV', 'comp':'ADV', 'conj':'CONJ', 'grnd':'V', 'infn':'V',
       'intj':'ADV', 'latn':'NI', 'noun':'S', 'npro':'NI', 'pnct':'NI', 'prcl':'ADV', 'prtf':'V',
       'prts':'V', 'romn':'NI', 'symb':'NI','verb':'V', 'prep':'PR', 'pred':'ADV', 'numr':'A'}
    
    path = '../data/words/'
        
    def fit(self, file_words, file_count):
        self.words_count = CountWords(file_count)
        self.words = {}
        file = open(file_words)
        for line in file:
            l_arr = line[:-1].lower().split(sep=',')
            ps = self._get_ps(l_arr[1], l_arr[0])
            if not ps: continue
            self._set_words(l_arr[0], ps, l_arr[2:])
        file.close()
        del(self.words_count)
        self._save_to_dump() 
    
    def _get_ps(self, ps, words):
        if not ps in self.map: 
            print(f'Ключ "{ps}" для слова "{words}" не размечен')
            return False
        return self.map[ps]
    
    def _set_words(self, lemma, ps, words):
        words.append(lemma)
        for word in words:
            instruction = self._get_instruction(word, lemma, ps)
            if ps=='V': l_count = len(words)
            else: l_count = self.words_count.get_count(lemma, ps)
            self._set_word(self.words, word, instruction, l_count)
        
    def _get_instruction(self, word, lemma, ps):
        for i in range(len(lemma)):
            if i>len(word)-1:
                end_word, end_lemma = word[i:], lemma[i:]
                break
            if lemma[i]!=word[i]:
                end_word, end_lemma = word[i:], lemma[i:]
                break
        else:
            end_word, end_lemma = word[i+1:], lemma[i+1:]
        return str(-len(end_word))+'|'+end_lemma+'|'+ps
    
    def _get_lemma(self, word, instruction):
        l, end, ps = instruction.split('|')
        l = int(l)
        return (word if l==0 else word[:l])+end, ps
    
    def _set_word(self, words, word, instruction, l_count):
        if len(word)==0: return
        if not word[-1] in words: words[word[-1]]={}
        words = words[word[-1]]
        if len(word)==1:
            if not 'val' in words: words['val']={instruction: l_count}
            else: words['val'][instruction]=l_count
        else:
            self._set_word(words, word[:-1], instruction, l_count)
    
    def _save_to_dump(self):
        for k, v in self.words.items():
            file = open(self.path+k, 'w')
            file.write(json.dumps(v))
            file.close()

    def _load_in_dump(self):
        self.words = {}
        for filename in os.listdir(self.path):
            file = open(self.path+filename, 'r')
            self.words[filename]=json.loads(file.read())
            file.close()

### Поиск или генерация лемм

In [11]:
class MorphAnalyzer(MorphAnalyzerFit):
    
    def __init__(self):
        self._load_in_dump()
    
    def parse(self, word):
        instructions = self._get_val_in_dict(self.words, word)
        if not instructions: 
            variants = self._get_variants(self.words, word, '', -2)
            if not variants: return word, 'ADV'
            instructions = {}
            for  v in variants:
                v_rev=''
                for ch in reversed(v): v_rev+=ch
                leven = sm(None, word, v_rev).ratio()
                if leven>=0.75:
                    for k in variants[v]:
                        if not k in instructions: instructions[k]=1
                        else: instructions[k]+=1
        val = {'instruction': False, 'count': -1}
        for k, v in instructions.items():
            if v>val['count']:
                val['count'] = v
                val['instruction'] = k
        if not val['instruction']: return word, 'ADV'
        return self._get_lemma(word, val['instruction'])
        
    
    def _get_val_in_dict(self, words, word):
        if len(word)==0: return False
        if word[-1] in words: words = words[word[-1]]
        else: return False
        if len(word)==1:
            if 'val' in words: return words['val']
            return False
        return self._get_val_in_dict(words, word[:-1])
    
    def _get_variants(self, words, word, w, step):
        if len(word)==0: return False
        if not word[-1] in words: 
            if step>=0: return self._get_all(words, w, {})
            else: return False
        w+=word[-1]
        words = words[word[-1]]
        if step>=0: return self._get_all(words, w, {})
        if len(word)==1: return False
        return self._get_variants(words, word[:-1], w, step+1)

    def _get_all(self, words, w, instructions):
        for k in words:
            if k=='val': instructions.update({w: words['val']})
            else: instructions.update(self._get_all(words[k], w+k, instructions))
        return instructions
    
    def _get_value(self):
        pass
            
                

### Выполнение тестового задания

In [8]:
obj = MorphAnalyzer()
# obj.fit('../data/odict.csv', '../data/annot.opcorpora.no_ambig.xml')

In [18]:
file_in = open('dataset_37845_1.txt', 'r', encoding='utf-8')
file_out = open('answer.txt', 'w', encoding='utf-8')
for line in file_in:
    text_in = line.replace(',','').replace('.','').replace('?','').replace('!','').replace('\n',' ')
    text_out = ''
    for word in text_in.split(' '):
        if len(word)==0: continue
        p = obj.parse(word.lower())
        val = word+'{'+p[0]+'='+p[1]+'}'
        text_out = val if text_out=='' else text_out+' '+val
    file_out.write(text_out + '\n')
file_in.close()
file_out.close()

<center><img src="stepik.jpg"><center>