## HSE natural language processing
### HW 01

In [4]:
import os
import csv
import pickle
import xml.etree.ElementTree as ET

In [5]:
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm

In [6]:
RESOURCES_PATH = './resources'
ODICT_FILENAME = 'odict.csv'
OPCORPORA_FILENAME_XML = 'dict.opcorpora.xml'
OPCORPORA_FILENAME = 'dict.opcorpora.txt'
OPCORPORA_DUMP_FILENAME = 'opcorpora.pkl'
TEST_FILENAME = 'test'

In [7]:
COMMENT_TO_POS = defaultdict(lambda: 'NI')
COMMENT_TO_POS['м'] = 'S'
COMMENT_TO_POS['ж'] = 'S'
COMMENT_TO_POS['с'] = 'S'
COMMENT_TO_POS['со'] = 'S'
COMMENT_TO_POS['жо'] = 'S'
COMMENT_TO_POS['мо'] = 'S'
COMMENT_TO_POS['мн'] = 'S'
COMMENT_TO_POS['мо-жо'] = 'S'
COMMENT_TO_POS['предл.'] = 'PR'
COMMENT_TO_POS['п'] = 'A'
COMMENT_TO_POS['межд.'] = 'ADV'
COMMENT_TO_POS['предик.'] = 'ADV'
COMMENT_TO_POS['н.'] = 'ADV'
COMMENT_TO_POS['вводн.'] = 'ADV'
COMMENT_TO_POS['сравн.'] = 'ADV'
COMMENT_TO_POS['част.'] = 'ADV'
COMMENT_TO_POS['св'] = 'V'
COMMENT_TO_POS['нсв'] = 'V'
COMMENT_TO_POS['св-нсв.'] = 'V'
COMMENT_TO_POS['союз'] = 'CONJ'

In [8]:
PUNCTUATION = ['.', ',', '?', '!', os.linesep]

In [9]:
def load_odict():
    odict = {}
    with open(os.path.join(RESOURCES_PATH, ODICT_FILENAME), encoding='cp1251') as csv_file:
        dict_reader = csv.reader(csv_file)
        for line in dict_reader:
            pos = ''
            lemma = line[0]
            tag = COMMENT_TO_POS[line[1]]
            odict_value = (lemma, tag)
            odict[lemma] = odict_value
            for token in line[2:]:
                odict[token] = odict_value
    return odict

In [10]:
odict = load_odict()

In [11]:
def run_task(test_input_path, test_output_path, word_dict):
    with open(test_input_path, 'r') as input_file:
        with open(test_output_path, 'w') as output_file :
            for line in input_file.readlines():
                for character in PUNCTUATION:
                    line = line.replace(character, '')
                tokens = []
                for token in line.split(' '):
                    if token == '':
                        continue
                    token_lower = token.lower()
                    lemma, tag = (token_lower, 'NI')
                    if token_lower in word_dict:
                        lemma, tag = word_dict[token_lower]
                    else:
                        token_lower.replace('ё', 'е')
                        if token_lower in word_dict:
                            lemma, tag = word_dict[token_lower]
                    tokens.append('{}{}{}={}{}'.format(token, '{', lemma, tag, '}'))
                output_file.write(' '.join(tokens))
                output_file.write(os.linesep)

In [12]:
run_task(os.path.join(RESOURCES_PATH, TEST_FILENAME + '.in'), os.path.join(RESOURCES_PATH, TEST_FILENAME + '.out'), odict)

```You've got the score 0.7444979367262724 (88.5% correct lemmas and 76.1% correct POS tags)```

PC is out of memory when I am using ElementTree, so I will parse opcorpora dictionary manually

In [13]:
def load_grammemes_parents():
    xml_path = os.path.join(RESOURCES_PATH, OPCORPORA_FILENAME_XML)
    grammemes_parents = {}
    grammeme_parent = None
    for event, elem in ET.iterparse(xml_path, events=("start", "end")):
        if event == 'start':
            if elem.tag == 'grammeme':
                grammeme_parent = elem.attrib['parent']
            elif elem.tag == 'name':
                grammemes_parents[''.join([text for text in elem.itertext()])] = grammeme_parent
        elif elem.tag == 'grammemes':
            break
    return grammemes_parents

In [14]:
grammemes_info = load_grammemes_parents()

In [15]:
def load_lemmas():
    xml_path = os.path.join(RESOURCES_PATH, OPCORPORA_FILENAME_XML)
    lemma_to_id = {}
    id_to_lemma = {}
    lemma_to_grammemes = {}
    word_to_gramemmes = {}
    word_to_lemma = {}
    links = {}
    lemma_id = None
    lemma = None
    word = None
    in_l = False
    in_f = False
    for event, elem in ET.iterparse(xml_path, events=("start", "end")):
        if event == 'start':
            if elem.tag == 'lemma':
                lemma_id = elem.attrib['id']
            elif elem.tag == 'l':
                in_l = True
                lemma = elem.attrib['t']
                lemma_to_id[lemma] = lemma_id
                id_to_lemma[lemma_id] = lemma
            elif elem.tag == 'g':
                if in_l:
                    if lemma not in lemma_to_grammemes:
                        lemma_to_grammemes[lemma] = []
                    lemma_to_grammemes[lemma].append(elem.attrib['v'])
                if in_f:
                    if word not in word_to_gramemmes:
                        word_to_gramemmes[word] = []
                    word_to_gramemmes[word].append(elem.attrib['v'])
            elif elem.tag == 'f':
                in_f = True
                word = elem.attrib['t']
                word_to_lemma[word] = lemma
            elif elem.tag == 'link':
                l_from = elem.attrib['from']
                l_to = elem.attrib['to']
                links[l_to] = l_from
        elif elem.tag == 'l':
            in_l = False
        elif elem.tag == 'f':
            in_f = False
    return lemma_to_id, id_to_lemma, lemma_to_grammemes, word_to_lemma, word_to_gramemmes, links

In [16]:
lemma_to_id, id_to_lemma, lemma_to_grammemes, word_to_lemma, word_to_gramemmes, links = load_lemmas()

In [110]:
for key, value in grammemes_info.items():
    if value == 'POST':
        print(key)

NOUN
ADJF
ADJS
COMP
VERB
INFN
PRTF
PRTS
GRND
NUMR
ADVB
NPRO
PRED
PREP
CONJ
PRCL
INTJ


In [212]:
process_word_opcorpora('долог')

('долгий', 'A')

In [214]:
GRAMMEME_TO_POS = defaultdict(lambda: 'NI')
GRAMMEME_TO_POS['NOUN'] = 'S'
GRAMMEME_TO_POS['ADJF'] = 'A'
GRAMMEME_TO_POS['ADJS'] = 'A'
GRAMMEME_TO_POS['COMP'] = 'A'
GRAMMEME_TO_POS['VERB'] = 'V'
GRAMMEME_TO_POS['INFN'] = 'V'
GRAMMEME_TO_POS['PRTF'] = 'V'
GRAMMEME_TO_POS['PRTS'] = 'V'
GRAMMEME_TO_POS['GRND'] = 'V'
GRAMMEME_TO_POS['ADVB'] = 'ADV'
GRAMMEME_TO_POS['PRED'] = 'ADV'
GRAMMEME_TO_POS['INTJ'] = 'ADV'
GRAMMEME_TO_POS['PRCL'] = 'ADV'
GRAMMEME_TO_POS['INTJ'] = 'ADV'
GRAMMEME_TO_POS['PREP'] = 'PR'
GRAMMEME_TO_POS['CONJ'] = 'CONJ'

In [163]:
CONJS = ['a', 'благо', 'буде', 'будто', 'вдобавок', 'да', 'дабы', 'даже', 'же', 'едва', 'ежели', 'если', 'зато', 
         'зачем', 'и', 'ибо', 'или', 'кабы', 'как', 'когда', 'коли', 'либо', 'лишь', 'нежели', 'но', 'однако',
         'однако', 'особенно', 'оттого', 'отчего', 'пока', 'покуда', 'поскольку', 'потому', 'почему', 'притом',
         'причем', 'пускай', 'пусть', 'раз', 'словно', 'также', 'тоже', 'только', 'точно', 'хотя', 'чем', 'что',
         'чтоб', 'чтобы']
CONJS = set(CONJS)

PRS = ['а-ля', 'без', 'безо', 'благодаря', 'близ', 'в', 'вблизи', 'ввиду', 'вглубь', 'вдогон', 'вдоль', 'взамен',
       'включая', 'вкруг', 'вместо', 'вне', 'внизу', 'внутри', 'внутрь', 'во', 'вовнутрь', 'возле', 'вокруг',
       'вопреки', 'вразрез', 'вроде', 'вслед', 'вследствие', 'для', 'для-ради', 'до', 'за', 'замест', 'заместо',
       'из', 'из-за', 'из-под', 'из-подо', 'внутри', 'изо', 'исключая', 'к', 'касаемо', 'касательно', 'ко',
       'кроме', 'кругом', 'меж', 'между', 'мимо', 'на', 'наверху', 'навроде', 'навстречу', 'над', 'надо', 'назад',
       'назади', 'назло', 'накануне', 'наместо', 'наперекор', 'наперерез', 'наперехват', 'наподобие', 'наподобье',
       'напротив', 'наряду', 'насупротив', 'насчет', 'несмотря', 'ниже', 'о', 'об', 'обо', 'обок', 'обочь', 
       'около', 'окрест', 'окроме', 'окромя', 'округ', 'опосля', 'опричь', 'от', 'ото', 'перед', 'передо',
       'по', 'повдоль', 'поверх', 'под', 'подле', 'подо', 'подобно', 'позади', 'помимо', 'поперёд', 'поперёк',
       'порядка', 'посереди', 'посередине', 'посредь', 'после', 'посреди', 'посредине', 'посредством', 'пред',
       'предо', 'прежде', 'при', 'про', 'промеж', 'помежду', 'против', 'ради', 'с', 'со', 'сверх', 'сверху']

PRS = set(PRS)

In [119]:
def process_grammeme(grammeme):
    pos = None
    while grammeme in grammemes_info:
        pr = grammemes_info[grammeme]
        if pr == 'POST':
            pos = pr
            break
        elif pr == '':
            break
        grammeme = pr
    if not pos is None:
        return grammeme
    return None

In [186]:
def get_word_pos_opcorpora(word):
    grammemes = []
    for gr in lemma_to_grammemes[word]:
        grammeme = process_grammeme(gr)
        if not grammeme is None and not GRAMMEME_TO_POS[gr] == 'NI':
            grammemes.append(GRAMMEME_TO_POS[gr])
    if len(grammemes) == 0:
        return 'NI'
    return grammemes[0]

In [196]:
def process_word_opcorpora(word):
    word = word.lower()
    word_mod = word.replace('ё', 'е')
    if word not in word_to_lemma:
        if word != word_mod:
            return process_word_opcorpora(word_mod)
        return word, 'NI'
    lemma = word_to_lemma[word]
    while lemma_to_id[lemma] in links:
        lemma_new = id_to_lemma[links[lemma_to_id[lemma]]]
        if lemma_new == lemma:
            break
        lemma = lemma_new
        break
    return lemma, get_word_pos_opcorpora(lemma)

In [188]:
def process_word_odict(word):
    word = word.lower()
    lemma, tag = (word, 'NI')
    if word in odict:
        lemma, tag = odict[word]
    else:
        word.replace('ё', 'е')
        if word in odict:
            lemma, tag = odict[word]
    return lemma, tag

In [223]:
def run_task_both(test_input_path, test_output_path):
    with open(test_input_path, 'r') as input_file:
        with open(test_output_path, 'w') as output_file :
            for line in tqdm(input_file.readlines()):
                for character in PUNCTUATION:
                    line = line.replace(character, '')
                tokens = []
                for ind, token in enumerate(line.split(' ')):
                    if token == '':
                        continue
                    lemma, tag = process_word_opcorpora(token)
                    if tag == 'NI':
                        tag = process_word_odict(token)[1]
                    if not token[0].islower() and ind != 0:
                        lemma = lemma[0].upper() + lemma[1:]
                    if token.lower() in CONJS:
                        lemma, tag = token.lower(), 'CONJ'
                    if token.lower() in PRS:
                        lemma, tag = token.lower(), 'PR'
                    if tag == 'NI':
                        tag = 'ADV'
                    tokens.append('{}{}{}={}{}'.format(token, '{', lemma, tag, '}'))
                output_file.write(' '.join(tokens))
                output_file.write(os.linesep)

In [224]:
run_task_both(os.path.join(RESOURCES_PATH, TEST_FILENAME + '.in'), os.path.join(RESOURCES_PATH, TEST_FILENAME + '.out'))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


