## HSE natural language processing
### HW 02

In [58]:
import os
import re
import string
import numpy as np

In [59]:
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict
from pymystem3 import Mystem

In [3]:
RESOURCES_PATH = './resources'
TRAIN_TEXTS_PATH = os.path.join(RESOURCES_PATH, 'texts_train.txt')
TRAIN_SCORES_PATH = os.path.join(RESOURCES_PATH, 'scores_train.txt')
TEST_INPUT_FILENAME = os.path.join(RESOURCES_PATH, 'test.in')
TEST_OUTPUT_FILENAME = os.path.join(RESOURCES_PATH, 'test.out')

In [4]:
def load_scores():
    with open(TRAIN_SCORES_PATH, 'r') as scores_file:
        return np.array(list(map(int, scores_file.readlines())))

In [5]:
def load_texts():
    with open(TRAIN_TEXTS_PATH, 'r') as texts_file:
        return np.array(texts_file.readlines())

In [6]:
def run_task(evaluator):
    with open(TEST_INPUT_FILENAME, 'r') as input_file:
        with open(TEST_OUTPUT_FILENAME, 'w') as output_file:
            for line in input_file.readlines():
                output_file.write(evaluator(line))
                output_file.write(os.linesep)

#### Base approach: average mark

In [7]:
avg_mark = int(round(load_scores().mean()))
print('Average mark: {}'.format(avg_mark))

Average mark: 8


In [8]:
run_task(lambda sentence: str(avg_mark))

#### Simple approach: avreage unigram score

In [9]:
texts = load_texts()

In [10]:
scores = load_scores()

In [33]:
def parse_line_to_tokens(line):
    return [token for token in re.split('\W+', line.lower().replace(os.linesep, '')) if token != '']

In [36]:
marks = defaultdict(lambda: [])
last = defaultdict(lambda: -1)

for ind, (text, score) in tqdm(enumerate(zip(texts, scores))):
    for token in parse_line_to_tokens(text):
        if last[token] != ind:
            marks[token].append(score)
            last[token] = ind

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [39]:
average_mark = {token : 1. * sum(marks[token]) / len(marks[token]) for token in marks.keys()}

In [48]:
def get_average_mark(word):
    return average_mark[word] if word in average_mark else avg_mark

In [51]:
def get_text_average_mark(line):
    return str(int(round(np.array([get_average_mark(token) for token in parse_line_to_tokens(line)]).mean())))

In [53]:
run_task(get_text_average_mark)