## HSE natural language processing
### HW 01

In [1]:
import os
import string
import re

In [42]:
from pymystem3 import Mystem
from operator import itemgetter
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict

In [44]:
RESOURCES_PATH = './resources'
TEST_IN_FILENAME = os.path.join(RESOURCES_PATH, 'test.in')
TEST_OUT_FILENAME = os.path.join(RESOURCES_PATH, 'test.out')
TEST_TST_FILENAME = os.path.join(RESOURCES_PATH, 'test.tst')
TRAIN_SENTENCES_FILENAME = os.path.join(RESOURCES_PATH, 'train_sentences.txt')
TRAIN_TAGS_FILE = os.path.join(RESOURCES_PATH, 'train_nes.txt')
DEVSET_DIR = os.path.join(RESOURCES_PATH, 'factRuEval-2016', 'devset')
NE3_DIR = os.path.join(RESOURCES_PATH, 'ne3', 'Collection3')

In [4]:
m = Mystem()

In [5]:
punct_set = set(string.punctuation)
punct_set.add(os.linesep)
punct_set.add('...')

In [6]:
def lemmatize_sentence(sentence):
    try:
        lemmas = m.lemmatize(sentence)
        process_lemma = lambda lemma: ' ' if lemma == ' ' else lemma.replace(' ', '') if lemma.replace(' ', '') not in punct_set else ' '
        return ''.join([process_lemma(lemma) for lemma in lemmas if lemma not in punct_set]).replace(os.linesep, '')
    except:
        return sentence

In [43]:
def run_task(lemmatize_dict):
    output = defaultdict(lambda: [])
    
    with open(TEST_IN_FILENAME, 'r') as in_file:
        with open(TEST_OUT_FILENAME, 'w') as out_file:            
            for line_ind, line in tqdm(enumerate(in_file.readlines())):
                n = len(line)
                ind = 0
                tokens = []
                while ind < n:
                    while ind < n and not line[ind].isalpha():
                        ind += 1
                    if ind >= n:
                        break
                    next_ind = ind
                    while next_ind < n and line[next_ind].isalpha():
                        next_ind += 1
                    tokens.append((ind, next_ind))
                    ind = next_ind
                token_ind = 0
                while token_ind < len(tokens):
                    marked = False
                    next_token_ind = token_ind
                    wf, wt = tokens[token_ind]
                    last_good_next_token = -1
                    while next_token_ind < min(token_ind + 8, len(tokens)):
                        current_word = lemmatize_sentence(line[wf:tokens[next_token_ind][1]])
                        if current_word in lemmatize_dict:
                            marked = True
                            last_good_next_token = next_token_ind
                        next_token_ind += 1
                    if not marked:
                        word = line[tokens[token_ind][0]:tokens[token_ind][1]]
                        try:
                            grammemas = set(m.analyze(word)[0]['analysis'][0]['gr'].split(','))
                        except:
                            grammemas = set()
                        if 'фам' in grammemas or 'имя' in grammemas:
                            output[line_ind].append((tokens[token_ind][0], tokens[token_ind][1] - tokens[token_ind][0], 'PERSON'))
                            out_file.write('{} {} PERSON '.format(tokens[token_ind][0], tokens[token_ind][1] - tokens[token_ind][0]))
                        token_ind += 1
                    else:
                        current_word = lemmatize_sentence(line[wf:tokens[last_good_next_token][1]])
                        all_tags = lemmatize_dict[current_word]
                        for ind in range(token_ind, last_good_next_token + 1):
                            for tag in all_tags:
                                output[line_ind].append((tokens[ind][0], tokens[ind][1] - tokens[ind][0], tag))
                                out_file.write('{} {} {} '.format(tokens[ind][0], tokens[ind][1] - tokens[ind][0], tag))
                        token_ind = last_good_next_token + 1
                            
                
                out_file.write('EOL')
                out_file.write(os.linesep)
                
    return output

In [8]:
def load_train_dict():
    train_dict = {}
    with open(TRAIN_SENTENCES_FILENAME, 'r') as train_sentences_file:
        with open(TRAIN_TAGS_FILE, 'r') as train_tags_file:
            for tag_line, sentence in zip(train_tags_file.readlines(), train_sentences_file.readlines()):
                tags = tag_line.split(' ')[:-1]
                last_token = ''
                tokens = [(int(tags[ind]), int(tags[ind]) + int(tags[ind + 1]), tags[ind + 2]) for ind in range(0, len(tags), 3)]
                tokens.sort(key=itemgetter(2, 0))
                for ind, token in enumerate(tokens):
                    word_from, word_to, tag = token
                    next_from = -1 if ind == len(tokens) - 1 else tokens[ind + 1][0]
                    next_tag = '' if ind == len(tokens) - 1 else tokens[ind + 1][2]
                    word = sentence[word_from: word_to]
                    if ind + 3 < len(tags) and next_from - word_to <= 5:
                        all_not_alpha = True
                        for letter in sentence[word_to:next_from]:
                            all_not_alpha = all_not_alpha and not letter.isalpha() and letter != ',' and letter != '(' and letter != ')'
                        if all_not_alpha:
                            last_token += sentence[word_from:next_from]
                            continue
                    word = last_token + word
                    if word not in train_dict:
                        train_dict[word] = []
                    train_dict[word].append(tag)
                    last_token = ''
    
    lemmatize_train_dict = {}
    for key in train_dict.keys():
        lemmatize_train_dict[lemmatize_sentence(key)] = list(set(train_dict[key]))
    return lemmatize_train_dict, train_dict

In [9]:
lemmatize_train_dict, train_dict = load_train_dict()

In [187]:
run_task(lemmatize_train_dict)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




In [10]:
pattern = re.compile(".*\.objects")
OBJECT_FILENAMES = [os.path.join(DEVSET_DIR, filename) for filename in os.listdir(DEVSET_DIR) if pattern.match(filename)]

In [11]:
def parse_object_files():
    tag_to_real_tag = {'LocOrg' : 'ORG', 'Org' : 'ORG', 'Person' : 'PERSON'}
    lemmatize_dict = {}
    for filename in OBJECT_FILENAMES:
        with open(filename, 'r') as object_file:
            for line in object_file.readlines():
                tokens = line.split(' ')
                tag = tokens[1]
                word = ''
                for token_ind in range(2, len(tokens)):
                    if tokens[token_ind] == '#':
                        word = lemmatize_sentence(' '.join(tokens[token_ind + 1:]).replace(os.linesep, ''))
                        if tag not in tag_to_real_tag:
                            continue
                        tag = tag_to_real_tag[tag]
                        if word not in lemmatize_dict:
                            lemmatize_dict[word] = []
                        lemmatize_dict[word].append(tag)
                        break
    return lemmatize_dict

In [12]:
object_files_dict = parse_object_files()

In [13]:
for key in object_files_dict.keys():
    if key not in lemmatize_train_dict:
        lemmatize_train_dict[key] = object_files_dict[key]
    else:
        lemmatize_train_dict[key] = list(set(object_files_dict[key] + lemmatize_train_dict[key]))

In [233]:
run_task(lemmatize_train_dict)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




In [14]:
pattern = re.compile(".*\.ann")
NE3_FILENAMES = [os.path.join(NE3_DIR, filename) for filename in os.listdir(NE3_DIR) if pattern.match(filename)]

In [15]:
def parse_ne3_files():
    lemmatize_dict = {}
    for filename in NE3_FILENAMES:
        with open(filename, 'r') as ne3_file:
            tag_to_real_tag = {'ORG' : 'ORG', 'PER' : 'PERSON'}
            for line in ne3_file.readlines():
                line = line.replace('\t', ' ').replace('\n', '')
                tokens = line.split(' ')
                tag = tokens[1]
                word = ''
                for token_ind in range(2, len(tokens)):
                    if tokens[token_ind].isalpha():
                        word = lemmatize_sentence(' '.join(tokens[token_ind:]).replace(os.linesep, ''))
                        if tag not in tag_to_real_tag:
                            break
                        tag = tag_to_real_tag[tag]
                        if word not in lemmatize_dict:
                            lemmatize_dict[word] = []
                        lemmatize_dict[word].append(tag)
                        break
    return lemmatize_dict

In [16]:
ne3_files_dict = parse_ne3_files()

In [17]:
for key in ne3_files_dict.keys():
    if key not in lemmatize_train_dict:
        lemmatize_train_dict[key] = ne3_files_dict[key]
    else:
        lemmatize_train_dict[key] = list(set(ne3_files_dict[key] + lemmatize_train_dict[key]))

In [18]:
run_task(lemmatize_train_dict)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




In [19]:
keys = list(lemmatize_train_dict.keys())
for key in keys:
    if '(' in key:
        tokens = key.split('(')
        tokens[1] = tokens[1].split(')')[0]
        for token in [tokens[0], tokens[1]]:
            if token not in lemmatize_train_dict:
                lemmatize_train_dict[token] = []
            lemmatize_train_dict[token] += lemmatize_train_dict[key]

In [20]:
for key in lemmatize_train_dict.keys():
    lemmatize_train_dict[key] = list(set(lemmatize_train_dict[key]))

In [325]:
run_task(lemmatize_train_dict)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




In [21]:
keys = list(lemmatize_train_dict.keys())
for key in keys:
    if '"' in key:
        for token in key.split('"'):
            if token == os.linesep:
                continue
            if token not in lemmatize_train_dict:
                lemmatize_train_dict[token] = []
            lemmatize_train_dict[token] += lemmatize_train_dict[key]
            
for key in lemmatize_train_dict.keys():
    lemmatize_train_dict[key] = list(set(lemmatize_train_dict[key]))

In [330]:
run_task(lemmatize_train_dict)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




In [23]:
keys = list(lemmatize_train_dict.keys())
for key in keys:
    if '\'' or '"' in key:
        for token in key.replace('\'', '"').split('"'):
            token = lemmatize_sentence(token).replace(os.linesep, '')
            if token == '':
                continue
            if token not in lemmatize_train_dict:
                lemmatize_train_dict[token] = []
            lemmatize_train_dict[token] += lemmatize_train_dict[key]
            
for key in lemmatize_train_dict.keys():
    lemmatize_train_dict[key] = list(set(lemmatize_train_dict[key]))

In [345]:
run_task(lemmatize_train_dict)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




In [24]:
keys = list(lemmatize_train_dict.keys())
for key in keys:
    if '<' in key or '>' in key:
        for token in key.replace('<', '>').split('>'):
            token = lemmatize_sentence(token).replace(os.linesep, '')
            if token == '':
                continue
            if token not in lemmatize_train_dict:
                lemmatize_train_dict[token] = []
            lemmatize_train_dict[token] += lemmatize_train_dict[key]
            
for key in lemmatize_train_dict.keys():
    lemmatize_train_dict[key] = list(set(lemmatize_train_dict[key]))

In [41]:
output = run_task(lemmatize_train_dict)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [45]:
def substitute_tags(output):
    with open(TEST_IN_FILENAME, 'r') as in_file:
        with open(TEST_TST_FILENAME, 'w') as out_file:            
            for line_ind, line in tqdm(enumerate(in_file.readlines())):
                prev = 0
                for wf, wl, tag in output[line_ind]:
                    out_file.write(line[prev:wf])
                    prev = wf + wl
                    out_file.write('{}'.format(tag))
                out_file.write(line[prev:])
                out_file.write(os.linesep)

In [46]:
substitute_tags(run_task(lemmatize_train_dict))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

ValueError: Single '}' encountered in format string