## HSE natural language processing
### HW 01

In [1]:
import os
import string
import re

In [2]:
from pymystem3 import Mystem
from operator import itemgetter
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict

In [3]:
RESOURCES_PATH = './resources'
TEST_IN_FILENAME = os.path.join(RESOURCES_PATH, 'test.in')
TEST_OUT_FILENAME = os.path.join(RESOURCES_PATH, 'test.out')
TEST_TST_FILENAME = os.path.join(RESOURCES_PATH, 'test.tst')
TRAIN_SENTENCES_FILENAME = os.path.join(RESOURCES_PATH, 'train_sentences.txt')
TRAIN_TAGS_FILE = os.path.join(RESOURCES_PATH, 'train_nes.txt')
DEVSET_DIR = os.path.join(RESOURCES_PATH, 'factRuEval-2016', 'devset')
NE3_DIR = os.path.join(RESOURCES_PATH, 'ne3', 'Collection3')

In [4]:
m = Mystem()

In [5]:
punct_set = set(string.punctuation)
punct_set.add(os.linesep)
punct_set.add('...')

In [6]:
def lemmatize_sentence(sentence):
    try:
        lemmas = m.lemmatize(sentence)
        process_lemma = lambda lemma: ' ' if lemma == ' ' else lemma.replace(' ', '') if lemma.replace(' ', '') not in punct_set else ' '
        return ''.join([process_lemma(lemma) for lemma in lemmas if lemma not in punct_set]).replace(os.linesep, '')
    except:
        return sentence

In [7]:
def run_task(lemmatize_dict):
    output = defaultdict(lambda: [])
    
    with open(TEST_IN_FILENAME, 'r') as in_file:
        with open(TEST_OUT_FILENAME, 'w') as out_file:            
            for line_ind, line in tqdm(enumerate(in_file.readlines())):
                n = len(line)
                ind = 0
                tokens = []
                while ind < n:
                    while ind < n and not line[ind].isalpha():
                        ind += 1
                    if ind >= n:
                        break
                    next_ind = ind
                    while next_ind < n and line[next_ind].isalpha():
                        next_ind += 1
                    tokens.append((ind, next_ind))
                    ind = next_ind
                token_ind = 0
                while token_ind < len(tokens):
                    marked = False
                    next_token_ind = token_ind
                    wf, wt = tokens[token_ind]
                    last_good_next_token = -1
                    while next_token_ind < min(token_ind + 8, len(tokens)):
                        current_word = lemmatize_sentence(line[wf:tokens[next_token_ind][1]])
                        if current_word in lemmatize_dict:
                            marked = True
                            last_good_next_token = next_token_ind
                        next_token_ind += 1
                    if not marked:
                        word = line[tokens[token_ind][0]:tokens[token_ind][1]]
                        try:
                            grammemas = set(m.analyze(word)[0]['analysis'][0]['gr'].split(','))
                        except:
                            grammemas = set()
                        if 'фам' in grammemas or 'имя' in grammemas:
                            output[line_ind].append((tokens[token_ind][0], tokens[token_ind][1] - tokens[token_ind][0], 'PERSON'))
                            out_file.write('{} {} PERSON '.format(tokens[token_ind][0], tokens[token_ind][1] - tokens[token_ind][0]))
                        elif tokens[token_ind][0] >= 1 and line[tokens[token_ind][0] - 1] == '«' and line[tokens[token_ind][1]] == '»':
                            output[line_ind].append((tokens[token_ind][0], tokens[token_ind][1] - tokens[token_ind][0], 'ORG'))
                            out_file.write('{} {} ORG '.format(tokens[token_ind][0], tokens[token_ind][1] - tokens[token_ind][0]))
#                         elif tokens[token_ind][0] >= 1 and line[tokens[token_ind][0] - 1] == '„' and line[tokens[token_ind][1]] == '“':
#                             output[line_ind].append((tokens[token_ind][0], tokens[token_ind][1] - tokens[token_ind][0], 'ORG'))
#                             out_file.write('{} {} ORG '.format(tokens[token_ind][0], tokens[token_ind][1] - tokens[token_ind][0]))
#                         elif word == word.capitalize() and len(word) >= 2:
#                             output[line_ind].append((tokens[token_ind][0], tokens[token_ind][1] - tokens[token_ind][0], 'ORG'))
#                             out_file.write('{} {} ORG '.format(tokens[token_ind][0], tokens[token_ind][1] - tokens[token_ind][0]))
                        token_ind += 1
                    else:
                        current_word = lemmatize_sentence(line[wf:tokens[last_good_next_token][1]])
                        all_tags = lemmatize_dict[current_word]
                        for ind in range(token_ind, last_good_next_token + 1):
                            for tag in all_tags:
                                output[line_ind].append((tokens[ind][0], tokens[ind][1] - tokens[ind][0], tag))
                                out_file.write('{} {} {} '.format(tokens[ind][0], tokens[ind][1] - tokens[ind][0], tag))
                        token_ind = last_good_next_token + 1                            
                
                out_file.write('EOL')
                out_file.write(os.linesep)
                
    return output

In [8]:
def load_train_dict():
    train_dict = {}
    with open(TRAIN_SENTENCES_FILENAME, 'r') as train_sentences_file:
        with open(TRAIN_TAGS_FILE, 'r') as train_tags_file:
            for tag_line, sentence in zip(train_tags_file.readlines(), train_sentences_file.readlines()):
                tags = tag_line.split(' ')[:-1]
                last_token = ''
                tokens = [(int(tags[ind]), int(tags[ind]) + int(tags[ind + 1]), tags[ind + 2]) for ind in range(0, len(tags), 3)]
                tokens.sort(key=itemgetter(2, 0))
                for ind, token in enumerate(tokens):
                    word_from, word_to, tag = token
                    next_from = -1 if ind == len(tokens) - 1 else tokens[ind + 1][0]
                    next_tag = '' if ind == len(tokens) - 1 else tokens[ind + 1][2]
                    word = sentence[word_from: word_to]
                    if ind + 3 < len(tags) and next_from - word_to <= 5:
                        all_not_alpha = True
                        for letter in sentence[word_to:next_from]:
                            all_not_alpha = all_not_alpha and not letter.isalpha() and letter != ',' and letter != '(' and letter != ')'
                        if all_not_alpha:
                            last_token += sentence[word_from:next_from]
                            continue
                    word = last_token + word
                    if word not in train_dict:
                        train_dict[word] = []
                    train_dict[word].append(tag)
                    last_token = ''
    
    lemmatize_train_dict = {}
    for key in train_dict.keys():
        lemmatize_train_dict[lemmatize_sentence(key)] = list(set(train_dict[key]))
    return lemmatize_train_dict, train_dict

In [9]:
lemmatize_train_dict, train_dict = load_train_dict()

In [11]:
len(lemmatize_train_dict)

1241

In [187]:
run_task(lemmatize_train_dict)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




In [13]:
pattern = re.compile(".*\.objects")
OBJECT_FILENAMES = [os.path.join(DEVSET_DIR, filename) for filename in os.listdir(DEVSET_DIR) if pattern.match(filename)]

In [14]:
def parse_object_files():
    tag_to_real_tag = {#'LocOrg' : 'ORG', 
        'Org' : 'ORG', 'Person' : 'PERSON'}
    lemmatize_dict = {}
    for filename in OBJECT_FILENAMES:
        with open(filename, 'r') as object_file:
            for line in object_file.readlines():
                tokens = line.split(' ')
                tag = tokens[1]
                word = ''
                for token_ind in range(2, len(tokens)):
                    if tokens[token_ind] == '#':
                        word = lemmatize_sentence(' '.join(tokens[token_ind + 1:]).replace(os.linesep, ''))
                        if tag not in tag_to_real_tag:
                            continue
                        tag = tag_to_real_tag[tag]
                        if word not in lemmatize_dict:
                            lemmatize_dict[word] = []
                        lemmatize_dict[word].append(tag)
                        break
    return lemmatize_dict

In [15]:
object_files_dict = parse_object_files()

In [16]:
for key in object_files_dict.keys():
    if key not in lemmatize_train_dict:
        lemmatize_train_dict[key] = object_files_dict[key]
    else:
        lemmatize_train_dict[key] = list(set(object_files_dict[key] + lemmatize_train_dict[key]))

In [17]:
len(lemmatize_train_dict)

1864

In [54]:
run_task(lemmatize_train_dict)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




defaultdict(<function __main__.run_task.<locals>.<lambda>()>,
            {1: [(15, 11, 'ORG')],
             5: [(13, 8, 'ORG'), (22, 9, 'ORG'), (103, 11, 'ORG')],
             6: [(10, 3, 'ORG'), (14, 9, 'PERSON'), (24, 9, 'PERSON')],
             7: [(96, 7, 'ORG'), (104, 5, 'ORG')],
             10: [(28, 7, 'ORG')],
             11: [(47, 9, 'PERSON'), (57, 5, 'PERSON')],
             16: [(11, 3, 'ORG'), (15, 6, 'PERSON')],
             18: [(73, 4, 'ORG')],
             19: [(41, 8, 'PERSON'), (50, 7, 'PERSON')],
             20: [(16, 2, 'ORG'),
              (81, 8, 'ORG'),
              (122, 8, 'ORG'),
              (150, 8, 'ORG'),
              (164, 11, 'ORG')],
             21: [(0, 6, 'ORG')],
             24: [(22, 5, 'ORG'),
              (28, 2, 'ORG'),
              (77, 4, 'PERSON'),
              (82, 7, 'PERSON'),
              (103, 6, 'PERSON'),
              (110, 8, 'PERSON'),
              (141, 7, 'ORG'),
              (161, 8, 'PERSON')],
             28: 

In [18]:
pattern = re.compile(".*\.ann")
NE3_FILENAMES = [os.path.join(NE3_DIR, filename) for filename in os.listdir(NE3_DIR) if pattern.match(filename)]

In [19]:
def parse_ne3_files():
    lemmatize_dict = {}
    for filename in NE3_FILENAMES:
        with open(filename, 'r') as ne3_file:
            tag_to_real_tag = {'ORG' : 'ORG', 'PER' : 'PERSON'}
            for line in ne3_file.readlines():
                line = line.replace('\t', ' ').replace('\n', '')
                tokens = line.split(' ')
                tag = tokens[1]
                word = ''
                for token_ind in range(2, len(tokens)):
                    if tokens[token_ind].isalpha():
                        word = lemmatize_sentence(' '.join(tokens[token_ind:]).replace(os.linesep, ''))
                        if tag not in tag_to_real_tag:
                            break
                        tag = tag_to_real_tag[tag]
                        if word not in lemmatize_dict:
                            lemmatize_dict[word] = []
                        lemmatize_dict[word].append(tag)
                        break
    return lemmatize_dict

In [20]:
ne3_files_dict = parse_ne3_files()

In [21]:
for key in ne3_files_dict.keys():
    if key not in lemmatize_train_dict:
        lemmatize_train_dict[key] = ne3_files_dict[key]
    else:
        lemmatize_train_dict[key] = list(set(ne3_files_dict[key] + lemmatize_train_dict[key]))

In [22]:
len(lemmatize_train_dict)

7940

In [18]:
run_task(lemmatize_train_dict)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




In [23]:
keys = list(lemmatize_train_dict.keys())
for key in keys:
    if '(' in key:
        tokens = key.split('(')
        tokens[1] = tokens[1].split(')')[0]
        for token in [tokens[0], tokens[1]]:
            if token not in lemmatize_train_dict:
                lemmatize_train_dict[token] = []
            lemmatize_train_dict[token] += lemmatize_train_dict[key]

In [24]:
for key in lemmatize_train_dict.keys():
    lemmatize_train_dict[key] = list(set(lemmatize_train_dict[key]))

In [325]:
run_task(lemmatize_train_dict)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




In [25]:
keys = list(lemmatize_train_dict.keys())
for key in keys:
    if '"' in key:
        for token in key.split('"'):
            if token == os.linesep:
                continue
            if token not in lemmatize_train_dict:
                lemmatize_train_dict[token] = []
            lemmatize_train_dict[token] += lemmatize_train_dict[key]
            
for key in lemmatize_train_dict.keys():
    lemmatize_train_dict[key] = list(set(lemmatize_train_dict[key]))

In [21]:
run_task(lemmatize_train_dict)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




defaultdict(<function __main__.run_task.<locals>.<lambda>()>,
            {0: [(102, 12, 'ORG')],
             2: [(11, 6, 'ORG'),
              (19, 4, 'ORG'),
              (34, 6, 'PERSON'),
              (41, 10, 'PERSON'),
              (52, 7, 'PERSON'),
              (94, 6, 'ORG')],
             4: [(50, 6, 'PERSON'), (58, 3, 'ORG')],
             6: [(7, 6, 'ORG')],
             7: [(41, 7, 'ORG'),
              (71, 9, 'PERSON'),
              (82, 3, 'ORG'),
              (88, 2, 'ORG')],
             10: [(25, 7, 'ORG')],
             11: [(30, 6, 'ORG')],
             12: [(3, 9, 'ORG'),
              (30, 6, 'ORG'),
              (55, 9, 'ORG'),
              (87, 7, 'ORG')],
             13: [(2, 5, 'PERSON'), (32, 6, 'ORG')],
             15: [(95, 3, 'ORG')],
             16: [(10, 5, 'ORG'), (16, 2, 'ORG')],
             17: [(17, 3, 'ORG'),
              (28, 3, 'ORG'),
              (39, 7, 'ORG'),
              (84, 6, 'ORG')],
             21: [(0, 7, 'ORG'), (43,

In [26]:
keys = list(lemmatize_train_dict.keys())
for key in keys:
    if '\'' or '"' in key:
        for token in key.replace('\'', '"').split('"'):
            token = lemmatize_sentence(token).replace(os.linesep, '')
            if token == '':
                continue
            if token not in lemmatize_train_dict:
                lemmatize_train_dict[token] = []
            lemmatize_train_dict[token] += lemmatize_train_dict[key]
            
for key in lemmatize_train_dict.keys():
    lemmatize_train_dict[key] = list(set(lemmatize_train_dict[key]))

In [345]:
run_task(lemmatize_train_dict)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




In [27]:
keys = list(lemmatize_train_dict.keys())
for key in keys:
    if '<' in key or '>' in key:
        for token in key.replace('<', '>').split('>'):
            token = lemmatize_sentence(token).replace(os.linesep, '')
            if token == '':
                continue
            if token not in lemmatize_train_dict:
                lemmatize_train_dict[token] = []
            lemmatize_train_dict[token] += lemmatize_train_dict[key]
            
for key in lemmatize_train_dict.keys():
    lemmatize_train_dict[key] = list(set(lemmatize_train_dict[key]))

In [41]:
output = run_task(lemmatize_train_dict)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [28]:
len(lemmatize_train_dict)

8522

In [33]:
pattern = re.compile(".*{.*}")

In [34]:
def parse_train_enhanced():
    with open(os.path.join(RESOURCES_PATH, 'train_sentences_enhanced.txt'), 'r') as train_file:
        for line in train_file.readlines():
            tokens = line.split()
            for token in tokens:
                if pattern.match(token):
                    token = token.replace('{', '}')
                    for cp in punct_set:
                        if cp == '}':
                            continue
                        token = token.replace(cp, '')
                    word, tag = token.split('}')[0], token.split('}')[1]
                    print(word, tag)
                    if word not in lemmatize_train_dict:
                        lemmatize_train_dict[word] = []
                    lemmatize_train_dict[word].append(tag)
    
    for key in lemmatize_train_dict.keys():
        lemmatize_train_dict[key] = list(set(lemmatize_train_dict[key]))

parse_train_enhanced()

СкотландЯрд ORG
Руперта PERSON
Мердока PERSON
Ванга PERSON
Сталина PERSON
Вертинского PERSON
совет ORG
координаторов ORG
общественного ORG
движения ORG
Сергея PERSON
Кургиняна PERSON
«Суть ORG
времени ORG
исполкома ORG
«Единой ORG
России ORG
«Холдинг ORG
ГазпромМедиа ORG
ЗАО ORG
Эхо ORG
Москвы ORG
радиостанции ORG
ФИФА ORG
Яндекса ORG
Mailru ORG
Google ORG
Rambler ORG
Спутник ORG
Ростелекому ORG
Форсайт PERSON
вещательной ORG
корпорации ORG
BBC ORG
Комитета ORG
по ORG
иностранным ORG
делам ORG
АльКайды ORG
Усамы PERSON
бин PERSON
Ладена PERSON
Марк PERSON
Викиновостям ORG
Vestas ORG
Социалистической ORG
рабочей ORG
партии ORG
Британии ORG
Конгресса ORG
профсоюзов ORG
профсоюза ORG
Юнайт ORG
Дмитрий PERSON
Ливанов PERSON
Владимира PERSON
Путина PERSON
Роснефть ORG
ТНКBP ORG
Лужков PERSON
Спаситель PERSON
Корпорация ORG
Google ORG
холдинга ORG
«Мегафоном ORG
Garsdale ORG
«Вымпелкомом ORG
МТС ORG
Garsdale ORG
Уго PERSON
Чавесу PERSON
компания ORG
Good ORG
Morning ORG
To ORG
You ORG
Produc

Telegram ORG
Верховного ORG
народного ORG
собрания ORG
КНДР ORG
Хван PERSON
Чжан PERSON
Еп PERSON
Blekko ORG
US ORG
Venture ORG
Partners ORG
CMEA ORG
Capital ORG
PivotNorth ORG
Capital ORG
компании ORG
завод ORG
Рязанову PERSON
comScore ORG
клуба ORG
«Перестройка ORG
«Социалдемократическая ORG
ассоциация ORG
СССР ORG
HP ORG
3Com ORG
Столтенбергу PERSON
руководства ORG
«Роснефти ORG
«Газпром ORG
нефтяной ORG
компании ORG
Лентару ORG
Bloomberg ORG
Шкрели PERSON
Лентару ORG
Си PERSON
Цзиньпина PERSON
Константин PERSON
Сакаев PERSON
Александр PERSON
Галкин PERSON
Владимир PERSON
Белов PERSON
Эрнесто PERSON
Инаркиев PERSON
Максим PERSON
Туров PERSON
Игорь PERSON
Курносов PERSON
Сергей PERSON
Искусных PERSON
Алексей PERSON
Илюшин PERSON
Евгений PERSON
Романов PERSON
Надежда PERSON
Косинцева PERSON
Татьяна PERSON
Косинцева PERSON
Александр PERSON
Брод PERSON
РПЦ ORG
агентство ORG
Reuters ORG
Прянишников PERSON
Фосс PERSON
Уилсоном PERSON
Nokia ORG
Прессслужба ORG
Кадырова ORG
Туркменбаши PERS

«Культура ORG
Елене PERSON
Ямпольской PERSON
Интернет ORG
БГ PERSON
организации ORG
«Билайн ORG
ОАО ORG
«ВымпелКом ORG
проекту ORG
0facebookcom ORG
социальной ORG
сети ORG
Facebook ORG
Сталину PERSON
Вертинский PERSON
ЦК ORG
Трудовой ORG
партии ORG
Кореи ORG
делегацию ORG
Южной ORG
Кореи ORG
Андрею PERSON
Санникову PERSON
Фёдора PERSON
Ивановича PERSON
Шаляпина PERSON
Соловцовском ORG
театре ORG
Театра ORG
Наций ORG
Дзержинского ORG
театра ORG
драмы ORG
Российским ORG
советом ORG
по ORG
международным ORG
делам ORG
РСМД ORG
Российской ORG
Академией ORG
наук ORG
МИД ORG
России ORG
Валерий PERSON
Гелетей PERSON
Джон PERSON
Бенер PERSON
ФИФА ORG
Зепп PERSON
Блаттер PERSON
Юрия PERSON
Лужкова PERSON
Мосгордумы ORG
Бориса PERSON
Громова PERSON
Сулима PERSON
Ямадаева PERSON
Генеральная ORG
прокуратура ORG
ОАЭ ORG
«GztRu ORG
Сулима PERSON
Ямадаева PERSON
Веселина PERSON
Топалова PERSON
News ORG
of ORG
the ORG
World ORG
Невилл PERSON
Терлбек PERSON
отдела ORG
новостей ORG
Джеймс PERSON
Уитерап 

Рабле PERSON
Клинсман PERSON
ВГТРК ORG
«Амедиа ORG
«Мосфильм ORG
«Каро ORG
Премьер ORG
«ТНТ ORG
интернеткомпании ORG
MailRu ORG
Group ORG
соцсетей ORG
«Одноклассники ORG
«Мой ORG
Мир ORG
RuTube ORG
Ivi ORG
Zoomby ORG
Megogo ORG
Театра ORG
Наций ORG
ГАБТа ORG
Анатолий PERSON
Иксанов PERSON
Чилингарова PERSON
ИМЭМО ORG
РАН ORG
Александр PERSON
Дынкин PERSON
Стратегический ORG
совет ORG
«Яндекса ORG
головной ORG
компании ORG
Yandex ORG
ньюйоркской ORG
бирже ORG
NASDAQ ORG
оператора ORG
телерекламы ORG
«Видео ORG
Интернешнл ORG
банка ORG
«Россия ORG
материнской ORG
Tele2 ORG
AB ORG
шведской ORG
инвестиционной ORG
группы ORG
Investment ORG
AB ORG
Kinnevik ORG
РБК ORG
Сенатом ORG
тюремные ORG
власти ORG
Алла PERSON
Бут PERSON
Совете ORG
Федерации ORG
ВР ORG
ТНКBP ORG
ТНКBP ORG
BP ORG
ВТБ ORG
российская ORG
« ORG
Tele2 ORG
NYSE ORG
Euronext ORG
«MTV ORG
информационноиздательского ORG
холдинга ORG
Николай PERSON
Картозия PERSON
ГАБТа ORG
Владимир PERSON
Урин PERSON
Большого ORG
«Взгляд ORG
Дми

In [35]:
len(lemmatize_train_dict)

10788

In [103]:
def substitute_tags(output):
    with open(TEST_IN_FILENAME, 'r') as in_file:
        with open(TEST_TST_FILENAME, 'w') as out_file:            
            for line_ind, line in tqdm(enumerate(in_file.readlines())):
                prev = 0
                for wf, wl, tag in output[line_ind]:
                    out_file.write(line[prev:wf + wl])
                    prev = wf + wl
                    out_file.write('{' + tag + '}')
                out_file.write(line[prev:])

In [104]:
substitute_tags(run_task(lemmatize_train_dict))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


