In [None]:
import re
import math
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

# Hunalign

### Text preparation

English text:

In [None]:
with open('en.txt', 'r', encoding='utf-8-sig') as f:
    en_text = f.read()

In [None]:
en_ready = []
for i in sent_tokenize(en_text):
    en_ready.append('<p>')
    en_ready.append(i)

In [None]:
with open('en_hunalign.txt', 'w', encoding='utf-8-sig') as f:
    f.write('\n'.join(en_ready))

Russian text:

In [None]:
with open('ru.txt', 'r', encoding='utf-8-sig') as f:
    ru_text = f.read()

In [None]:
ru_ready = []
for i in sent_tokenize(ru_text):
    new_i = re.sub('([.,!?();])', r' \1 ', i)
    new_i = re.sub('\s{2,}', ' ', new_i)
    ru_ready.append('<p>')
    ru_ready.append(new_i)

In [None]:
with open('ru_hunalign.txt', 'w', encoding='utf-8-sig') as f:
    f.write('\n'.join(ru_ready))

### Alignment

В папку с приложением hunalign.exe и нулевым словарем null.dic (так как для русского языка нет готового словаря) положить файлы en_hunalign и ru_hunalign, в которых находятся в готовом для элайнмента формате предложения на соответствующих языках, открыть в ней командную строку и ввести следующее (результат будет в файле hunalign.txt):

hunalign.exe null.dic en_hunalign.txt ru_hunalign.txt -text -utf -realign > hunalign.txt

### Hunalign table

In [None]:
with open('hunalign.txt', 'r', encoding='utf-8-sig') as f:
    aligned_text = f.read()

In [None]:
with_p = aligned_text.split('\n')

without_p = []
for i in range(1, len(with_p), 2):
    without_p.append(with_p[i])

en_hunalign_df = []
for element in without_p:
    for en in range(0, len(element.split('\t')), 3):
        en_hunalign_df.append(element.split('\t')[en])

ru_hunalign_df = []
for element in without_p:
    for ru in range(1, len(element.split('\t')), 3):
        ru_hunalign_df.append(element.split('\t')[ru])

score = []
for element in without_p:
    for sc in range(2, len(element.split('\t')), 3):
        score.append(element.split('\t')[sc])

In [None]:
df_hunalign = pd.DataFrame({'en': en_hunalign_df, 'ru': ru_hunalign_df, 'score': score})

df_hunalign.to_excel('./hunalign.xlsx', index=False)

### Hunalign table with entities

In [None]:
points = []

for count, letter in enumerate(en_text):
    if en_text[count-1]+letter == ' .' or letter == '?' or letter == '!':
        points.append(count)

for i, value in enumerate(en_hunalign_df):
    if value == '':
        points.insert(i, '')

In [None]:
df_entities = pd.read_excel('entities.xlsx', index_col=None, header=None)

In [None]:
df_hunalign_entities = pd.DataFrame({'en': en_hunalign_df, 'ru': ru_hunalign_df, 'points': points, 'entities': ''*len(points)})

In [None]:
entities = [[] for _ in range(len(points))]

for i, value in enumerate(df_entities[1]):
    for ind, p in enumerate(points):
        if p != '' and int(value.split(' ')[1]) < int(p):
            entities[[df_hunalign_entities[df_hunalign_entities['points']==p].index.values][0][0]].append(str(value.split(' ')[0])+'-'+df_entities[2][i])
            break

In [None]:
df_hunalign_entities = pd.DataFrame({'en': en_hunalign_df, 'ru': ru_hunalign_df, 'entities': entities})

df_hunalign_entities.to_excel('./hunalign_entities.xlsx', index=False)

# Fast_align

### Text preparation

In [None]:
tokenized_en_text = []
for sent in sent_tokenize(en_text.lower()):
    tokenized_en_text.append(tokenizer.tokenize(sent))
    
clean_en_text = []
for sent in tokenized_en_text:
    clean_en_text.append(' '.join(sent))

In [None]:
tokenized_ru_text = []
for sent in sent_tokenize(ru_text.lower(), language="russian"):
    tokenized_ru_text.append(tokenizer.tokenize(sent))

clean_ru_text = []
for sent in tokenized_ru_text:
    clean_ru_text.append(' '.join(sent))

In [None]:
with open('en_fastalign.txt', 'w', encoding='utf-8-sig') as f:
    f.write('\n'.join(clean_en_text))

with open('ru_fastalign.txt', 'w', encoding='utf-8-sig') as f:
    f.write('\n'.join(clean_ru_text))

### Alignment

Через консоль Ubuntu. Сначала приведем в нужный формат по предложениям:

paste en_fastalign.txt ru_fastalign.txt | sed "s/$(printf '\t')/ ||| /g" > source_targets.fastalign

Сначала мы рассматриваем английский язык как мишень (target), а русский язык как источник (source), поэтому используем reverse:

./fast_align -i source_targets.fastalign -d -o -v -r > reverse.align

### Fast_align table

In [None]:
with open('fastalign_en_ru.txt', 'r', encoding='utf-8-sig') as f:
    aligned_words = f.read()

aligned_words = aligned_words.split('\n')

In [None]:
sootnosh = []
for i in range(len(aligned_words)-1):
    sootnosh.append([[aligned_words[i]],[clean_en_text[i], clean_ru_text[i]]])

In [None]:
df_fastalign_incomplete = pd.DataFrame({'en': [], 'ru': []}) #поменять местами для ru_en

k = 0
for soot in sootnosh:
    indexes = soot[0][0].split(' ')
    list_en = []
    list_ru = []
    
    for i in indexes:
        if soot[0][0] == '':
            for j in range(len(soot[1][0].split(' '))):
                list_en.append(soot[1][0].split(' ')[j])
                list_ru.append('')
        else:
            list_en.append(soot[1][0].split(' ')[int(i.split('-')[0])])
            list_ru.append(soot[1][1].split(' ')[int(i.split('-')[1])])     
        
    inde = [k]
    df2 = pd.DataFrame(list(zip(inde, inde)), columns=['en', 'ru'])
    df_fastalign_incomplete = df_fastalign_incomplete.append(df2)
    df2 = pd.DataFrame(list(zip(list_en, list_ru)), columns=['en', 'ru']) # поменять местами list_en и list_ru для ru_en и columns
    df_fastalign_incomplete = df_fastalign_incomplete.append(df2)
    k += 1

### Fast_align complete table

In [None]:
en_fastalign_df = []
for i in range(len(en_hunalign_df)):
    new_str = str(i) + ' '
    en_fastalign_df.append(new_str + en_hunalign_df[i])

sentences = []
for i in en_fastalign_df:
    new_i = i.split(' ')
    sentences.append(new_i)

In [None]:
incomplete_sentences = []
incomplete_one_sent = []

len_k = int(sentences[-1][0]) + 1
k = 0
for i in df_fastalign_incomplete['en']:
    if i == k + 1:
        incomplete_sentences.append(incomplete_one_sent)
        incomplete_one_sent = []
        k += 1
        incomplete_one_sent.append(i)
    else:
        incomplete_one_sent.append(str(i))
    
incomplete_sentences.append(incomplete_one_sent)
incomplete_sentences[0][0] = 0

In [None]:
pair = []
incomplete = []

k = 1
minus_len = 0
for i, value in enumerate(df_fastalign_incomplete['ru']):
    if value == k:
        incomplete.append(pair)
        pair = []
        minus_len += len(incomplete[k-1])
        k += 1
        
    pair.append(str(incomplete_sentences[k-1][i-minus_len]) + '-' + str(value))

incomplete.append(pair)
incomplete[0][0] = '0-0'

In [None]:
complete = []

for one_sent in sentences:
    j = 0
    pair = sentences.index(one_sent)
    
    for i in range(len(one_sent)):
        
        try:
            if one_sent[i].lower() == incomplete[pair][j].split('-')[0]:
                complete.append(incomplete[pair][j])
                j += 1
            elif one_sent[i].lower() == "'s" or one_sent[i].lower() == "'d":
                complete.append(incomplete[pair][j])
                j += 1
            else:
                complete.append(one_sent[i].lower() + '-' + ' ')
        
        except IndexError:
            complete.append(one_sent[i].lower() + '-' + ' ')

In [None]:
en_fastalign_complete = []
ru_fastalign_complete = []

k = 0
for pair in complete:
    if pair.split('-')[0] == str(k) and pair.split('-')[1] == str(k):
        en_fastalign_complete.append(int(pair.split('-')[0]))
        ru_fastalign_complete.append(int(pair.split('-')[1]))
        k += 1
    else:
        en_fastalign_complete.append(pair.split('-')[0])
        ru_fastalign_complete.append(pair.split('-')[1])

In [None]:
df_fastalign_complete = pd.DataFrame({'en': en_fastalign_complete, 'ru': ru_fastalign_complete})
df_fastalign_complete.to_excel('./fastalign_en_ru_complete.xlsx', index=False)

In [None]:
# для ru_en
df_fastalign_complete = pd.DataFrame({'ru': ru_fastalign_complete, 'en': en_fastalign_complete})
df_fastalign_complete.to_excel('./fastalign_ru_en_complete.xlsx', index=False)

### Fast_align table with entities

In [None]:
all_tags = []

k = 0
full_word = ''
for i, word in enumerate(df_fastalign_complete['en']):
    
    if type(word) != str and math.isnan(word):
        word = ''
        len_token = 2
    elif word == 's' or word == 'd':
        len_token = 2
    else:
        len_token = len(str(word))
   
    try:
        k += len_token + 1
        if (word == '.' or word == '!' or word == '?') and (len(str(df_fastalign_complete['en'][i + 1])) == 1 and str(df_fastalign_complete['en'][i + 1]).isdigit()):
            k -= 2
        elif (word == '.' or word == '!' or word == '?') and (len(str(df_fastalign_complete['en'][i + 1])) == 2 and str(df_fastalign_complete['en'][i + 1]).isdigit()):
            k -= 3 
        elif (word == '.' or word == '!' or word == '?') and (len(str(df_fastalign_complete['en'][i + 1])) == 3 and str(df_fastalign_complete['en'][i + 1]).isdigit()):
            k -= 4

        all_tags.append(str(word) + '-' + ' ')
        for start in df_entities[1]:
            if k - 2 > int(start.split(' ')[1]) and k - 2 <= int(start.split(' ')[2])+1 and word != '.' and word != '!' and word != '?':
                fin = str(word) + '-' + start.split(' ')[0]
                
                if all_tags[len(all_tags) - 1].split('-')[0] == word and all_tags[len(all_tags) - 1].split('-')[1] == ' ':
                    all_tags[len(all_tags) - 1] = fin
                elif all_tags[len(all_tags) - 1].split('-')[0] == word and all_tags[len(all_tags) - 1].split('-')[1] != ' ':
                    all_tags[len(all_tags) - 1] = all_tags[len(all_tags) - 1] + '-' + start.split(' ')[0]
                else:
                    all_tags.append(fin)
                    
    except KeyError:
        break

In [None]:
en_fastalign_entities = []
fastalign_entities = []
for i, value in enumerate(all_tags):
    
    en_fastalign_entities.append(value.split('-')[0])
    if value.split('-')[1] == ' ':
        fastalign_entities.append('')
    else:
        fastalign_entities.append(value.split('-')[1:])

ru_fastalign_entities = list(df_fastalign_complete['ru'])
en_fastalign_entities.append('.')
fastalign_entities.append('')

In [None]:
df_fastalign_entities = pd.DataFrame({'en': en_fastalign_entities, 'ru': ru_fastalign_entities, 'tags': fastalign_entities})
df_fastalign_entities.to_excel('./fastalign_en_ru_entities.xlsx', index=False)

### Ru_en fast_align

Теперь проделываем то же самое, но рассматриваем русский язык как мишень (target), а английский как источник (source), используем forward:

./fast_align -i source_targets.fastalign -d -o -v > forward.align

In [None]:
with open('fastalign_ru_en.txt', 'r', encoding='utf-8-sig') as f:
    aligned_words = f.read()

aligned_words = aligned_words.split('\n')

In [None]:
dict_tags = {}
list_dict_tags = []

k = 1
for i, word in enumerate(df_fastalign_entities['en']):
    
    if word == str(k) and df_fastalign_entities['ru'][i] == k:
        list_dict_tags.append(dict_tags)
        dict_tags = {}
        k += 1
        
    elif df_fastalign_entities['tags'][i] in dict_tags.keys() and type(df_fastalign_entities['tags'][i]) == str:
        dict_tags[df_fastalign_entities['tags'][i]].append(word)
        
    elif type(df_fastalign_entities['tags'][i]) == str:
        dict_tags[df_fastalign_entities['tags'][i]] = [word]

Проделать sootnosh для ru_en.

In [None]:
ru_fastalign_entities = []
en_fastalign_entities = []
fastalign_entities = []

k = 0
for i, word in enumerate(df_fastalign_incomplete['en']):
    ru_fastalign_entities.append(df_fastalign_incomplete['ru'][i])
    en_fastalign_entities.append(word)
    
    if word == k + 1 and df_fastalign_incomplete['ru'][i] == k + 1:
        k += 1
    
    for key, value in list_dict_tags[k].items():
        for v in value:
            if word == v:
                fastalign_entities.append(key)
                break
                
    if len(en_fastalign_entities) > len(fastalign_entities):
        fastalign_entities.append('')
if len(en_fastalign_entities) > len(fastalign_entities):
    fastalign_entities.append('')

In [None]:
df_fastalign_entities = pd.DataFrame({'ru': ru_fastalign_entities, 'en': en_fastalign_entities, 'tags': fastalign_entities})
df_fastalign_entities.to_excel('./fastalign_ru_en_entities.xlsx', index=False)

# Models

### Preparation

In [None]:
dictionary = []

k = 0
for i, line in enumerate(df_fastalign_entities['ru']):

    if str(df_fastalign_entities['tags'][i]) == 'nan':
        tag = ''
    else:
        tag = df_fastalign_entities['tags'][i]
        
    if line == k and df_fastalign_entities['en'][i] == k:
        dictionary.append(str(k) + '-' + str(k) + '-' + tag)
        k += 1
    else:
        dictionary.append(str(line) + '-' + str(df_fastalign_entities['en'][i]) + '-' + tag)

In [None]:
for_models = ''
for i in ru_hunalign_df:
    for_models += i + ' '
    
for_models = for_models.replace('?', '.')
for_models = for_models.replace('!', '.')
for_models = for_models.replace('~~~', '')
for_models = for_models.replace('. . .', '')

new_for_models = []
k = 0
for i in for_models.split('.'):
    new_i = re.sub(r'[.«»,"\'?:!;—]', '', i)
    new_i = re.sub(r'[-]', ' ', new_i)
    new_i = re.sub(r'[\n\xa0]', ' ', new_i)
    new_i = re.sub(r'  ', ' ', new_i)
    if k == 0:
        new_for_models.append(str(k) + ' ' + new_i)
    else:
        new_for_models.append(str(k) + new_i)
    k += 1

In [None]:
for_models = ''
for i in new_for_models:
    for_models += '' + i

In [None]:
df_fastalign_entities_complete = []

k = 0
j = 0
for word in for_models.split(' '):
    
    if word.lower() != dictionary[j].split('-')[0]:
        df_fastalign_entities_complete.append(word.lower() + '-' + '' + '-' + '')
        continue
        
    else:
        df_fastalign_entities_complete.append(dictionary[j].split('-')[0] + '-' + dictionary[j].split('-')[1] + '-' + dictionary[j].split('-')[2])
        j += 1
        continue

### Natasha Slovnet

In [None]:
from navec import Navec
from slovnet import NER

navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
ner = NER.load('slovnet_ner_news_v1.tar')
ner.navec(navec)

natasha_slovnet = ner(for_models)

In [None]:
df_natasha = []

length = 0
for i in range(len(df_fastalign_entities_complete)):
    length += len(df_fastalign_entities_complete[i].split('-')[0]) + 1
    
    for j in natasha_slovnet.spans:
        tag = ''
        if length >= j.start and length <= j.stop:
            tag = j.type
            break
    
    try:
        new_tag = df_fastalign_entities_complete[i + 1] + '-' + tag
        df_natasha.append(new_tag)
    except IndexError:
        break

In [None]:
ru_full_tags = []
en_full_tags = []
en_tag_full_tags = []
natasha = []

for i in df_natasha:
    ru_full_tags.append(i.split('-')[0])
    en_full_tags.append(i.split('-')[1])
    en_tag_full_tags.append(i.split('-')[2][2:5])
    natasha.append(i.split('-')[3])

### Stanza

In [None]:
import stanza

nlp = stanza.Pipeline(lang='ru', processors='tokenize,ner')
doc_stanza = nlp(for_models)

In [None]:
df_stanza = []

length = 0
for i in range(len(df_fastalign_entities_complete)):
    length += len(df_fastalign_entities_complete[i].split('-')[0]) + 1

    for sent in doc_stanza.sentences:
        for ent in sent.ents:
            tag = ''
            if length >= ent.start_char and length <= ent.end_char:
                tag = ent.type
                break
        break
    
    try:
        new_tag = df_fastalign_entities_complete[i + 1] + '-' + tag
        df_stanza.append(new_tag)
    except IndexError:
        break

In [None]:
stanza = []
for i in df_stanza:
    stanza.append(i.split('-')[3])

### Deeppavlov RuBert

In [None]:
from deeppavlov import configs, build_model

In [None]:
ner_model = build_model(configs.ner.ner_rus_bert, download=True)

In [None]:
n = 512
chunks = [for_models[i:i+n] for i in range(0, len(for_models), n)]

In [None]:
doc_bert = []
for i in chunks:
    doc_bert.append(ner_model([i]))

In [None]:
df_bert = []
for i in range(len(doc_bert)-1):
    for j in range(len(doc_bert[i][0][0])):
        for f in for_models.split(' '):
            if f == doc_bert[i][0][0][j]:
                if doc_bert[i][1][0][j] == 'O':
                    df_bert.append(f + '-' + doc_bert[i][1][0][j])
                else:
                    df_bert.append(f + '-' + doc_bert[i][1][0][j][2:])
                break
            elif f == doc_bert[i][0][0][j] + doc_bert[i+1][0][0][0]:
                df_bert.append(f + '-' + 'O')
                break

# отдельно рассмотреть последнее:
for i in range(1, len(doc_bert[-1][0][0])):
    df_bert.append(doc_bert[-1][0][0][i] + '-' + doc_bert[-1][1][0][i])

In [None]:
bert = []
for i in df_bert:
    if i.split('-')[1] == 'O':
        bert.append('')
    else:
        bert.append(i.split('-')[1])

### Dataset with all tags

In [None]:
df_alltags = pd.DataFrame({'ru': ru_full_tags, 'en': en_full_tags, 'en_tag': en_tag_full_tags, 'natasha': natasha, 'stanza': stanza, 'bert': bert})
df_alltags.to_excel('./all_entities.xlsx', index=False)

# Metrics

In [None]:
df_alltags = pd.read_excel('all_entities.xlsx', index_col=None)

In [None]:
from sklearn.metrics import classification_report

In [None]:
en_tag = []
for i in df_alltags['en_tag']:
    if str(i) == 'nan':
        en_tag.append('')
    else:
        en_tag.append(i)

### Natasha

In [None]:
natasha_tag = []
for i in df_alltags['natasha']:
    if str(i) == 'nan':
        natasha_tag.append('')
    else:
        natasha_tag.append(i)

In [None]:
natasha_en_tag = []
natasha_natasha_tag = []
for i in range(len(en_tag)):
    if en_tag[i] == 'GPE':
        natasha_en_tag.append('LOC')
        natasha_natasha_tag.append(natasha_tag[i])
    elif en_tag[i] != '' and natasha_tag[i] != '' and en_tag[i] != 'FAC' and en_tag[i] != 'VEH':
        natasha_en_tag.append(en_tag[i])
        natasha_natasha_tag.append(natasha_tag[i])
    elif (en_tag[i] == '' and natasha_tag[i] != '') or (en_tag[i] != '' and natasha_tag[i] == '' and en_tag[i] != 'FAC' and en_tag[i] != 'VEH'):
        natasha_en_tag.append(en_tag[i])
        natasha_natasha_tag.append(natasha_tag[i])

In [None]:
print(classification_report(natasha_en_tag, natasha_natasha_tag))

### Stanza

In [None]:
stanza_tag = []
for i in df_alltags['stanza']:
    if str(i) == 'nan':
        stanza_tag.append('')
    else:
        stanza_tag.append(i)

In [None]:
stanza_en_tag = []
stanza_stanza_tag = []
for i in range(len(en_tag)):
    if stanza_tag == 'MISC' and en_tag[i] != '':
        stanza_en_tag.append(en_tag[i])
        stanza_stanza_tag.append(en_tag[i])
    elif en_tag[i] == 'GPE':
        stanza_en_tag.append('LOC')
        stanza_stanza_tag.append(stanza_tag[i])
    elif en_tag[i] != '' and stanza_tag[i] != '' and en_tag[i] != 'FAC' and en_tag[i] != 'VEH':
        stanza_en_tag.append(en_tag[i])
        stanza_stanza_tag.append(stanza_tag[i])
    elif (en_tag[i] == '' and stanza_tag[i] != '') or (en_tag[i] != '' and stanza_tag[i] == '' and en_tag[i] != 'FAC' and en_tag[i] != 'VEH'):
        stanza_en_tag.append(en_tag[i])
        stanza_stanza_tag.append(stanza_tag[i])

In [None]:
print(classification_report(stanza_en_tag, stanza_stanza_tag))

### Bert

In [None]:
bert_tag = []
for i in df_alltags['bert']:
    if str(i) == 'nan':
        bert_tag.append('')
    else:
        bert_tag.append(i)

In [None]:
bert_en_tag = []
bert_bert_tag = []
for i in range(len(en_tag)):
    if en_tag[i] == 'GPE':
        bert_en_tag.append('LOC')
        bert_bert_tag.append(bert_tag[i])
    elif en_tag[i] != '' and bert_tag[i] != '' and en_tag[i] != 'FAC' and en_tag[i] != 'VEH':
        bert_en_tag.append(en_tag[i])
        bert_bert_tag.append(bert_tag[i])
    elif (en_tag[i] == '' and bert_tag[i] != '') or (en_tag[i] != '' and bert_tag[i] == '' and en_tag[i] != 'FAC' and en_tag[i] != 'VEH'):
        bert_en_tag.append(en_tag[i])
        bert_bert_tag.append(bert_tag[i])

In [None]:
print(classification_report(bert_en_tag, bert_bert_tag))