In [7]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import string
from os.path import join as pathjoin
import tqdm

np.random.seed(0)

In [8]:
from natasha import Doc, Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

In [9]:
DATA_DIR = '/home/mlepekhin/data'
MODELS_DIR = '/home/mlepekhin/models'

Reading of the test datasets -- in English, and in Russian.

In [10]:
df_ru_test = pd.read_csv(pathjoin(DATA_DIR, 'ru_test'))

In [11]:
df_en_test.values.shape

(422, 3)

In [12]:
X_test_ru, y_test_ru =  df_ru_test['text'].values, df_ru_test['target'].values

Reading of the extracted keywords for both languages.

In [13]:
keywords_ru_noun = pd.read_csv(pathjoin(DATA_DIR, 'natasha_keywords_ru_noun.csv'))
keywords_ru_adj = pd.read_csv(pathjoin(DATA_DIR, 'natasha_keywords_ru_adj.csv'))
keywords_ru_adv = pd.read_csv(pathjoin(DATA_DIR, 'natasha_keywords_ru_adv.csv'))
keywords_ru_verb = pd.read_csv(pathjoin(DATA_DIR, 'natasha_keywords_ru_verb.csv'))

In [14]:
def get_keywords_by_topic_and_pos(keywords_df, tag, result_dict):
    for keyword, topic in zip(keywords_df['keyword'].values, keywords_df['topic'].values):
        if (topic, tag) not in result_dict:
            result_dict[(topic, tag)] = list()
        result_dict[(topic, tag)].append(keyword)

In [15]:
pos_set = {'ADV', 'ADJ', 'NOUN', 'VERB'}

In [16]:
ru_keywords_by_topic_pos = dict()
for pos in pos_set:
    get_keywords_by_topic_and_pos(keywords_ru_noun, pos, ru_keywords_by_topic_pos)

In [17]:
def make_replace_dict(keywords_by_topic_pos):
    replace_dict = dict()
    
    for key, word_list in keywords_by_topic_pos.items():
        replace_dict[key] = []
        for another_key, another_word_list in keywords_by_topic_pos.items():
            if key != another_key:
                replace_dict[key] += another_word_list
    return replace_dict

In [18]:
ru_replace_dict = make_replace_dict(ru_keywords_by_topic_pos)

Creation of datasets with substituted keywords.

In [24]:
def modify_sentence(sentence, keywords_dict, replace_dict, substitution_prob, topic):
    found_keywords = 0
    substituted_keywords = 0
    mod_sentence = []
    doc = Doc(sentence)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)

    for word in doc.tokens:
        if word.pos in pos_set and word.text in keywords_dict[(topic, word.pos)]:
            found_keywords += 1
            rand = np.random.uniform()
            if rand <= substitution_prob:
                substituted_keywords += 1
                mod_sentence.append(np.random.choice(replace_dict[(topic, word.pos)]))
            else:
                mod_sentence.append(word.text)
    return found_keywords, substituted_keywords, ' '.join(mod_sentence)


def make_dataset_with_substitution(X, y, keywords_dict, replace_dict, substitution_prob):
    result_df = pd.DataFrame()
    found_keywords = 0
    substituted_keywords = 0

    for sentence, topic in tqdm.tqdm(zip(X, y)):
        found, substituted, mod_sentence = modify_sentence(
            sentence, keywords_dict, replace_dict, substitution_prob, topic
        )
        result_df = result_df.append({'text': mod_sentence, 'target': topic}, ignore_index=True)
        found_keywords += found
        substituted_keywords += substituted
  
    print("found keywords: {}, substituted keywords: {}".format(found_keywords, substituted_keywords))
    return result_df

In [25]:
ru_substituted_100 = make_dataset_with_substitution(
    X_test_ru, y_test_ru, ru_keywords_by_topic_pos, ru_replace_dict, 1.0
)
ru_substituted_50 = make_dataset_with_substitution(
    X_test_ru, y_test_ru, ru_keywords_by_topic_pos, ru_replace_dict, 0.5
)
ru_substituted_25 = make_dataset_with_substitution(
    X_test_ru, y_test_ru, ru_keywords_by_topic_pos, ru_replace_dict, 0.25
)
ru_substituted_10 = make_dataset_with_substitution(
    X_test_ru, y_test_ru, ru_keywords_by_topic_pos, ru_replace_dict, 0.1
)
ru_substituted_5 = make_dataset_with_substitution(
    X_test_ru, y_test_ru, ru_keywords_by_topic_pos, ru_replace_dict, 0.05
)
ru_substituted_2 = make_dataset_with_substitution(
    X_test_ru, y_test_ru, ru_keywords_by_topic_pos, ru_replace_dict, 0.02
)

483it [00:45, 10.55it/s]
0it [00:00, ?it/s]

found keywords: 5966, substituted keywords: 5966


483it [00:46, 10.43it/s]
0it [00:00, ?it/s]

found keywords: 5966, substituted keywords: 3042


483it [00:46, 10.45it/s]
0it [00:00, ?it/s]

found keywords: 5966, substituted keywords: 1537


483it [00:46, 10.45it/s]

found keywords: 5966, substituted keywords: 594





In [26]:
ru_substituted_100.to_csv(pathjoin(DATA_DIR, 'natasha_ru_test_substitution100'))
ru_substituted_50.to_csv(pathjoin(DATA_DIR, 'natasha_ru_test_substitution50'))
ru_substituted_25.to_csv(pathjoin(DATA_DIR, 'natasha_ru_test_substitution25'))
ru_substituted_10.to_csv(pathjoin(DATA_DIR, 'natasha_ru_test_substitution10'))
ru_substituted_5.to_csv(pathjoin(DATA_DIR, 'natasha_ru_test_substitution5'))
ru_substituted_2.to_csv(pathjoin(DATA_DIR, 'natasha_ru_test_substitution2'))