In [23]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import string
from os.path import join as pathjoin
import tqdm
import spacy_udpipe

np.random.seed(0)

In [2]:
DATA_DIR = '/home/mlepekhin/data'
MODELS_DIR = '/home/mlepekhin/models'

Reading of the test datasets -- in English, and in Russian.

In [3]:
df_en_test = pd.read_csv(pathjoin(DATA_DIR, 'en_test'))
df_ru_test = pd.read_csv(pathjoin(DATA_DIR, 'ru_test'))

In [4]:
df_en_test.values.shape

(422, 3)

In [5]:
X_test_ru, y_test_ru =  df_ru_test['text'].values, df_ru_test['target'].values
X_test_en, y_test_en = df_en_test['text'].values, df_en_test['target'].values

Reading of the extracted keywords for both languages.

In [6]:
keywords_ru_noun = pd.read_csv(pathjoin(DATA_DIR, f'keywords_ru_noun.csv'))
keywords_ru_adj = pd.read_csv(pathjoin(DATA_DIR, 'keywords_ru_adj.csv'))
keywords_ru_adv = pd.read_csv(pathjoin(DATA_DIR, 'keywords_ru_adv.csv'))
keywords_ru_verb = pd.read_csv(pathjoin(DATA_DIR, 'keywords_ru_verb.csv'))

keywords_en_noun = pd.read_csv(pathjoin(DATA_DIR, 'keywords_en_noun.csv'))
keywords_en_adj = pd.read_csv(pathjoin(DATA_DIR, 'keywords_en_adj.csv'))
keywords_en_adv = pd.read_csv(pathjoin(DATA_DIR, 'keywords_en_adv.csv'))
keywords_en_verb = pd.read_csv(pathjoin(DATA_DIR, 'keywords_en_verb.csv'))

In [7]:
def get_keywords_by_topic_and_pos(keywords_df, tag, result_dict):
    for keyword, topic in zip(keywords_df['keyword'].values, keywords_df['topic'].values):
        if (topic, tag) not in result_dict:
            result_dict[(topic, tag)] = list()
        result_dict[(topic, tag)].append(keyword)

In [11]:
pos_set = {'ADV', 'ADJ', 'NOUN', 'VERB'}

In [13]:
ru_keywords_by_topic_pos = dict()
for pos in pos_set:
    get_keywords_by_topic_and_pos(keywords_ru_noun, pos, ru_keywords_by_topic_pos)
    
en_keywords_by_topic_pos = dict()
for pos in pos_set:
    get_keywords_by_topic_and_pos(keywords_en_noun, pos, en_keywords_by_topic_pos)

In [14]:
def make_replace_dict(keywords_by_topic_pos):
    replace_dict = dict()
    
    for key, word_list in keywords_by_topic_pos.items():
        replace_dict[key] = []
        for another_key, another_word_list in keywords_by_topic_pos.items():
            if key != another_key:
                replace_dict[key] += another_word_list
    return replace_dict

In [15]:
ru_replace_dict = make_replace_dict(ru_keywords_by_topic_pos)
en_replace_dict = make_replace_dict(en_keywords_by_topic_pos)

Creation of datasets with substituted keywords.

In [24]:
nlp_ru = spacy_udpipe.load("ru")
nlp_en = spacy_udpipe.load("en")

In [31]:
def modify_sentence(sentence, keywords_dict, replace_dict, substitution_prob, model, topic):
    found_keywords = 0
    substituted_keywords = 0
    mod_sentence = []
    doc = model(sentence)

    for word in doc:
        if word.pos_ in pos_set and word.text in keywords_dict[(topic, word.pos_)]:
            found_keywords += 1
            rand = np.random.uniform()
            if rand <= substitution_prob:
                substituted_keywords += 1
                mod_sentence.append(np.random.choice(replace_dict[(topic, word.pos_)]))
            else:
                mod_sentence.append(word.text)
    return found_keywords, substituted_keywords, ' '.join(mod_sentence)


def make_dataset_with_substitution(X, y, keywords_dict, replace_dict, substitution_prob, model):
    result_df = pd.DataFrame()
    found_keywords = 0
    substituted_keywords = 0

    for sentence, topic in tqdm.tqdm(zip(X, y)):
        found, substituted, mod_sentence = modify_sentence(
            sentence, keywords_dict, replace_dict, substitution_prob, model, topic
        )
        result_df = result_df.append({'text': mod_sentence, 'target': topic}, ignore_index=True)
        found_keywords += found
        substituted_keywords += substituted
  
    print("found keywords: {}, substituted keywords: {}".format(found_keywords, substituted_keywords))
    return result_df

In [None]:
en_substituted_100 = make_dataset_with_substitution(
    X_test_en, y_test_en, en_keywords_by_topic_pos, en_replace_dict, 1.0, nlp_en
)
en_substituted_50 = make_dataset_with_substitution(
    X_test_en, y_test_en, en_keywords_by_topic_pos, en_replace_dict, 0.5, nlp_en
)
en_substituted_25 = make_dataset_with_substitution(
    X_test_en, y_test_en, en_keywords_by_topic_pos, en_replace_dict, 0.25, nlp_en
)
en_substituted_10 = make_dataset_with_substitution(
    X_test_en, y_test_en, en_keywords_by_topic_pos, en_replace_dict, 0.1, nlp_en
)

413it [06:29,  4.05it/s]

In [None]:
en_substituted_100.to_csv(pathjoin(DATA_DIR, 'en_test_substitution100'))
en_substituted_50.to_csv(pathjoin(DATA_DIR, 'en_test_substitution50'))
en_substituted_25.to_csv(pathjoin(DATA_DIR, 'en_test_substitution25'))
en_substituted_10.to_csv(pathjoin(DATA_DIR, 'en_test_substitution10'))

In [None]:
ru_substituted_100 = make_dataset_with_substitution(
    X_test_ru, y_test_ru, ru_keywords_by_topic_pos, ru_replace_dict, 1.0, nlp_ru
)
ru_substituted_50 = make_dataset_with_substitution(
    X_test_ru, y_test_ru, ru_keywords_by_topic_pos, ru_replace_dict, 0.5, nlp_ru
)
ru_substituted_25 = make_dataset_with_substitution(
    X_test_ru, y_test_ru, ru_keywords_by_topic_pos, ru_replace_dict, 0.25, nlp_ru
)
ru_substituted_10 = make_dataset_with_substitution(
    X_test_ru, y_test_ru, ru_keywords_by_topic_pos, ru_replace_dict, 0.1, nlp_ru
)

In [None]:
ru_substituted_100.to_csv(pathjoin(DATA_DIR, 'ru_test_substitution100'))
ru_substituted_50.to_csv(pathjoin(DATA_DIR, 'ru_test_substitution50'))
ru_substituted_25.to_csv(pathjoin(DATA_DIR, 'ru_test_substitution25'))
ru_substituted_10.to_csv(pathjoin(DATA_DIR, 'ru_test_substitution10'))

In [33]:
print(1)

1
