In [33]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [62]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import string

np.random.seed(0)

Reading of the test datasets -- in English, and in Russian.

In [35]:
df_en_test = pd.read_csv('/content/drive/My Drive/abbyy/en_test')
df_ru_test = pd.read_csv('/content/drive/My Drive/abbyy/ru_test')

In [36]:
df_en_test.values.shape

(422, 3)

In [37]:
X_test_ru, y_test_ru =  df_ru_test['text'].values, df_ru_test['target'].values
X_test_en, y_test_en = df_en_test['text'].values, df_en_test['target'].values

Reading of the extracted keywords for both languages.

In [38]:
keywords_ru = pd.read_csv('/content/drive/My Drive/abbyy/keywords_ru.csv')
keywords_en = pd.read_csv('/content/drive/My Drive/abbyy/keywords_en.csv')

In [44]:
def get_keywords_by_topic(keywords_df):
  result = dict()

  for keyword, topic in zip(keywords_df['keyword'].values, keywords_df['topic'].values):
    if topic not in result:
      result[topic] = set()
    result[topic].add(keyword)
  return result

In [45]:
ru_keywords_by_topic = get_keywords_by_topic(keywords_ru)
en_keywords_by_topic = get_keywords_by_topic(keywords_en)

Reading of the most frequent words for both languages.

In [51]:
def read_list_from_file(filename):
  result_list = []
  with open(filename, 'r') as fin:
    for line in fin:
      result_list.append(line.strip())
  return result_list

In [52]:
frequent_words_ru = read_list_from_file('/content/drive/My Drive/abbyy/frequent_words_ru')
frequent_words_en = read_list_from_file('/content/drive/My Drive/abbyy/frequent_words_en')

In [55]:
!cat '/content/drive/My Drive/abbyy/frequent_words_en'

one
may
would
also
time
–
shall
said
new
work
like
people
first
use
nt
could
system
information
two
see
well
made
state
make
article
many
us
doc
united
military
order
international
even
police
states
must
need
way
data
c
children
take
world
get
used
years
court
good
case
n


Creation of datasets with substituted keywords.

In [64]:
def normalize_word(word):
  return word.translate(str.maketrans("", "", string.punctuation)).lower()

def modify_sentence(sentence, keywords_set, frequent_list, substitution_prob):
  found_keywords = 0
  substituted_keywords = 0
  mod_sentence = []

  for word in sentence.split():
    mod_word = normalize_word(word)
    if mod_word in keywords_set:
      found_keywords += 1
      rand = np.random.uniform()
      if rand <= substitution_prob:
        substituted_keywords += 1
        mod_sentence.append(np.random.choice(frequent_list))
      else:
        mod_sentence.append(word)
  return found_keywords, substituted_keywords, ' '.join(mod_sentence)


def make_dataset_with_substitution(X, y, keywords_dict, frequent_list, substitution_prob):
  result_df = pd.DataFrame()
  found_keywords = 0
  substituted_keywords = 0

  for sentence, topic in zip(X, y):
    found, substituted, mod_sentence = modify_sentence(
        sentence, keywords_dict[topic], frequent_list, substitution_prob
    )
    result_df = result_df.append({'text': mod_sentence, 'target': topic}, ignore_index=True)
    found_keywords += found
    substituted_keywords += substituted
  
  print("found keywords: {}, substituted keywords: {}".format(found_keywords, substituted_keywords))
  return result_df

In [65]:
en_substituted_100 = make_dataset_with_substitution(X_test_en, y_test_en, en_keywords_by_topic, frequent_words_en, 1.0)
en_substituted_50 = make_dataset_with_substitution(X_test_en, y_test_en, en_keywords_by_topic, frequent_words_en, 0.5)
en_substituted_25 = make_dataset_with_substitution(X_test_en, y_test_en, en_keywords_by_topic, frequent_words_en, 0.25)
en_substituted_10 = make_dataset_with_substitution(X_test_en, y_test_en, en_keywords_by_topic, frequent_words_en, 0.1)

found keywords: 13247, substituted keywords: 13247
found keywords: 13247, substituted keywords: 6603
found keywords: 13247, substituted keywords: 3418
found keywords: 13247, substituted keywords: 1347


In [69]:
en_substituted_100.to_csv('/content/drive/My Drive/abbyy/en_test_substitution100')
en_substituted_50.to_csv('/content/drive/My Drive/abbyy/en_test_substitution50')
en_substituted_25.to_csv('/content/drive/My Drive/abbyy/en_test_substitution25')
en_substituted_10.to_csv('/content/drive/My Drive/abbyy/en_test_substitution10')

In [66]:
ru_substituted_100 = make_dataset_with_substitution(X_test_ru, y_test_ru, ru_keywords_by_topic, frequent_words_ru, 1.0)
ru_substituted_50 = make_dataset_with_substitution(X_test_ru, y_test_ru, ru_keywords_by_topic, frequent_words_ru, 0.5)
ru_substituted_25 = make_dataset_with_substitution(X_test_ru, y_test_ru, ru_keywords_by_topic, frequent_words_ru, 0.25)
ru_substituted_10 = make_dataset_with_substitution(X_test_ru, y_test_ru, ru_keywords_by_topic, frequent_words_ru, 0.1)

found keywords: 12255, substituted keywords: 12255
found keywords: 12255, substituted keywords: 6041
found keywords: 12255, substituted keywords: 3100
found keywords: 12255, substituted keywords: 1194


In [70]:
ru_substituted_100.to_csv('/content/drive/My Drive/abbyy/ru_test_substitution100')
ru_substituted_50.to_csv('/content/drive/My Drive/abbyy/ru_test_substitution50')
ru_substituted_25.to_csv('/content/drive/My Drive/abbyy/ru_test_substitution25')
ru_substituted_10.to_csv('/content/drive/My Drive/abbyy/ru_test_substitution10')

In [72]:
!head '/content/drive/My Drive/abbyy/en_test_substitution100'

,target,text
0,A22,see need
1,A7,article may make information use well – good must military nt may even make people united one – information take international may must people doc must world time
2,A1,good also children well make
3,A12,made – one
4,A1,order need military world c one get make states order two well doc may state people system one court children article also get children international would also nt also state years system made system see even shall use article two could good may used article
5,A9,good first state – even well get
6,A4,united would many court military c take court used work years made states police need states system may must children case c order police military would first would made people information two see military work even united take state world article also doc court see make information one need world united time see use new people united like n must states get two may case system must world use data use system could must said would years time n 