In [None]:
import json

In [None]:
with open("commands_init.txt", "r") as fp:
    commands = json.load(fp)

In [None]:
directions = [
    "влево",
    "левее",
    "налево",
    "слева",
    "вправо",
    "правее",
    "направо",
    "справа",
    "вверх",
    "выше",
    "вперед",
    "наверх",
    "сверху",
    "вниз",
    "ниже",
    "назад",
    "снизу",
    "влево вверх",
    "по диагонали влево вверх",
    "влево вниз",
    "по диагонали влево вниз",
    "вправо вверх",
    "по диагонали вправо вверх",
    "вправо вниз",
    "по диагонали вправо вниз",
]

ship_directions = [
    "влево",
    "левее",
    "налево",
    "слева",
    "вправо",
    "правее",
    "направо",
    "справа",
    "вверх",
    "выше",
    "вперед",
    "наверх",
    "сверху",
    "вниз",
    "ниже",
    "назад",
    "снизу",
]

tiles = [
    "поляна",
    "пустышка",
    "стрелка",
    "указатель",
    "конь",
    "лошадь",
    "бочка",
    "лабиринт",
    "джунгли",
    "пустыня",
    "болото",
    "горы",
    "лед",
    "капкан",
    "ловушка",
    "крокодил",
    "людоед",
    "крепость",
    "сундук",
    "деньги",
    "сокровища",
    "воздушный шар",
    "шар",
    "самолет",
    "пушка",
]

In [None]:
entities = {
    "move_ship_by_direction": "ship_direction",
    "move_pirate_by_direction": "direction",
    "move_pirate_by_tile": "tile",
    "pirate_swim": "ship_direction",
    "choose_your_fighter":"fighter",
    "move_on_horse":"horse_direction"
}
entity_examples = {
    "ship_direction": ship_directions,
    "direction": directions,
    "tile": tiles,
}

## Back Translation && Paraphrasing

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

MODEL_NAME = "cointegrated/rut5-base-paraphraser"
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model.cuda()
model.eval()


def paraphrase(text, beams=5, grams=4):
    x = tokenizer(text, return_tensors="pt", padding=True).to(model.device)
    max_size = int(x.input_ids.shape[1] * 1.5 + 10)
    out = model.generate(
        **x, encoder_no_repeat_ngram_size=grams, num_beams=beams, max_length=max_size
    )
    return tokenizer.decode(out[0], skip_special_tokens=True)

In [None]:
def paraphrase_all(commands: list):
    paraphrased = []
    for i, command in enumerate(commands):
        paraphrased.append(paraphrase(command))  # appends 1 command
    return paraphrased

In [None]:
from transformers import MarianMTModel, MarianTokenizer

# load models
language_code = "sl"
target_model_name = f"Helsinki-NLP/opus-mt-ru-{language_code}"
target_tokenizer = MarianTokenizer.from_pretrained(target_model_name)
target_model = MarianMTModel.from_pretrained(target_model_name)
back_model_name = f"Helsinki-NLP/opus-mt-{language_code}-ru"
back_tokenizer = MarianTokenizer.from_pretrained(back_model_name)
back_model = MarianMTModel.from_pretrained(back_model_name)


def back_translation(batch_texts: list):
    # translate
    translated_commands = perform_translation(
        batch_texts, target_model, target_tokenizer, "sl"
    )
    back_translated_commands = perform_translation(
        translated_commands, back_model, back_tokenizer, "ru"
    )
    return back_translated_commands


def perform_translation(batch_texts: list, model, tokenizer, target_language: str):
    translated = model.generate(
        **tokenizer(batch_texts, return_tensors="pt", padding=True)
    )
    translated_texts = [
        tokenizer.decode(t, skip_special_tokens=True) for t in translated
    ]
    return translated_texts

In [None]:
import re


def merge_2(original_commands: list, augmented_commands: list):
    new_items = set(augmented_commands) - set(original_commands)
    merge_result = original_commands + list(new_items)
    return merge_result


def merge(lists_to_merge: list):
    res = []
    for i, l in enumerate(lists_to_merge):
        l = [
            (re.sub("[^А-яа-я]+", " ", item[0]).lower(), item[1].lower()) for item in l
        ]
        res = merge_2(res, l)
    return res

In [None]:
def first_augmentation(commands: list):
    """Composition of back translation and paraphrasing"""
    origs = commands * 5
    commands_sl = back_translation(commands)
    commands_paraphrased = paraphrase_all(commands)
    paraphrase_slovenian = paraphrase_all(commands_sl)
    to_slovenian_paraphrased = back_translation(commands_paraphrased)
    assert (
        len(commands_paraphrased)
        + len(commands_sl)
        + len(paraphrase_slovenian)
        + len(to_slovenian_paraphrased)
        == len(commands) * 4
    ), "commands amounts don't match"
    all_commands = merge(
        [
            [(commands[i], commands[i]) for i in range(len(commands))],
            [(commands_paraphrased[i], commands[i]) for i in range(len(commands))],
            [(commands_sl[i], commands[i]) for i in range(len(commands))],
            [(paraphrase_slovenian[i], commands[i]) for i in range(len(commands))],
            [(to_slovenian_paraphrased[i], commands[i]) for i in range(len(commands))],
        ]
    )
    return all_commands

In [None]:
for command_with_entity in entities.keys():
    command_variants = commands[command_with_entity]
    entity = entities[command_with_entity]
    entity_variants = entity_examples[entity]
    new_variants = []
    for variant in command_variants:
        for entity_variant in entity_variants:
            new_variants.append(variant.replace(entity, entity_variant))
    commands[command_with_entity] = new_variants

In [None]:
firstly_augmented_commands = {}

for command in commands.keys():
    firstly_augmented_commands[command] = first_augmentation(commands[command])
    print(f"{command} -- done")

In [None]:
deleted = 0
for intent in firstly_augmented_commands.keys():
    clean_commands = []
    for command in firstly_augmented_commands[intent]:
        first_word = (command[0].split(" "))[0]
        if command[0].count(first_word) > 4:
            deleted += 1
        else:
            clean_commands.append(command)
    firstly_augmented_commands[intent] = clean_commands

In [None]:
with open("first_augm_big.txt", "w") as fp:
    json.dump(firstly_augmented_commands, fp)  # encode dict into JSON

In [None]:
with open("first_augm_big.txt", "r") as fp:
    commands = json.load(fp)

In [None]:
commands = firstly_augmented_commands

## Defne EDA

In [None]:
import random
from random import shuffle

random.seed(1)

stop_words = [
    "я",
    "мы",
    "наш",
    "мой",
    "сейчас",
    "тогда",
    "он",
    "его",
    "она",
    "ее",
    "для",
    "на",
    "в",
    "по",
    "это",
    "этот",
    "с",
    "от",
    "при",
    "к",
    "быть",
    "право",
    "вправо",
    "направо",
    "правый",
    "правее",
    "лево",
    "влево",
    "налево",
    "левый",
    "левее",
    "слева",
    "второй",
]

#### Formatting

In [None]:
import re
import pymorphy2


def format_command(command):
    result = command
    result = re.sub(" +", " ", result)  # delete extra spaces
    if result != "":
        if result[0] == " ":
            result = result[1:]
        if result[-1] == " ":
            result = result[:-1]
    return result


morph = pymorphy2.MorphAnalyzer()


def to_normal_form(command: str):
    words = command.split(" ")
    new_command = ""
    for word in words:
        new_command += morph.parse(word)[0].normal_form + " "
    return new_command[:-1]

#### Swap words

In [None]:
def preposition_index(words, word_idx):
    if word_idx > 0:
        if (
            morph.parse(words[word_idx - 1])[0].tag.POS == "PREP"
            or morph.parse(words[word_idx - 1])[0].tag.POS == "PRCL"
        ):
            return word_idx - 1
    if word_idx != len(words) - 1:
        if (
            morph.parse(words[word_idx])[0].tag.POS == "PREP"
            or morph.parse(words[word_idx])[0].tag.POS == "PRCL"
        ):
            return word_idx
    return -1

In [None]:
def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    return new_words


def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words) - 1)
    random_idx_2 = random_idx_1
    prep_1 = -1
    prep_2 = -1
    counter = 0
    while random_idx_2 == random_idx_1 or (prep_1 != -1 and prep_1 == prep_2):
        random_idx_2 = random.randint(0, len(new_words) - 1)
        counter += 1
        prep_1 = preposition_index(new_words, random_idx_1)
        prep_2 = preposition_index(new_words, random_idx_2)
        if counter > 5:
            return new_words

    if prep_1 != -1:
        random_idx_1 = prep_1
        new_words[random_idx_1] += " " + new_words[random_idx_1 + 1]
    if prep_2 != -1:
        random_idx_2 = prep_2
        new_words[random_idx_2] += " " + new_words[random_idx_2 + 1]

    new_words[random_idx_1], new_words[random_idx_2] = (
        new_words[random_idx_2],
        new_words[random_idx_1],
    )

    # order indices
    if random_idx_1 > random_idx_2:
        random_idx_1, random_idx_2 = (random_idx_2, random_idx_1)
        prep_1, prep_2 = (prep_2, prep_1)

    # fix preps and save next words for insertion
    if prep_1 != -1:
        prep_1_next_word = new_words.pop(random_idx_1 + 1)
        random_idx_2 -= 1  # deleted element before it
        new_words[random_idx_2] = new_words[random_idx_2].replace(
            f" {prep_1_next_word}", ""
        )
    if prep_2 != -1:
        prep_2_next_word = new_words.pop(random_idx_2 + 1)
        new_words[random_idx_1] = new_words[random_idx_1].replace(
            f" {prep_2_next_word}", ""
        )

    # insert next words
    if prep_1 != -1:
        new_words.insert(random_idx_2 + 1, prep_1_next_word)
    if prep_2 != -1:
        new_words.insert(random_idx_1 + 1, prep_2_next_word)

    return new_words

#### Replace with synonyms

In [None]:
def replace_with_synonym(words: list, n: int):
    """
    Replaces n random words with a synonym
    """
    new_words = words.copy()
    words_to_replace = list()
    word_position = dict()
    for i, word in enumerate(words):
        word = to_normal_form(word)
        if word not in stop_words:
            if is_in_vocab(word):
                words_to_replace.append(word)
        word_position[word] = i
    random.shuffle(words_to_replace)

    replaced = 0
    for replace_word in words_to_replace:
        synonyms = get_synonyms(replace_word)
        if len(synonyms) == 0:
            continue
        # pick random synonym
        new_word = format_command(random.choice(synonyms))
        new_words[word_position[replace_word]] = new_word
        replaced += 1
        if replaced == n:
            break

    return new_words

#### Insert words

In [None]:
def random_insertion(words, n):
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return new_words


def add_word(new_words):
    synonyms = []
    counter = 0
    while len(synonyms) < 1:
        random_word = to_normal_form(new_words[random.randint(0, len(new_words) - 1)])
        counter += 1
        if counter >= 10:
            return
        if random_word in stop_words or not is_in_vocab(random_word):
            continue
        synonyms = get_synonyms(random_word)
    random_synonym = synonyms[0]
    random_idx = random.randint(0, len(new_words) - 1)
    new_words.insert(random_idx, random_synonym)

#### Create pipeline

In [None]:
def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
    sentence = format_command(sentence)
    words = sentence.split(" ")
    words = [word for word in words if word != ""]
    num_words = len(words)

    augmented_sentences = []
    num_new_per_technique = int(num_aug / 4) + 1

    # synonyms
    if alpha_sr > 0:
        n_sr = max(1, int(alpha_sr * num_words))
        for _ in range(num_new_per_technique):
            a_words = replace_with_synonym(words, n_sr)
            augmented_sentences.append(" ".join(a_words))

    # insertions
    if alpha_ri > 0:
        n_ri = max(1, int(alpha_ri * num_words))
        for _ in range(num_new_per_technique):
            a_words = random_insertion(words, n_ri)
            augmented_sentences.append(" ".join(a_words))

    # swaps
    if alpha_rs > 0:
        n_rs = max(1, int(alpha_rs * num_words))
        for _ in range(num_new_per_technique):
            a_words = random_swap(words, n_rs)
            augmented_sentences.append(" ".join(a_words))

    augmented_sentences = [format_command(sentence) for sentence in augmented_sentences]
    shuffle(augmented_sentences)

    # append the original sentence
    augmented_sentences.append(sentence)

    return augmented_sentences

### For synonyms (word2vec)

In [None]:
tags_conversion = {
    "A": "ADJ",
    "ADV": "ADV",
    "ADVPRO": "ADV",
    "ANUM": "ADJ",
    "APRO": "DET",
    "COM": "ADJ",
    "CONJ": "SCONJ",
    "INTJ": "INTJ",
    "NONLEX": "X",
    "NUM": "NUM",
    "PART": "PART",
    "PR": "ADP",
    "S": "NOUN",
    "SPRO": "PRON",
    "UNKN": "X",
    "V": "VERB",
}

In [None]:
import gensim.downloader

vectors = gensim.downloader.load("word2vec-ruscorpora-300")

In [None]:
from pymystem3 import Mystem

m = Mystem()


def tag_word(word: str):
    processed = m.analyze(word)[0]
    lemma = processed["analysis"][0]["lex"].lower().strip()
    pos = processed["analysis"][0]["gr"].split(",")[0]
    pos = pos.split("=")[0].strip()
    tagged = lemma + "_" + tags_conversion[pos]
    return tagged


def is_in_vocab(word: str):
    try:
        word = tag_word(word)
        syns = vectors.most_similar(positive=word)
        return True
    except:
        return False


def get_synonyms(word: str):
    synonyms_vec = vectors.most_similar(positive=tag_word(word))
    synonyms = [item[0].split("_")[0] for item in synonyms_vec]
    return synonyms

## Augment with EDA



In [None]:
total_commands_count = 0
for intent in commands.keys():
    total_commands_count += len(commands[intent])

In [None]:
count = 0
progress_percentage = 0
augmented_commands = {}

for intent in commands.keys():
    all_commands = commands[intent]
    augmented_all = []
    for command in all_commands:
        augmented_list = eda(command[0])
        augmented = [
            (format_command(variant), command[1]) for variant in augmented_list
        ]
        augmented_all = merge([augmented_all, augmented])

        count += 1
        if round((count / total_commands_count) * 100) > progress_percentage:
            progress_percentage = round((count / total_commands_count) * 100)
            if progress_percentage % 10 == 0:
                print(f"{progress_percentage}% done...")
    augmented_commands[intent] = augmented_all

## Export

In [None]:
intent_column = []
command_column = []
orig_column = []
entity_column = []

entities = {
    "move_ship_by_direction": "ship_direction",
    "move_pirate_by_direction": "direction",
    "move_pirate_by_tile": "tile",
    "pirate_swim": "ship_direction",
    "choose_your_fighter": "fighter",
    "move_on_horse": "horse_direction",
}

for intent in augmented_commands.keys():
    for command in augmented_commands[intent]:
        command_column.append(command[0])
        orig_column.append(command[1])
        intent_column.append(intent)
        if intent in entities.keys():
            entity_column.append(entities[intent])
        else:
            entity_column.append("none")

In [None]:
import pandas as pd

df = pd.DataFrame(
    {
        "command": command_column,
        "orig": orig_column,
        "intent": intent_column,
        "entity": entity_column,
    }
)

In [None]:
df.to_csv('version-5.csv')

# Метрики

In [None]:
import pandas as pd

# import df

df = pd.read_csv("commands-entity-version-2-with-origs.csv")

# удаляем оригинальные варианты
eval_df = df[df["command"] != df["orig"]]

test_sentences = list(eval_df["command"])

## Перплексия

In [None]:
import numpy as np
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from tqdm.auto import tqdm

mname = "sberbank-ai/rugpt3small_based_on_gpt2"
gpt_tokenizer = AutoTokenizer.from_pretrained(mname)
gpt_model = AutoModelForCausalLM.from_pretrained(mname)
gpt_model.cuda()

In [None]:
# Habr version


def get_gpt2_ppl(test_sentences, aggregate=True, sep="\n"):
    """Calculate average perplexity per token and number of tokens in each text."""
    lls = []
    weights = []
    for text in tqdm(test_sentences):
        encodings = gpt_tokenizer(f"{sep}{text}{sep}", return_tensors="pt")
        input_ids = encodings.input_ids.to(gpt_model.device)
        target_ids = input_ids.clone()

        w = max(0, len(input_ids[0]) - 1)
        if w > 0:
            with torch.no_grad():
                outputs = gpt_model(input_ids, labels=target_ids)
                log_likelihood = outputs[0]
                ll = log_likelihood.item()
        else:
            ll = 0
        lls.append(ll)
        weights.append(w)

    likelihoods, weights = np.array(lls), np.array(weights)
    if aggregate:
        return sum(likelihoods * weights) / sum(weights)
    return likelihoods, weights

In [None]:
# huggingface version


def ppl(test_sentences, sep="\n"):
    """Calculate average perplexity per token and number of tokens in each text."""
    lls = []
    for text in tqdm(test_sentences):
        encodings = gpt_tokenizer(f"{sep}{text}{sep}", return_tensors="pt")
        input_ids = encodings.input_ids.to(gpt_model.device)
        target_ids = input_ids.clone()

        with torch.no_grad():
            outputs = gpt_model(input_ids, labels=target_ids)
            ll = outputs.loss
        lls.append(ll)
    return torch.exp(torch.stack(lls).mean())

In [None]:
get_gpt2_ppl(test_sentences)

## Сохранение смысла

In [None]:
import evaluate

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

In [None]:
def get_similarity_metrics(predictions, references):
    bleu_results = bleu.compute(predictions=predictions, references=references)
    rouge_results = rouge.compute(
        predictions=predictions, references=references, tokenizer=lambda x: x.split()
    )
    bertscore_results = bertscore.compute(
        predictions=predictions, references=references, lang="ru"
    )
    results = {
        "bleu": bleu_results,
        "rouge": rouge_results,
        "bertscore": bertscore_results,
    }
    return results

In [None]:
references = eval_df["orig"]
predictions = eval_df["command"]

In [None]:
results = get_similarity_metrics(predictions, references)

### BLEU

precisions (list of floats): geometric mean of n-gram precisions

In [None]:
results["bleu"]

### ROUGE

In [None]:
results["rouge"]

### BERTScore

In [None]:
bertscore_results = results["bertscore"]
print(f"mean precision: {np.mean(bertscore_results['precision'])}")
print(f"mean recall: {np.mean(bertscore_results['recall'])}")
print(f"mean f1: {np.mean(bertscore_results['f1'])}")

## Currently Unused

### Back Translation: other languages

In [None]:
def back_traslation(batch_texts: list, language_code: str):
    
    # load models
    target_model_name = f'Helsinki-NLP/opus-mt-ru-{language_code}'
    target_tokenizer = MarianTokenizer.from_pretrained(target_model_name)
    target_model = MarianMTModel.from_pretrained(target_model_name)
    back_model_name = f'Helsinki-NLP/opus-mt-{language_code}-ru'
    back_tokenizer = MarianTokenizer.from_pretrained(back_model_name)
    back_model = MarianMTModel.from_pretrained(back_model_name)
    
    # translate
    translated_commands = perform_translation(original_commands, target_model, target_tokenizer, language_code)
    back_translated_commands = perform_translation(translated_commands, back_model, back_tokenizer, 'ru')
    return back_translated_commands
    


def perform_translation(batch_texts: list, model, tokenizer, target_language: str):
    translated = model.generate(**tokenizer(batch_texts, return_tensors="pt", padding=True))
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    return translated_texts

In [None]:
original_commands = ['подвинь корабль вправо',
 'плыви вправо',
 'плыву направо',
 'двигаюсь кораблем вправо',
 'право руля',
 'уплыви вправо',
 'отчаливаю направо',
 'судно направо',
 'мы поплывем правее', 
 'на лодке вправо',
 'врубай мотор идем на восток',
 'кораблем направо',
 'я иду кораблем направо',
 'корабль на клетку вправо',
 'шаг правее кораблем']

In [None]:
# английский
back_traslation(original_commands, 'en')

In [None]:
# французский
back_traslation(original_commands, 'fr')

In [None]:
# украинский
back_traslation(original_commands, 'uk')

In [None]:
# словенский (класс!)
back_traslation(original_commands, 'sl')

In [None]:
# латышский
back_traslation(original_commands, 'lv')

In [None]:
augmented_by_sl = merge([original_commands, back_traslation(original_commands, 'sl')])
augmented_by_sl

### Тезаурус RuWordNet

- [Тезаурус для русского языка](https://github.com/avidale/python-ruwordnet)
- Основные сущности:
    - Sense - одно слово/словосочетание с конкретным значением
    - Synset - множество Sense'ов с одинаковыми значениями и  частью речи
- Поддерживаются разлтичные отношения между синсетами:
    <div>
<img src="attachment:image.png" width="700"/>
</div>

In [None]:
from ruwordnet import RuWordNet

wn = RuWordNet()

В EDA использовались такие методы вместо генерации синонимов word2vec:

In [None]:
def replace_with_related(words: list, n: int, relation_type: str):
    """
    Replaces n random words with a synonym or hyponym 
    (based on relation_type param).
    """
    new_words = words.copy()
    words_to_replace = list()
    word_position = dict()
    for i, word in enumerate(words):
        if word not in stop_words:
            words_to_replace.append(word)
        word_position[word] = i
    random.shuffle(words_to_replace)
    
    replaced = 0
    for replace_word in words_to_replace:
        related = get_related(replace_word, relation_type)
        if len(related) == 0:
            continue
        # pick random synonym/hyponym
        new_word = format_command(random.choice(related))
        new_words[word_position[replace_word]] = new_word
        replaced += 1
        if replaced == n:
            break

    return new_words

def get_related(word: str, relation_type:str):
    related = []
    if (relation_type == 'synonym'):
        for sense in wn.get_senses(word):
            related.append(sense.synset.title)
    else: # hyponyms
        for sense in wn.get_senses(word):
            for item in sense.synset.hyponyms:
                related.append(item.title)
    return related