## Data Augmentation with MarianMT using Back-Translation

### Initialize the models for English <-> Foreign Languages


In [1]:
import warnings
import random
warnings.filterwarnings('ignore')

import pandas as pd
from tqdm import trange
from transformers import MarianMTModel, MarianTokenizer

target_model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
target_tokenizer = MarianTokenizer.from_pretrained(target_model_name)
target_model = MarianMTModel.from_pretrained(target_model_name)

en_model_name = 'Helsinki-NLP/opus-mt-ROMANCE-en'
en_tokenizer = MarianTokenizer.from_pretrained(en_model_name)
en_model = MarianMTModel.from_pretrained(en_model_name)

In [2]:
def translate(texts, model, tokenizer, language="fr"):
    """Prepare the text data into appropriate format for the model"""
    template = lambda text: f"{text}" if language == "en" else f">>{language}<< {text}"
    src_texts = [template(text) for text in texts]

    # Tokenize the texts
    encoded = tokenizer.prepare_seq2seq_batch(src_texts,
                                              return_tensors='pt')
    
    # Generate translation using model
    translated = model.generate(**encoded)

    # Convert the generated tokens indices back into text
    translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
    
    return translated_texts

In [3]:
def back_translate(texts, source_lang="en", target_lang="fr", verbose=False):
    """translate to target language and back to source language"""
    target_lang_texts = translate(texts,
                                  target_model,
                                  target_tokenizer,
                                  language=target_lang)
    if verbose:
        print('Intermediate Target Language texts:')
        print(target_lang_texts)
    # Translate from target language back to source language
    back_translated_texts = translate(target_lang_texts, en_model, en_tokenizer, 
                                      language=source_lang)
    
    back_translated_texts = [t for t in back_translated_texts if t not in texts]
    return back_translated_texts

### Perform Augmentation using English <-> Spanish

In [4]:
en_texts = ['Cannot access website', 'I hated the food', "I can't login to my vpn"]

In [5]:
aug_texts = back_translate(en_texts, source_lang="en", target_lang="es", verbose=True)
print(aug_texts)

Intermediate Target Language texts:
['No se puede acceder al sitio web', 'Odiaba la comida.', 'No puedo acceder a mi vpn']
['Cannot access the website', 'I hated food.', "I can't access my vpn"]


### Perform Augmentation using English <-> Italian

In [6]:
aug_texts = back_translate(en_texts, source_lang="en", target_lang="it", verbose=True)
print(aug_texts)

Intermediate Target Language texts:
['Impossibile accedere al sito web', 'Odiavo il cibo.', 'Non posso accedere al mio vpn']
['Unable to access website', 'I hated food.', "I can't access my vpn"]


### Perform Augmentation using English <-> French



In [7]:
aug_texts = back_translate(en_texts, source_lang="en", target_lang="fr", verbose=True)
print(aug_texts)

Intermediate Target Language texts:
["Impossible d'accéder au site Web", "J'ai détesté la nourriture.", 'Je ne peux pas me connecter à mon vpn']
['Unable to access website', 'I hated food.', "I can't connect to my vpn"]


In [8]:
dataset = pd.read_csv('./data/preprocessed_data_groups.csv')

In [9]:
minority_class_descr = dataset[dataset.label == 1].translated_description.tolist()

In [10]:
len(minority_class_descr)

2514

In [11]:
size = 3
augmented = list()
for i in trange(len(minority_class_descr)//size):
    subset = minority_class_descr[i*size:(i+1)*size]
    augmented.append(back_translate(subset, source_lang="en", target_lang="es"))
    augmented.append(back_translate(subset, source_lang="en", target_lang="it"))
    augmented.append(back_translate(subset, source_lang="en", target_lang="fr"))

len(augmented)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 838/838 [10:33:31<00:00, 45.36s/it]


2514

In [12]:
augmented = [i for j in augmented for i in j]
augmented = [i for i in augmented if i not in minority_class_descr]
len(augmented)

7202

In [13]:
dataset.label.value_counts()

0    5985
1    2514
Name: label, dtype: int64

In [14]:
pd.Series(augmented)

0       event critical hostname company with value mou...
1       duplicate soft network two devices try sharing...
2       problem solving printer work printer replaceme...
3       critical event hostname company with value mou...
4       duplicate delicate network two devices test sh...
                              ...                        
7197    no this operation impossible access macne util...
7198    multiple pc can not be opened versceden prgram...
7199              come receive e-mail send zz mail advice
7200    no this operation inaccessible macne utility f...
7201       multiple pc can not be opened prgramdntyme cnc
Length: 7202, dtype: object

In [15]:
augmented_df = pd.DataFrame(columns=dataset.columns)
augmented_df.translated_description = augmented
augmented_df.label = 1
augmented_df.sample(7)

Unnamed: 0,translated_description,keywords,short_description,description,group,cleaned_description,cleaned_short_description,merged_description,char_length,word_length,short_char_length,short_word_length,language,language_confidence,label
3264,reset password sid sid bubble user kindly make...,,,,,,,,,,,,,,1
1296,hostname volume tag dat hostname server eafe s...,,,,,,,,,,,,,,1
4760,work hr payroll n or work programmer failure,,,,,,,,,,,,,,1
5656,The Commission has adopted a proposal for a Co...,,,,,,,,,,,,,,1
3027,appear pdf output engineering tool dear bhughj...,,,,,,,,,,,,,,1
779,system system system system system system syst...,,,,,,,,,,,,,,1
4860,work edit user hang forever user admin managem...,,,,,,,,,,,,,,1


In [16]:
augmented_df = pd.concat([dataset, augmented_df])

In [17]:
augmented_df.shape

(15701, 15)

In [18]:
group0_descr = dataset[dataset.label == 0].translated_description.tolist()
group1_descr = dataset[dataset.label == 1].translated_description.tolist()
len(group0_descr), len(group1_descr)

(5985, 2514)

In [19]:
group0_sample = random.sample(group0_descr, len(group1_descr))

In [20]:
len(group0_sample)

2514

In [21]:
size = 3
augmented = list()
for i in trange(len(group0_sample)//size):
    subset = group0_sample[i*size:(i+1)*size]
    augmented.append(back_translate(subset, source_lang="en", target_lang="es"))
    augmented.append(back_translate(subset, source_lang="en", target_lang="it"))
    augmented.append(back_translate(subset, source_lang="en", target_lang="fr"))

len(augmented)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 838/838 [2:35:41<00:00, 11.15s/it]


2514

In [22]:
augmented = [i for j in augmented for i in j]
augmented = [i for i in augmented if i not in group0_sample]
len(augmented)

7124

In [23]:
augmented_df2 = pd.DataFrame(columns=augmented_df.columns)
augmented_df2.translated_description = augmented
augmented_df2.label = 0
augmented_df2.sample(7)

Unnamed: 0,translated_description,keywords,short_description,description,group,cleaned_description,cleaned_short_description,merged_description,char_length,word_length,short_char_length,short_word_length,language,language_confidence,label
2868,work work failure work programmer,,,,,,,,,,,,,,0
6149,the user needs an access engineering tool,,,,,,,,,,,,,,0
2914,Account lock,,,,,,,,,,,,,,0
1608,unlock songyody user erp,,,,,,,,,,,,,,0
3971,erp sid password reset,,,,,,,,,,,,,,0
6776,intermittent computer shutdown,,,,,,,,,,,,,,0
1443,problem identifier check user details name che...,,,,,,,,,,,,,,0


In [24]:
augmented_df = pd.concat([augmented_df, augmented_df2])

In [25]:
augmented_df.shape

(22825, 15)

In [26]:
augmented_df.to_csv('./data/augmented_data.csv', index=None)

In [28]:
augmented_df.label.value_counts()

0    13109
1     9716
Name: label, dtype: int64

### OverSampling the minority class to balance the label distribution

In [40]:
import numpy as np
from imblearn.over_sampling import RandomOverSampler

In [41]:
X = np.array(augmented_df.translated_description).reshape(-1, 1)
y = np.array(augmented_df.label).reshape(-1, 1)
X.shape, y.shape

((22825, 1), (22825, 1))

In [42]:
oversampler = RandomOverSampler(sampling_strategy='minority')

X_upsampled, y_upsampled = oversampler.fit_resample(X, y)
X_upsampled = np.squeeze(X_upsampled)
y_upsampled = np.squeeze(y_upsampled)
X_upsampled.shape, y_upsampled.shape

((26218,), (26218,))

In [43]:
X_upsampled[0], y_upsampled[0]

('verify user detail name check user name ad reset password advise user login check caller confirm able login issue resolve',
 0)

In [44]:
upsampled_df = pd.DataFrame({"translated_description": X_upsampled, "label": y_upsampled})
upsampled_df.shape

(26218, 2)

In [45]:
upsampled_df.label.value_counts()  # BALANCED DATA!

1    13109
0    13109
Name: label, dtype: int64

In [46]:
upsampled_df.to_csv('./data/augmented_upsampled_data.csv', index=None)