# NLP Data augmantation project 

This notebook presents some different services to use a backtranslation approach for aumenting data for NLP tasks.

__Warning:__ there are a few directories pointing to a private drive account but they can be easily changed to reproduce the work. API keys are not provided here.

## Importing initial data

In [5]:
import pandas as pd

data = pd.read_csv('/content/drive/My Drive/project_codebase/project/data/requests.csv', sep=';')
demandes = data['demande']
data

Unnamed: 0,demande,motif,groupe_motif
0,je viens enregistrer mon bac,Enregistrement de PACS,01c - Etat Civil PACS Enregistrement
1,je viens de pardon je viens pour déposer un do...,Enregistrement de PACS,01c - Etat Civil PACS Enregistrement
2,je souhaite enregistrer une convention de PACS...,Enregistrement de PACS,01c - Etat Civil PACS Enregistrement
3,bonjour je viens pour un enregistrement de PACS,Enregistrement de PACS,01c - Etat Civil PACS Enregistrement
4,je souhaite enregistrer une convention de pact...,Enregistrement de PACS,01c - Etat Civil PACS Enregistrement
...,...,...,...
1183,nous souhaitons annuler notre pacte civil de s...,"PACS (Dépôt de dossier, modification ou dissol...","01d - Etat Civil PACS Modification, dissolution"
1184,je viens de déposer mon dossier de PACS,"PACS (Dépôt de dossier, modification ou dissol...","01d - Etat Civil PACS Modification, dissolution"
1185,je me PACS,"PACS (Dépôt de dossier, modification ou dissol...","01d - Etat Civil PACS Modification, dissolution"
1186,je souhaite modifier mon contrat PACS,"PACS (Dépôt de dossier, modification ou dissol...","01d - Etat Civil PACS Modification, dissolution"


## Back translation with Yandex

In [0]:
import requests
import json

api = 'https://translate.yandex.net/api/v1.5/tr.json/translate'
key = '<Your API key>'

languages = ['en', 'es', 'ru', 'de', 'ar', 'it'] # , 'ja', 'ca', 'zh'

In [0]:
result = []

In [0]:
for i, request in demandes.loc[1168:].iteritems():
    for lang in languages:
        text = request
        lang, back = f'fr-{lang}', f'{lang}-fr'

        r = requests.post(api, data={'key':key,
                                     'text':text,
                                    'lang':lang})
        answer = json.loads(r.text)
        if answer['code'] != 200:
            print(answer)
        translated_text = answer['text'][0]

        r = requests.post(api, data={'key':key,
                                    'text':translated_text,
                                    'lang':back})
        answer = json.loads(r.text)
        if answer['code'] != 200:
            print(answer)
        final = answer['text'][0]
        result.append([i, final])
    print(i)


In [0]:
augmented = pd.DataFrame(result, columns=['corresponding_example_id', 'generated'])
# augmented.to_csv('/content/drive/My Drive/project_codebase/project/generated_yandex.csv', index=False)

## Back translation with Azure

In [0]:
import requests, uuid, json

subscription_key = '<Your API key>'
endpoint = '<Your endpoint>'

path = '/translate?api-version=3.0'
params = '&to=de&to=it'
constructed_url = endpoint + path + params

headers = {
    'Ocp-Apim-Subscription-Key': subscription_key,
    'Content-type': 'application/json',
    'X-ClientTraceId': str(uuid.uuid4())
}

# You can pass more than one object in body.
body = [{
    'text' : 'Hello World!'
}]
request = requests.post(constructed_url, headers=headers, json=body)
# response = request.json()

# print(json.dumps(response, sort_keys=True, indent=4, separators=(',', ': ')))
print(request.text)

{"error":{"code":"404","message": "Resource not found"}}


## Backtranslation with Goslate

In [0]:
pip install goslate

Collecting goslate
  Downloading https://files.pythonhosted.org/packages/39/0b/50af938a1c3d4f4c595b6a22d37af11ebe666246b05a1a97573e8c8944e5/goslate-1.5.1.tar.gz
Collecting futures
  Downloading https://files.pythonhosted.org/packages/05/80/f41cca0ea1ff69bce7e7a7d76182b47bb4e1a494380a532af3e8ee70b9ec/futures-3.1.1-py3-none-any.whl
Building wheels for collected packages: goslate
  Building wheel for goslate (setup.py) ... [?25l[?25hdone
  Created wheel for goslate: filename=goslate-1.5.1-cp36-none-any.whl size=11550 sha256=ffa17e20011a127799035cca55e239a353593a047eb978461ce7242d346a7853
  Stored in directory: /root/.cache/pip/wheels/4f/7f/28/6f52271012a7649b54b1a7adaae329b4246bbbf9d1e4f6e51a
Successfully built goslate
Installing collected packages: futures, goslate
Successfully installed futures-3.1.1 goslate-1.5.1


In [0]:
import goslate
import pandas as pd
import sys
import time

translator = goslate.Goslate()
d = pd.read_csv("/content/drive/My Drive/project_codebase/project/data/requests.csv", sep=';')
classes = d['motif'].unique()
langs= ['en', 'es', 'de', 'ru', 'ar', 'it', 'ja', 'ca', 'zh']
for cl in classes:
	di = d.loc[d['motif']==cl]
	di = di['demande'].values.tolist()
	# print("class : {}".format(cl), file=sys.stderr)
	for l in langs:
		# print("lang = {}".format(l), file=sys.stderr)
		inter = translator.translate(di, l)
		res = translator.translate(inter, 'fr')
		for x in res:
			print("{};{}".format(x, cl), file='results.csv')
		time.sleep(5)


## Augmented data

Putting the dataset back together.

In [18]:
generated = pd.read_csv('/content/drive/My Drive/project_codebase/project/generated_yandex.csv')
initial = pd.read_csv('/content/drive/My Drive/project_codebase/project/data/requests.csv', sep=';').reset_index()
initial.rename(columns={'index': 'corresponding_example_id'}, inplace=True)
generated = generated.join(initial, on='corresponding_example_id', lsuffix='0')
generated.drop(['corresponding_example_id0', 'corresponding_example_id'], axis=1, inplace=True)
generated.rename(columns={'demande':'demande_originale', 'generated':'demande'}, inplace=True)
generated

Unnamed: 0,demande,demande_originale,motif,groupe_motif
0,Je viens de sauver mon réservoir,je viens enregistrer mon bac,Enregistrement de PACS,01c - Etat Civil PACS Enregistrement
1,Je viens juste de garder mon dépôt,je viens enregistrer mon bac,Enregistrement de PACS,01c - Etat Civil PACS Enregistrement
2,je viens d'enregistrer mon bac,je viens enregistrer mon bac,Enregistrement de PACS,01c - Etat Civil PACS Enregistrement
3,je viens d'enregistrer mon aquarium,je viens enregistrer mon bac,Enregistrement de PACS,01c - Etat Civil PACS Enregistrement
4,Je ne sauvegarde que le réservoir,je viens enregistrer mon bac,Enregistrement de PACS,01c - Etat Civil PACS Enregistrement
...,...,...,...,...
7123,Je veux dissoudre mon pacte civil de solidarité,je souhaite faire dissoudre mon pacte civil de...,"PACS (Dépôt de dossier, modification ou dissol...","01d - Etat Civil PACS Modification, dissolution"
7124,"je voudrais, pour dissoudre mon pacte civil de...",je souhaite faire dissoudre mon pacte civil de...,"PACS (Dépôt de dossier, modification ou dissol...","01d - Etat Civil PACS Modification, dissolution"
7125,je veux résoudre mon pacte civil de solidarité,je souhaite faire dissoudre mon pacte civil de...,"PACS (Dépôt de dossier, modification ou dissol...","01d - Etat Civil PACS Modification, dissolution"
7126,Je veux que ma solution de la Charte de civil ...,je souhaite faire dissoudre mon pacte civil de...,"PACS (Dépôt de dossier, modification ou dissol...","01d - Etat Civil PACS Modification, dissolution"


## Evaluation

The evaluation is done by commenting and uncommenting some lines so that the `dev.txt` and `test.txt` always remain the same and `train.txt` change according to the data provided.

In [0]:
from os import path

import os
import numpy
import pandas
corpus_path = "corpus_splits/"
if not path.exists(corpus_path):
    os.mkdir(corpus_path)

#Loading dataset
# df = pandas.read_csv("/content/drive/My Drive/project_codebase/project/data/requests.csv", sep=";")
df = generated
df = df[["motif", "demande"]]
df["motif"] = "__label__" + df["motif"].astype("str")
df["motif"] = df["motif"].str.replace(" ","_",regex=False)

# Number of splits
num_splits = 10


for split in range(num_splits):
    base_path = corpus_path + "split_" + str(split)
    if not path.exists(base_path):
        os.mkdir(base_path)

    train, test, dev = numpy.split(df.sample(frac=1), [int(.7 * len(df)), int(.9 * len(df))])  # type: # DataFrame

    train.to_csv(base_path + "/train.txt", index=False, sep="\t", header=False)
    # test.to_csv(base_path + "/test.txt", index=False, sep="\t", header=False)
    # dev.to_csv(base_path + "/dev.txt", index=False, sep="\t", header=False)


In [0]:
!pip install flair

In [24]:
%tensorflow_version 2.x

from flair.data import Corpus
from flair.datasets import ClassificationCorpus
from flair.embeddings import CamembertEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from torch.optim import Adam
import numpy as np

TensorFlow 2.x selected.


In [0]:
data_folder = 'corpus_splits/'

# column format indicating which columns hold the text and label(s)
column_name_map = {1: "text", 2: "label_topic", }

# Camembert
camembert = CamembertEmbeddings(layers="-1,-2,-3,-4")

embedding_list = [camembert]

# Document embedding model
document_embeddings = DocumentRNNEmbeddings(embedding_list, hidden_size=750, bidirectional=True,
                                            rnn_layers=2,
                                            rnn_type='GRU',
                                            dropout=0.4,
                                            word_dropout=0.1)
results = []

# 10-fold cross validation
for root, dirs, files in os.walk(data_folder):
    for dir in dirs:
        if "split" in dir:
            print("Processing " + dir + " ...")
            corpus: Corpus = ClassificationCorpus(data_folder + "/" + dir,
                                                  test_file='test.txt',
                                                  dev_file='dev.txt',
                                                  train_file='train.txt', in_memory=True)

            classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(),
                                        multi_label=False)
            trainer = ModelTrainer(classifier, corpus)
            model_path = data_folder + "/" + dir + "/model/"
            scores = trainer.train(model_path, max_epochs=10,
                                   embeddings_storage_mode="cpu",
                                   learning_rate=0.3,
                                   mini_batch_size=32,
                                   anneal_factor=0.5,
                                   shuffle=False,
                                   patience=5, save_final_model=False, anneal_with_restarts=False)
            expected = [sentence.labels[0].value for sentence in corpus.test.sentences]
            predictions = [sentence.labels[0].value for sentence in classifier.predict(corpus.test.sentences)]
            scores['test_f1'] = f1_score(expected, predictions, average='micro')
            results.append(scores)


In [27]:
accuracies = []
f1s = []
for split in results:
    accuracies.append(split['test_score'])
    f1s.append(split['test_f1'])

print("| {:.3f} +- {:.3f} | {:.3f} +- {:.3f} |".format(float(np.mean(accuracies)), float(np.std(accuracies)), float(np.mean(f1s)), float(np.std(f1s))))


| 0.988 +- 0.015 | 0.988 +- 0.015 |
