In [1]:
# Install the required packages
%pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
# read data 
import pandas as pd

original_file_path = './data.csv'
save_home = './'
datatype = 'train'
df = pd.read_csv(original_file_path)
df.head()

Unnamed: 0,label,content
0,1,Reuters - Saboteurs blew up an internal oil\pi...
1,1,The UN envoy to Sudan urged the government to ...
2,1,Palestinian gunmen have released an Israeli Ar...
3,1,Washington on Tuesday swept aside objections f...
4,1,DEAF children helped by kidnapped Margaret Has...


In [3]:
# Data Augmentation Example: (1) Synonym Replacement; (2) Back Translation
df = df[:10]    # comment out this line to process the whole dataset; note that back translation is very time-consuming. 

In [4]:
# (1) Weak Augmentation: Synonym Replacement
import nlpaug.augmenter.word as naw
from tqdm import tqdm

df['synonym_aug'] = 0
aug = naw.SynonymAug(aug_src='wordnet')
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    df['synonym_aug'][idx] = aug.augment(row['content'])[0]
df.to_csv(save_home + datatype + ".csv",index=False)
# print saving path
print('Data Augmentation Done! Check the saved file at: ', save_home + datatype + ".csv")

  from .autonotebook import tqdm as notebook_tqdm
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['synonym_aug'][idx] = aug.augment(row['content'])[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
100%|██████████| 10/10 [00:00<00:00, 104.34it/s]

Data Augmentation Done! Check the saved file at:  ./train.csv





In [5]:
# (2) Strong Augmentation: Back Translation
import nlpaug.augmenter.word as naw
import pandas as pd
from tqdm import tqdm
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("gpu num: ", n_gpu)
# file = "unlabeled_data.csv"
# df = pd.read_csv(file)
df['back_translation'] = 0
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en',
    device=device
)

for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    df['back_translation'][idx] = back_translation_aug.augment(row['content'])[0]
df.to_csv(save_home + datatype + ".csv",index=False)
# print saving path
print('Data Augmentation Done! Check the saved file at: ', save_home + datatype + ".csv")

gpu num:  4


Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-en-de and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-de-en and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['back_translation'][idx] = back_translation_aug.augment(row['content'])[0]
100%|██████████| 10/10 [00:18<00:00,  1.87s/

Data Augmentation Done! Check the saved file at:  ./train.csv





In [6]:
# Display Augmented Examples
df.head()

Unnamed: 0,label,content,synonym_aug,back_translation
0,1,Reuters - Saboteurs blew up an internal oil\pi...,Reuters - Diversionist blew improving an inter...,Reuters - Saboteurs blew up an internal oil pi...
1,1,The UN envoy to Sudan urged the government to ...,The UN envoy to Sudan urged the government to ...,The UN envoy to Sudan urged the government to ...
2,1,Palestinian gunmen have released an Israeli Ar...,Palestinian gunmen have released an Israeli Ar...,Palestinian gunmen have freed an Israeli Arab ...
3,1,Washington on Tuesday swept aside objections f...,Washington on Tuesday swept aside objections f...,Washington brushed aside objections from Beiru...
4,1,DEAF children helped by kidnapped Margaret Has...,DEAF children helped by kidnapped Margaret Has...,Deaf children supported by kidnapped Margaret ...
