In [1]:
# read data 
import pandas as pd

original_file_path = './data.csv'
save_home = './'
datatype = 'train'
df = pd.read_csv(original_file_path)
df.head()

Unnamed: 0,label,content
0,1,Reuters - Saboteurs blew up an internal oil\pi...
1,1,The UN envoy to Sudan urged the government to ...
2,1,Palestinian gunmen have released an Israeli Ar...
3,1,Washington on Tuesday swept aside objections f...
4,1,DEAF children helped by kidnapped Margaret Has...


In [2]:
# Data Augmentation Example: (1) Synonym Replacement; (2) Back Translation
df = df[:10]    # comment out this line to process the whole dataset; note that back translation is very time-consuming. 

In [3]:
# (1) Weak Augmentation: Synonym Replacement
import nlpaug.augmenter.word as naw
from tqdm import tqdm

df['synonym_aug'] = 0
aug = naw.SynonymAug(aug_src='wordnet')
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    df['synonym_aug'][idx] = aug.augment(row['content'])[0]
df.to_csv(save_home + datatype + ".csv",index=False)

  from .autonotebook import tqdm as notebook_tqdm
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
100%|██████████| 10/10 [00:00<00:00, 97.56it/s]


In [4]:
# (2) Strong Augmentation: Back Translation
import nlpaug.augmenter.word as naw
import pandas as pd
from tqdm import tqdm
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("gpu num: ", n_gpu)
# file = "unlabeled_data.csv"
# df = pd.read_csv(file)
df['back_translation'] = 0
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en',
    device=device
)

for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    df['back_translation'][idx] = back_translation_aug.augment(row['content'])[0]
df.to_csv(save_home + datatype + ".csv",index=False)


gpu num:  4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 10/10 [00:08<00:00,  1.13it/s]


In [5]:
# Display Augmented Examples
df.head()

Unnamed: 0,label,content,synonym_aug,back_translation
0,1,Reuters - Saboteurs blew up an internal oil\pi...,Reuters - Saboteurs blew upwards an internal f...,Reuters - Saboteurs blew up an internal oil pi...
1,1,The UN envoy to Sudan urged the government to ...,The UN emissary to Soudan cheer the government...,The UN envoy to Sudan urged the government to ...
2,1,Palestinian gunmen have released an Israeli Ar...,Palestinian hired gun accept released an Israe...,Palestinian gunmen have freed an Israeli Arab ...
3,1,Washington on Tuesday swept aside objections f...,Washington on Tuesday swept aside objections f...,Washington brushed aside objections from Beiru...
4,1,DEAF children helped by kidnapped Margaret Has...,DEAF children help by kidnapped Margaret Hassa...,Deaf children supported by kidnapped Margaret ...
