In [97]:
import pandas as pd
import numpy as np
import os
import torch 
import torch.nn.functional as F

from config import DATA_FOLDER, DATA_PCL_NAME, DATA_CATEGORIES_NAME
from utils import Utils
from transformers import BertTokenizer

df = pd.read_csv(os.path.join(
            os.path.dirname(os.getcwd()),
            DATA_FOLDER,
            DATA_PCL_NAME
        ))

df = df.dropna()
df

Unnamed: 0,par_id,art_id,keyword,country_code,text,label,binary_label
0,1,24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0.0,0.0
1,2,21968160,migrant,gh,"In Libya today , there are countless number of...",0.0,0.0
2,3,16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0.0,0.0
3,4,7811231,disabled,nz,Council customers only signs would be displaye...,0.0,0.0
4,5,1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0.0,0.0
...,...,...,...,...,...,...,...
10463,10464,19612634,disabled,ie,"""When Marie O'Donoghue went looking for a spec...",0.0,0.0
10464,10465,14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",1.0,0.0
10465,10466,70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0.0,0.0
10466,10467,20282330,in-need,ng,""""""" She has one huge platform , and informatio...",3.0,1.0


In [2]:
# pip install nlpaug==1.1.11
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

text = 'The quick brown fox jumps over the lazy dog .'
print(text)

  from .autonotebook import tqdm as notebook_tqdm


The quick brown fox jumps over the lazy dog .


ALL TAKEN FROM: https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb

# Character Augmenter
Augmenting data in character level. Possible scenarios include image to text and chatbot. During recognizing text from image, we need to optical character recognition (OCR) model to achieve it but OCR introduces some errors such as recognizing "o" and "0". OCRAug simulate these errors to perform the data augmentation. For chatbot, we still have typo even though most of application comes with word correction. Therefore, KeyboardAug is introduced to simulate this kind of errors.

Keyboard Augmenter - Substitute character by keyboard distance

In [99]:
aug = nac.KeyboardAug()
augmented_text = aug.augment(text, n=1)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['The quick bgowJ fox jumps 0ve$ the Iasy dog.']


Random Augmenter Insert character randomly

In [100]:
aug = nac.RandomCharAug(action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['The pquikck wbrOown fox jubmpPs over the lazy dog.']


Substitute character randomly

In [101]:
aug = nac.RandomCharAug(action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['The q3iak brown fox _umpY oge& the lazy dog.']


Swap character randomly

In [102]:
aug = nac.RandomCharAug(action="swap")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['The quick brown fox jusmp over the alyz dog.']


# Word Augmenter

Besides character augmentation, word level is important as well. We make use of word2vec (Mikolov et al., 2013), GloVe (Pennington et al., 2014), fasttext (Joulin et al., 2016), BERT(Devlin et al., 2018) and wordnet to insert and substitute similar word. Word2vecAug, GloVeAug and FasttextAug use word embeddings to find most similar group of words to replace original word. On the other hand, BertAug use language models to predict possible target word. WordNetAug use statistics way to find the similar group of words.

## 1. Spelling Augmenter
Substitute word by spelling mistake words dictionary

In [103]:
aug = naw.SpellingAug()
augmented_texts = aug.augment(text, n=3)
print("Original:")
print(text)
print("Augmented Texts:")
print(augmented_texts)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Texts:
['The quick brown fox jumps overt d lazy gog.', 'Ihe quikly brown fox jumps other the lazy dog.', 'The quick brown fox jumps other the laizy don.']


## 2. Contextual Word Embeddings Augmenter


Insert word by contextual word embeddings (BERT, DistilBERT, RoBERTA or XLNet)


In [104]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['the quick reacting brown fox momentarily jumps guard over the lazy dog.']


Insert word by contextual word embeddings (BERT, DistilBERT, RoBERTA or XLNet)


In [105]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="substitute")
augmented_text = aug.augment(text, n=3)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['that first brown fox jumps over the small dog.', 'the quick brown cat jumps over its panting dog.', 'the quick brown fox did like the yellow dog.']


In [106]:
aug = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', action="substitute")
augmented_text = aug.augment(text, n=1)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['the crazy falling fox jumps over the lazy duck.']


In [107]:
aug = naw.ContextualWordEmbsAug(
    model_path='roberta-base', action="substitute")
augmented_text = aug.augment(text, n=1)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['The quick brown fox bolted toward the white dog.']


## 3. Random Word Augmenter


Swap word randomly


In [108]:
aug = naw.RandomWordAug(action="swap")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['The quick brown over fox the jumps lazy dog.']


Delete word randomly


In [109]:
aug = naw.RandomWordAug(action="delete")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['The quick brown fox jumps dog.']


Delete a set of contunous word will be removed randomly

In [110]:
aug = naw.RandomWordAug(action="crop")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['The quick over the lazy dog.']


## 4. Synonym Augmenter

Substitute word by WordNet's synonym

In [111]:
# Run the code below if it does not work 
"""
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()
"""
aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['The agile brown fox jumps complete the lazy blackguard.']


## 5. Antonym Augmenter
Substitute word by antonym

In [112]:
aug = naw.AntonymAug()
_text = 'Good boy'
augmented_text = aug.augment(_text)
print("Original:")
print(_text)
print("Augmented Text:")
print(augmented_text)

Original:
Good boy
Augmented Text:
['Bad boy']


# Sentence Augmentation

Contextual Word Embeddings for Sentence Augmenter


In [113]:
aug = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['The quick brown fox jumps over the lazy dog . first first other .']


In [114]:
aug = nas.ContextualWordEmbsForSentenceAug(model_path='distilgpt2')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['The quick brown fox jumps over the lazy dog . .']


# Chained Augmentations

In [115]:
import nlpaug.flow as naf

aug = naf.Sequential([
    nac.RandomCharAug(action='substitute'),
    naw.RandomWordAug()
])

augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['Quick ^ ovmd the HaBy dog.']


In [116]:
# If you do not want to execute the same set of augmenters all the time, sometimes will pick some of the augmenters every time.
aug = naf.Sometimes([
    nac.RandomCharAug(action='delete'),
    nac.RandomCharAug(action='insert'),
    naw.RandomWordAug()
])

augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['quQichk brown fox jumps + l! ahzy dog.']


# DATA AUGMENTATION ON PCL DATA

### Augmentation

```
!pip install nlpaug==1.1.11
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf
```

Character: 
1. Keyboard Augmenter - Substitute character by keyboard distance 
    - ```aug = nac.RandomCharAug(action="insert")```

Word: 
1. Random Word Augmenter
    - Swap: ```aug = naw.RandomWordAug(action="swap")```
    - Delete: ```aug = naw.RandomWordAug(action="delete")```
    - Delete a set of contunous word will be removed randomly: ```aug = naw.RandomWordAug(action="crop")```
2. Synonym Augmenter
    - ```aug = naw.SynonymAug(aug_src='wordnet')```
3. Antoymn Augmenter
    - ```aug = naw.AntonymAug()```
3. Contextual Word Embeddings Augmenter - Insert word by contextual word embeddings (BERT, DistilBERT, RoBERTA or XLNet)
    - Insert: ```aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")```
    - Substitute: ```aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")```
4. Chained Augmentations - apply a chain of augmentations:  
    - All of them: 
        ```
        aug = naf.Sequential([
        nac.RandomCharAug(action='substitute'),
        naw.RandomWordAug()])
        ```
    - Sometimes: (If you do not want to execute the same set of augmenters all the time, sometimes will pick some of the augmenters every time.)
        ```
        aug = naf.Sometimes([
        nac.RandomCharAug(action='delete'),
        nac.RandomCharAug(action='insert'),
        naw.RandomWordAug()])
        ```

Sentence:
1. ```aug = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2')```

In [117]:
def data_augmentation(df, augmentor, n):
    """
    augment all data n amount of times with augmentor of choice
    """
    all_data = [df]
    for _ in range(n):
        df_new = df.copy()
        texts = df['text'].tolist()
        augmented_text = augmentor.augment(texts)
        df_new['text'] = augmented_text
        all_data.append(df_new)
    
    return pd.concat(all_data, axis=0)

In [118]:
def data_augmentation_class_rebalance(df, augmentor):
    all_data = [df]
    n = int(len(df[df['binary_label']==0])/len(df[df['binary_label']==1]))
    for _ in range(n):
        df_new = df[df['binary_label']==1].copy(deep=True)
        texts = df_new['text'].tolist()
        augmented_text = augmentor.augment(texts)
        df_new['text'] = augmented_text
        all_data.append(df_new)
    
    return pd.concat(all_data, axis=0)

In [119]:
test_df = df[240:250]
test_df

Unnamed: 0,par_id,art_id,keyword,country_code,text,label,binary_label
240,241,174310,migrant,ie,The incident has added significance because of...,0.0,0.0
241,242,7511755,in-need,ie,What Dublin needs to do to make itself more at...,0.0,0.0
242,243,19406751,homeless,nz,The Prime Minister Bill English has come out i...,0.0,0.0
243,244,18374174,in-need,nz,Mr Little said they would provide better and q...,1.0,0.0
244,245,15627978,women,nz,McAlister was also nominated for coach of the ...,0.0,0.0
245,246,20000497,homeless,au,What causes someone to become homeless ? Brain...,1.0,0.0
246,247,15992006,refugee,ie,A young child evacuated from Aleppo at a refug...,0.0,0.0
247,248,17606348,hopeless,ke,""""""" Clear evidence of outstanding miraculous o...",4.0,1.0
248,249,22546584,homeless,hk,Another collective sale leads the region 's re...,0.0,0.0
249,250,24477162,migrant,in,"""Bank of America 's biggest competitors do n't...",0.0,0.0


In [120]:
aug = naw.RandomWordAug(action="crop")
df_out_n = data_augmentation(test_df, aug, n=2)
df_out_rebalance = data_augmentation_class_rebalance(test_df, aug)

In [121]:
df_out_n

Unnamed: 0,par_id,art_id,keyword,country_code,text,label,binary_label
240,241,174310,migrant,ie,The incident has added significance because of...,0.0,0.0
241,242,7511755,in-need,ie,What Dublin needs to do to make itself more at...,0.0,0.0
242,243,19406751,homeless,nz,The Prime Minister Bill English has come out i...,0.0,0.0
243,244,18374174,in-need,nz,Mr Little said they would provide better and q...,1.0,0.0
244,245,15627978,women,nz,McAlister was also nominated for coach of the ...,0.0,0.0
245,246,20000497,homeless,au,What causes someone to become homeless ? Brain...,1.0,0.0
246,247,15992006,refugee,ie,A young child evacuated from Aleppo at a refug...,0.0,0.0
247,248,17606348,hopeless,ke,""""""" Clear evidence of outstanding miraculous o...",4.0,1.0
248,249,22546584,homeless,hk,Another collective sale leads the region 's re...,0.0,0.0
249,250,24477162,migrant,in,"""Bank of America 's biggest competitors do n't...",0.0,0.0


In [122]:
df_out_rebalance

Unnamed: 0,par_id,art_id,keyword,country_code,text,label,binary_label
240,241,174310,migrant,ie,The incident has added significance because of...,0.0,0.0
241,242,7511755,in-need,ie,What Dublin needs to do to make itself more at...,0.0,0.0
242,243,19406751,homeless,nz,The Prime Minister Bill English has come out i...,0.0,0.0
243,244,18374174,in-need,nz,Mr Little said they would provide better and q...,1.0,0.0
244,245,15627978,women,nz,McAlister was also nominated for coach of the ...,0.0,0.0
245,246,20000497,homeless,au,What causes someone to become homeless ? Brain...,1.0,0.0
246,247,15992006,refugee,ie,A young child evacuated from Aleppo at a refug...,0.0,0.0
247,248,17606348,hopeless,ke,""""""" Clear evidence of outstanding miraculous o...",4.0,1.0
248,249,22546584,homeless,hk,Another collective sale leads the region 's re...,0.0,0.0
249,250,24477162,migrant,in,"""Bank of America 's biggest competitors do n't...",0.0,0.0
