In [12]:
from transformers import pipeline
from transformers import BertTokenizer
import random
import numpy as np
import string
import pandas as pd

In [110]:
class BERT_aug:
    """
    Use BERT MLM to augment data.
    Typical usage: :: 
        >>> aug_model = BERT_aug('path/to/gensim/model'or 'BERT MLM itself')
        >>> aug_model.augment('I love school')
        i adore school
    """
    
    def __init__(self, **kwargs):
        """
        A method to initialize a model on a given path.
        :type random_state: int, float, str, bytes, bytearray
        :param random_state: seed
        :param model: The path to the model or the model itself.
        :type runs: int, optional
        :param runs: The number of times to augment a sentence. By default is 1.
        :type p: float, optional
        :param p: The probability of success of an individual trial. (0<p<=1), default is 0.5
        """

        # Set random state
        if 'random_state' in kwargs:
            self.random_state = kwargs['random_state']
            if isinstance(self.random_state, int):
                random.seed(self.random_state)
                np.random.seed(self.random_state)
            else:
                raise TypeError("random_state must have type int")

        try:
            if "p" in kwargs:
                self.p = kwargs['p']                
            else:
                kwargs['p'] = 0.5  # Set default value
        except KeyError:
            raise

        # Error handling of given parameters
        try:
            if "runs" not in kwargs:
                kwargs["runs"] = 1  # Default value for runs
            elif type(kwargs["runs"]) is not int:
                raise TypeError("DataType for 'runs' must be an integer")
            if "model" not in kwargs:
                raise ValueError("Set the value of model. e.g model='path/to/model' or model itself")
        except (ValueError, TypeError):
            raise
        else:
            self.runs = kwargs["runs"] 
            self.model = kwargs["model"]
            self.p = kwargs["p"]


    def geometric(self, data):
        """
        Used to generate Geometric distribution.
        :type data: list
        :param data: Input data
        :rtype:   ndarray or scalar
        :return:  Drawn samples from the parameterized Geometric distribution.
        """

        data = np.array(data)
        first_trial = np.random.geometric(p=self.p, size=data.shape[0]) == 1  # Capture success after first trial
        return data[first_trial]

    def augment(self, data):
        """
        The method to replace words with similar words.
        
        :type data: str
        :param data: Input data
        :rtype:   str
        :return:  The augmented data
        """
        
        # Avoid nulls and other unsupported types
        if type(data) is not str: 
            raise TypeError("Only strings are supported")
        # get punctuations
        puncs = string.punctuation
        # Tokenize
        data_tokens = tokenizer.basic_tokenizer.tokenize(data.lower())
        # append a period if no punctuations at the the end
        if data_tokens[-1] not in puncs:
            data_tokens.append('.')
        data_tokens_idx = [[x, y] for (x, y) in enumerate(data_tokens)]  # Enumerate data
        
        # Randomly replace some words
        for _ in range(self.runs):
            data_tokens_idx_no_puncs = [[idx, w] for idx, w in data_tokens_idx if w not in puncs]
            words = self.geometric(data=data_tokens_idx_no_puncs).tolist()  # List of words indexed
            data_tokens2 = data_tokens.copy() # copy the tokens. Use the original tokens for prediction
            for w in words:
                data_tokens3 = data_tokens2.copy() # copy the tokens. only mask one token each time
                data_tokens3[int(w[0])] = '[MASK]' # only mask words, not punctuations
                sentence = " ".join(data_tokens3)
                pred_words = self.model(sentence) # make prediction with punctuations
                similar_words = [item['token_str'] for item in pred_words]
                similar_words_weights = [item['score'] for item in pred_words]
                word = random.choices(similar_words, similar_words_weights, k=1) # choose words based on weights
                data_tokens[int(w[0])] = word[0].lower() # Replace with a predicted word
        
        return " ".join(data_tokens)

In [27]:
# initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# initialize BERT masked language model
bert_mlm_model = pipeline('fill-mask', model='bert-base-uncased', top_k = 10)

### Augment one document

In [117]:
aug_model = BERT_aug(model=bert_mlm_model, runs=1, p=1, random_state = 123)
aug_model.augment('Once again Mr. Costner has dragged out a movie for far longer than necessary.')

'once again mr . grey had put in the confession for much longer than usual .'

### Augment documents in a dataframe

In [39]:
minority_texts = {
    'id': [1,2,3], 
    'label': [1,1,1], 
    'text':["Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters.","This film is absolutely appalling and awful. It's not low budget, it's a no budget film that makes Ed Wood's movies look like art. The acting is abysmal but sets and props are worse then anything I have ever seen.","When I saw the film, I was disappointed. The acting is stilted, and the attempts at comedy are woefully out of place and forced. And I'm sorry, but a boy being chased by a turd in a bedpan is not funny or scary, it's just stupid."]
} 
minority_df = pd.DataFrame(minority_texts)

In [40]:
minority_df

Unnamed: 0,id,label,text
0,1,1,Once again Mr. Costner has dragged out a movie...
1,2,1,This film is absolutely appalling and awful. I...
2,3,1,"When I saw the film, I was disappointed. The a..."


In [111]:
import time
start = time.time()
labels = []
texts = []
ids = []
orig = []
#change_counts = []
for index, row in minority_df.iterrows():
    print('Augmenting document ', row['id'])
    labels.append(row['label'])
    texts.append(row['text'])
    orig.append('original')
    ids.append(row['id'])
    random.seed(123)
    for i in range(2):
        aug_model = BERT_aug(model=bert_mlm_model, runs=1, p=0.9, random_state = random.randint(1, 9999))
        try:
            text = aug_model.augment(row['text'])
        except AttributeError or HTTPError or ValueError:
            text = "none"
            #words_num = 0
        
        texts.append(text)
        labels.append(row['label'])
        orig.append('generated')
        ids.append(row['id'])
        #change_counts.append(words_num)

end = time.time()
print(end - start)
aug_df = pd.DataFrame(data = zip(ids, texts, labels, orig), 
                      columns = ['id', 'review', 'label', 'original'])

Augmenting document  1
Augmenting document  2
Augmenting document  3
20.35181188583374


In [112]:
aug_df

Unnamed: 0,id,review,label,original
0,1,Once again Mr. Costner has dragged out a movie...,1,original
1,1,once again mr . andrews had put out the script...,1,generated
2,1,once again mr . edison had planned on the movi...,1,generated
3,2,This film is absolutely appalling and awful. I...,1,original
4,2,this movie is simply terrible and awful . it '...,1,generated
5,2,this movie is both terrible and disgusting . i...,1,generated
6,3,"When I saw the film, I was disappointed. The a...",1,original
7,3,"when i saw this film , i was shocked . the dia...",1,generated
8,3,"when i saw the film , i was shocked . the dial...",1,generated
