## Random Words Augmenter

In [63]:
import random

In [64]:
"""
SUBSTITUTE: words are substituted randomly
DELETE: words are removed randomly
SWAP: adjacent words swapped randomly
CROP: set of continuous words removed randomly
"""

class Action:
    SUBSTITUTE = 'substitute'
    DELETE = 'delete'
    SWAP = 'swap'
    CROP = 'crop'

In [68]:
def random_augment(data, action, aug_percent=0.3, aug_min=1, aug_max=10, skipwords=None, target_words=None):
    """
    Random text augmentation function.
    
    :param data: Input text to augment.
    :param action: Action to perform ('substitute', 'delete', 'swap', or 'crop').
    :param aug_percent: Percentage of words to augment.
    :param aug_min: Minimum number of words to augment.
    :param aug_max: Maximum number of words to augment.
    :param skipwords: List of words to skip during augmentation.
    :param target_words: List of words for replacement (only for 'substitute' action).
    :return: Augmented text.
    """
    if not data or not data.strip():
        return data
    original_sentence = data
    words = data.split()
    # calculate count of augmentation
    aug_count = max(aug_min, int(len(words) * aug_percent))
    aug_count = min(aug_count, aug_max, len(words))
    target_words = target_words or ['_']
    
    # https://arxiv.org/pdf/1703.02573.pdf, https://arxiv.org/pdf/1712.06751.pdf, https://arxiv.org/pdf/1806.09030.pdf
    # https://arxiv.org/pdf/1905.11268.pdf,
    def substitute(words):
        """
        Substitute selected words with random target words.
        """
        change_seq = 0  # Track sequence of changes
        aug_indices = random.sample(range(len(words)), aug_count)  # Randomly select indices to augment
        aug_indices.sort(reverse=True)  # Process from the end to avoid index shifts

        for idx in aug_indices:
            original_token = words[idx]  # Original word at this position
            # Skip if the word is in the skipwords list
            if skipwords and original_token in skipwords:
                continue
            new_token = random.choice(target_words)  # Choose a replacement word from target_words
            # Apply capitalization of the original token to the new token if necessary
            if idx == 0:
                new_token = original_token[0].upper() + new_token[1:] if original_token[0].isupper() else new_token
            change_seq += 1  # Increment change sequence
            words[idx] = new_token  # Replace the word

        return words

    # https://arxiv.org/pdf/1905.11268.pdf, https://arxiv.org/pdf/1809.02079.pdf, https://arxiv.org/pdf/1903.09460.pdf
    def delete(words):
        aug_indices = sorted(random.sample(range(len(words)), aug_count), reverse=True)
        for idx in aug_indices:
            if skipwords and words[idx] in skipwords:
                continue
            words.pop(idx)
        return words

    # https://arxiv.org/pdf/1711.02173.pdf, https://arxiv.org/pdf/1809.02079.pdf, https://arxiv.org/pdf/1903.09460.pdf
    def swap(words):
        """
        Swap selected words with adjacent words.
        """
        change_seq = 0  # Track sequence of changes
        aug_indices = random.sample(range(len(words) - 1), aug_count)  # Randomly select indices to augment
        aug_indices.sort(reverse=True)  # Process from the end to avoid index shifts

        for idx in aug_indices:
            # Ensure swapping does not include skipwords
            if skipwords and (words[idx] in skipwords or words[idx + 1] in skipwords):
                continue
            
            # Swap the word with its adjacent word
            original_token = words[idx]
            swap_token = words[idx + 1]
            
            # Check if the word is at the beginning, and maintain proper case if necessary
            if idx == 0:
                # Capitalize swap token if needed
                swap_token = original_token[0].upper() + swap_token[1:] if original_token[0].isupper() else swap_token
                original_token = original_token.lower() if original_token[0].isupper() else original_token

            # Perform swap
            words[idx], words[idx + 1] = swap_token, original_token
            change_seq += 1  # Increment change sequence

        return words

    def crop(words):
        if len(words) < 2:
            return words  # Skip if not enough words to crop
        start_idx = random.randint(0, len(words) - aug_count)
        end_idx = start_idx + aug_count
        return words[:start_idx] + words[end_idx:]
    
    if action == Action.SUBSTITUTE:
        words = substitute(words)
    elif action == Action.DELETE:
        words = delete(words)
    elif action == Action.SWAP:
        words = swap(words)
    elif action == Action.CROP:
        words = crop(words)
    
    augmented_words = ' '.join(words)
    return original_sentence, augmented_words

In [69]:
text = "This is a simple example sentence for testing."
augmented_text = random_augment(text, action=Action.SUBSTITUTE, target_words=["awesome", "great"])
print(augmented_text)

('This is a simple example sentence for testing.', 'This is a simple awesome sentence for great')


In [70]:
text = "This is a simple example sentence for testing."
augmented_text = random_augment(text, action=Action.SWAP, target_words=["awesome", "great"])
print(augmented_text)

('This is a simple example sentence for testing.', 'This is a example simple sentence testing. for')


In [71]:
text = "This is a simple example sentence for testing."
augmented_text = random_augment(text, action=Action.DELETE, target_words=["awesome", "great"])
print(augmented_text)

('This is a simple example sentence for testing.', 'This is a example sentence for')


In [72]:
text = "This is a simple example sentence for testing."
augmented_text = random_augment(text, action=Action.CROP, target_words=["awesome", "great"])
print(augmented_text)

('This is a simple example sentence for testing.', 'This is a simple example testing.')
