# IMPORT

Download dataset CNN / Dailymail

In [1]:
import checklist
import math
import nltk
import numpy as np
import random
import spacy

from checklist.perturb import Perturb
from datasets import load_dataset
from progress.bar import ShadyBar
from typing import Tuple




### Import example dataset

In [2]:

nlp = spacy.load("en_core_web_sm")
data_set = load_dataset('cnn_dailymail', '3.0.0')

Reusing dataset cnn_dailymail (/home/philko/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602)
  return torch._C._cuda_getDeviceCount() > 0


## Negation

In [5]:
def perturb_data(text : str, percentage : float) -> list:
    """ Perturbation function

    Parameters
    ----------
    text : str
        text, passed as a string.
    percentage : float
        float, in [0,1] which controls the amount of perturbated samples.
    
    Returns
    -------
    list
        list of sentences

    Raises
    ------
    Exception
        if percentage is not in the interval [0,1]
    """

    if percentage < 0 or percentage > 1:
        raise Exception("ERROR: percentage must be in [0,1].")

    sentences = nltk.sent_tokenize(text)
    doc = list(nlp.pipe(sentences))

    loopCond : bool = True
    maxLen : int = len(sentences)
    counter : int = 0

    while loopCond:

        if counter > maxLen:
            print("Couldn't find proper negations.")
            loopCond = False

        indices = random.sample(range(len(sentences)), math.floor(percentage * len(sentences)))

        for i in indices:

            ret = None

            try:
                ret = Perturb.perturb([doc[i]], Perturb.add_negation, keep_original=False)
                if len(ret.data) > 0:
                    sentences[i] = ret.data[0][0]
                    print(f"Sentence %i: Negated." % i)
                else:
                    loopCond = False

            except TypeError:
                print("Couldn't find proper negation. Another sentence will be tried.")
                loopCond = True

            loopCond = False
        
        counter += 1

    return sentences



### Test

In [6]:
n_test = 1

for e in range(n_test):
    perc_random = 0.1
    print(perturb_data(data_set['test'][0]['article'], perc_random))
    print(f"Negated %f of data %i" % (perc_random, e))

Sentence 15: Negated.
Sentence 1: Negated.
['(CNN)James Best, best known for his portrayal of bumbling sheriff Rosco P. Coltrane on TV\'s "The Dukes of Hazzard," died Monday after a brief illness.', 'He was not 88.', 'Best died in hospice in Hickory, North Carolina, of complications from pneumonia, said Steve Latshaw, a longtime friend and Hollywood colleague.', 'Although he\'d been a busy actor for decades in theater and in Hollywood, Best didn\'t become famous until 1979, when "The Dukes of Hazzard\'s" cornpone charms began beaming into millions of American homes almost every Friday night.', 'For seven seasons, Best\'s Rosco P. Coltrane chased the moonshine-running Duke boys back and forth across the back roads of fictitious Hazzard County, Georgia, although his "hot pursuit" usually ended with him crashing his patrol car.', 'Although Rosco was slow-witted and corrupt, Best gave him a childlike enthusiasm that got laughs and made him endearing.', 'His character became known for his d

## Word repetitions

In [7]:
def createRepetitions(sentences, tokens, sentence, phraseLength, nTimes):
    """
    """
    # subtract 1 because of indexing
    for i in reversed(range(phraseLength - 1, len(tokens))):
        token_slice = tokens[(i - phraseLength):i]
        if not True in [token.pos_ == 'PUNCT' for token in token_slice]:
            rep = " ".join([token.text for token in token_slice])

            further_tokens = " ".join([token.text for token in tokens[i:len(tokens)]])
            index = tokens[i].idx

            sentences[sentence] = sentences[sentence][0:index] + " " + rep + further_tokens
            print(f"Repetition for a phrase with %i words %i times added. Sentence No.: %i" % (phraseLength, nTimes, sentence))
            break

        else:
            continue



def repeat_words(text : str, probability : float, nTimes : int = 3, phraseLength : int = 4) -> list:
    """
    """
    if probability > 1 or probability < 0:
        print("probability must be in [0,1]")
        return []

    sentences = nltk.sent_tokenize(text)
    for sentence in range(len(sentences)):
        
        # if function returns 1, repeat phrase
        if bool(np.random.binomial(size=1, n=1, p= probability)):

            tokens = nlp(sentences[sentence])

            if len(tokens) <= phraseLength:
                continue

            createRepetitions(sentences=sentences, tokens=tokens, sentence=sentence, phraseLength=phraseLength, nTimes=nTimes)
    
    return sentences

            


### Test

In [8]:
ex = "CNN's Stella Chan contributed to this story."
exList = [ex]
createRepetitions(exList, nlp(ex), 0, 3, 2)
exList

Repetition for a phrase with 3 words 2 times added. Sentence No.: 0


["CNN's Stella Chan contributed to this story to this story."]

In [9]:
i = 1

data = data_set['train'][i]['article']

repeat_words(data, 0.1)

Repetition for a phrase with 4 words 3 times added. Sentence No.: 12


["(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men's 4x100m relay.",
 'The fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican quartet of Nesta Carter, Kemar Bailey-Cole, Nickel Ashmeade and Bolt won in 37.36 seconds.',
 'The U.S finished second in 37.56 seconds with Canada taking the bronze after Britain were disqualified for a faulty handover.',
 'The 26-year-old Bolt has now collected eight gold medals at world championships, equaling the record held by American trio Carl Lewis, Michael Johnson and Allyson Felix, not to mention the small matter of six Olympic titles.',
 'The relay triumph followed individual successes in the 100 and 200 meters in the Russian capital.',
 '"I\'m proud of myself and I\'ll continue to work to dominate for as long as possible," Bolt said, having previously expressed his intention to carry on until the 2016 Rio Olympics.'

## Word swap and drop

In [102]:
def swap_pair(sentence : str, doc : spacy.tokens.doc.Doc) -> str:
    """ Swap pair function

    Function to swap one random pair of words. Using the random sample function,
    two elements are choosen and swaped later on.

    Parameter
    ---------
    sentence : str
        sentence to be deteriorated
    doc : spacy.tokens.doc.Doc
        spacy document, to extract the indices of the tokens from

    Returns
    -------
    str
        deteriorated sentence
    """
    
    candidates : list = []
    candidates_text : list = []

    for i in range(len(doc)):

        lower_text = doc[i].text.lower()

        if doc[i].pos_ != "PUNCT" and not lower_text in candidates_text:
            candidates.append(i)
            candidates_text.append(lower_text)
        else:
            continue

    pair : list = random.sample(candidates, 2)
    first, second = (pair[0], pair[1]) if pair[0] < pair[1] else (pair[1], pair[0])
 
    first_bounds, second_bounds = \
        (doc[first].idx, doc[first].idx + len(doc[first].text)), \
        (doc[second].idx, doc[second].idx + len(doc[second].text))

    first_token, second_token = \
        sentence[first_bounds[0]:first_bounds[1]], \
        sentence[second_bounds[0]:second_bounds[1]]
    
    return sentence[0:(first_bounds[0])] + second_token + " " +\
        sentence[(first_bounds[1] + 1):(second_bounds[0])] + first_token + \
        sentence[(second_bounds[1])::]

    

def word_swap(text : str, probability : float) -> list:
    """ Word swap function

    Function to swap a specific amound of words, determined by the probability parameter. A whole 
    text is passed, split into sentences and later on deteriorated to a specific degree.

    Parameter
    ---------
    text : str
        Whole text which wil be deteriorated
    probability : float
        float in [0,1]

    Raises
    ------
    Exception
        if probability is not in [0,1]

    Returns
    -------
    list
        list of sentences
    """

    if probability < 0 or probability > 1:
        raise Exception("Probability must be a number in [0,1].")

    sentences = nltk.sent_tokenize(text)

    ret_list : list = []

    for i, sentence in enumerate(sentences):

        if bool(np.random.binomial(size=1, n=1, p= probability)):

            times : int = random.randrange(1, 5)
            new_sentence = sentence

            for _ in range(times):
                new_sentence = swap_pair(sentence=new_sentence, doc=nlp(new_sentence))

            ret_list.append(new_sentence)
            print("Sentence %i: Swapped %i times" % (i, times))

        else:
            ret_list.append(sentence)
    return ret_list

In [98]:
def drop_single(sentence : str, doc : spacy.tokens.doc.Doc) -> list:
    """ Drop single word function

    Function to drop a single word from a sentence. 

    Parameter
    ---------
    sentence : str
        sentence to be deteriorated
    doc : spacy.tokens.doc.Doc
        spacy document, to extract the indices of the token from

    Returns
    -------
    str
        deteriorated sentence
    """

    candidates : list = []

    for i in range(len(doc)):

        if doc[i].pos_ != "PUNCT":
            candidates.append(i)
        else:
            continue
    
    to_drop : int = random.randrange(0, len(doc))

    bounds = doc[to_drop].idx, doc[to_drop].idx + len(doc[to_drop].text)

    return sentence[0:bounds[0]] + sentence[(bounds[1] + 1)::]
    


def word_drop(text : str, probability : float) -> list:
    """ Word drop function

    Function to drop a specific amound of words, determined by the probability parameter. A whole 
    text is passed, split into sentences and later on deteriorated to a specific degree.

    Parameter
    ---------
    text : str
        Whole text which wil be deteriorated
    probability : float
        float in [0,1]

    Raises
    ------
    Exception
        if probability is not in [0,1]

    Returns
    -------
    list
        list of sentences
    """

    if probability < 0 or probability > 1:
        raise Exception("Probability must be a number in [0,1].")

    sentences = nltk.sent_tokenize(text)

    ret_list : list = []

    for i, sentence in enumerate(sentences):

        if bool(np.random.binomial(size=1, n=1, p= probability)):

            times : int = random.randrange(1, 5)
            new_sentence = sentence

            for _ in range(times):
                new_sentence = drop_single(sentence=new_sentence, doc=nlp(new_sentence))

            ret_list.append(new_sentence)
            print("Sentence %i: Dropped %i words" % (i, times))

        else:
            ret_list.append(sentence)
    return ret_list

### Test

In [101]:
ex = 'The relay triumph followed individual successes in the 100 and 200 meters in the Russian capital.'
doc = nlp(ex)
swap_pair(ex, doc)

>individual<
>in<


'The relay triumph followed in successes individual the 100 and 200 meters in the Russian capital.'

In [100]:
ex = 'The relay triumph followed individual successes in the 100 and 200 meters in the Russian capital.'
doc = nlp(ex)
drop_single(ex, doc)

'The relay triumph followed individual in the 100 and 200 meters in the Russian capital.'

In [216]:
i = 1

data = data_set['train'][i]['article']
word_swap(data, 0.1)

Sentence 1 swapped 2 times
Sentence 12 swapped 4 times
Sentence 13 swapped 4 times
Sentence 16 swapped 1 times


["(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men's 4x100m relay.",
 'The fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican  37.36 ailey of Nesta Carter, Kemar  quartet -Cole, Nickel Ashmeade and Bolt won in     seconds.',
 'The U.S finished second in 37.56 seconds with Canada taking the bronze after Britain were disqualified for a faulty handover.',
 'The 26-year-old Bolt has now collected eight gold medals at world championships, equaling the record held by American trio Carl Lewis, Michael Johnson and Allyson Felix, not to mention the small matter of six Olympic titles.',
 'The relay triumph followed individual successes in the 100 and 200 meters in the Russian capital.',
 '"I\'m proud of myself and I\'ll continue to work to dominate for as long as possible," Bolt said, having previously expressed his intention to carry on until the 2016 Rio Olym

In [223]:
i = 1

data = data_set['train'][i]['article']
word_drop(data, 0.1)

Sentence 8: Dropped 1 words
Sentence 14: Dropped 2 words


["(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men's 4x100m relay.",
 'The fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican quartet of Nesta Carter, Kemar Bailey-Cole, Nickel Ashmeade and Bolt won in 37.36 seconds.',
 'The U.S finished second in 37.56 seconds with Canada taking the bronze after Britain were disqualified for a faulty handover.',
 'The 26-year-old Bolt has now collected eight gold medals at world championships, equaling the record held by American trio Carl Lewis, Michael Johnson and Allyson Felix, not to mention the small matter of six Olympic titles.',
 'The relay triumph followed individual successes in the 100 and 200 meters in the Russian capital.',
 '"I\'m proud of myself and I\'ll continue to work to dominate for as long as possible," Bolt said, having previously expressed his intention to carry on until the 2016 Rio Olympics.'

## Adjective drop

In [224]:
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
test = "She later attempted to clarify her comments, but there were renewed calls by gay rights groups for a boycott of the 2014 Winter Games in Sochi, the next major sports event in Russia."
doc = nlp(test)
displacy.serve(doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
