# IMPORT

Download dataset CNN / Dailymail

In [36]:
import checklist
import math
import nltk
import numpy as np
import random
import spacy

from checklist.perturb import Perturb
from datasets import load_dataset
from progress.bar import ShadyBar




Reusing dataset cnn_dailymail (/home/philko/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602)


### Import example dataset

In [37]:

nlp = spacy.load("en_core_web_sm")
data_set = load_dataset('cnn_dailymail', '3.0.0')

Reusing dataset cnn_dailymail (/home/philko/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602)


## Negation

In [3]:

i = 1

sentences = nltk.sent_tokenize(data_set['train'][i]['article'])
data = list(nlp.pipe(sentences))

ret = Perturb.perturb([data[0]], Perturb.add_negation, keep_original=False)
ret.data

[["(CNN) -- Usain Bolt didn't round off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men's 4x100m relay."]]

In [8]:
def perturb_data(text : str, percentage : float) -> list:
    """ Perturbation function

    Parameters
    ----------
    text : str
        text, passed as a raw string.
    percentage : float
        float, in [0,1] which controls the amount of perturbated samples.
    
    Returns
    -------
    list
        list of sentences

    Raises
    ------
    Exception
        if percentage is not in the interval [0,1]
    """

    if percentage < 0 or percentage > 1:
        raise Exception("ERROR: percentage must be in [0,1].")

    sentences = nltk.sent_tokenize(text)
    # indices = random.sample(range(len(sentences)), math.floor(percentage * len(sentences)))
    doc = list(nlp.pipe(sentences)) # type some spacy doc

    loopCond : bool = True
    maxLen : int = len(sentences)
    counter : int = 0

    while loopCond:

        if counter > maxLen:
            print("Couldn't find proper negations.")
            loopCond = False

        indices = random.sample(range(len(sentences)), math.floor(percentage * len(sentences)))

        for i in indices:

            ret = None

            try:
                ret = Perturb.perturb([doc[i]], Perturb.add_negation, keep_original=False)
                if len(ret.data) > 0:
                    sentences[i] = ret.data[0][0]
                else:
                    loopCond = False

            except TypeError:
                # print("Couldn't find proper negation. Another sentence will be tried.")
                loopCond = True

            loopCond = False
        
        counter += 1

    return sentences

n_test = 1

for e in range(n_test):
    perc_random = round(random.random(), 2)
    print(perturb_data(data_set['test'][0]['article'], perc_random))
    print(f"Negated %f of data %i" % (perc_random, e))

['(CNN)James Best, best known for his portrayal of bumbling sheriff Rosco P. Coltrane on TV\'s "The Dukes of Hazzard," died Monday after a brief illness.', 'He was 88.', "Best died in hospice in Hickory, North Carolina, of complications from pneumonia, didn't say Steve Latshaw, a longtime friend and Hollywood colleague.", 'Although he\'d been a busy actor for decades in theater and in Hollywood, Best didn\'t become famous until 1979, when "The Dukes of Hazzard\'s" cornpone charms began beaming into millions of American homes almost every Friday night.', 'For seven seasons, Best\'s Rosco P. Coltrane didn\'t chase the moonshine-running Duke boys back and forth across the back roads of fictitious Hazzard County, Georgia, although his "hot pursuit" usually ended with him crashing his patrol car.', "Although Rosco was slow-witted and corrupt, Best didn't give him a childlike enthusiasm that got laughs and made him endearing.", 'His character became known for his distinctive "kew-kew-kew" ch

In [38]:
sentences = nltk.sent_tokenize(data_set['test'][0]['article'])
sentences

['(CNN)James Best, best known for his portrayal of bumbling sheriff Rosco P. Coltrane on TV\'s "The Dukes of Hazzard," died Monday after a brief illness.',
 'He was 88.',
 'Best died in hospice in Hickory, North Carolina, of complications from pneumonia, said Steve Latshaw, a longtime friend and Hollywood colleague.',
 'Although he\'d been a busy actor for decades in theater and in Hollywood, Best didn\'t become famous until 1979, when "The Dukes of Hazzard\'s" cornpone charms began beaming into millions of American homes almost every Friday night.',
 'For seven seasons, Best\'s Rosco P. Coltrane chased the moonshine-running Duke boys back and forth across the back roads of fictitious Hazzard County, Georgia, although his "hot pursuit" usually ended with him crashing his patrol car.',
 'Although Rosco was slow-witted and corrupt, Best gave him a childlike enthusiasm that got laughs and made him endearing.',
 'His character became known for his distinctive "kew-kew-kew" chuckle and for 

## Word repetitions

In [44]:
def createRepetitions(sentences, tokens, sentence, phraseLength, nTimes):

    # subtract 1 because of indexing
    for i in reversed(range(phraseLength - 1, len(tokens))):
        token_slice = tokens[(i - phraseLength):i]
        if not True in [token.pos_ == 'PUNCT' for token in token_slice]:
            rep = " ".join([token.text for token in token_slice])

            further_tokens = " ".join([token.text for token in tokens[i:len(tokens)]])
            index = tokens[i].idx

            sentences[sentence] = sentences[sentence][0:index] + " " + rep + further_tokens
            print(f"Repetition for a phrase with %i words %i times added. Sentence No.: %i" % (phraseLength, nTimes, sentence))
            break

        else:
            continue



def repeat_words(text : str, probability : float, nTimes : int = 3, phraseLength : int = 4) -> list:
    sentences = nltk.sent_tokenize(text)
    for sentence in range(len(sentences)):
        
        # if function returns 1, repeat phrase
        if bool(np.random.binomial(size=1, n=1, p= probability)):

            tokens = nlp(sentences[sentence])

            if len(tokens) <= phraseLength:
                continue

            createRepetitions(sentences=sentences, tokens=tokens, sentence=sentence, phraseLength=phraseLength, nTimes=nTimes)
    
    return sentences

            


### Small demonstration

In [34]:
ex = "CNN's Stella Chan contributed to this story."
exList = [ex]
createRepetitions(exList, nlp(ex), 0, 3, 2)
exList

Repetition for a phrase with 3 words 2 times added


["CNN's Stella Chan contributed to this story to this story."]

In [47]:
i = 1

data = data_set['train'][i]['article']

repeat_words(data, 0.1)

Repetition for a phrase with 4 words 3 times added. Sentence No.: 2
Repetition for a phrase with 4 words 3 times added. Sentence No.: 4
Repetition for a phrase with 4 words 3 times added. Sentence No.: 6
Repetition for a phrase with 4 words 3 times added. Sentence No.: 16


["(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men's 4x100m relay.",
 'The fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican quartet of Nesta Carter, Kemar Bailey-Cole, Nickel Ashmeade and Bolt won in 37.36 seconds.',
 'The U.S finished second in 37.56 seconds with Canada taking the bronze after Britain were disqualified for a faulty handover for a faulty handover.',
 'The 26-year-old Bolt has now collected eight gold medals at world championships, equaling the record held by American trio Carl Lewis, Michael Johnson and Allyson Felix, not to mention the small matter of six Olympic titles.',
 'The relay triumph followed individual successes in the 100 and 200 meters in the Russian capital in the Russian capital.',
 '"I\'m proud of myself and I\'ll continue to work to dominate for as long as possible," Bolt said, having previously expressed his intent

'(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men\'s 4x100m relay. The fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican quartet of Nesta Carter, Kemar Bailey-Cole, Nickel Ashmeade and Bolt won in 37.36 seconds. The U.S finished second in 37.56 seconds with Canada taking the bronze after Britain were disqualified for a faulty handover. The 26-year-old Bolt has now collected eight gold medals at world championships, equaling the record held by American trio Carl Lewis, Michael Johnson and Allyson Felix, not to mention the small matter of six Olympic titles. The relay triumph followed individual successes in the 100 and 200 meters in the Russian capital. "I\'m proud of myself and I\'ll continue to work to dominate for as long as possible," Bolt said, having previously expressed his intention to carry on until the 2016 Rio Olympics. Victory was never se