In [1]:
import checklist
import math
import nltk
import numpy as np
import random
import spacy

from checklist.perturb import Perturb
from datasets import load_dataset, dataset_dict
from progress.bar import ShadyBar
from typing import Tuple

In [2]:
nlp = spacy.load("en_core_web_sm")
data_set = load_dataset('cnn_dailymail', '3.0.0')

Reusing dataset cnn_dailymail (/home/philko/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602)
  return torch._C._cuda_getDeviceCount() > 0


In [3]:
type(data_set)

datasets.dataset_dict.DatasetDict

In [4]:
len(data_set['train']) + len(data_set['validation']) + len(data_set['test'])

311971

In [5]:


def concat_data_set(data_set : dataset_dict.DatasetDict, cutoff_fun : callable) -> list:

    data : list = list()

    bar = ShadyBar('Creating dataset', max=len(data_set['train']) + len(data_set['validation']) + len(data_set['test']))

    for e in data_set['train']:
        data.append(
            cutoff_fun(e)
        )
        bar.next()

    for e in data_set['validation']:
        data.append(
            cutoff_fun(e)
        )
        bar.next()

    for e in data_set['test']:
        data.append(
            cutoff_fun(e)
        )
        bar.next()
    bar.finish()

    return data

cnn_cutoff = lambda elem : {'article': elem['article'], 'id': elem['id']}
concat_data_set(data_set, cnn_cutoff)

Two of these three dizzying moves ended up in victory, and one did not. Two out of three isn\'t bad. Why shouldn\'t Palin think another gamble might pay off? Palin herself may not know her next move. Speculation about her presidential ambitions is premature, though it will be much easier for her to build a national organization now that she has no professional ties to Alaska. Whatever she does will be noticed, that\'s for sure. Because the attention lavished on Palin\'s decision is further evidence of her unwitting ability to bring out deep-seated feelings of admiration -- and loathing -- in people. We will be hearing from Palin, and from the Palin-haters, for a long while to come.',
  'id': '02c971cf94ad3b1696742544778f06cf8a2b1c23'},
 {'article': '(CNN) -- Holders Bayern Munich equaled the record for successive Champions League wins held by their manager Pep Guardiola\'s former side Barcelona as they reached the knockout stages Tuesday. Bayern eased past Czech champions Viktoria Pils

In [6]:
    def negate_data(sentences : list) -> Tuple[list, list]:
        """ Perturbation function

        Complexity is O(n^2).

        Parameters
        ----------
        text : str
            text, passed as a string.
        
        Returns
        -------
        list
            list of sentences

        Raises
        ------
        Exception
            if percentage is not in the interval [0,1]
        """
        return_list : list = list()
        not_negated : list = list()

        for index, sentence in enumerate(list(nlp.pipe(sentences))):
            sent_tmp : str = sentence.text
            try:
                ret = Perturb.perturb([sentence], Perturb.add_negation, keep_original=False)
                if len(ret.data) > 0:
                    sent_tmp = ret.data[0][0]
                else:
                    not_negated.append(index)
                    print("Couldn't find proper negation.")

            except TypeError:
                not_negated.append(index)
                print("Couldn't find proper negation.")

            return_list.append(sent_tmp)


        return return_list, not_negated 

In [7]:
def tokenize_sentences(text : str) -> list:
        return nltk.sent_tokenize(text)

In [8]:
data = data_set['train'][0]['article']

In [9]:
sentences = tokenize_sentences(data)

In [10]:
negate_data(sentences=sentences)
data

RuntimeError: generator raised StopIteration

In [41]:

    def createRepetitions(\
            sentences : list,\
            doc : spacy.tokens.doc.Doc,\
            sent_ind : int,\
            phraseLength : int,\
            nTimes : int) -> bool:
        """ Creating Repetitions in one sentence

        Function to create repetitions in one sentence. To avoid 
        the repitition of punctations, only phrase without punctuations 
        will be choosen. Alteration is done inplace.
        Complexity is O(n).

        Parameter
        ---------
        sentences : list
            list of sentences in which one sentence will be perturbated
        doc : pacy.tokens.doc.Doc
            parsed tokens as a spaCy doc
        sent_ind : int
            index of sentence to be perturbated
        phraseLength : int
            length of a phrase to be repeated
        nTimes : int
            number of times the phrase will be repeated
        """
        # subtract 1 because of indexing
        for i in reversed(range(phraseLength - 1, len(doc))):
            token_slice = doc[(i - phraseLength):i]
            if not True in [token.pos_ == 'PUNCT' for token in token_slice]:

                index = doc[i].idx

                rep = " ".join([token.text for token in token_slice])
                further_tokens = " ".join([token.text for token in doc[i:len(doc)]])
                sentences[sent_ind ] = sentences[sent_ind ][0:index] + " " + rep + further_tokens

                #print(f"Repetition for a phrase with %i words %i times added. Sentence No.: %i" % (phraseLength, nTimes, sent_ind))
                return True
        return False



    def repeat_words(sentences : list, nTimes : int = 3, phraseLength : int = 4):
        """ Repeat wordssfunction

        Function to repeats some words in every sentence of the text. With the passed probability, a sentence in the text is perturbated.
        Complexity is O(n^2) (function call).

        Parameter
        ---------
        sentences : list
            list of already tokenized sentence tokens
        nTimes : int
            number of repetitions of each phrase
        phraseLength : int
            length of phrase to be repeated
            
        Returns
        -------
        list, list
            list of sentences, list of sentences where a phrase is repeated

        """

        indices : list = []
        for i in range(len(sentences)):
            
            tokens = nlp(sentences[i])

            if len(tokens) <= phraseLength:
                continue

            if createRepetitions(sentences=sentences, doc=tokens, sent_ind =i, phraseLength=phraseLength, nTimes=nTimes):
                indices.append(i)

        return sentences, indices

In [42]:
repeat_words(sentences=sentences)

(["It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria military force in Syria.",
  'Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons use of chemical weapons.',
  'The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction weapons of mass destruction. "',
  "It's a step that is set to turn an international crisis into a fierce domestic political battle fierce domestic political battle.",
  'There are key questions looming over the debate: What did U.N. weapons inspectors find in Syria inspectors find in Syria?',
  'What happens if Congress votes no if Congress votes no?',
  'And how will the Syrian gover

In [43]:
def swap_pair(sentence : str, doc : spacy.tokens.doc.Doc) -> str:
    """ Swap pair function

    Function to swap one random pair of words. Using the random sample function,
    two elements are choosen and swaped later on.

    Parameter
    ---------
    sentence : str
        sentence to be deteriorated
    doc : spacy.tokens.doc.Doc
        spacy document, to extract the indices of the tokens from

    Returns
    -------
    str
        deteriorated sentence
    """
    
    candidates : list = []
    candidates_text : list = []

    for i in range(len(doc)):

        lower_text = doc[i].text.lower()

        if doc[i].pos_ != "PUNCT" and not lower_text in candidates_text:
            candidates.append(i)
            candidates_text.append(lower_text)
        else:
            continue

    pair : list = random.sample(candidates, 2)
    first, second = (pair[0], pair[1]) if pair[0] < pair[1] else (pair[1], pair[0])
 
    first_bounds, second_bounds = \
        (doc[first].idx, doc[first].idx + len(doc[first].text)), \
        (doc[second].idx, doc[second].idx + len(doc[second].text))

    first_token, second_token = \
        sentence[first_bounds[0]:first_bounds[1]], \
        sentence[second_bounds[0]:second_bounds[1]]
    
    return sentence[0:(first_bounds[0])] + second_token + " " +\
        sentence[(first_bounds[1] + 1):(second_bounds[0])] + first_token + \
        sentence[(second_bounds[1])::]

    

def word_swap(sentences : list):
    """ Word swap function

    Function to swap words in every sentence as far as it is possible. 

    Parameter
    ---------
    sentences : list
        list of already tokenized sentence tokens

    Returns
    -------
    list, list
        list of sentences, list of indices of sentences with a swapped word
    """
    ret_list : list = []
    indices : list = []

    for i, sentence in enumerate(sentences):

        times : int = random.randrange(1, 5)
        new_sentence = sentence

        for _ in range(times):
            new_sentence = swap_pair(sentence=new_sentence, doc=nlp(new_sentence))

        ret_list.append(new_sentence)
        indices.append(i)
    return ret_list, indices

In [44]:
word_swap(sentences)

(["It's official: Barack President U.S. Obama wants lawmakers to weigh military on whether to use in force in Syria military force in Syria.",
  'is sent a letter right the heads of the House and Senate on Saturday night, hours after announcing that he believes military action weapons Syrian targets Obama the to step to take over the alleged use of chemical against use of chemical weapons.',
  'The proposed future from Obama asks Congress destruction approve the use of military force "to deter, disrupt, prevent and degrade the potential for legislation uses of chemical weapons or other weapons of mass to weapons of mass destruction. "',
  'political crisis a step that to set is domestic an international s into a fierce turn It battle fierce domestic political battle.',
  'There in key questions looming over inspectors debate: the did What weapons U.N. find are Syria inspectors find in Syria?',
  'no happens Congress if votes What if Congress votes no?',
  'the react how will Syrian gov

In [45]:
def drop_single(sentence : str, doc : spacy.tokens.doc.Doc) -> list:
    """ Drop single word function

    Function to drop a single word from a sentence. 

    Parameter
    ---------
    sentence : str
        sentence to be deteriorated
    doc : spacy.tokens.doc.Doc
        spacy document, to extract the indices of the token from

    Returns
    -------
    str
        deteriorated sentence
    """

    candidates : list = []

    for i in range(len(doc)):

        if doc[i].pos_ != "PUNCT":
            candidates.append(i)
        else:
            continue
    
    to_drop : int = random.randrange(0, len(doc))

    bounds = doc[to_drop].idx, doc[to_drop].idx + len(doc[to_drop].text)

    return sentence[0:bounds[0]] + sentence[(bounds[1] + 1)::]
    


def word_drop(text : str):
    """ Word drop function

    Function to drop random words from a sentence.

    Parameter
    ---------
    sentences : list
        list of already tokenized sentence tokens

    Returns
    -------
    list
        list of sentences
    """


    ret_list : list = []
    indices : list = []

    for i, sentence in enumerate(sentences):

            times : int = random.randrange(1, 5)
            new_sentence = sentence

            for _ in range(times):
                new_sentence = drop_single(sentence=new_sentence, doc=nlp(new_sentence))

            ret_list.append(new_sentence)
            indices.append(i)
    return ret_list, indices

In [46]:
word_drop(sentences)

(["It's official: U.S. Barack Obama wants lawmakers to weigh in on whether to use military force in Syria military force in Syria.",
  'Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons use chemical weapons.',
  'The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential future uses of chemical weapons or other weapons of mass destruction weapons of mass destruction. "',
  "It's a step is set to turn an international crisis into a fierce domestic political battle fierce domestic political battle.",
  'There are key questions over the debate: What did U.N. weapons inspectors find in Syria inspectors find in Syria?',
  'What happens Congress votes no Congress no',
  'And how will the Syrian government react the Syrian government ',
  'In 

In [None]:
def drop_single_pos(sentence : str, doc : spacy.tokens.doc.Doc, pos : str) -> list:
    """ Drop single word function

    Function to drop words based on their POS tag. 

    Parameter
    ---------
    sentence : str
        sentence to be deteriorated
    doc : spacy.tokens.doc.Doc
        spacy document, to extract the indices of the token from
    pos : str
        pos tag whose words will be dropped

    Returns
    -------
    str
        deteriorated sentence
    """

    candidates : list = []

    for i in range(len(doc)):

        if doc[i].pos_ == pos:
            candidates.append(i)
        else:
            continue
    
    new_sentence = sentence
    diff : int = 0
    for i in candidates:
        bounds = doc[i].idx - diff, doc[i].idx + len(doc[i].text) - diff
        print(sentence)
        print(diff)
        sentence = sentence[0:bounds[0]] + sentence[(bounds[1] + 1)::]
        diff += doc[i].idx + len(doc[i].text) + 1

    return sentence
    


def pos_drop(text : str, pos : str):
    """ POS drop function

    Function to drop random words with a specific POS-Tag from a sentence.

    Parameter
    ---------
    sentences : list
        list of already tokenized sentence tokens

    Returns
    -------
    list
        list of sentences
    """


    ret_list : list = []
    indices : list = []

    for i, sentence in enumerate(sentences):

            times : int = random.randrange(1, 5)
            new_sentence = sentence

            for _ in range(times):
                new_sentence = drop_single_pos(sentence=new_sentence, doc=nlp(new_sentence), pos=pos)

            ret_list.append(new_sentence)
            indices.append(i)
    return ret_list, indices

In [51]:
from spacy import displacy
print(pos_drop([sentences[2]], "ADJ")[0])

test = nlp(sentences[2])
displacy.serve(test, style="dep")

 proposetion from Obama asks Congress to approve the force "to deter, disrupt, preventdegrade the potential for uses of chemical weother weapons of mass destruction weapons of mass destruction. "



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

