In [32]:
import checklist
import math
import nltk
import numpy as np
import random
import spacy

from checklist.perturb import Perturb
from datasets import load_dataset, dataset_dict
from progress.bar import ShadyBar
from typing import Tuple

In [33]:
nlp = spacy.load("en_core_web_sm")
data_set = load_dataset('cnn_dailymail', '3.0.0')

Reusing dataset cnn_dailymail (/home/philko/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602)


In [34]:
type(data_set)

datasets.dataset_dict.DatasetDict

In [35]:
len(data_set['train']) + len(data_set['validation']) + len(data_set['test'])

311971

In [36]:


def concat_data_set(data_set : dataset_dict.DatasetDict, cutoff_fun : callable) -> list:

    data : list = list()

    bar = ShadyBar('Creating dataset', max=len(data_set['train']) + len(data_set['validation']) + len(data_set['test']))

    for e in data_set['train']:
        data.append(
            cutoff_fun(e)
        )
        bar.next()

    for e in data_set['validation']:
        data.append(
            cutoff_fun(e)
        )
        bar.next()

    for e in data_set['test']:
        data.append(
            cutoff_fun(e)
        )
        bar.next()
    bar.finish()

    return data

cnn_cutoff = lambda elem : {'article': elem['article'], 'id': elem['id']}
concat_data_set(data_set, cnn_cutoff)

[{'article': 'It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons. The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction." It\'s a step that is set to turn an international crisis into a fierce domestic political battle. There are key questions looming over the debate: What did U.N. weapons inspectors find in Syria? What happens if Congress votes no? And how will the Syrian government react? In a televised address from the White House Rose Garden earlier Saturday, the president said he would take his case to Congress, not because he has to -- but be

In [37]:
    def negate_data(sentences : list) -> Tuple[list, list]:
        """ Perturbation function

        Complexity is O(n^2).

        Parameters
        ----------
        text : str
            text, passed as a string.
        
        Returns
        -------
        list
            list of sentences

        Raises
        ------
        Exception
            if percentage is not in the interval [0,1]
        """
        return_list : list = list()
        not_negated : list = list()

        for index, sentence in enumerate(list(nlp.pipe(sentences))):
            sent_tmp : str = sentence.text
            try:
                ret = Perturb.perturb([sentence], Perturb.add_negation, keep_original=False)
                if len(ret.data) > 0:
                    sent_tmp = ret.data[0][0]
                else:
                    not_negated.append(index)
                    print("Couldn't find proper negation.")

            except TypeError:
                not_negated.append(index)
                print("Couldn't find proper negation.")

            return_list.append(sent_tmp)


        return return_list, not_negated 

In [38]:
def tokenize_sentences(text : str) -> list:
        return nltk.sent_tokenize(text)

In [39]:
data = data_set['train'][0]['article']

In [65]:
sentences = tokenize_sentences(data)

In [41]:
negate_data(sentences=sentences)
data

Couldn't find proper negation.
Couldn't find proper negation.
Couldn't find proper negation.
Couldn't find proper negation.
Couldn't find proper negation.
Couldn't find proper negation.
Couldn't find proper negation.
Couldn't find proper negation.
Couldn't find proper negation.


'It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons. The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction." It\'s a step that is set to turn an international crisis into a fierce domestic political battle. There are key questions looming over the debate: What did U.N. weapons inspectors find in Syria? What happens if Congress votes no? And how will the Syrian government react? In a televised address from the White House Rose Garden earlier Saturday, the president said he would take his case to Congress, not because he has to -- but because he want

In [42]:

    def createRepetitions(\
            sentences : list,\
            doc : spacy.tokens.doc.Doc,\
            sent_ind : int,\
            phraseLength : int,\
            nTimes : int) -> bool:
        """ Creating Repetitions in one sentence

        Function to create repetitions in one sentence. To avoid 
        the repitition of punctations, only phrase without punctuations 
        will be choosen. Alteration is done inplace.
        Complexity is O(n).

        Parameter
        ---------
        sentences : list
            list of sentences in which one sentence will be perturbated
        doc : pacy.tokens.doc.Doc
            parsed tokens as a spaCy doc
        sent_ind : int
            index of sentence to be perturbated
        phraseLength : int
            length of a phrase to be repeated
        nTimes : int
            number of times the phrase will be repeated
        """
        # subtract 1 because of indexing
        for i in reversed(range(phraseLength - 1, len(doc))):
            token_slice = doc[(i - phraseLength):i]
            if not True in [token.pos_ == 'PUNCT' for token in token_slice]:

                index = doc[i].idx

                rep = " ".join([token.text for token in token_slice])
                further_tokens = " ".join([token.text for token in doc[i:len(doc)]])
                sentences[sent_ind ] = sentences[sent_ind ][0:index] + " " + rep + further_tokens

                #print(f"Repetition for a phrase with %i words %i times added. Sentence No.: %i" % (phraseLength, nTimes, sent_ind))
                return True
        return False



    def repeat_words(sentences : list, nTimes : int = 3, phraseLength : int = 4):
        """ Repeat wordssfunction

        Function to repeats some words in every sentence of the text. With the passed probability, a sentence in the text is perturbated.
        Complexity is O(n^2) (function call).

        Parameter
        ---------
        sentences : list
            list of already tokenized sentence tokens
        nTimes : int
            number of repetitions of each phrase
        phraseLength : int
            length of phrase to be repeated
            
        Returns
        -------
        list, list
            list of sentences, list of sentences where a phrase is repeated

        """

        indices : list = []
        for i in range(len(sentences)):
            
            tokens = nlp(sentences[i])

            if len(tokens) <= phraseLength:
                continue

            if createRepetitions(sentences=sentences, doc=tokens, sent_ind =i, phraseLength=phraseLength, nTimes=nTimes):
                indices.append(i)

        return sentences, indices

In [44]:
repeat_words(sentences=sentences)

(["It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria military force in Syria military force in Syria.",
  'Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons use of chemical weapons use of chemical weapons.',
  'The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction weapons of mass destruction weapons of mass destruction. "',
  "It's a step that is set to turn an international crisis into a fierce domestic political battle fierce domestic political battle fierce domestic political battle.",
  'There are key questions looming over the debate: What did U.N. weapons inspectors find in Syria inspectors f

In [45]:
def swap_pair(sentence : str, doc : spacy.tokens.doc.Doc) -> str:
    """ Swap pair function

    Function to swap one random pair of words. Using the random sample function,
    two elements are choosen and swaped later on.

    Parameter
    ---------
    sentence : str
        sentence to be deteriorated
    doc : spacy.tokens.doc.Doc
        spacy document, to extract the indices of the tokens from

    Returns
    -------
    str
        deteriorated sentence
    """
    
    candidates : list = []
    candidates_text : list = []

    for i in range(len(doc)):

        lower_text = doc[i].text.lower()

        if doc[i].pos_ != "PUNCT" and not lower_text in candidates_text:
            candidates.append(i)
            candidates_text.append(lower_text)
        else:
            continue

    pair : list = random.sample(candidates, 2)
    first, second = (pair[0], pair[1]) if pair[0] < pair[1] else (pair[1], pair[0])
 
    first_bounds, second_bounds = \
        (doc[first].idx, doc[first].idx + len(doc[first].text)), \
        (doc[second].idx, doc[second].idx + len(doc[second].text))

    first_token, second_token = \
        sentence[first_bounds[0]:first_bounds[1]], \
        sentence[second_bounds[0]:second_bounds[1]]
    
    return sentence[0:(first_bounds[0])] + second_token + " " +\
        sentence[(first_bounds[1] + 1):(second_bounds[0])] + first_token + \
        sentence[(second_bounds[1])::]

    

def word_swap(sentences : list):
    """ Word swap function

    Function to swap words in every sentence as far as it is possible. 

    Parameter
    ---------
    sentences : list
        list of already tokenized sentence tokens

    Returns
    -------
    list, list
        list of sentences, list of indices of sentences with a swapped word
    """
    ret_list : list = []
    indices : list = []

    for i, sentence in enumerate(sentences):

        times : int = random.randrange(1, 5)
        new_sentence = sentence

        for _ in range(times):
            new_sentence = swap_pair(sentence=new_sentence, doc=nlp(new_sentence))

        ret_list.append(new_sentence)
        indices.append(i)
    return ret_list, indices

In [46]:
word_swap(sentences)

(["It's on Obama .S. whether Barack   lawmakers wants to weigh in official President to use military force in Syria military force in Syria military force in Syria.",
  'hours sent a letter to the weapons of the House and Senate on Saturday night, Obama after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical heads use of chemical weapons use of chemical weapons.',
  'The proposed legislation mass Obama asks Congress military approve the disrupt of to force "to deter, use, prevent and degrade the potential for future uses of chemical weapons or other weapons of from destruction weapons of mass destruction weapons of mass destruction. "',
  "Itstep turn 's that is set to a political international crisis into a domestic fierce an battle fierce domestic political battle fierce domestic political battle.",
  'There are key weapons looming over questions debate: What the in did inspectors find U.N. Syria inspectors f

In [47]:
def drop_single(sentence : str, doc : spacy.tokens.doc.Doc) -> list:
    """ Drop single word function

    Function to drop a single word from a sentence. 

    Parameter
    ---------
    sentence : str
        sentence to be deteriorated
    doc : spacy.tokens.doc.Doc
        spacy document, to extract the indices of the token from

    Returns
    -------
    str
        deteriorated sentence
    """

    candidates : list = []

    for i in range(len(doc)):

        if doc[i].pos_ != "PUNCT":
            candidates.append(i)
        else:
            continue
    
    to_drop : int = random.randrange(0, len(doc))

    bounds = doc[to_drop].idx, doc[to_drop].idx + len(doc[to_drop].text)

    return sentence[0:bounds[0]] + sentence[(bounds[1] + 1)::]
    


def word_drop(text : str):
    """ Word drop function

    Function to drop random words from a sentence.

    Parameter
    ---------
    sentences : list
        list of already tokenized sentence tokens

    Returns
    -------
    list
        list of sentences
    """


    ret_list : list = []
    indices : list = []

    for i, sentence in enumerate(sentences):

            times : int = random.randrange(1, 5)
            new_sentence = sentence

            for _ in range(times):
                new_sentence = drop_single(sentence=new_sentence, doc=nlp(new_sentence))

            ret_list.append(new_sentence)
            indices.append(i)
    return ret_list, indices

In [48]:
word_drop(sentences)

(["It's  U.S. President Barack Obama wants lawmakers weigh in on whether to use military force Syria military force in Syria military force Syria.",
  'Obama sent a letter to the heads of the House and Senate night, hours after announcing that he believes military action against Syrian is the right step to take over the alleged use of chemical weapons use of chemical weapons use of chemical weapons.',
  'The proposed legislation from Obama asks Congress to approve the use of military force "to   prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction weapons of mass destruction weapons of mass destruction. ',
  "It's a is set to turn an crisis into a fierce political battle fierce domestic political battle fierce domestic political battle.",
  'There are key questions looming over the debate: What U.N. inspectors find in Syria inspectors find Syria inspectors find in Syria?',
  'What happens if Congress votes no votes no if Congress no

In [67]:
def drop_single_pos(sentence : str, doc : spacy.tokens.doc.Doc, pos : str) -> list:
    """ Drop single word function

    Function to drop words based on their POS tag. 

    Parameter
    ---------
    sentence : str
        sentence to be deteriorated
    doc : spacy.tokens.doc.Doc
        spacy document, to extract the indices of the token from
    pos : str
        pos tag whose words will be dropped

    Returns
    -------
    str
        deteriorated sentence
    """

    candidates : list = []

    for i in range(len(doc)):

        if doc[i].pos_ == pos:
            candidates.append(i)
        else:
            continue
    
    if len(candidates) == 0:
        return sentence, False
    
    diff : int = 0
    for i in candidates:
        bounds = doc[i].idx - diff, doc[i].idx + len(doc[i].text) - diff
        sentence = sentence[0:bounds[0]] + sentence[(bounds[1] + 1)::]
        diff += len(doc[i].text) + 1
    

    return sentence, True
    


def pos_drop(sentences : list, pos : str):
    """ POS drop function

    Function to drop random words with a specific POS-Tag from a sentence.

    Parameter
    ---------
    sentences : list
        list of already tokenized sentence tokens

    Returns
    -------
    list
        list of sentences
    """

    ret_list : list = []
    indices : list = []

    for i, sentence in enumerate(sentences):
            new_sentence = sentence
            new_sentence, success = drop_single_pos(sentence=new_sentence, doc=nlp(new_sentence), pos=pos)

            if success:
                indices.append(i)
            ret_list.append(new_sentence)
 
    return ret_list, indices

In [70]:
from spacy import displacy
print(sentences[2])
print("\n")
print(pos_drop(sentences, "ADJ"))


The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction."


(["It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use force in Syria.", 'Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes action against targets is the step to take over the alleged use of weapons.', 'The proposed legislation from Obama asks Congress to approve the use of force "to deter, disrupt, prevent and degrade the potential for uses of weapons or weapons of destruction."', "It's a step that is set to turn an crisis into a battle.", 'There are questions looming over the debate: What did U.N. weapons inspectors find in Syria?', 'What happens if Congress votes no?', 'And how will the government react?', 'In a televised address from the White House Rose Garden earli

In [73]:
test = nlp(sentences[5])
displacy.serve(test, style="dep")



OSError: [Errno 98] Address already in use