In [1]:

import csv
from datasets import load_dataset
from textblob import TextBlob
from transformers import pipeline, AutoTokenizer, TFAutoModelForSeq2SeqLM, AutoModelForSeq2SeqLM
from random import sample
from torch import load




INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


# Word lists
Custom (and hopefully eventually curated) word lists for finding corpus items that might cause various FATE issues.

In [2]:
# ethnicity related words
ewl = ['black', 'african-american', 'arab', 'mexican', 'asian', 'african', 'korean', 'japanese', 'chinese', 'china', 'korea', 'africa', 'japan', 'mexico', 'nigger', 'nigga', 'terrorist', 'islam', 'hindu', 'india'] 

# gender and gender identity related words
gwl = ['she', 'woman', 'transexual', 'trans', 'gender', 'transgender', 'asexual', 'non-binary',  'gender-fluid', 'lgbt', 'lgbtq', "lbtq+" ] 

# plain non-subsective adjectives (Nayak et al., 2014)
nsawl = ['alleged', 'apparent', 'arguable', 'assumed', 'believed', 'debatable', 'disputed', 'doubtful', 'dubious', 'erroneous', 'expected', 'faulty', 'future', 'historic', 'impossible', 'improbable', 'likely', 'mistaken', 'ostensible', 'plausible', 'possible', 'potential', 'predicted', 'presumed', 'probable', 'putative', 'questionable', 'seeming', 'so-called', 'supposed', 'suspicious', 'theoretical', 'uncertain', 'unlikely', 'unsuccessful']


# Datasets
Various datasets from different ATS sub-domains. 

In [3]:
# CNN/DM -> news
# Format: split (train,test,validation), features (article, highlights)
cnn_dm = load_dataset('cnn_dailymail', '3.0.0')
cnn_dm = cnn_dm['validation']['article'] + cnn_dm['test']['article']

# Reddit TIFU -> blogs
# Format: split (train), features (ups, num_comments, upvote_ratio, score, documents, tldr)
tifu = load_dataset('reddit_tifu', 'long')

# SamSum -> dialogue
# Format: split (train, test, validation), features (id, dialogue, summary)
samsum = load_dataset('samsum')
samsum = samsum['validation']['dialogue'] + samsum['test']['dialogue']

Reusing dataset cnn_dailymail (C:\Users\ajule\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
Reusing dataset reddit_tifu (C:\Users\ajule\.cache\huggingface\datasets\reddit_tifu\long\1.1.0\bb5bea66e93c55965332f70dc693c38b9e3930a16e9e8a1323ef1a2c8a2fcee6)
Reusing dataset samsum (C:\Users\ajule\.cache\huggingface\datasets\samsum\samsum\0.0.0\3f7dba43be72ab10ca66a2e0f8547b3590e96c2bd9f2cbb1f6bb1ec1f1488ba6)


# Models

In [12]:
# BART
# Trained on large CNN/DM dataset
BART_CNN_TOKENIZER = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
BART_CNN_MODEL = TFAutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

# Base BART model finetuned on samsum dataset
BART_samsum_TOKENIZER = AutoTokenizer.from_pretrained("lidiya/bart-base-samsum")
BART_samsum_MODEL = AutoModelForSeq2SeqLM.from_pretrained("lidiya/bart-base-samsum")

# PEGASUS
# Finetuned for TIFU dialogue/blogging dataset
# PEGASUS_TIFU_TOKENIZER = AutoTokenizer.from_pretrained("google/pegasus-reddit_tifu")
# PEGASUS_TIFU_MODEL = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-reddit_tifu")

# MatchSum
#match_model = load('MatchSum_cnndm_bert.ckpt')

# SMMRY (simple extractive algorithm used by the reddit autotldr bot)
# smmry_api = 'F780F04404'

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large-cnn.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.
Downloading: 100%|██████████| 295/295 [00:00<00:00, 308kB/s]
Downloading: 100%|██████████| 1.68k/1.68k [00:00<00:00, 1.68MB/s]
Downloading: 100%|██████████| 798k/798k [00:00<00:00, 5.30MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 4.43MB/s]
Downloading: 100%|██████████| 1.36M/1.36M [00:00<00:00, 7.72MB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 119kB/s]
Downloading: 100%|██████████| 558M/558M [00:42<00:00, 13.1MB/s]


In [40]:
def find_items(corpus, wordlist, length=1000): 
    """
    List of strings, List of strings -> List of tuples (corpus item [string], words present [list of strings], sentences where words present [list of strings])
    Returns a random sample of specified length with items in the corpus that contain words from the given wordlist.
    """
    res = []
    for item in sample(corpus, length):
        keywords = []
        key_sentences = []
        sentences = TextBlob(item).sentences

        for sentence in sentences:
            set_key = False
            words = sentence.words.lower()

            for word in words:
                if word in wordlist:
                    set_key = True 
                    if word not in keywords:
                        keywords.append(word)

            if set_key == True:
                key_sentences.append(sentence)

        if len(keywords) != 0:
            res.append((item, keywords, key_sentences))
    
    return res

def print_item(item): 
    """
    Given 4-tuple with text, keywords, key sentences and summary, print them in a readable manner.
    """
    print("Text: ", item[0], "\nKeywords: ", item[1], "\nKey sentences: ", item[2], "\nSummary: ", item[3])
    
def show_item(item):
    """
    Given 4-tuple with text, keywords, key sentences and summary, return concatenated string.
    """
    return "Text: ", item[0], "\nKeywords: ", item[1], "\nKey sentences: ", item[2], "\nSummary: ", item[3]
    
def summ_dialogue(text):
    summarizer = pipeline("summarization", model="lidiya/bart-base-samsum")
    return summarizer(text)

In [6]:

corpus = samsum
eitems = find_items(corpus, ewl, 400)
gitems = find_items(corpus, gwl, 400)
aitems = find_items(corpus, nsawl, 400)

print('items done')
print('e: ', len(eitems), 'g: ', len(gitems), 'a: ', len(aitems))
e_res = []
g_res = []
a_res = []

items done
e:  12 g:  47 a:  29


In [55]:

ex = "Jack: It ain't right man. She  \r\nJohn: She's an alleged convict?! That is crazy but we have to remember it is not sure yet. \r\nJack: That is true, let's not jump to conclusions yet."

print(summ_dialogue(ex))


Your max_length is set to 128, but you input_length is only 65. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


[{'summary_text': 'Lydia is an alleged convict. John and Jack are not sure yet.'}]


In [8]:
summarizer = pipeline("summarization", model="lidiya/bart-base-samsum")
for item in eitems:
    e_res.append((item[0], item[1], item[2], summarizer(item[0])))
for item in gitems:
    g_res.append((item[0], item[1], item[2], summarizer(item[0])))
for item in aitems:
    a_res.append((item[0], item[1], item[2], summarizer(item[0])))

Your max_length is set to 128, but you input_length is only 90. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ..\aten\src\ATen\native\BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
Your max_length is set to 128, but you input_length is only 86. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 128, but you input_length is only 96. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 128, but you input_length is only 111. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 128, but you input_length is only 106. You might consider decreasing max_length manually, e.g. summari

In [38]:
print(len(a_res))
item = a_res[13]
print_item(item)

29
Text:  Jeff: Have you heard about this Christmas at Wall Street?
Peter: Yup, was apparently the worst since the end of the 19th century
Miranda: but it's bounced back since, no panic
Jeff: I've listened to some radio podcasts about the American economy
Jeff: and it's pretty scary, I mean the economic war on China that Trump is waging and all his unpredictability
Jeff: but I'm not an economist
Peter: I know, it's hard to say what may happen because Trump is rather unpredictable
Peter: But people in my bank seem calm, at least 2019 should be calm
Miranda: yup, calm, but the global economy will get weaker
Miranda: and the growth will be weaker but not tragic, I suppose
Jeff: and the EU? How do you think?
Miranda: We just don't know what will happen to the UK and with Brexit 
Miranda: There are many different scenarios possible
Miranda: but it seems that 2020 may be much worse
Peter: I agree, I'd expect a global economic meltdown in the early 2020s 
Keywords:  ['possible'] 
Key sentence

In [22]:
with open("dialogue_nsa.txt", 'w') as file:
    for item in a_res:
        print(show_item(item), '\n', file=file)
        

UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f601' in position 280: character maps to <undefined>