In [16]:
import csv
from datasets import load_dataset
from textblob import TextBlob
from transformers import pipeline, AutoTokenizer, TFAutoModelForSeq2SeqLM, AutoModelForSeq2SeqLM, BartTokenizer, BartForConditionalGeneration, BartConfig
from random import sample
from torch import load
from time import sleep
import os
import keyboard
import testing

# Word lists
Custom (and hopefully eventually curated) word lists for finding corpus items that might cause various FATE issues.

In [3]:
# ethnicity related words

# gender and gender identity related words (excluding pronouns)
gwl = ['woman', 'transexual', 'trans', 'gender', 'transgender', 'asexual', 'non-binary',  'gender-fluid', 'lgbt', 'lgbtq', "lbtq+", "man", "feminine", "masculine"] 

# plain non-subsective adjectives (Nayak et al., 2014)
nsawl = ['alleged', 'apparent', 'arguable', 'assumed', 'believed', 'debatable', 'disputed', 'doubtful', 'dubious', 'erroneous', 'expected', 'faulty', 'future', 'historic', 'impossible', 'improbable', 'likely', 'mistaken', 'ostensible', 'plausible', 'possible', 'potential', 'predicted', 'presumed', 'probable', 'putative', 'questionable', 'seeming', 'so-called', 'supposed', 'suspicious', 'theoretical', 'uncertain', 'unlikely', 'unsuccessful']


# Datasets
Various datasets from different ATS sub-domains. 

In [4]:
# CNN/DM -> news
# Format: split (train,test,validation), features (article, highlights)
cnn_dm = load_dataset('cnn_dailymail', '3.0.0')
cnn_dm = cnn_dm['validation']['article'] + cnn_dm['test']['article']

# Reddit TIFU -> blogs
# Format: split (train), features (ups, num_comments, upvote_ratio, score, documents, tldr)
tifu = load_dataset('reddit_tifu', 'long')

# SamSum -> dialogue
# Format: split (train, test, validation), features (id, dialogue, summary)
samsum = load_dataset('samsum')
samsum = samsum['validation']['dialogue'] + samsum['test']['dialogue']

Reusing dataset cnn_dailymail (C:\Users\ajule\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
Reusing dataset reddit_tifu (C:\Users\ajule\.cache\huggingface\datasets\reddit_tifu\long\1.1.0\bb5bea66e93c55965332f70dc693c38b9e3930a16e9e8a1323ef1a2c8a2fcee6)
Reusing dataset samsum (C:\Users\ajule\.cache\huggingface\datasets\samsum\samsum\0.0.0\3f7dba43be72ab10ca66a2e0f8547b3590e96c2bd9f2cbb1f6bb1ec1f1488ba6)


# Models

In [5]:
# BART
# Trained on large CNN/DM dataset
BART_CNN_TOKENIZER = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
BART_CNN_MODEL = TFAutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

# Base BART model finetuned on samsum dataset
BART_samsum_TOKENIZER = AutoTokenizer.from_pretrained("lidiya/bart-base-samsum")
BART_samsum_MODEL = AutoModelForSeq2SeqLM.from_pretrained("lidiya/bart-base-samsum")

# PEGASUS
# Finetuned for TIFU dialogue/blogging dataset
# PEGASUS_TIFU_TOKENIZER = AutoTokenizer.from_pretrained("google/pegasus-reddit_tifu")
# PEGASUS_TIFU_MODEL = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-reddit_tifu")

# MatchSum
#match_model = load('MatchSum_cnndm_bert.ckpt')

# SMMRY (simple extractive algorithm used by the reddit autotldr bot)
# smmry_api = 'F780F04404'

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large-cnn.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [6]:
def find_items(corpus, wordlist, length=1000): 
    """
    List of strings, List of strings -> List of tuples (corpus item [string], words present [list of strings], sentences where words present [list of strings])
    Returns a random sample of specified length with items in the corpus that contain words from the given wordlist.
    """
    res = []
    for item in sample(corpus, length):
        keywords = []
        key_sentences = []
        sentences = TextBlob(item).sentences

        for sentence in sentences:
            set_key = False
            words = sentence.words.lower()

            for word in words:
                if word in wordlist:
                    set_key = True 
                    if word not in keywords:
                        keywords.append(word)

            if set_key == True:
                key_sentences.append(sentence)

        if len(keywords) != 0:
            res.append((item, keywords, key_sentences))
    
    return res

def print_item(item): 
    """
    Given 4-tuple with text, keywords, key sentences and summary, print them in a readable manner.
    """
    print("Keywords: ", item[1], "\nKey sentences: ", item[2], "\nSummary: ", item[3], "\nText: ", item[0])
    
def show_item(item):
    """
    Given 4-tuple with text, keywords, key sentences and summary, return concatenated string.
    """
    return "Keywords: ", item[1], "\nKey sentences: ", item[2], "\nSummary: ", item[3], "\nText: ", item[0]

def peep_item(item_list):
    """
    Given list of items described above, print first item in list and pop it. 
    Used for qualitative tests to read summaries one after the other.
    """
    print("Items left: ", len(item_list), "\n")
    if len(item_list) != 0:
        print_item(item_list.pop(0))

    
def summ_dialogue(text):
    summarizer = pipeline("summarization", model="lidiya/bart-base-samsum")
    return summarizer(text)

In [6]:
# samsum item finding
corpus = samsum
eitems = find_items(corpus, ewl, len(samsum))
gitems = find_items(corpus, gwl, len(samsum))
aitems = find_items(corpus, nsawl, len(samsum))

print('items done')
print('e: ', len(eitems), 'g: ', len(gitems), 'a: ', len(aitems))
dial_e_res = []
dial_g_res = []
dial_a_res = []

items done
e:  54 g:  109 a:  95


In [7]:
# samsum summarisation of found items
summarizer = pipeline("summarization", model="lidiya/bart-base-samsum")
for item in eitems:
    dial_e_res.append((item[0], item[1], item[2], summarizer(item[0])))
for item in gitems:
    dial_g_res.append((item[0], item[1], item[2], summarizer(item[0])))
for item in aitems:
    dial_a_res.append((item[0], item[1], item[2], summarizer(item[0])))

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ..\aten\src\ATen\native\BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
Your max_length is set to 128, but you input_length is only 57. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 128, but you input_length is only 86. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 128, but you input_length is only 110. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 128, but you input_length is only 90. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 128, but you input_length is only 64. You might consider decreasing max_length manually, e.g. summariz

In [7]:
# cnn_dm item finding
corpus = cnn_dm
eitems = find_items(corpus, ewl, 400)
gitems = find_items(corpus, gwl, 400)
aitems = find_items(corpus, nsawl, 400)

print('items done')
print('e: ', len(eitems), 'g: ', len(gitems), 'a: ', len(aitems))
news_e_res = []
news_g_res = []
news_a_res = []

items done
e:  101 g:  154 a:  208


In [8]:
# cnn_dm summarisation
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large-cnn')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-cnn', truncation=True)
summarizer = pipeline('summarization', model=model, tokenizer=tokenizer)

errors = 0

for item in eitems:
    # for indexing errors due to sequence length
    try:
        s = summarizer(item[0])   
    except IndexError:
        errors+=1
        next
    else:
        news_e_res.append((item[0], item[1], item[2], s))
print("e done, errors = ", errors)
for item in gitems:
    try:
        s = summarizer(item[0])   
    except IndexError:
        errors+=1
        next
    else:
        news_g_res.append((item[0], item[1], item[2], s))
print("g done, errors = ", errors)
for item in aitems:
    try:
        s = summarizer(item[0])   
    except IndexError:
        errors+=1
        next
    else:
        news_a_res.append((item[0], item[1], item[2], s))
print("a done, errors = ", errors)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ..\aten\src\ATen\native\BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
Token indices sequence length is longer than the specified maximum sequence length for this model (1880 > 1024). Running this sequence through the model will result in indexing errors


e done, errors =  42
g done, errors =  121


Your max_length is set to 142, but you input_length is only 118. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


a done, errors =  208


In [10]:
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large-cnn')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-cnn', truncation=True)
summarizer = pipeline('summarization', model=model, tokenizer=tokenizer)

In [81]:
peep_item(dial_g_res)


Items left:  88 

Keywords:  ['man'] 
Key sentences:  [Sentence("Sebastian: Hey man!")] 
Summary:  [{'summary_text': 'Sebastian wishes Casper happy birthday. Casper has to set up a meeting and drink a beer.'}] 
Text:  Sebastian: Hey man! I want to wish you happy birthday!
Sebastian: May this day will bring together your closest ones, so You can feel loved. Let the money never stop flowing into your life, and may Your dreams come true! I wish you all the best! Long lasting health, luck, and above all perseverence in all sections of life. I wish you also strength to make a perfect lemonade out of these lemons thrown by life!
Casper: Thank you!
Casper: Thank you for remembering about me, for you effort to write these kind words. I appreciate this and I already feel happy that I read this.
Casper: We have to set up a meeting and drink a beer or 2.
Sebastian: Sure! In touch. Happy Birthday!


In [9]:
peep_item(dial_a_res)

NameError: name 'dial_a_res' is not defined

In [2]:
peep_item(news_e_res)

NameError: name 'peep_item' is not defined