# Parsing Experiments

A variety of experiments in generating guided prompts. Unfortunately, it seems like unguided prompts generalize better.

In [1]:
from datasets import load_dataset

from preprocess import get_headline


def load_headlines(count=20):
    ds = load_dataset("Biddls/Onion_News")
    return [get_headline(ds['train'][i]['text']) for i in range(count)]
    
headlines = load_headlines()
headlines

['Relaxed Marie Kondo Now Says She Perfectly Happy Living In Waist-High Sewage',
 'U.S. Officials Call For Correct Amount Of Violence',
 'Kamala Harris Asks Communications Assistant If She Can Take Them Out For Coffee And Pick Their Brain Sometime',
 '25 Arrested In Fake Nursing School Diploma Scheme',
 'World’s Oldest American Dies At 72',
 'Report: Everyone Laughing At What Is A Very Silly Misunderstanding, But Don’t Be Fooled—Even Now, The Seeds Of Resentment Are Taking Root',
 'CEOs Explain How They Will Use ChatGPT',
 'FDA Moves To Ease Blood Donation Rules For Gay And Bisexual Men',
 'Study Shows Humans Still Have Genes To Grow Full Coat Of Body Hair',
 'Look What Happens When You Leave A McDonald’s Hamburger Out On A Counter For A Year',
 'Biden Secures Nation Extra Trash Can',
 'ChatGPT Forced To Take Bar Exam Even Though Dream Was To Be AI Art Bot',
 'Man Has Watched All 761 Movies',
 'David Cronenberg Once Again Leaves Doctor’s Appointment Disappointed By Lack Of Body Horror'

In [2]:
# Yake topics

import yake
import pprint

kw_extractor = yake.KeywordExtractor(lan='en', n=3, top=2)

for headline in headlines:
    print(headline)
    pprint.pp(kw_extractor.extract_keywords(headline))
    print()

Relaxed Marie Kondo Now Says She Perfectly Happy Living In Waist-High Sewage
[('Perfectly Happy Living', np.float64(0.0032173869679631944)),
 ('Relaxed Marie Kondo', np.float64(0.0035308295728302113))]

U.S. Officials Call For Correct Amount Of Violence
[('Officials Call', np.float64(0.012602360123953448)),
 ('Amount Of Violence', np.float64(0.012602360123953448))]

Kamala Harris Asks Communications Assistant If She Can Take Them Out For Coffee And Pick Their Brain Sometime
[('Harris Asks Communications', np.float64(0.02140921543860024)),
 ('Communications Assistant', np.float64(0.02140921543860024))]

25 Arrested In Fake Nursing School Diploma Scheme
[('School Diploma Scheme', np.float64(0.001881309737406442)),
 ('Fake Nursing School', np.float64(0.0032173869679631944))]

World’s Oldest American Dies At 72
[('Oldest American Dies', np.float64(0.0032173869679631944)),
 ('Oldest American', np.float64(0.02140921543860024))]

Report: Everyone Laughing At What Is A Very Silly Misunderstand

In [3]:
# SpaCy noun-chunk based topics

import spacy


nlp = spacy.load('en_core_web_sm')

def extract_topics(text, max_topics=2):
    doc = nlp(text.lower()) # lower casing the text seems to result in better parsing
    noun_chunks = [chunk for chunk in doc.noun_chunks if chunk.root.pos_ != 'PRON'] # get all noun chunks, ignoring pronouns\n",
    noun_chunks.sort(key=key, reverse=True)
    return noun_chunks

def key(span):
    # Returns the importance of a span. A span is important if it contains an entity or is long.
    entity_types = ['PERSON', 'ORG', 'GPE']
    is_entity = any(token.ent_type_ in entity_types for token in span)
    length = len(span.text.strip())
    return (is_entity, length)

for headline in headlines:
    print(headline)
    print(extract_topics(headline))
    print()

Relaxed Marie Kondo Now Says She Perfectly Happy Living In Waist-High Sewage
[relaxed marie kondo, waist-high sewage]

U.S. Officials Call For Correct Amount Of Violence
[u.s. officials, correct amount, violence]

Kamala Harris Asks Communications Assistant If She Can Take Them Out For Coffee And Pick Their Brain Sometime
[kamala harris, communications assistant, their brain, coffee]

25 Arrested In Fake Nursing School Diploma Scheme
[fake nursing school diploma scheme]

World’s Oldest American Dies At 72
[]

Report: Everyone Laughing At What Is A Very Silly Misunderstanding, But Don’t Be Fooled—Even Now, The Seeds Of Resentment Are Taking Root
[a very silly misunderstanding, resentment, the seeds, report, root]

CEOs Explain How They Will Use ChatGPT
[chatgpt, ceos]

FDA Moves To Ease Blood Donation Rules For Gay And Bisexual Men
[fda, blood donation rules, gay and bisexual men]

Study Shows Humans Still Have Genes To Grow Full Coat Of Body Hair
[full coat, body hair, humans, study, g

In [4]:
# SpaCy entity based topics

import spacy


nlp = spacy.load('en_core_web_sm')

def extract_topic(text):
    doc = nlp(text.lower()) # lower casing the text seems to result in better parsing
    entities = doc.ents
    for entity in entities:
        if entity.label_ == 'PERSON':
            return text[entity.start_char:entity.end_char]

for headline in headlines:
    print(headline)
    pprint.pp(extract_topic(headline))
    print()

Relaxed Marie Kondo Now Says She Perfectly Happy Living In Waist-High Sewage
'Marie Kondo'

U.S. Officials Call For Correct Amount Of Violence
None

Kamala Harris Asks Communications Assistant If She Can Take Them Out For Coffee And Pick Their Brain Sometime
'Kamala Harris'

25 Arrested In Fake Nursing School Diploma Scheme
None

World’s Oldest American Dies At 72
None

Report: Everyone Laughing At What Is A Very Silly Misunderstanding, But Don’t Be Fooled—Even Now, The Seeds Of Resentment Are Taking Root
None

CEOs Explain How They Will Use ChatGPT
None

FDA Moves To Ease Blood Donation Rules For Gay And Bisexual Men
None

Study Shows Humans Still Have Genes To Grow Full Coat Of Body Hair
None

Look What Happens When You Leave A McDonald’s Hamburger Out On A Counter For A Year
None

Biden Secures Nation Extra Trash Can
None

ChatGPT Forced To Take Bar Exam Even Though Dream Was To Be AI Art Bot
None

Man Has Watched All 761 Movies
None

David Cronenberg Once Again Leaves Doctor’s Appo