In [34]:
from pathlib import Path

import wikipedia
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec

from IPython.display import display

In [16]:
# for Amazon
main_wiki_link = 'https://en.wikipedia.org/wiki/Amazon_(company)'
entities_df = pd.read_pickle("../../data/temp/Amazon_ents.pkl")

display(entities_df.head())

Unnamed: 0,text,href
0,Rick Dalzell,/wiki/Rick_Dalzell
1,Comparison of Android e-book reader software,/wiki/Comparison_of_Android_e-reader_software
3,New Castle,"/wiki/New_Castle,_Delaware"
4,Mnet,/wiki/Mnet_(TV_channel)
5,Amazon Game Studios,/wiki/Amazon_Game_Studios


In [17]:
# load model
fname = '../../models/gensim_(100k_docs)/gensim_(100k_docs)_vs200_e20_a5e-100_PV-DM-A'
model = Doc2Vec.load(fname)

In [None]:
def tokenize(text):
    stop_words = set(stopwords.words('english'))
    tokenized = [token.lower() for token in word_tokenize(text) if token.isalpha()]
    filtered = [token for token in tokenized if not token in stop_words]
    return filtered


def valid_paragraph(paragraph):
    return len(paragraph) > 0 and not paragraph.isspace() and not paragraph.startswith('==')


def get_page_endpoint(url_or_uri):
    return url_or_uri.split('/')[-1]

In [31]:
get_page_endpoint(main_wiki_link)

'Amazon_(company)'

In [42]:
page = wikipedia.page(get_page_endpoint(main_wiki_link))
plaintext = '\n'.join([paragraph for paragraph in page.content.split('\n') 
                       if valid_paragraph(paragraph)])

main_doc = tokenize(plaintext)

print(len(plaintext_tokenized))

5519


In [53]:
def get_tokenized_doc(href_col):
    try:
        entity_page = wikipedia.page(get_page_endpoint(href_col))
    except wikipedia.PageError:
        print(f'Page not found {href_col}')
        pass
    except wikipedia.DisambiguationError:
        print(f'Page was disambiguous: {href_col}')
        pass
    else:
        plaintext = '\n'.join([paragraph for paragraph in page.content.split('\n') 
                       if valid_paragraph(paragraph)])
        entity_doc = tokenize(plaintext)
        return model.similarity_unseen_docs(main_doc, entity_doc)
    return 0

In [54]:
entities_df['similarity'] = entities_df.href.apply(get_tokenized_doc)

Page not found /wiki/New_Castle,_Delaware
Page not found /wiki/Mnet_(TV_channel)




  lis = BeautifulSoup(html).find_all('li')


Page was disambiguous: /wiki/CDNow
Page not found /wiki/Grand_Haven,_Michigan
Page was disambiguous: /wiki/LaTeX
Page was disambiguous: /wiki/Amgen
Page was disambiguous: /wiki/Jewelry
Page not found /wiki/Barnes_%26_Noble
Page not found /wiki/Groupon
Page not found /wiki/HP_Inc.
Page was disambiguous: /wiki/Jazz
Page not found /wiki/Ed_Markey
Page not found /wiki/Musical_instrument
Page not found /wiki/Amazon_Fire_TV
Page not found /wiki/Vox_Media
Page was disambiguous: /wiki/HTC
Page not found /wiki/VinSmart
Page not found /wiki/USA_Today
Page not found /wiki/CSX_Corporation
Page was disambiguous: /wiki/Shopee
Page not found /wiki/Redmi
Page not found /wiki/Syndicat_National_de_l%27%C3%89dition_Phonographique
Page was disambiguous: /wiki/Amazon_(disambiguation)#Organizations
Page not found /wiki/Lector_(software)
Page not found /wiki/Xiaomi
Page not found /wiki/Solidaires_Unitaires_D%C3%A9mocratiques
Page not found /wiki/Disco
Page was disambiguous: /wiki/.pl
Page not found /wiki/Tra

In [41]:
type(vector_to_compare)

numpy.ndarray

In [60]:
with pd.option_context('display.max_rows', 150):
    display(entities_df.sort_values('similarity', ascending=False).head(100))

Unnamed: 0,text,href,similarity
939,Lektz,/wiki/Lektz,0.991737
716,Trip.com,/wiki/Trip.com,0.991725
605,Epidemic Marketing,/wiki/Epidemic_Marketing,0.99163
357,brand valuation,/wiki/List_of_most_valuable_brands,0.991556
945,Phonograph record,/wiki/Phonograph_record,0.991358
733,Jim Rutt,/wiki/Jim_Rutt,0.991317
333,Music education,/wiki/Music_education,0.991278
1220,Scott D. Sullivan,/wiki/Scott_D._Sullivan,0.991258
1246,Fortune 1000,/wiki/Fortune_1000,0.991255
422,Texas Instruments,/wiki/Texas_Instruments,0.991215


In [66]:
with pd.option_context('display.max_rows', 150):
    display(entities_df[entities_df.similarity > 0].sort_values('similarity').head(150))

Unnamed: 0,text,href,similarity
1050,Transmeta,/wiki/Transmeta,0.984606
98,Statistically improbable phrase,/wiki/Statistically_improbable_phrase,0.984626
818,Snapplify,/wiki/Snapplify,0.984689
531,Souq.com,/wiki/Souq.com,0.984808
342,Fraternal Order of Police,/wiki/Fraternal_Order_of_Police,0.984889
1015,The Voice,/wiki/The_Voice_(franchise),0.98507
809,World,/wiki/World_music_(term),0.985128
522,SUDOC (France),/wiki/SUDOC_(identifier),0.985139
754,sporting goods,/wiki/Sports_equipment,0.985192
904,Applied Materials,/wiki/Applied_Materials,0.985212


In [68]:
article_1 = """Brisbane motorists filling up their tanks today will be hard-pressed to find fuel under $1.70 per litre, but experts say holding off just a few days could save your back pocket. 

Key points:
Eighty-five per cent of service stations were higher than the $1.70 mark, the RACQ said
Businesses are accused of price gouging and motorists are urged to hold off filling up
Petrol prices have peaked and are expected to fall in the coming days
The RACQ's Clare Hunter said it was hard to find a good deal even for those who shopped around.

"Eighty-five per cent of the market is higher than $1.70 so that is expensive and that means we shouldn't be filling the tank," Ms Hunter said.

She said the price hikes were due to a "triple whammy" with a combination of factors at play. 

"We were, before we went into lockdown, starting to go into the hike phase of the price cycle anyway," Ms Hunter said. 

"But what we're seeing is a combination of a really high terminal gate price, high price of oil … and of course that high point in the price cycle, which is not good news for motorists who need fuel … it's incredibly unfair."

Petrol prices advertised at a service station in Brisbane for more than $1.70 per litre.
The RACQ encouraged drivers to use apps and websites to shop around to avoid getting stung at the bowser.(Supplied)
But Ms Hunter said the high prices were no surprise with global factors pushing up the price Brisbane motorists were paying at the pump and many businesses taking full advantage of that.

"The high indicative retail margins — the extra money servos slap on what they're buying fuel for to sell it, to make a margin — those are very high, and prices we shouldn't be seeing," Ms Hunter said.

"It is too high, it is gouging, and that's why at this point in the price cycle we need everyone to avoid the bowser if you can. 

"Anything around that $1.40 to $1.50 mark is probably fair and there are a few servos that are serving that … so that is available, but you've got to make the choice about whether it's worth you driving [there] if you're not around that area."

Signage outside a service station advertising fuel for around $1.40 per litre.
The fuel prices at Metro Fuel Greenbank were considerably cheaper than competitors, with most servos around Brisbane selling E10 for an average $1.70 a litre.(Supplied: Raf Memon)
Raf Memon is the owner of Metro Fuel at Greenbank and its sister servo at Nerang and with the Brisbane servo selling E10 at 137.9 cents per litre, Mr Memon said competitive prices had never been bad for business.

"If we keep low prices [on fuel], we get more shop business," Mr Memon said.

"We get more money spent in the shop than fuel, so we still make money."

Instead of hiking up prices by 30 or 40 cents, Mr Memon said any increase was done gradually.

"We try to look after customers. If they are happy, we're happy," he said.

"If crude oil goes higher, then we would need to go higher but the last two years we've had a max price of $1.50 for unleaded and E10."

"We sacrifice our margin to give customers a better price [and] this way we get more business that way."

Find more local news
Tell us your location and find more local ABC News and information
Positive news after prices peak
Ms Hunter said prices should start coming down soon.

"Prices peaked yesterday and we're already seeing them starting to come down," Ms Hunter said.

"We have hit the worst of it, so prices will get cheaper over the next few days. 

"So, if anyone can hold off filling their tank, then it's worth doing that but people have got the school run to do, people have got things to do, so we understand that drivers will need fuel at some point."

Mr Hunter said taking advantage of real-time fuel price monitoring was a way to ensure motorists got more bang for their buck.

"With the data that we've now got available, we're able to see that in advance by jumping on those apps and websites and checking the prices in our area before we get stung at a servo that's too expensive when there was a cheaper one down the road," she said.

"With that knowledge you can make better market choices."""

In [70]:
article_1 = tokenize(article_1)

In [71]:
article_2 = """
More than $120,000 has been raised for the family of five-month-old Mia after her tragic death earlier this week following a magpie swooping incident in Brisbane's south. 
The outpouring of emotions from across Australia over the "unthinkable" accident comes as Brisbane City Council launches an "urgent" investigation. 
The extended family of Mia's parents, Simone Francis and Jacob Power, have today thanked over 2,500 donators who have contributed to the fund that will be used on the toddler's funeral. 
READ MORE: Cairns' snap lockdown to end as Queensland's first vaccination hub opens
Five-month-old Mia was being carried by her mother through  Glindemann Park at Holland Park on Sunday before the "aggressive" bird attacked, causing Mia's mother to duck and trip. 
Photos of five-month-old Mia were shared on an online fundraiser for the family. (GoFundMe)
A note with the words "Fly High Little Mia" with a knitted heart has been placed at a Brisbane park where a swooping magpie led to the death of a baby girl. 
A note with the words "Fly High Little Mia" with a knitted heart has been placed at the Brisbane park. (JOCELYN GARCIA)
"We are absolutely overwhelmed with the generosity of every single one of you," Katie Hunt, one of Mia's Aunties posted late Tuesday.
"The support that has been shown to Mia and her parents has well and truly surpassed any hope or expectation we had."
"Many many thanks from the bottom of our hearts."
Mia was being carried by her mother through Glindemann Park at Holland Park on Sunday before the "aggressive" bird attacked, causing Mia's mother to duck and trip.
Mia was rushed to Queensland Children's Hospital with serious injuries but later died.
A note with the words "Fly High Little Mia" with a knitted heart has now been placed at the park near where the accident took place.
"Everything about Mia was perfect, from her head full of hair to her tiny little nose, and her little long toes," family members said following the fundraiser.
"Beautiful she was and loved by all who got to meet her. The joy Mia brought to all of our lives cannot be described in words, but is certainly felt in all of our hearts."
Magpie breeding and swooping season traditionally starts as the days become warmer towards the end of winter.
The magpie in question has since been moved from the park by Brisbane City Council, and has been moved "a significant distance" away (9News)
"Her gentle soul will always be forever in our hearts. Rest in peace, beautiful Mia," other people posted.
Brisbane Mayor Adrian Schrinner yesterday said that the incident had shocked the city "to the absolute core".
"This is something that I'm aware has never happened before," Mayor Schrinner said.
"We know that this situation probably was impossible to have predicted."
Brisbane City Council will be undergoing a "urgent" investigation to ensure a tragedy like this is avoided in the future.
"I know there was signs put up but … we need to make sure it doesn't happen again," Mayor Schrinner said.
Magpie was a known threat to local residents
Holland Park West residents have said the magpie that caused the tragic accident was a known menace.
Locals who spoke with 9News said they had complained to the council about the bird, and that it was a persistent, aggressive swooper.
"This magpie swoops everyone every time," one man said, adding the bird would frequently attack kids and adults.
Since Sunday's tragic accident, council workers have installed extra signage and caution tape at Glindemann Park, just south of Brisbane's CBD.
The bird has since been removed.
READ MORE: Man dies following collision between motorbike and water truck on Gold Coast
Locals told 9News the bird had a reputation for being particularly aggressive.
Locals told 9News the bird had a reputation for being particularly aggressive. (9News)
Flowers to remember a baby girl who died after a magpie swooping incident at  Glindemann Park in Holland Park West.
Flowers have been left at Glindemann Park in Holland Park West. (9News)
"It's just a traumatic accident, it's a tragedy and really sad," another Holland Park West local told 9News.
A Queensland Ambulance Service spokesperson said paramedics were called to the park following "reports that a mum had fallen with a baby in her arms".
"(She was) reportedly ducking to avoid a magpie," the spokesperson said.
Magpie swooping season runs from July to December and peaks in September.
Police are preparing a report for the coroner.
"""

In [72]:
article_2 = tokenize(article_2)

In [73]:
model.similarity_unseen_docs(article_1, article_2)

0.15266876

In [74]:
page = wikipedia.page(get_page_endpoint(main_wiki_link))

In [75]:
dir(page)

['_WikipediaPage__continued_query',
 '_WikipediaPage__load',
 '_WikipediaPage__title_query_param',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'categories',
 'content',
 'coordinates',
 'html',
 'images',
 'links',
 'original_title',
 'pageid',
 'parent_id',
 'references',
 'revision_id',
 'section',
 'sections',
 'summary',
 'title',
 'url']

In [76]:
page.categories

['1994 establishments in Washington (state)',
 '3D publishing',
 'All Wikipedia articles written in American English',
 'All articles containing potentially dated statements',
 'All articles to be split',
 'All articles with unsourced statements',
 'Amazon (company)',
 'American companies established in 1994',
 'Android (operating system) software',
 'Articles containing potentially dated statements from 2015',
 'Articles containing potentially dated statements from 2018',
 'Articles containing potentially dated statements from 2020',
 'Articles containing potentially dated statements from October 2017',
 'Articles containing potentially dated statements from September 2013',
 'Articles containing potentially dated statements from September 2020',
 'Articles to be split from January 2021',
 'Articles with short description',
 'Articles with unsourced statements from May 2017',
 'Arts and crafts retailers',
 'Bookstores of the United States',
 'CS1 French-language sources (fr)',
 'CS1 G

In [81]:
page = wikipedia.WikipediaPage(title="Jeff Bezos")

In [82]:
page.summary

'Jeffrey Preston Bezos ( BAY-zohss; né Jorgensen; born January 12, 1964) is an American business magnate, media proprietor, and investor. Bezos is the founder and executive chairman of Amazon, having previously served as chairman, president and CEO of the company. With a net worth of about $200 billion as of August 2021, he is the richest person in the world according to both Forbes and Bloomberg\'s Billionaires Index.Born in Albuquerque and raised in Houston and later Miami, Bezos graduated from Princeton University in 1986. He holds a degree in electrical engineering and computer science. He worked on Wall Street in a variety of related fields from 1986 to early 1994. Bezos founded Amazon in late 1994, on a cross-country road trip from New York City to Seattle. The company began as an online bookstore and has since expanded to a wide variety of other e-commerce products and services, including video and audio streaming, cloud computing, and artificial intelligence. It is currently th