In [1]:
import re
import urllib3

import pandas as pd
from bs4 import BeautifulSoup

from IPython.display import display

In [2]:
# Sample companies (FAANG) and their respective Wikipedia pages

wiki_test_pages = {
    'Facebook': 'https://en.wikipedia.org/wiki/Facebook',
    'Apple': 'https://en.wikipedia.org/wiki/Apple_Inc.',
    'Amazon': 'https://en.wikipedia.org/wiki/Amazon_(company)',
    'Netflix': 'https://en.wikipedia.org/wiki/Netflix',
    'Google': 'https://en.wikipedia.org/wiki/Google'
}

In [3]:
http = urllib3.PoolManager()
url = wiki_test_pages['Amazon']

response = http.request('GET', url)
soup = BeautifulSoup(response.data, 'html.parser')

In [4]:
def keyword_ish(tag):
    not_empty = lambda x: x.text != ''
    links_to_wiki = lambda x: x.has_attr('href') and x.get('href').startswith('/wiki/')
   
    return not_empty(tag) & links_to_wiki(tag)

In [5]:
wiki_body = soup.find('div', {'id': 'mw-content-text'})
all_tags = wiki_body.find_all('a')

filtered_tags = [tag for tag in all_tags if keyword_ish(tag)]
filtered_tags = set(filtered_tags)

In [6]:
entities_df = pd.DataFrame({
    'text': [tag.text for tag in filtered_tags],
    'href': [tag.get('href') for tag in filtered_tags],
})

entities_df = entities_df[~entities_df.href.str.match('/wiki/\w+:\w+')]

print('Keyword-ish entities from Amazon Wikipedia page:', len(entities_df))

print('Sample entities from Amazon Wikipedia page:')
with pd.option_context('display.max_rows', 100):
    display(entities_df.sample(100))


Keyword-ish entities from Amazon Wikipedia page: 1209
Sample entities from Amazon Wikipedia page:


Unnamed: 0,text,href
23,Dash wand,/wiki/Amazon_Dash#Barcode_scanner
1142,Album,/wiki/Album
124,Usinternetworking Inc,/wiki/Usinternetworking_Inc
1171,NBC News,/wiki/NBC_News
1065,Record label,/wiki/Record_label
921,Black Friday,/wiki/Black_Friday_(shopping)
823,STDU Viewer,/wiki/STDU_Viewer
1166,Alteon WebSystems,/wiki/Alteon_WebSystems
687,Kobo,/wiki/Kobo_Inc.
890,Adobe Acrobat,/wiki/Adobe_Acrobat


In [8]:
entities_df.to_pickle('../../data/temp/Amazon_ents.pkl')

In [12]:
with pd.option_context('display.max_rows', 150):
    display(entities_df.head(150))

Unnamed: 0,text,href
0,Rick Dalzell,/wiki/Rick_Dalzell
1,Comparison of Android e-book reader software,/wiki/Comparison_of_Android_e-reader_software
3,New Castle,"/wiki/New_Castle,_Delaware"
4,Mnet,/wiki/Mnet_(TV_channel)
5,Amazon Game Studios,/wiki/Amazon_Game_Studios
6,A9.com,/wiki/A9.com
7,.it,/wiki/.it
8,US$,/wiki/United_States_dollar
9,PocketBook eReader,/wiki/PocketBook_International
10,COVID-19 pandemic,/wiki/COVID-19_pandemic


In [13]:
with pd.option_context('display.max_rows', 150):
    display(entities_df.tail(150))

Unnamed: 0,text,href
1095,Kraft Heinz,/wiki/Kraft_Heinz
1096,Domain name,/wiki/Domain_name
1097,BEZOS,/wiki/Jeff_Bezos
1099,Music store,/wiki/Music_store
1100,USA,/wiki/United_States
1101,Josh Harris (Internet),/wiki/Josh_Harris_(Internet)
1102,MusicBrainz,/wiki/MBL_(identifier)
1104,C++,/wiki/C%2B%2B
1105,Align Technology,/wiki/Align_Technology
1106,Seattle,/wiki/Seattle
