In [67]:
from bs4 import BeautifulSoup as bs
import urllib
import requests
import pandas as pd

## Basic html notebook

---

In [94]:
with open('basic_example.html', 'r') as f:

    contents = f.read()

    soup = BeautifulSoup(contents, 'lxml')

In [97]:
pd.read_html(contents)[0]

Unnamed: 0,Firstname,Lastname,Gender,Age
0,Violet,Luca,F,32
1,Emmett,Otter,M,22


In [26]:
# find all children in body and print the 1st one
soup.body.findChildren(recursive=False)[0]

<div style='\"background-color:black;color:white;padding:20px;\"'>
<h1>This is a Heading</h1>
<p>This is a paragraph.</p>
<p>This is a second paragraph</p>
</div>

In [25]:
## Extracting text from html documents

In [29]:
soup.title.text

'Page Title'

In [30]:
soup.p.text

'This is a paragraph.'

In [38]:
for node in soup.find_all('p'):
    print(node.text)

This is a paragraph.
This is a second paragraph
                                                                     
Mouse over this paragraph, to display the title attribute as a tooltip.                         



In [39]:
## Extracting attributes

In [55]:
soup.find_all('p')[2].get('title')

"I'm a tooltip"

In [52]:
for node in soup.find_all('p'):
    print(node.get('title'))

None
None
I'm a tooltip


In [122]:
# incase can't open url
cinema_url = 'https://www.cinemaclock.com/ont/toronto/movies/in-theatres'
req = urllib.request.Request(cinema_url , headers={'User-Agent': 'Mozilla/5.0'})
content = urllib.request.urlopen(req).read()
# url = urllib.request.urlopen(cinema_url, headers={'User-Agent': 'Mozilla/5.0'})
# content = url.read()
soup = BeautifulSoup(content, 'lxml')

In [None]:
# method 1
for node in soup.select(".movietitle a"):
    print(node.text)

In [104]:
# method 2
for node in soup.find_all('h3', {"class": "movietitle"}): # can use class_ = "movietitle" as well
    for n in node.find_all('a'):
        print(n.get_text())   

2 Hearts PG-13GPGPGPGPGPG
100% Wolf PGGGGGGG
The Addams Family PG-13GPGPGPG
The Addams Family PGGPGPGPGPGG
After We Collided 13+14A14A14A14A14A
Alita: Battle Angel PG-13G14APG14A14A14A
Annabelle R13+14A14A14A14A14A
Annabelle: Creation R13+14A14A14A14A14A
Attack the Block R13+14A14A14A14A14A
Ballet: Dracula 
Beetlejuice PGGPGPGPGPG14A
Casper PGGPGPGGPG
Come Play PG-1313+
The Conjuring R13+14A14A14A14A14A
The Conjuring 2 R13+14A14A14A14A14A
Dawn of the Dead R13+18A18A18ARR
The Empty Man R13+14A14A14A14A14A
The Exorcist R13+14A18A18A14PAR
Ghostbusters PGGPGPGPGPGPG
Goodbye, America PG
Goosebumps PGGPGPGPGPGPG
Gudbay, Amerika PG
Halloween R13+R18A18AR18A
Harry Potter and the Half-Blood Prince PGGPGPGPGPGPG
Hocus Pocus PGGPGPGPGPGPG
Honest Thief GPGPGPGPGPG
Hotel Transylvania PGGPGPGGGG
I Am Greta GPGPGPGPG
The Last Shift RGPG14APG
Lupin III: The First GPGPGPGPGPG
Memories of Murder 14A14A14A14A14A
Monster House PGGPGPGPGPGPG
Monsters, Inc. GGGGGGG
My People, My Homeland PGPGPGPGPG
The New 

In [93]:
for node in soup.find_all('h3', {"class": "movietitle"}):
    for n in node.find_all('a'):
        for i in n.find_all('span', {'class': 'rtUS'}):
            print(i.text)

PG-13
PG
PG-13
PG
PG-13
R
R
R
PG
PG
PG-13
R
R
R
R
PG
PG
R
PG
PG
PG
R
R
PG
G
PG-13
PG
R
R
R
R
R
PG-13
R
PG


## Client facing web scraping

---

In [105]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_the_100_largest_municipalities_in_Canada_by_population'

In [130]:
req = urllib.request.Request(wiki_url , headers={'User-Agent': 'Mozilla/5.0'})
content = urllib.request.urlopen(req).read()
tables = pd.read_html(content)

In [113]:
ca_table = tables[0]
ca_table.head()

Unnamed: 0,Rank(2016),Municipality,Province,Municipal status,"Land area(km2, 2011)",Growth Rate 2011–2016,Population(2016),Population(2011),Population(2006),Population(2001),Population(1996)
0,1,Toronto,Ontario,City,630.2,4.46%,2731571,2615060,2503281,2481494,2385421
1,2,Montreal,Quebec,Ville,365.1,3.34%,1704694,1649519,1620693,1583590,1547030
2,3,Calgary,Alberta,City,825.3,12.99%,1239220,1096833,988193,879003,768082
3,4,Ottawa,Ontario,City,2790.2,5.76%,934243,883391,812129,774072,721136
4,5,Edmonton,Alberta,City,684.4,14.82%,932546,812201,730372,666104,616306


In [116]:
## Extracting links

In [131]:
soup = BeautifulSoup(content, 'lxml')

In [136]:
wiki_links = []
for node in soup.select('td:nth-child(2) a'):
    wiki_links.append(node.get('href'))

In [137]:
wiki_links = wiki_links[0:100]
wiki_links

['/wiki/Toronto',
 '/wiki/Montreal',
 '/wiki/Calgary',
 '/wiki/Ottawa',
 '/wiki/Edmonton',
 '/wiki/Mississauga',
 '/wiki/Winnipeg',
 '/wiki/Vancouver',
 '/wiki/Brampton',
 '/wiki/Hamilton,_Ontario',
 '/wiki/Quebec_City',
 '/wiki/Surrey,_British_Columbia',
 '/wiki/Laval,_Quebec',
 '/wiki/Halifax,_Nova_Scotia',
 '/wiki/London,_Ontario',
 '/wiki/Markham,_Ontario',
 '/wiki/Vaughan',
 '/wiki/Gatineau',
 '/wiki/Saskatoon',
 '/wiki/Longueuil',
 '/wiki/Kitchener,_Ontario',
 '/wiki/Burnaby',
 '/wiki/Windsor,_Ontario',
 '/wiki/Regina,_Saskatchewan',
 '/wiki/Richmond,_British_Columbia',
 '/wiki/Richmond_Hill,_Ontario',
 '/wiki/Oakville,_Ontario',
 '/wiki/Burlington,_Ontario',
 '/wiki/Greater_Sudbury',
 '/wiki/Sherbrooke',
 '/wiki/Oshawa',
 '/wiki/Saguenay,_Quebec',
 '/wiki/L%C3%A9vis,_Quebec',
 '/wiki/Barrie',
 '/wiki/Abbotsford,_British_Columbia',
 '/wiki/Coquitlam',
 '/wiki/Trois-Rivi%C3%A8res',
 '/wiki/St._Catharines',
 '/wiki/Guelph',
 '/wiki/Cambridge,_Ontario',
 '/wiki/Whitby,_Ontario',
 '/

In [138]:
## Extracting the latitude and longitude

In [139]:
base_url = 'https://en.wikipedia.org'
city_url = base_url + wiki_links[0]

In [148]:
search_pattern = 'geohack.toolforge.org/geohack'

In [140]:
req = urllib.request.Request(city_url , headers={'User-Agent': 'Mozilla/5.0'})
content = urllib.request.urlopen(req).read()
soup = BeautifulSoup(content, 'lxml')

In [146]:
import re

In [177]:
url_l = []
for node in soup.select('a'):
    temp = node.get('href')
    if temp != None:
        if re.search(search_pattern, temp) != None:
            url_l.append(temp)
url_l = url_l[0]
geo_link = 'https:' + url_l

In [178]:
req = urllib.request.Request(geo_link , headers={'User-Agent': 'Mozilla/5.0'})
content = urllib.request.urlopen(req).read()
soup = BeautifulSoup(content, 'lxml')

In [185]:
lat = soup.select('.latitude')[0].text
print(lat)

43.741667
