# White House press briefings

In [1]:
import requests
url = 'https://www.whitehouse.gov/briefings-statements/page/0/'
resp = requests.get(url)

## 1. Fetching and saving each index page

In [2]:
with open('./index-pages/0.html','w') as wf:
    wf.write(resp.text)

In [3]:
import requests
from os import makedirs
INDEX_PAGES_DIR = 'index-pages'
makedirs(INDEX_PAGES_DIR, exist_ok=True)

THE_URL = 'https://www.whitehouse.gov/briefings-statements/page/'
MAX_PAGE_NUM = 50

for pagenum in range(MAX_PAGE_NUM):
    url = THE_URL + str(pagenum)+'/'
    print('downloading', url)
    resp = requests.get(url)
    
    fname = str(pagenum) + '.html'
    print('Saving to',fname)
    with open('index-pages/'+fname,'w') as wf:
        wf.write(resp.text)

downloading https://www.whitehouse.gov/briefings-statements/page/0/
Saving to 0.html
downloading https://www.whitehouse.gov/briefings-statements/page/1/
Saving to 1.html
downloading https://www.whitehouse.gov/briefings-statements/page/2/
Saving to 2.html
downloading https://www.whitehouse.gov/briefings-statements/page/3/
Saving to 3.html
downloading https://www.whitehouse.gov/briefings-statements/page/4/
Saving to 4.html
downloading https://www.whitehouse.gov/briefings-statements/page/5/
Saving to 5.html
downloading https://www.whitehouse.gov/briefings-statements/page/6/
Saving to 6.html
downloading https://www.whitehouse.gov/briefings-statements/page/7/
Saving to 7.html
downloading https://www.whitehouse.gov/briefings-statements/page/8/
Saving to 8.html
downloading https://www.whitehouse.gov/briefings-statements/page/9/
Saving to 9.html
downloading https://www.whitehouse.gov/briefings-statements/page/10/
Saving to 10.html
downloading https://www.whitehouse.gov/briefings-statements/pag

#### Join function

In [4]:
from os.path import join
join('index-pages', '42.html')
# 'index-pages/42.html'
join("this", "is", "a", "really", "nested", "file.txt")

'this/is/a/really/nested/file.txt'

In [5]:
import requests
from os import makedirs

INDEX_PAGES_DIR = 'index-pages'
makedirs(INDEX_PAGES_DIR, exist_ok=True)

THE_URL = 'https://www.whitehouse.gov/briefings-statements/page/{}/'
MAX_PAGE_NUM = 5

for pagenum in range(MAX_PAGE_NUM):
    url = THE_URL.format(pagenum) 
    print('downloading', url)
    resp = requests.get(url)
    
    fname = join(INDEX_PAGES_DIR,'{}.html'.format(pagenum))
    print(fname)
    print('Saving to',fname)
    with open(fname,'w') as wf:
        wf.write(resp.text)

downloading https://www.whitehouse.gov/briefings-statements/page/0/
index-pages/0.html
Saving to index-pages/0.html
downloading https://www.whitehouse.gov/briefings-statements/page/1/
index-pages/1.html
Saving to index-pages/1.html
downloading https://www.whitehouse.gov/briefings-statements/page/2/
index-pages/2.html
Saving to index-pages/2.html
downloading https://www.whitehouse.gov/briefings-statements/page/3/
index-pages/3.html
Saving to index-pages/3.html
downloading https://www.whitehouse.gov/briefings-statements/page/4/
index-pages/4.html
Saving to index-pages/4.html


In [6]:
THE_URL = 'https://www.whitehouse.gov/briefings-statements/page/{}/'
a= '123'

THE_URL.format(a)

'https://www.whitehouse.gov/briefings-statements/page/123/'

### Format example

In [7]:
URL_ENDPOINT = 'https://maps.googleapis.com/maps/api/streetview'
QUERY_STRING = 'size={size}&location={location}&heading={heading}'

q_str = QUERY_STRING.format(size='400x400', location='New_York', heading=0)
url = URL_ENDPOINT + '?' + q_str
print(url)

https://maps.googleapis.com/maps/api/streetview?size=400x400&location=New_York&heading=0


In [8]:
URL_ENDPOINT = 'https://maps.googleapis.com/maps/api/streetview'
QUERY_STRING = 'size={size}location={location}&heading={heading}'
q_str = QUERY_STRING.format(size='400x400', location='Chicago', heading="")
url = URL_ENDPOINT + '?' + q_str
print(url)

https://maps.googleapis.com/maps/api/streetview?size=400x400location=Chicago&heading=


In [9]:
import requests
URL_ENDPOINT = 'https://maps.googleapis.com/maps/api/streetview'
resp = requests.get(URL_ENDPOINT, params={'size': '600x300', 'location': 'Stanford, CA'})
print(resp.url)

https://maps.googleapis.com/maps/api/streetview?size=600x300&location=Stanford%2C+CA


### request.get() with parameters

In [10]:
payload = {'key1': 'value1', 'key2': 'value2'}
r = requests.get('http://httpbin.org/get', params=payload)
print(r.url)

http://httpbin.org/get?key1=value1&key2=value2


## 2. Converting HTML text into a data object

In [11]:
from bs4 import BeautifulSoup
htmltxt = '<p>Hello World</p>'
# name of parser = lxml
soup = BeautifulSoup(htmltxt,'lxml')
soup.text

'Hello World'

Basically, the BeautifulSoup's text attribute will return a string stripped of any HTML tags and metadata.

In [12]:
# header
soup = BeautifulSoup("""<h1>Hello</h1><p>World</p>""", 'lxml')
soup.text

'HelloWorld'

In [13]:
# hyper link
mytxt = """
<h1>Hello World</h1>
<p>This is a <a href="http://example.com">link</a></p>"""

soup = BeautifulSoup(mytxt,'lxml')
soup.text

'Hello World\nThis is a link'

#### Basically, the BeautifulSoup's text attribute will return a string stripped of any HTML tags and metadata.

### 利用find() 抓去tag

In [14]:
mytxt = """
<h1>Hello World</h1>
<p>This is a <a href="http://example.com">link</a></p>"""
soup = BeautifulSoup(mytxt,'lxml')
print(soup.find('h1'))
print(soup.find('p'))
print(soup.find('a'))

<h1>Hello World</h1>
<p>This is a <a href="http://example.com">link</a></p>
<a href="http://example.com">link</a>


In [15]:
type(soup.find('a'))

bs4.element.Tag

In [16]:
soup.find('h1').text

'Hello World'

### 通过tag抓取特性(attribute)，特性一般用=链接

In [17]:
from bs4 import BeautifulSoup
mytxt = """
<h1>Hello World</h1>
<p>This is a <a href="http://example.com">link</a></p>
"""

soup = BeautifulSoup(mytxt,'lxml')
mylink = soup.find('a')

In [18]:
print(type(mylink))
print(type(mylink.attrs))
# href="http://example.com"
# {'href': 'http://example.com'}
print(mylink.attrs)
mylink.attrs['href']

<class 'bs4.element.Tag'>
<class 'dict'>
{'href': 'http://example.com'}


'http://example.com'

### Finding multiple elements with find_all 查找所有tags

In [19]:
from bs4 import BeautifulSoup
moretxt = """
<p>Visit the <a href='http://www.nytimes.com'>New York Times</a></p>
<p>Visit the <a href='http://www.wsj.com'>Wall Street Journal</a></p>
"""
soup = BeautifulSoup(moretxt,'lxml')
tags = soup.find_all('a')
tags

[<a href="http://www.nytimes.com">New York Times</a>,
 <a href="http://www.wsj.com">Wall Street Journal</a>]

In [20]:
print(len(tags))
print(tags[0].text)
print(tags[0].attrs['href'])

for t in tags:
    print(t.text,t.attrs['href'])

2
New York Times
http://www.nytimes.com
New York Times http://www.nytimes.com
Wall Street Journal http://www.wsj.com


However, be careful not to treat the ResultSet as if it were a Tag – try to understand why the following doesn't make much sense (nevermind results in an error):

`tags.attrs['href']`

AttributeError: 'ResultSet' object has no attribute 'attrs'

In [21]:
print(tags)
hrefs = []
for t in tags:
    hrefs.append(t)
hrefs

[<a href="http://www.nytimes.com">New York Times</a>, <a href="http://www.wsj.com">Wall Street Journal</a>]


[<a href="http://www.nytimes.com">New York Times</a>,
 <a href="http://www.wsj.com">Wall Street Journal</a>]

### Finding nested elements 查找嵌套

In [22]:
evenmoretxt = """
<h1><a href="http://www.a.com">Awesome</a></h1>
<h1><a href="http://www.b.com">Really Awesome</a></h1>

<div><a href="http://na.com">Ignore me</a></div>
<div><a href="http://127.0.0.1">Ignore me again</a></div>
"""

soup = BeautifulSoup(evenmoretxt, 'lxml')

In [23]:
# collect all the <h1> tags using find_all()
heds = soup.find_all('h1')

In [24]:
links = []
for h in heds:
    links.append(h.find('a'))
links

[<a href="http://www.a.com">Awesome</a>,
 <a href="http://www.b.com">Really Awesome</a>]

## Example

In [25]:
import requests

resp = requests.get('http://www.example.com')
txt = resp.text

In [26]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(txt, 'lxml')

In [27]:
print(len(soup.find_all('p')))
print('-' * 20)
print(soup.find_all('p')[0].text)
print('-' * 20)
print(len(soup.find_all('h1')))
print('-' * 20)
print(soup.find_all('a')[0].text)
print(soup.find('a').attrs['href'])

2
--------------------
This domain is established to be used for illustrative examples in documents. You may use this
    domain in examples without prior coordination or asking for permission.
--------------------
1
--------------------
More information...
http://www.iana.org/domains/example


{{
<article class="briefing-statement briefing-statement--results">
	<div class="briefing-statement__content">
					<p class="briefing-statement__type">Press Briefings</p>
		
		<h2 class="briefing-statement__title"><a href="https://www.whitehouse.gov/briefings-statements/press-briefing-press-secretary-sarah-sanders-050918/">
                                                    Press Briefing by Press Secretary Sarah Sanders</a></h2>
		<div class="meta meta--left">
						<p class="meta__date">
				<time>May 9, 2018</time>
			</p>
		</div>
	</div>
</article>
}}

In [28]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.whitehouse.gov/briefings-statements/page/0/'
resp = requests.get(url)
soup = BeautifulSoup(resp.text,'lxml')

In [29]:
len(soup.find_all('a'))

68

In [30]:
len(soup.find_all('h2'))

10

In [31]:
urls = []
for h in soup.find_all('h2'):
    urls.append(h.find('a').attrs['href'])
urls

['https://www.whitehouse.gov/briefings-statements/statement-regarding-summit-united-states-north-korea/',
 'https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-meeting-prime-minister-lee-hsien-loong-singapore/',
 'https://www.whitehouse.gov/briefings-statements/statement-first-lady-melania-trumps-visit-fords-theatre/',
 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-approves-new-jersey-disaster-declaration/',
 'https://www.whitehouse.gov/briefings-statements/press-conference-president-trump-g7-summit/',
 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-approves-new-hampshire-disaster-declaration-5/',
 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-approves-alaska-disaster-declaration-2/',
 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-approves-new-hampshire-disaster-declaration-4/',
 'https://www.whitehouse.gov/briefings-statements/remarks-president-t

In [32]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.whitehouse.gov/briefings-statements/page/0/'
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'lxml')

links = []
for h in soup.find_all('h2'):
    links.append(h.find('a').attrs['href'])
links

['https://www.whitehouse.gov/briefings-statements/statement-regarding-summit-united-states-north-korea/',
 'https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-meeting-prime-minister-lee-hsien-loong-singapore/',
 'https://www.whitehouse.gov/briefings-statements/statement-first-lady-melania-trumps-visit-fords-theatre/',
 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-approves-new-jersey-disaster-declaration/',
 'https://www.whitehouse.gov/briefings-statements/press-conference-president-trump-g7-summit/',
 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-approves-new-hampshire-disaster-declaration-5/',
 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-approves-alaska-disaster-declaration-2/',
 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-approves-new-hampshire-disaster-declaration-4/',
 'https://www.whitehouse.gov/briefings-statements/remarks-president-t

In [33]:
import os
from bs4 import BeautifulSoup

INDEX_PAGES_DIR = 'index-pages'

some_filename = os.path.join(INDEX_PAGES_DIR,'30.html')
with open(some_filename,'r') as rf:
    txt = rf.read()

soup = BeautifulSoup(txt,'lxml')

for h in soup.find_all('h2'):
    a = h.find('a')
    print(a.attrs['href'])

https://www.whitehouse.gov/briefings-statements/statement-press-secretary-israels-announcement-related-iranian-nuclear-weapons-development/
https://www.whitehouse.gov/briefings-statements/remarks-vice-president-pence-u-s-department-homeland-security-u-s-customs-border-protection-employees-imperial-ca/
https://www.whitehouse.gov/briefings-statements/remarks-president-trump-president-buhari-federal-republic-nigeria-joint-press-conference/
https://www.whitehouse.gov/briefings-statements/statement-press-secretary-regarding-united-states-delegation-china/
https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-call-president-emmanuel-macron-france-17/
https://www.whitehouse.gov/briefings-statements/remarks-president-trump-president-buhari-federal-republic-nigeria-bilateral-meeting/
https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-recognizes-importance-small-businesses-2018-small-business-week/
https://www.whitehouse.gov/briefings-statements/

### Using urllib.parse.urljoin()

In [34]:
from urllib.parse import urljoin
BASE_URL = 'https://www.whitehouse.gov/'
url = urljoin(BASE_URL, '/briefings-statements/president-donald-j-trump-fulfilling-promise-u-s-korea-free-trade-agreement-national-security/')
url

'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-fulfilling-promise-u-s-korea-free-trade-agreement-national-security/'

In [35]:
from urllib.parse import urljoin
URL_A = 'http://www.example.com'
URL_B = 'http://www.example.com/some/fun/page.html'

urljoin(URL_A, '/helpme')
# http://www.example.com/helpme
urljoin(URL_B, '/helpme')
# http://www.example.com/helpme

'http://www.example.com/helpme'

### Extracting and resolving absolute URLs

In [36]:
from bs4 import BeautifulSoup
from os.path import join
from urllib.parse import urljoin
WH_BASE_URL = 'https://www.whitehouse.gov/briefings-statements/'
INDEX_PAGES_DIR = 'index-pages'

some_filename = join(INDEX_PAGES_DIR,'30.html')
with open(some_filename,'r') as rf:
    txt = rf.read()
    
soup = BeautifulSoup(txt,'lxml')

for h in soup.find_all('h2'):
    url = urljoin(WH_BASE_URL,a.attrs['href'])
    print(url)


https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-call-prime-minister-shinzo-abe-japan-8/
https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-call-prime-minister-shinzo-abe-japan-8/
https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-call-prime-minister-shinzo-abe-japan-8/
https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-call-prime-minister-shinzo-abe-japan-8/
https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-call-prime-minister-shinzo-abe-japan-8/
https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-call-prime-minister-shinzo-abe-japan-8/
https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-call-prime-minister-shinzo-abe-japan-8/
https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-call-prime-minister-shinzo-abe-japan-8/
https://www.whitehouse.g

### Use glob() to get a list of files

In [37]:
from glob import glob
from os.path import join

INDEX_PAGES_DIR = 'index-pages'
gp = join(INDEX_PAGES_DIR,'*.html')
index_pages_filenames = glob(gp)
print(len(index_pages_filenames))
index_pages_filenames[10]

50


'index-pages/4.html'

In [38]:
# ---->
from glob import glob
from os.path import join
INDEX_PAGES_DIR = 'index-pages'
index_pages_filenames = glob(join(INDEX_PAGES_DIR,'*.html'))

## ALL together

In [39]:
from bs4 import BeautifulSoup
from glob import glob
from os.path import join
from urllib.parse import urljoin

INDEX_PAGES_DIR = 'index-pages'
WH_BASE_URL = 'https://www.whitehouse.gov/briefings-statements/'

links = []
ip_fnames = glob(join(INDEX_PAGES_DIR, '*.html'))
for fname in ip_fnames:
    with open(fname,'r') as rf:
        txt = rf.read()
    
    soup = BeautifulSoup(txt,'lxml')
    for h in soup.find_all('h2'):
        a = h.find('a')
        links.append(a.attrs['href'])
links

['https://www.whitehouse.gov/briefings-statements/presidential-message-congress-united-states-7/',
 'https://www.whitehouse.gov/briefings-statements/notice-regarding-continuation-national-emergency-respect-actions-government-syria-2/',
 'https://www.whitehouse.gov/briefings-statements/statement-vice-president-mike-pence-release-three-americans/',
 'https://www.whitehouse.gov/briefings-statements/statement-press-secretary-release-three-americans/',
 'https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-call-president-emmanuel-macron-france-18/',
 'https://www.whitehouse.gov/briefings-statements/gina-haspel-defended-national-security/',
 'https://www.whitehouse.gov/briefings-statements/press-briefing-national-security-advisor-john-bolton-iran/',
 'https://www.whitehouse.gov/briefings-statements/remarks-first-lady-melania-trump-todays-spouses-senate-luncheon/',
 'https://www.whitehouse.gov/briefings-statements/presidential-message-congress-united-states-6/',
 

In [40]:
from bs4 import BeautifulSoup
from glob import glob
from os.path import join
from urllib.parse import urljoin

INDEX_PAGES_DIR = 'index-pages'
WH_BASE_URL = 'https://www.whitehouse.gov/briefings-statements/'

links = []
ip_fnames = glob(join(INDEX_PAGES_DIR, '*.html'))
for fname in ip_fnames:
    with open(fname,'r') as rf:
        txt = rf.read()
    
    soup = BeautifulSoup(txt,'lxml')
    for h in soup.find_all('h2'):
        a = h.find('a')
        url = urljoin(WH_BASE_URL, a.attrs['href'])
        links.append(url)
links

['https://www.whitehouse.gov/briefings-statements/presidential-message-congress-united-states-7/',
 'https://www.whitehouse.gov/briefings-statements/notice-regarding-continuation-national-emergency-respect-actions-government-syria-2/',
 'https://www.whitehouse.gov/briefings-statements/statement-vice-president-mike-pence-release-three-americans/',
 'https://www.whitehouse.gov/briefings-statements/statement-press-secretary-release-three-americans/',
 'https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-call-president-emmanuel-macron-france-18/',
 'https://www.whitehouse.gov/briefings-statements/gina-haspel-defended-national-security/',
 'https://www.whitehouse.gov/briefings-statements/press-briefing-national-security-advisor-john-bolton-iran/',
 'https://www.whitehouse.gov/briefings-statements/remarks-first-lady-melania-trump-todays-spouses-senate-luncheon/',
 'https://www.whitehouse.gov/briefings-statements/presidential-message-congress-united-states-6/',
 

In [41]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-call-president-recep-tayyip-erdogan-turkey-5/'
reps = requests.get(url)
soup = BeautifulSoup(reps.text,'lxml')
# mydivs = soup.find_all("div", {"class": "page-content"})
for content in soup.find_all("div", {"class": "page-content"}):
    p = content.find('p')
    print(p.text)

President Donald J. Trump spoke today with President Recep Tayyip Erdogan of Turkey to reaffirm the importance of strong relations between the United States and Turkey, as NATO Allies and strategic partners, and to exchange views on regional developments.  The two leaders committed to continue efforts to intensify cooperation on shared strategic challenges and to address the concerns of both countries that affect the bilateral relations.


In [42]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.whitehouse.gov/briefings-statements/readout-president-donald-j-trumps-call-president-recep-tayyip-erdogan-turkey-5/'
reps = requests.get(url)
soup = BeautifulSoup(reps.text,'lxml')
# mydivs = soup.find_all("div", {"class": "page-content"})
# for content in soup.find_all("div", {"class": "page-content"}):
#     p = content.find('p')
#     print(p.text)
time = soup.find('time')
print(time.text)

March 22, 2018
