In [1]:
# Just stick some data there
with open('email_addresses.txt', 'w') as f:
    f.write("joelgrus@gmail.com\n")
    f.write("joel@m.datasciencester.com\n")
    f.write("joelgrus@m.datasciencester.com\n")

In [2]:
f = open('email_addresses.txt', "r")
print(f.read())

joelgrus@gmail.com
joel@m.datasciencester.com
joelgrus@m.datasciencester.com



In [3]:
def get_domain(email_address: str) -> str:
    """Split on '@' and return the last piece"""
    return email_address.lower().split("@")[-1]

In [4]:
# a couple of tests
assert get_domain('joelgrus@gmail.com') == 'gmail.com'

In [5]:
assert get_domain('joel@m.datasciencester.com') == 'm.datasciencester.com'

In [6]:
from collections import Counter

In [7]:
with open('email_addresses.txt', 'r') as f:
    domain_counts = Counter(get_domain(line.strip())
                            for line in f
                            if "@" in line)


In [8]:
domain_counts

Counter({'gmail.com': 1, 'm.datasciencester.com': 2})

In [9]:
with open('tab_delimited_stock_prices.txt', 'w') as f:
    f.write("""6/20/2014\tAAPL\t90.91
6/20/2014\tMSFT\t41.68
6/20/2014\tFB\t64.5
6/19/2014\tAAPL\t91.86
6/19/2014\tMSFT\t41.51
6/19/2014\tFB\t64.34
""")

In [10]:
f = open('tab_delimited_stock_prices.txt', "r")
print(f.read())

6/20/2014	AAPL	90.91
6/20/2014	MSFT	41.68
6/20/2014	FB	64.5
6/19/2014	AAPL	91.86
6/19/2014	MSFT	41.51
6/19/2014	FB	64.34



In [11]:
def process(date: str, symbol: str, closing_price: float) -> None:
    # Imaginge that this function actually does something.
    assert closing_price > 0.0

In [12]:
import csv

with open('tab_delimited_stock_prices.txt') as f:
    tab_reader = csv.reader(f, delimiter='\t')
    for row in tab_reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])
        process(date, symbol, closing_price)

In [13]:
with open('colon_delimited_stock_prices.txt', 'w') as f:
    f.write("""date:symbol:closing_price
6/20/2014:AAPL:90.91
6/20/2014:MSFT:41.68
6/20/2014:FB:64.5
""")

In [14]:
f = open('colon_delimited_stock_prices.txt', "r")
print(f.read())

date:symbol:closing_price
6/20/2014:AAPL:90.91
6/20/2014:MSFT:41.68
6/20/2014:FB:64.5



In [15]:
with open('colon_delimited_stock_prices.txt') as f:
    colon_reader = csv.DictReader(f, delimiter=':')
    for dict_row in colon_reader:
        date = dict_row["date"]
        symbol = dict_row["symbol"]
        closing_price = float(dict_row["closing_price"])
        process(date, symbol, closing_price)

In [16]:
todays_prices = {'AAPL': 90.91, 'MSFT': 41.68, 'FB': 64.5 }

In [17]:
with open('comma_delimited_stock_prices.txt', 'w') as f:
    csv_writer = csv.writer(f, delimiter=',')
    for stock, price in todays_prices.items():
        csv_writer.writerow([stock, price])

In [18]:
f = open('comma_delimited_stock_prices.txt', "r")
print(f.read())

AAPL,90.91

MSFT,41.68

FB,64.5




In [19]:
results = [["test1", "success", "Monday"],
           ["test2", "success, kind of", "Tuesday"],
           ["test3", "failure, kind of", "Wednesday"],
           ["test4", "failure, utter", "Thursday"]]

In [20]:
# don't do this!
with open('bad_csv.txt', 'w') as f:
    for row in results:
        f.write(",".join(map(str, row))) # might have too many commas in it!
        f.write("\n")                    # row might have newlines as well!

In [21]:
f = open('bad_csv.txt', "r")
print(f.read())

test1,success,Monday
test2,success, kind of,Tuesday
test3,failure, kind of,Wednesday
test4,failure, utter,Thursday



In [22]:
from bs4 import BeautifulSoup
import requests

In [23]:
# I put the relevant HTML file on GitHub. In order to fit
# the URL in the book I had to split it across two lines.
# Recall that whitespace-separated strings get concatenated.
url = ("https://raw.githubusercontent.com/"
       "joelgrus/data/master/getting-data.html")
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')

first_paragraph = soup.find('p')        # or just soup.p

In [24]:
soup

<!DOCTYPE html>
<html lang="en-US"><head>
    <title>Getting Data</title>
    <meta charset="utf-8"/>
</head>
<body>
    <h1>Getting Data</h1>
    <div class="explanation">
        This is an explanation.
    </div>
    <div class="comment">
        This is a comment.
    </div>
    <div class="content">
        <p id="p1">This is the first paragraph.</p>
        <p class="important">This is the second paragraph.</p>
    </div>
    <div class="signature">
        <span id="name">Joel</span>
        <span id="twitter">@joelgrus</span>
        <span id="email">joelgrus-at-gmail</span>
    </div>


</body></html>

In [25]:
first_paragraph

<p id="p1">This is the first paragraph.</p>

In [26]:
type(first_paragraph)

bs4.element.Tag

In [27]:
assert str(soup.find('p')) == '<p id="p1">This is the first paragraph.</p>'

In [28]:
first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()

In [29]:
first_paragraph_text

'This is the first paragraph.'

In [30]:
first_paragraph_words

['This', 'is', 'the', 'first', 'paragraph.']

In [31]:
assert first_paragraph_words == ['This', 'is', 'the', 'first', 'paragraph.']

In [32]:
first_paragraph_id = soup.p['id']       # raises KeyError if no 'id'
first_paragraph_id2 = soup.p.get('id')  # returns None if no 'id'

In [33]:
first_paragraph_id

'p1'

In [34]:
first_paragraph_id2

'p1'

In [35]:
assert first_paragraph_id == first_paragraph_id2 == 'p1'

In [36]:
all_paragraphs = soup.find_all('p')  # or just soup('p')
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]

In [37]:
all_paragraphs

[<p id="p1">This is the first paragraph.</p>,
 <p class="important">This is the second paragraph.</p>]

In [38]:
paragraphs_with_ids

[<p id="p1">This is the first paragraph.</p>]

In [39]:
assert len(all_paragraphs) == 2

In [40]:
assert len(paragraphs_with_ids) == 1

In [41]:
important_paragraphs = soup('p', {'class' : 'important'})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p')
                         if 'important' in p.get('class', [])]

In [42]:
important_paragraphs

[<p class="important">This is the second paragraph.</p>]

In [43]:
important_paragraphs2

[<p class="important">This is the second paragraph.</p>]

In [44]:
important_paragraphs3

[<p class="important">This is the second paragraph.</p>]

In [45]:
assert important_paragraphs == important_paragraphs2 == important_paragraphs3

In [46]:
assert len(important_paragraphs) == 1

In [47]:
soup('div')

[<div class="explanation">
         This is an explanation.
     </div>,
 <div class="comment">
         This is a comment.
     </div>,
 <div class="content">
         <p id="p1">This is the first paragraph.</p>
         <p class="important">This is the second paragraph.</p>
     </div>,
 <div class="signature">
         <span id="name">Joel</span>
         <span id="twitter">@joelgrus</span>
         <span id="email">joelgrus-at-gmail</span>
     </div>]

In [48]:
# warning, will return the same span multiple times
# if it sits inside multiple divs
# be more clever if that's the case
spans_inside_divs = [span
                     for div in soup('div')     # for each <div> on the page
                     for span in div('span')]   # find each <span> inside it

In [49]:
spans_inside_divs

[<span id="name">Joel</span>,
 <span id="twitter">@joelgrus</span>,
 <span id="email">joelgrus-at-gmail</span>]

In [50]:
assert len(spans_inside_divs) == 3

In [51]:
def paragraph_mentions(text: str, keyword: str) -> bool:
    """
    Returns True if a <p> inside the text mentions {keyword}
    """
    soup = BeautifulSoup(text, 'html5lib')
    paragraphs = [p.get_text() for p in soup('p')]

    return any(keyword.lower() in paragraph.lower()
               for paragraph in paragraphs)

In [52]:
text = """<body><h1>Facebook</h1><p>Twitter</p>"""
soup = BeautifulSoup(text, 'html5lib')
soup

<html><head></head><body><h1>Facebook</h1><p>Twitter</p></body></html>

In [53]:
assert paragraph_mentions(text, "twitter")       # is inside a <p>

In [54]:
assert not paragraph_mentions(text, "facebook")  # not inside a <p>

In [55]:
{ "title" : "Data Science Book",
  "author" : "Joel Grus",
  "publicationYear" : 2019,
  "topics" : [ "data", "science", "data science"] }

{'title': 'Data Science Book',
 'author': 'Joel Grus',
 'publicationYear': 2019,
 'topics': ['data', 'science', 'data science']}

In [56]:
import json
serialized = """{ "title" : "Data Science Book",
                  "author" : "Joel Grus",
                  "publicationYear" : 2019,
                  "topics" : [ "data", "science", "data science"] }"""

In [57]:
# parse the JSON to create a Python dict
deserialized = json.loads(serialized)
print(deserialized)
type(deserialized)

{'title': 'Data Science Book', 'author': 'Joel Grus', 'publicationYear': 2019, 'topics': ['data', 'science', 'data science']}


dict

In [58]:
assert deserialized["publicationYear"] == 2019

In [59]:
assert "data science" in deserialized["topics"]

In [60]:
def main():
    from bs4 import BeautifulSoup
    import requests
    
    url = "https://www.house.gov/representatives"
    text = requests.get(url).text
    soup = BeautifulSoup(text, "html5lib")
    
    all_urls = [a['href']
                for a in soup('a')
                if a.has_attr('href')]
    
    print(len(all_urls))  # 965 for me, way too many
    
    import re
    
    # Must start with http:// or https://
    # Must end with .house.gov or .house.gov/
    regex = r"^https?://.*\.house\.gov/?$"
    
    # Let's write some tests!
    assert re.match(regex, "http://joel.house.gov")
    assert re.match(regex, "https://joel.house.gov")
    assert re.match(regex, "http://joel.house.gov/")
    assert re.match(regex, "https://joel.house.gov/")
    assert not re.match(regex, "joel.house.gov")
    assert not re.match(regex, "http://joel.house.com")
    assert not re.match(regex, "https://joel.house.gov/biography")
    
    # And now apply
    good_urls = [url for url in all_urls if re.match(regex, url)]
    

In [61]:
url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, "html5lib")
    
all_urls = [a['href']
            for a in soup('a')
            if a.has_attr('href')]
    
print(len(all_urls))

965


In [62]:
import re
    
# Must start with http:// or https://
# Must end with .house.gov or .house.gov/
regex = r"^https?://.*\.house\.gov/?$"
    
# Let's write some tests!
assert re.match(regex, "http://joel.house.gov")
assert re.match(regex, "https://joel.house.gov")
assert re.match(regex, "http://joel.house.gov/")
assert re.match(regex, "https://joel.house.gov/")
assert not re.match(regex, "joel.house.gov")
assert not re.match(regex, "http://joel.house.com")
assert not re.match(regex, "https://joel.house.gov/biography")

In [63]:
# And now apply
good_urls = [url for url in all_urls if re.match(regex, url)]
    
print(len(good_urls))  # still 862 for me

878


In [64]:
num_original_good_urls = len(good_urls)
num_original_good_urls

878

In [65]:
good_urls = list(set(good_urls))
good_urls[:3]

['https://james.house.gov',
 'https://dustyjohnson.house.gov/',
 'https://vanduyne.house.gov']

In [66]:
print(len(good_urls))  # only 431 for me

439


In [67]:
assert len(good_urls) < num_original_good_urls

In [68]:
html = requests.get('https://jayapal.house.gov').text
soup = BeautifulSoup(html, 'html5lib')

In [69]:
# Use a set because the links might appear multiple times.
links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
links

{'https://jayapal.house.gov/category/news/',
 'https://jayapal.house.gov/category/press-releases/'}

In [70]:
print(links) # {'/media/press-releases'}

{'https://jayapal.house.gov/category/news/', 'https://jayapal.house.gov/category/press-releases/'}


In [71]:
# I don't want this file to scrape all 400+ websites every time it runs.
# So I'm going to randomly throw out most of the urls.
# The code in the book doesn't do this.
import random
good_urls = random.sample(good_urls, 5)
print(f"after sampling, left with {good_urls}")

after sampling, left with ['https://babin.house.gov', 'https://sessions.house.gov', 'https://finstad.house.gov/', 'https://strong.house.gov', 'https://takano.house.gov']


In [72]:
from typing import Dict, Set

In [73]:
press_releases: Dict[str, Set[str]] = {}

In [74]:
for house_url in good_urls:
    html = requests.get(house_url).text
    soup = BeautifulSoup(html, 'html5lib')
    pr_links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
    print(f"{house_url}: {pr_links}")
    press_releases[house_url] = pr_links

https://babin.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://sessions.house.gov: {'/press-releases'}
https://finstad.house.gov/: {'/press-releases'}
https://strong.house.gov: {'/media/press-releases'}
https://takano.house.gov: {'https://takano.house.gov/newsroom/press-releases'}


In [75]:
for house_url, pr_links in press_releases.items():
    for pr_link in pr_links:
        url = f"{house_url}/{pr_link}"
        text = requests.get(url).text
    
        if paragraph_mentions(text, 'data'):
            print(f"{house_url}")
            break  # done with this house_url

In [76]:
import requests, json
    
github_user = "joelgrus"
endpoint = f"https://api.github.com/users/{github_user}/repos"

In [77]:
repos = json.loads(requests.get(endpoint).text)
repos[1]["created_at"]

'2018-11-30T22:41:16Z'

In [78]:
from collections import Counter
from dateutil.parser import parse

In [79]:
dates = [parse(repo["created_at"]) for repo in repos]
dates[:3]

[datetime.datetime(2017, 12, 2, 20, 13, 49, tzinfo=tzutc()),
 datetime.datetime(2018, 11, 30, 22, 41, 16, tzinfo=tzutc()),
 datetime.datetime(2019, 12, 1, 2, 57, 18, tzinfo=tzutc())]

In [80]:
month_counts = Counter(date.month for date in dates)
month_counts

Counter({12: 4, 11: 8, 2: 2, 1: 2, 9: 5, 7: 3, 5: 2, 6: 1, 8: 2, 4: 1})

In [81]:
weekday_counts = Counter(date.weekday() for date in dates)
weekday_counts

Counter({5: 5, 4: 6, 6: 4, 2: 8, 1: 6, 3: 1})

In [82]:
last_5_repositories = sorted(repos,
                                 key=lambda r: r["pushed_at"],
                                 reverse=True)[:5]

In [83]:
last_5_repositories[1]["language"]

'Python'

In [84]:
last_5_languages = [repo["language"]
                        for repo in last_5_repositories]

In [85]:
last_5_languages

['JavaScript', 'Python', 'Python', 'Python', 'Python']

In [86]:
# import os
    
# # Feel free to plug your key and secret in directly
# CONSUMER_KEY = os.environ.get("TWITTER_CONSUMER_KEY")
# CONSUMER_SECRET = os.environ.get("TWITTER_CONSUMER_SECRET")
    
# import webbrowser
# from twython import Twython
    
# # Get a temporary client to retrieve an authentication url
# temp_client = Twython(CONSUMER_KEY, CONSUMER_SECRET)
# temp_creds = temp_client.get_authentication_tokens()
# url = temp_creds['auth_url']
    
# # Now visit that URL to authorize the application and get a PIN
# print(f"go visit {url} and get the PIN code and paste it below")
# webbrowser.open(url)
# PIN_CODE = input("please enter the PIN code: ")
    
# # Now we use that PIN_CODE to get the actual tokens
# auth_client = Twython(CONSUMER_KEY,
#                         CONSUMER_SECRET,
#                         temp_creds['oauth_token'],
#                         temp_creds['oauth_token_secret'])
# final_step = auth_client.get_authorized_tokens(PIN_CODE)
# ACCESS_TOKEN = final_step['oauth_token']
# ACCESS_TOKEN_SECRET = final_step['oauth_token_secret']
    
# # And get a new Twython instance using them.
# twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
    
# from twython import TwythonStreamer
    
# # Appending data to a global variable is pretty poor form
# # but it makes the example much simpler
# tweets = []
    
# class MyStreamer(TwythonStreamer):
#     def on_success(self, data):
#         """
#         What do we do when twitter sends us data?
#         Here data will be a Python dict representing a tweet
#         """
#         # We only want to collect English-language tweets
#         if data.get('lang') == 'en':
#             tweets.append(data)
#             print(f"received tweet #{len(tweets)}")
    
#         # Stop when we've collected enough
#         if len(tweets) >= 100:
#             self.disconnect()
    
#     def on_error(self, status_code, data):
#         print(status_code, data)
#         self.disconnect()
    
# stream = MyStreamer(CONSUMER_KEY, CONSUMER_SECRET,
#                     ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
    
# # starts consuming public statuses that contain the keyword 'data'
# stream.statuses.filter(track='data')
    
# # if instead we wanted to start consuming a sample of *all* public statuses
# # stream.statuses.sample()
    
# if __name__ == "__main__": main()