# Getting Data

In [1]:
from Ch09_Getting_Data import *

## stdin and stdout

## Reading Files

### The Basics of Text Files

In [2]:
file_for_reading = open('reading_file.txt', 'r')
file_for_reading2 = open('reading_file.txt')

file_for_writing = open('writing_file.txt', 'w')

file_open_appending = open('appending_file.txt', 'a')
# don't forget to close your files when you're done
file_for_writing.close()

```
with open(filename) as f:
    data = function_that_gets_data_from(f)
    
# at this point f has already been closed, so don't try to use it
process(data)
```

In [3]:
assert get_domain('cuixuanstephem@gmail.com') == 'gmail.com'
assert get_domain('joel@m.datasciencester.com') == 'm.datasciencester.com'

## Scraping the Web

### HTML and the Parsing Thereof

In [4]:
from bs4 import BeautifulSoup
import requests

In [5]:
url = ("https://raw.githubusercontent.com/joelgrus/data/master/getting-data.html")
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')


first_paragraph = soup.find('p')
first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()

first_paragraph_id = soup.p['id']  # raises KeyError if no 'id'
first_paragraph_id2 = soup.p.get('id')  # return None if no 'id'

Frequently, you’ll want to find tags with a specific class:

In [6]:
important_paragraphs = soup('p', {'class': 'important'})
important_paragraphs

[<p class="important">This is the second paragraph.</p>]

In [7]:
important_paragraphs2 = soup('p', 'important')
important_paragraphs2

[<p class="important">This is the second paragraph.</p>]

In [8]:
important_paragraphs3 = [p for p in soup('p')
                         if 'important' in p.get('class', [])]
important_paragraphs3

[<p class="important">This is the second paragraph.</p>]

In [9]:
spans_inside_divs = [span 
                     for div in soup('div')
                     for span in div('span')]
spans_inside_divs

[<span id="name">Joel</span>,
 <span id="twitter">@joelgrus</span>,
 <span id="email">joelgrus-at-gmail</span>]

### Example: Keeping Tabs on Congress

In [10]:
url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, 'html5lib')

In [11]:
all_urls = [a['href']
            for a in soup('a')
            if a.has_attr('href')]
len(all_urls)

967

This returns way too many URLs. If you look at them, the ones we want start with either http:// or https://, have some kind of name, and end with either .house.gov or .house.gov/.

In [12]:
import re

In [13]:
regex = r"^https?://.*\.house\.gov/?$"
assert re.match(regex, "http://joel.house.gov")
assert re.match(regex, "https://joel.house.gov")
assert re.match(regex, "http://joel.house.gov/")
assert re.match(regex, "https://joel.house.gov/")
assert not re.match(regex, "joel.house.gov")
assert not re.match(regex, "http://joel.house.com")
assert not re.match(regex, "https://joel.house.gov/biography")

In [14]:
good_urls = [url for url in all_urls if re.match(regex, url)]
len(good_urls)

880

If you look at the list, there are a lot of duplicates. Let’s use set to get rid of them:

In [15]:
good_urls = list(set(good_urls))
len(good_urls)

440

In [16]:
html = requests.get('https://jayapal.house.gov').text
soup = BeautifulSoup(html, 'html5lib')
links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
links

{'https://jayapal.house.gov/category/news/',
 'https://jayapal.house.gov/category/press-releases/'}

In [17]:
# 运行时间较长，先注释掉
# from typing import Dict, Set
# press_releases: Dict[str, Set[str]] = {}

# for house_url in good_urls:
#     html = requests.get(house_url).text
#     soup = BeautifulSoup(html, 'html5lib')
#     pr_links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}

#     press_releases[house_url] = pr_links

In [18]:
text = """<body><h1>Facebook</h1><p>Twitter</p>"""
assert paragraph_mentions(text, "twitter")
assert not paragraph_mentions(text, "facebook")

In [19]:
# for house_url, pr_links in press_releases.items():
#     for pr_link in pr_links:
#         url = f'{house_url}/{pr_link}'
#         text = requests.get(url).text
#         if paragraph_mentions(text, 'data'):
#             print(f'house_url')
#             break

## Using APIs

### JSON and XML

In [21]:
assert deserialized['publicationYear'] == 2019
assert "data science" in deserialized['topics']

### Using an Unauthenticated API

## Example: Using the Twitter APIs