# Data Gathering 1: Urllib3
Simple exercises to get familiar with urllib3

In [5]:
from urllib.request import urlopen
response = urlopen('https://www.nytimes.com/')
html_doc = response.read()
print("Url is:", response.geturl())
print(html_doc)

Url is: https://www.nytimes.com/


In [13]:
response = urlopen('https://www.theguardian.com')
response.geturl()   

'https://www.theguardian.com/us'

In [6]:
import urllib3
http = urllib3.PoolManager()
response = http.request('GET', 'https://www.nytimes.com/')
html_doc = response.data
print("Url is:", response.geturl())
print(html_doc)

Url is: https://www.nytimes.com/


# Data Gathering 2: BeautifulSoup
Once you retrieve an html document, you want to process it. Beautifulsoup is a useful and reliable package to achieve this goal.

In [7]:
from bs4 import BeautifulSoup

soup = BeautifulSoup("<html>data</html>")
soup = BeautifulSoup('<b class="boldest">Very bold</b>')
tag = soup.b
print("Tag is:", tag)

tag.name = "blockquote"
tag.string = "Extremely Bold"
tag['class'] = 'verybold'
tag['id'] = 1
print("Tag after changes is:", tag)

Tag is: <b class="boldest">Very bold</b>
Tag after changes is: <blockquote class="verybold" id="1">Extremely Bold</blockquote>


In [8]:
#get content with a particular tag
from urllib.request import urlopen
response = urlopen('https://www.charitynavigator.org/index.cfm?bay=topten.detail&listid=148')
html_doc = response.read()
soup = BeautifulSoup(html_doc)
#Lets get the names of charities in this category
tbl = soup.find_all('table')[0]
for link in tbl.find_all('a'):
    print(link.text)


Doctors Without Borders, USA
American Red Cross
ALSAC - St. Jude Children's Research Hospital
The Nature Conservancy
Natural Resources Defense Council
World Wildlife Fund
UNICEF USA
Save the Children
DAV (Disabled American Veterans) Charitable Service Trust
American Society for the Prevention of Cruelty to Animals


# XML
Most content online is in html format but that is by no means the only common one. XML is another useful format (and you should already know about json).

In [11]:
# Don't forget pip install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import xml.etree.ElementTree as elementtree

# You can get data for other years here: https://www.senate.gov/legislative/Public_Disclosure/database_download.htm
dom = elementtree.parse('senate-lobbying-2021_1_1_1.xml')
allText = ""
filinglist = dom.getroot()
for filing in filinglist:
    issues = list(filing.iter('Issues'))
    if len(issues) > 0:
        issuelist = issues[0].iter('Issue')
        for i in issuelist:
            allText = allText + ' ' +  i.attrib.get('SpecificIssue')

wordcloud = WordCloud().generate(allText)
img = wordcloud.to_image()
img.save("senate-wordcloud.png")