In [72]:
import bs4
import requests
import html5lib
import time
import json

In [73]:
base_url = "http://www.ilga.gov"

In [74]:
leg_url = base_url + "/legislation/"

In [75]:
print(leg_url)

http://www.ilga.gov/legislation/


In [76]:
r = requests.get(leg_url)

In [77]:
html_text = r.text.encode('iso-8859-1')

In [78]:
soup = bs4.BeautifulSoup(html_text, "html5lib")

In [79]:
links = soup.find_all("a")

In [80]:
mid_links = []
for a in links:
    if a.has_attr("href") and a['href'].startswith('grplist.asp'):
        mid_url = leg_url + a['href']
        mid_r = requests.get(mid_url)
        mid_html_text = mid_r.text.encode('iso-8859-1')
        mid_soup = bs4.BeautifulSoup(mid_html_text, "html5lib")
        mid_links.extend(mid_soup.find_all("a"))
        

In [81]:
sbill_links = []
hbill_links = []
for mid_a in mid_links:
    if mid_a.text.startswith("SB"):
        bill_url = base_url + mid_a['href']
        sbill_links.append(bill_url)
    elif mid_a.text.startswith("HB"):
        bill_url = base_url + mid_a['href']
        hbill_links.append(bill_url)

In [82]:
len(sbill_links)

3982

In [83]:
len(hbill_links)

5752

In [84]:
r_test = requests.get(sbill_links[1559])
html_test = r_test.text.encode('iso-8859-1')
soup_test = bs4.BeautifulSoup(html_test, "html5lib")

In [85]:
soup_test

<html lang="en"><!-- Trigger/Open The Modal --><head></head><body alink="#9933FF" bgcolor="#FFFFFF" leftmargin="0" link="#3366FF" onload="document.NumString.NumSearch.focus();" text="#000000" topmargin="0" vlink="#663366"><div style="position: fixed; z-index: 999; top: 5; left: 600; background-color: navy; display: block">
<button id="myBtn" style="color: white; background-color: navy; display: block">Translate Website</button></div>
<!-- The Modal -->
<div class="modal" id="myModal" style="display: none">
  <!-- Modal content -->
  <div class="modal-content">
      <div class="modal-header"><h3>
    <span class="close">×</span></h3></div>    
    <p>The Illinois General Assembly offers the Google Translate™ service for visitor convenience. In no way should it be considered accurate as to the translation of any content herein.</p>
    <p>Visitors of the Illinois General Assembly website are encouraged to use other translation services available on the internet.</p>
    <p>The English l

In [106]:
bill_info = {}
soup_test.find_all("span")

Bill Number

In [87]:
bill_number = soup_test.title.text.split()[-1]
bill_info[bill_number] = {}
print(bill_info)

{'SB1560': {}}


Bill Title (Short Description) and Status

In [88]:
heading_tags = soup_test.find_all('span', class_ = "heading2")
for tag in heading_tags:
    if tag.text.startswith("Short Description"):
        description_tag = tag
    if tag.text.startswith("Last Action"):
        last_action_marker = tag.next.next.next
        break

In [89]:
bill_short_descr = description_tag.next.next.next.text
bill_info[bill_number]['short description'] = bill_short_descr
print(bill_info)

{'SB1560': {'short description': 'SAFETY-TECH'}}


In [90]:
last_action_tags = last_action_marker.find_all('td', class_ = "content")
for tag in last_action_tags:
    if tag['align'] == 'left': last_action = tag.text
    if tag['align'] == 'right': last_action_date = tag.text.strip()
    if tag['align'] == 'center': last_action_body = tag.text

In [91]:
bill_info[bill_number]['last action'] = {'action': last_action, 
                                         'date': last_action_date, 
                                         'body': last_action_body}
if last_action.endswith('Assignments'): bill_status = 'In process - not yet assigned to committee'
elif last_action.startswith("Public Act"): bill_status = 'Passed and signed into law'
else: bill_status = 'In process'

In [92]:
bill_info['SB1560']

{'short description': 'SAFETY-TECH',
 'last action': {'action': 'Referred to Assignments',
  'date': '2/15/2019',
  'body': 'Senate'}}

Senate Sponsors

In [93]:
next_tag = None
for tag in heading_tags:
    if tag.text.startswith("Senate Sponsors"):
        next_tag = tag.next_sibling.next_sibling
        break
    

In [94]:
senate_sponsors = []
while True:
    if next_tag is None: break
    if isinstance(next_tag, bs4.element.Tag) and next_tag.has_attr("href"):
        senate_sponsors.append(next_tag.text)
    next_tag = next_tag.next_sibling
    if next_tag in heading_tags: break

In [95]:
bill_info[bill_number]['senate sponsors'] = senate_sponsors
print(bill_info)

{'SB1560': {'short description': 'SAFETY-TECH', 'last action': {'action': 'Referred to Assignments', 'date': '2/15/2019', 'body': 'Senate'}, 'senate sponsors': ['Thomas Cullerton']}}


House Sponsors

In [96]:
next_tag = None
for tag in heading_tags:
    if tag.text.startswith("House Sponsors"):
        next_tag = tag.next_sibling.next_sibling
        break 

In [97]:
house_sponsors = []
while True:
    if next_tag is None: break
    if isinstance(next_tag, bs4.element.Tag) and next_tag.has_attr("href"):
        house_sponsors.append(next_tag.text)
    next_tag = next_tag.next_sibling
    if next_tag in heading_tags: break

In [98]:
bill_info[bill_number]['house sponsors'] = house_sponsors
print(bill_info)

{'SB1560': {'short description': 'SAFETY-TECH', 'last action': {'action': 'Referred to Assignments', 'date': '2/15/2019', 'body': 'Senate'}, 'senate sponsors': ['Thomas Cullerton'], 'house sponsors': []}}


Synopsis

In [99]:
synopsis_list = []
for tag in soup_test.find_all('span', class_ = "content notranslate"):
    synopsis_list.append(tag.text.strip())
synopsis = "  ".join(synopsis_list)  

In [100]:
bill_info[bill_number]['synopsis'] = synopsis.strip()
print(bill_info)

{'SB1560': {'short description': 'SAFETY-TECH', 'last action': {'action': 'Referred to Assignments', 'date': '2/15/2019', 'body': 'Senate'}, 'senate sponsors': ['Thomas Cullerton'], 'house sponsors': [], 'synopsis': 'Amends the Environmental Protection Act. Makes a technical change in a Section concerning the short title.'}}


#### Last Action

In [102]:
t0 = time.time()

In [108]:
for link in sbill_links:
    r_bill = requests.get(link)
    html_bill = r_bill.text.encode('iso-8859-1')
    soup_bill = bs4.BeautifulSoup(html_bill, "html5lib")
    
    #Bill number
    bill_number = soup_bill.title.text.split()[-1]
    bill_info[bill_number] = {}
    
    #Bill description and last action
    heading_tags = soup_bill.find_all('span', class_ = "heading2")
    for tag in heading_tags:
        if tag.text.startswith("Short Description"):
            description_tag = tag
        if tag.text.startswith("Last Action"):
            last_action_marker = tag.next.next.next
            break
    bill_short_descr = description_tag.next.next.next.text
    bill_info[bill_number]['short description'] = bill_short_descr
    last_action_tags = last_action_marker.find_all('td', class_ = "content")
    for tag in last_action_tags:
        if tag['align'] == 'left': last_action = tag.text
        if tag['align'] == 'right': last_action_date = tag.text.strip()
        if tag['align'] == 'center': last_action_body = tag.text
    bill_info[bill_number]['last action'] = {'action': last_action, 
                                             'date': last_action_date, 
                                             'body': last_action_body}
    if last_action.endswith('Assignments'): bill_status = 'In process - not yet assigned to committee'
    elif last_action.startswith("Public Act"): bill_status = 'Passed and signed into law'
    else: bill_status = 'In process'
    bill_info[bill_number]['status'] = bill_status
    
    #Senate sponsors
    next_tag = None
    for tag in heading_tags:
        if tag.text.startswith("Senate Sponsors"):
            next_tag = tag.next_sibling.next_sibling
            break
    senate_sponsors = []
    while True:
        if next_tag is None: break
        if isinstance(next_tag, bs4.element.Tag) and next_tag.has_attr("href"):
            senate_sponsors.append(next_tag.text)
        next_tag = next_tag.next_sibling
        if next_tag in heading_tags: break
    bill_info[bill_number]['senate sponsors'] = senate_sponsors
    
    #House sponsors
    next_tag = None
    for tag in heading_tags:
        if tag.text.startswith("House Sponsors"):
            next_tag = tag.next_sibling.next_sibling
            break
    house_sponsors = []
    while True:
        if next_tag is None: break
        if isinstance(next_tag, bs4.element.Tag) and next_tag.has_attr("href"):
            house_sponsors.append(next_tag.text)
        next_tag = next_tag.next_sibling
        if next_tag in heading_tags: break
    bill_info[bill_number]['house sponsors'] = house_sponsors
    
    #Synopsis
    synopsis_list = []
    for tag in soup_bill.find_all('span', class_ = "content notranslate"):
        synopsis_list.append(tag.text.strip())
    synopsis = "  ".join(synopsis_list)
    bill_info[bill_number]['synopsis'] = synopsis.strip()
delta_t = time.time() - t0    

Export the bill info to a json file

In [109]:
with open('bill_info.json', 'w') as f:
    json.dump(bill_info, f)

In [31]:
print(delta_t)

752.4093079566956


In [105]:
bill_info

{'SB1560': {'short description': 'SAFETY-TECH',
  'last action': {'action': 'Referred to Assignments',
   'date': '2/15/2019',
   'body': 'Senate'},
  'senate sponsors': ['Thomas Cullerton'],
  'house sponsors': [],
  'synopsis': 'Amends the Environmental Protection Act. Makes a technical change in a Section concerning the short title.'},
 'SB0001': {'short description': 'MINIMUM WAGE/INCOME TAX CREDIT',
  'last action': {'action': 'Public Act . . . . . . . . . 101-0001',
   'date': '2/19/2019',
   'body': 'Senate'},
  'status': 'Passed and signed into law',
  'senate sponsors': ['Kimberly A. Lightford',
   'Jacqueline Y. Collins',
   'Antonio Muñoz',
   'Iris Y. Martinez',
   'Mattie Hunter',
   'Patricia Van Pelt',
   'Robert Peters',
   'Ann Gillespie',
   'Ram Villivalam',
   'Omar Aquino',
   'Martin A. Sandoval',
   'Terry Link',
   'Elgie R. Sims, Jr.',
   'Toi W. Hutchinson',
   'Cristina Castro',
   'Emil Jones, III',
   'Christopher Belt'],
  'house sponsors': ['Will Guzzardi

In [112]:
bill_info_2 = {}

In [113]:
for link in hbill_links:
    r_bill = requests.get(link)
    html_bill = r_bill.text.encode('iso-8859-1')
    soup_bill = bs4.BeautifulSoup(html_bill, "html5lib")
    
    #Bill number
    bill_number = soup_bill.title.text.split()[-1]
    bill_info_2[bill_number] = {}
    
    #Bill description and last action
    heading_tags = soup_bill.find_all('span', class_ = "heading2")
    for tag in heading_tags:
        if tag.text.startswith("Short Description"):
            description_tag = tag
        if tag.text.startswith("Last Action"):
            last_action_marker = tag.next.next.next
            break
    bill_short_descr = description_tag.next.next.next.text
    bill_info_2[bill_number]['short description'] = bill_short_descr
    last_action_tags = last_action_marker.find_all('td', class_ = "content")
    for tag in last_action_tags:
        if tag['align'] == 'left': last_action = tag.text
        if tag['align'] == 'right': last_action_date = tag.text.strip()
        if tag['align'] == 'center': last_action_body = tag.text
    bill_info_2[bill_number]['last action'] = {'action': last_action, 
                                             'date': last_action_date, 
                                             'body': last_action_body}
    if last_action.endswith('Assignments'): bill_status = 'In process - not yet assigned to committee'
    elif last_action.startswith("Public Act"): bill_status = 'Passed and signed into law'
    else: bill_status = 'In process'
    bill_info_2[bill_number]['status'] = bill_status
    
    #Senate sponsors
    next_tag = None
    for tag in heading_tags:
        if tag.text.startswith("Senate Sponsors"):
            next_tag = tag.next_sibling.next_sibling
            break
    senate_sponsors = []
    while True:
        if next_tag is None: break
        if isinstance(next_tag, bs4.element.Tag) and next_tag.has_attr("href"):
            senate_sponsors.append(next_tag.text)
        next_tag = next_tag.next_sibling
        if next_tag in heading_tags: break
    bill_info_2[bill_number]['senate sponsors'] = senate_sponsors
    
    #House sponsors
    next_tag = None
    for tag in heading_tags:
        if tag.text.startswith("House Sponsors"):
            next_tag = tag.next_sibling.next_sibling
            break
    house_sponsors = []
    while True:
        if next_tag is None: break
        if isinstance(next_tag, bs4.element.Tag) and next_tag.has_attr("href"):
            house_sponsors.append(next_tag.text)
        next_tag = next_tag.next_sibling
        if next_tag in heading_tags: break
    bill_info_2[bill_number]['house sponsors'] = house_sponsors
    
    #Synopsis
    synopsis_list = []
    for tag in soup_bill.find_all('span', class_ = "content notranslate"):
        synopsis_list.append(tag.text.strip())
    synopsis = "  ".join(synopsis_list)
    bill_info_2[bill_number]['synopsis'] = synopsis.strip()

In [115]:
len(bill_info_2)

5752

In [116]:
with open('bill_info_2.json', 'w') as f:
    json.dump(bill_info_2, f)