In [92]:
import bs4
import requests
import html5lib
import time
import json
import csv
import re

In [93]:
base_url = "http://www.ilga.gov"
senate_url = base_url + "/senate/"
house_url = base_url + "/house/"

#### Grab html

In [94]:
def url_to_soup(url):
    r = requests.get(url)
    html_text = r.text.encode('iso-8859-1')
    return bs4.BeautifulSoup(html_text, "html5lib")


In [98]:
rep_info = {}

In [96]:
def update_rep_dict(rep_info, url, chamber):
    soup = url_to_soup(url)
    tablerow_tags = soup.find_all('td', class_ = "detail")
    for i in range(0,len(tablerow_tags),5):
        name = tablerow_tags[i].a.text
        rep_info[name] = {}
        committees_link = url + tablerow_tags[i+2].a["href"]
        district = tablerow_tags[i+3].text
        party = tablerow_tags[i+4].text
        rep_info[name]['party'] = party
        rep_info[name]['district'] = str(chamber) + ' ' + str(district)
        rep_info[name]['committees'] = []
    
        #Go to committees page and grab committees
        committee_soup = url_to_soup(committees_link)
        com_table_tags = committee_soup.find_all('td', class_="billlist")
        for tag in com_table_tags:
            if tag.a and 'hearing' in tag.a['href']:
                committee_id = re.findall(r'ID=([\d]{4})', tag.a['href'])[0]
                rep_info[name]['committees'].append((tag.text.strip(), committee_id))  
        rep_info[name]['bills'] = {'ids': [], 'count sponsored': 0, 'count passed': 0, 'topic counts':{}}

In [99]:
update_rep_dict(rep_info, senate_url, 'Senate')
update_rep_dict(rep_info, house_url, 'House')

In [100]:
rep_info

{'Neil Anderson': {'party': 'R',
  'district': 'Senate 36',
  'committees': [('Commerce and Economic Development', '2358'),
   ('Committee of the Whole', '2332'),
   ('Energy and Public Utilities', '2343'),
   ('Licensed Activities', '2325'),
   ('Opioid Crisis Abatement Spec. Com.', '2644'),
   ('Oversight Medicaid Mang. Care, Spec', '2611'),
   ('Subcommittee on Capital (TR)', '2583'),
   ('Transportation', '2327'),
   ('Veterans Affairs', '2374')],
  'bills': {'ids': [],
   'count sponsored': 0,
   'count passed': 0,
   'topic counts': {}}},
 'Omar Aquino': {'party': 'D',
  'district': 'Senate 2',
  'committees': [('Appropriations I', '2317'),
   ('Appropriations II', '2329'),
   ('Committee of the Whole', '2332'),
   ('Education', '2318'),
   ('Executive', '2320'),
   ('Government Accountability/Pensions', '2373'),
   ('Higher Education', '2333'),
   ('Labor', '2334'),
   ('Opioid Crisis Abatement Spec. Com.', '2644'),
   ('Sub. on Traffic Safety Systems', '2651'),
   ('Subcommitte

#### Bring in bill data

In [101]:
with open('bill_info.json') as f:
    bill_info = json.load(f)

#### Iterate through bills and update rep dictionary

In [103]:
for bill_num, bill_dict in bill_info.items():
    sponsors = []
    if bill_dict['senate sponsors']:
        sponsors.append(bill_dict['senate sponsors'][0])
    if bill_dict['house sponsors']:
        sponsors.append(bill_dict['house sponsors'][0])
    for sponsor in sponsors:    
        rep_info[sponsor]['bills']['ids'].append(bill_num)
        rep_info[sponsor]['bills']['count sponsored'] += 1
        if bill_dict['status'] == 'Passed and signed into law':
            rep_info[sponsor]['bills']['count passed'] += 1
        if bill_dict['committee']:
            topic = bill_dict['committee']
            if topic not in rep_info[sponsor]['bills']['topic counts']:
                rep_info[sponsor]['bills']['topic counts'][topic] = 0
            rep_info[sponsor]['bills']['topic counts'][topic] += 1
        

In [104]:
rep_info

{'Neil Anderson': {'party': 'R',
  'district': 'Senate 36',
  'committees': [('Commerce and Economic Development', '2358'),
   ('Committee of the Whole', '2332'),
   ('Energy and Public Utilities', '2343'),
   ('Licensed Activities', '2325'),
   ('Opioid Crisis Abatement Spec. Com.', '2644'),
   ('Oversight Medicaid Mang. Care, Spec', '2611'),
   ('Subcommittee on Capital (TR)', '2583'),
   ('Transportation', '2327'),
   ('Veterans Affairs', '2374')],
  'bills': {'ids': ['SB0040',
    'SB0058',
    'SB0120',
    'SB0148',
    'SB0149',
    'SB0160',
    'SB0167',
    'SB0173',
    'SB0944',
    'SB1042',
    'SB1137',
    'SB1138',
    'SB1219',
    'SB1435',
    'SB1492',
    'SB1815',
    'SB1860',
    'SB1870',
    'SB1872',
    'SB1873',
    'SB1986',
    'SB2117',
    'SB2535',
    'SB3033',
    'SB3242',
    'SB3444',
    'SB3445',
    'SB3695',
    'SB3705',
    'SB3706',
    'SB3707',
    'SB3708',
    'HB0271',
    'HB0822',
    'HB0889',
    'HB1554',
    'HB1659',
    'HB239

#### Rep database

In [None]:
rep_data = []
for name in rep_info.keys():
    
    rep_data.append([name, 
                     rep_info[name]['party'], 
                     rep_info[name]['district'], 
                     rep_info[name]['count sponsored'],
                     rep_info[name]['count passed']])