Permalink
Browse files

scrape_house beginnings

  • Loading branch information...
jamesturk committed Sep 29, 2010
1 parent 4007063 commit 27e40c7cf941bf4b32b197f56c8876bae35c55eb
Showing with 20 additions and 0 deletions.
  1. +20 −0 legislators/update_legislators.py
@@ -6,6 +6,7 @@
from collections import defaultdict
import string
from xml.dom import minidom
+import lxml.html
from votesmart import votesmart, VotesmartApiError
votesmart.apikey = '496ec1875a7885ec65a4ead99579642c'
@@ -351,6 +352,25 @@ def standardize_file(csvfile):
l['birthdate'] = newdate
tbl.save_to(csvfile)
+def scrape_house(csvfile):
+ table = LegislatorTable(csvfile)
+ base_url = 'http://clerk.house.gov/member_info/mem_contact_info.html?statdis=%s%02d'
+ for leg in table.get_legislators(in_office='1', title='Rep'):
+ url = base_url % (leg['state'], int(leg['district']))
+ data = urllib2.urlopen(url).read()
+ doc = lxml.html.fromstring(data)
+ try:
+ # district, addr, city, phone, (optional note about subcommittees)
+ addr = doc.xpath('//div[@id="results"]/p/text()')[1]
+ addr = addr.strip().replace('HOB', 'House Office Building')
+ if addr != leg['congress_office']:
+ leg['congress_office'] = addr
+ print 'Updated', leg['firstname'], leg['lastname']
+ except Exception as e:
+ print 'encountered exception (%s) while processing %s %s' % (
+ e, leg['firstname'], leg['lastname'])
+ table.save_to(csvfile)
+
def main():
from optparse import OptionParser
parser = OptionParser()

0 comments on commit 27e40c7

Please sign in to comment.