forked from openstates/openstates-scrapers
-
Notifications
You must be signed in to change notification settings - Fork 1
/
legislators.py
79 lines (59 loc) · 2.65 KB
/
legislators.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import re
from billy.scrape import NoDataForPeriod
from billy.utils import urlescape
from billy.scrape.legislators import LegislatorScraper, Legislator
import lxml.html
class WVLegislatorScraper(LegislatorScraper):
state = 'wv'
def scrape(self, chamber, term):
self.validate_term(term, latest_only=True)
if chamber == 'upper':
chamber_abbrev = 'sen'
title_abbrev = 'sen'
else:
chamber_abbrev = 'hse'
title_abbrev = 'del'
url = "http://www.legis.state.wv.us/districts/maps/%s_dist.cfm" % (
chamber_abbrev)
page = lxml.html.fromstring(self.urlopen(url))
page.make_links_absolute(url)
view_url = '%smemview' % title_abbrev
for link in page.xpath("//a[contains(@href, '%s')]" % view_url):
name = link.xpath("string()").strip()
leg_url = urlescape(link.attrib['href'])
if name in ['Members', 'Senate Members', 'House Members',
'Vacancy', 'VACANT']:
continue
self.scrape_legislator(chamber, term, name, leg_url)
def scrape_legislator(self, chamber, term, name, url):
page = lxml.html.fromstring(self.urlopen(url))
page.make_links_absolute(url)
dist_link = page.xpath("//a[contains(@href, 'dist=')]")[0]
district = dist_link.xpath('string()').strip().lstrip('0')
mem_span = page.xpath("//span[contains(@class, 'memname')]")[0]
mem_tail = mem_span.tail.strip()
party = re.match(r'\((R|D)', mem_tail).group(1)
if party == 'D':
party = 'Democratic'
elif party == 'R':
party = 'Republican'
photo_url = page.xpath(
"//img[contains(@src, 'images/members/')]")[0].attrib['src']
email = page.xpath(
"//a[contains(@href, 'mailto:')]")[1].attrib['href'].split(
'mailto:')[1]
leg = Legislator(term, chamber, district, name, party=party,
photo_url=photo_url, email=email, url=url)
leg.add_source(url)
for link in page.xpath("//a[contains(@href, 'committee.cfm')]"):
comm = link.xpath("string()").strip()
sub_index = comm.find('Subcommittee')
if sub_index > 0:
sub = comm[sub_index:].strip()
comm = comm[:sub_index].strip()
leg.add_role('committee member', term, committee=comm,
subcommittee=sub, chamber=chamber)
else:
leg.add_role('committee member', term, committee=comm,
chamber=chamber)
self.save_legislator(leg)