Skip to content

Commit

Permalink
Add date scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
Vanuan committed Jun 23, 2014
1 parent 0b8994d commit 7c29105
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 11 deletions.
10 changes: 6 additions & 4 deletions scrapers/README.md
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -11,15 +11,17 @@ add2virtualenv .


``` ```
psql psql -U postgres -h localhost psql psql -U postgres -h localhost
# CREATE DATABASE pupa_ukraine; # CREATE DATABASE pupa_ukraine TEMPLATE=template_postgis;
# \q # \q
export DATABASE_URL=postgres://postgres@localhost/pupa_ukraine export DATABASE_URL=postgis://postgres@localhost/pupa_ukraine
pupa dbinit pupa dbinit ua
``` ```




### Scrape! ### Scrape!


``` ```
pupa update pupa update odessa_region
# use fastmode after first scraping (uses cache)
# pupa update --fastmode odessa_region
``` ```
2 changes: 1 addition & 1 deletion scrapers/odessa_region/__init__.py
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


class OdessaOblRada(Jurisdiction): class OdessaOblRada(Jurisdiction):
classification = 'rada' classification = 'rada'
division_id = 'ocd-division/country:ua/region:od/' division_id = 'ocd-division/country:ua/region:od'
name = 'Одеська обласна рада' name = 'Одеська обласна рада'
url = 'http://oblrada.odessa.gov.ua/' url = 'http://oblrada.odessa.gov.ua/'
scrapers = { scrapers = {
Expand Down
26 changes: 21 additions & 5 deletions scrapers/odessa_region/people.py
Original file line number Original file line Diff line number Diff line change
@@ -1,6 +1,7 @@
#encoding=utf-8 #encoding=utf-8
from pupa.scrape import Scraper from pupa.scrape import Scraper
from pupa.scrape.helpers import Legislator, Organization from pupa.scrape.helpers import Organization
from pupa.scrape.popolo import Person
import logging import logging
import lxml.html import lxml.html


Expand All @@ -12,6 +13,14 @@
MEMBERLIST = 'http://oblrada.odessa.gov.ua/index.php?option=com_content&view=article&id=1054&Itemid=266&lang=uk' MEMBERLIST = 'http://oblrada.odessa.gov.ua/index.php?option=com_content&view=article&id=1054&Itemid=266&lang=uk'




def parse_date(date_string):
day, month, year, _ = date_string.split(' ')
month_number = {'січня': 1, 'лютого': 2, 'березня': 3, 'квітня': 4, 'травня': 5, 'червня': 6, 'липня': 7, 'серпня': 8, 'вересня': 9, 'жовтня': 10, 'листопада': 11, 'грудня': 12}.get(month)
year = int(year)
day = int(day)
return '%04d-%02d-%02d' % (year, month_number, day)


class OdessaOblRadaPersonScraper(Scraper): class OdessaOblRadaPersonScraper(Scraper):


def lxmlize(self, url): def lxmlize(self, url):
Expand All @@ -23,6 +32,7 @@ def lxmlize(self, url):
def scrape_person_details(self, url, name, post): def scrape_person_details(self, url, name, post):
page = self.lxmlize(url) page = self.lxmlize(url)
article = page.xpath("//div[@class='art-article']")[0] article = page.xpath("//div[@class='art-article']")[0]
birthdate = ''
if not article.text_content().strip() == u'Інформіція відсутня' and not article.text_content().strip() == name: if not article.text_content().strip() == u'Інформіція відсутня' and not article.text_content().strip() == name:
items = article.xpath("./p/span") items = article.xpath("./p/span")
img = article.xpath("./p//img") img = article.xpath("./p//img")
Expand All @@ -35,10 +45,13 @@ def scrape_person_details(self, url, name, post):
if not item.text_content().strip(): if not item.text_content().strip():
continue continue
text = item.text_content() text = item.text_content()
birthdate_string = 'Дата народження: '
if text.startswith(birthdate_string):
birthdate = parse_date(text.replace(birthdate_string, ''))
logger.info(text) logger.info(text)
else: else:
img_url = '' img_url = ''
p = Legislator(name=name, district=None, image=img_url) p = Person(name=name, image=img_url, birth_date=birthdate)
p.add_source(MEMBERLIST) p.add_source(MEMBERLIST)
p.add_source(url) p.add_source(url)
return p return p
Expand All @@ -57,13 +70,16 @@ def scrape(self):
if not name2: if not name2:
name1 = name.xpath("./p/span/span/span/a") name1 = name.xpath("./p/span/span/span/a")
name2 = name1[0].text_content() name2 = name1[0].text_content()
url = name.xpath("./p/span/span/span/a")[0].attrib["href"] a_link = name.xpath("./p/span/span/span/a")[0]
url = a_link.attrib["href"]
else: else:
url = name.xpath("./p/span/a")[0].attrib["href"] a_link = name.xpath("./p/span/a")[0]
url = a_link.attrib["href"]
name3 = name.xpath("./p/span")[0].text_content()
post = post.xpath("./p/span/span/a") post = post.xpath("./p/span/span/a")
if post: if post:
post = post[0].text_content() post = post[0].text_content()
else: else:
post = '' post = ''


yield self.scrape_person_details(url, name2, post) yield self.scrape_person_details(url, name3, post)
2 changes: 1 addition & 1 deletion scrapers/requirements.txt
Original file line number Original file line Diff line number Diff line change
@@ -1,3 +1,3 @@
-e git+git://github.com/django/django.git@1.7b4#egg=django -e git+https://github.com/django/django.git@stable/1.7.x#egg=django
-e git+https://github.com/opencivicdata/pupa.git@5c7295e484e721b9898980befcc45bf34115a46e#egg=pupa -e git+https://github.com/opencivicdata/pupa.git@5c7295e484e721b9898980befcc45bf34115a46e#egg=pupa
lxml lxml

0 comments on commit 7c29105

Please sign in to comment.