Permalink
Browse files

Add date scraping

  • Loading branch information...
Vanuan committed Jun 23, 2014
1 parent 0b8994d commit 7c2910541baf5cb8e0a69243c7a9889ace7de276
Showing with 29 additions and 11 deletions.
  1. +6 −4 scrapers/README.md
  2. +1 −1 scrapers/odessa_region/__init__.py
  3. +21 −5 scrapers/odessa_region/people.py
  4. +1 −1 scrapers/requirements.txt
View
@@ -11,15 +11,17 @@ add2virtualenv .
```
psql psql -U postgres -h localhost
-# CREATE DATABASE pupa_ukraine;
+# CREATE DATABASE pupa_ukraine TEMPLATE=template_postgis;
# \q
-export DATABASE_URL=postgres://postgres@localhost/pupa_ukraine
-pupa dbinit
+export DATABASE_URL=postgis://postgres@localhost/pupa_ukraine
+pupa dbinit ua
```
### Scrape!
```
-pupa update
+pupa update odessa_region
+# use fastmode after first scraping (uses cache)
+# pupa update --fastmode odessa_region
```
@@ -6,7 +6,7 @@
class OdessaOblRada(Jurisdiction):
classification = 'rada'
- division_id = 'ocd-division/country:ua/region:od/'
+ division_id = 'ocd-division/country:ua/region:od'
name = 'Одеська обласна рада'
url = 'http://oblrada.odessa.gov.ua/'
scrapers = {
@@ -1,6 +1,7 @@
#encoding=utf-8
from pupa.scrape import Scraper
-from pupa.scrape.helpers import Legislator, Organization
+from pupa.scrape.helpers import Organization
+from pupa.scrape.popolo import Person
import logging
import lxml.html
@@ -12,6 +13,14 @@
MEMBERLIST = 'http://oblrada.odessa.gov.ua/index.php?option=com_content&view=article&id=1054&Itemid=266&lang=uk'
+def parse_date(date_string):
+ day, month, year, _ = date_string.split(' ')
+ month_number = {'січня': 1, 'лютого': 2, 'березня': 3, 'квітня': 4, 'травня': 5, 'червня': 6, 'липня': 7, 'серпня': 8, 'вересня': 9, 'жовтня': 10, 'листопада': 11, 'грудня': 12}.get(month)
+ year = int(year)
+ day = int(day)
+ return '%04d-%02d-%02d' % (year, month_number, day)
+
+
class OdessaOblRadaPersonScraper(Scraper):
def lxmlize(self, url):
@@ -23,6 +32,7 @@ def lxmlize(self, url):
def scrape_person_details(self, url, name, post):
page = self.lxmlize(url)
article = page.xpath("//div[@class='art-article']")[0]
+ birthdate = ''
if not article.text_content().strip() == u'Інформіція відсутня' and not article.text_content().strip() == name:
items = article.xpath("./p/span")
img = article.xpath("./p//img")
@@ -35,10 +45,13 @@ def scrape_person_details(self, url, name, post):
if not item.text_content().strip():
continue
text = item.text_content()
+ birthdate_string = 'Дата народження: '
+ if text.startswith(birthdate_string):
+ birthdate = parse_date(text.replace(birthdate_string, ''))
logger.info(text)
else:
img_url = ''
- p = Legislator(name=name, district=None, image=img_url)
+ p = Person(name=name, image=img_url, birth_date=birthdate)
p.add_source(MEMBERLIST)
p.add_source(url)
return p
@@ -57,13 +70,16 @@ def scrape(self):
if not name2:
name1 = name.xpath("./p/span/span/span/a")
name2 = name1[0].text_content()
- url = name.xpath("./p/span/span/span/a")[0].attrib["href"]
+ a_link = name.xpath("./p/span/span/span/a")[0]
+ url = a_link.attrib["href"]
else:
- url = name.xpath("./p/span/a")[0].attrib["href"]
+ a_link = name.xpath("./p/span/a")[0]
+ url = a_link.attrib["href"]
+ name3 = name.xpath("./p/span")[0].text_content()
post = post.xpath("./p/span/span/a")
if post:
post = post[0].text_content()
else:
post = ''
- yield self.scrape_person_details(url, name2, post)
+ yield self.scrape_person_details(url, name3, post)
@@ -1,3 +1,3 @@
--e git+git://github.com/django/django.git@1.7b4#egg=django
+-e git+https://github.com/django/django.git@stable/1.7.x#egg=django
-e git+https://github.com/opencivicdata/pupa.git@5c7295e484e721b9898980befcc45bf34115a46e#egg=pupa
lxml

0 comments on commit 7c29105

Please sign in to comment.