Permalink
Fetching contributors…
Cannot retrieve contributors at this time
189 lines (152 sloc) 6.83 KB
"""
Purpose
-------
This fdw can be used to access items from an rss feed.
The column names are mapped to the elements inside an item.
An rss item has the following strcture:
.. code-block:: xml
<item>
<title>Title</title>
<pubDate>2011-01-02</pubDate>
<link>http://example.com/test</link>
<guid>http://example.com/test</link>
<description>Small description</description>
</item>
You can access every element by defining a column with the same name. Be
careful to match the case! Example: pubDate should be quoted like this:
``pubDate`` to preserve the uppercased ``D``.
.. api_compat::
:read:
Dependencies
------------
You will need the `lxml`_ library.
.. _lxml: http://lxml.de/
Required options
-----------------
``url`` (string)
The RSS feed URL.
Usage Example
-------------
.. _Radicale: http://radicale.org
If you want to parse the `radicale`_ rss feed, you can use the following
definition:
.. code-block:: sql
CREATE SERVER rss_srv foreign data wrapper multicorn options (
wrapper 'multicorn.rssfdw.RssFdw'
);
CREATE FOREIGN TABLE radicalerss (
"pubDate" timestamp,
description character varying,
title character varying,
link character varying
) server rss_srv options (
url 'http://radicale.org/rss/'
);
select "pubDate", title, link from radicalerss limit 10;
.. code-block:: bash
pubDate | title | link
---------------------+----------------------------------+----------------------------------------------
2011-09-27 06:07:42 | Radicale 0.6.2 | http://radicale.org/news#2011-09-27@06:07:42
2011-08-28 13:20:46 | Radicale 0.6.1, Changes, Future | http://radicale.org/news#2011-08-28@13:20:46
2011-08-01 08:54:43 | Radicale 0.6 Released | http://radicale.org/news#2011-08-01@08:54:43
2011-07-02 20:13:29 | Feature Freeze for 0.6 | http://radicale.org/news#2011-07-02@20:13:29
2011-05-01 17:24:33 | Ready for WSGI | http://radicale.org/news#2011-05-01@17:24:33
2011-04-30 10:21:12 | Apple iCal Support | http://radicale.org/news#2011-04-30@10:21:12
2011-04-25 22:10:59 | Two Features and One New Roadmap | http://radicale.org/news#2011-04-25@22:10:59
2011-04-10 20:04:33 | New Features | http://radicale.org/news#2011-04-10@20:04:33
2011-04-02 12:11:37 | Radicale 0.5 Released | http://radicale.org/news#2011-04-02@12:11:37
2011-02-03 23:35:55 | Jabber Room and iPhone Support | http://radicale.org/news#2011-02-03@23:35:55
(10 lignes)
"""
from . import ForeignDataWrapper
from datetime import datetime, timedelta
from lxml import etree
try:
from urllib.request import urlopen
except ImportError:
from urllib import urlopen
from logging import ERROR, WARNING
from multicorn.utils import log_to_postgres
import json
def element_to_dict(element):
"""
This method takes a lxml element and return a json string containing
the element attributes and a text key and a child node.
>>> test = lambda x: sorted([(k, sorted(v.items())) if isinstance(v, dict) else (k, [sorted(e.items()) for e in v]) if isinstance(v, list) else (k, v) for k, v in element_to_dict(etree.fromstring(x)).items()])
>>> test('<t a1="v1"/>')
[('attributes', {'a1': 'v1'}), ('children', []), ('tag', 't'), ('text', '')]
>>> test('<t a1="v1">Txt</t>')
[('attributes', {'a1': 'v1'}), ('children', []), ('tag', 't'), ('text', 'Txt')]
>>> test('<t>Txt<s1 a1="v1">Sub1</s1>Txt2<s2 a2="v2">Sub2</s2>Txt3</t>')
[('attributes', {}), ('children', [[('attributes', {'a1': 'v1'}), ('children', []), ('tag', 's1'), ('text', 'Sub1')], [('attributes', {'a2': 'v2'}), ('children', []), ('tag', 's2'), ('text', 'Sub2')]]), ('tag', 't'), ('text', 'Txt')]
"""
return {
'tag': etree.QName(element.tag).localname,
'text': element.text or '',
'attributes': dict(element.attrib),
'children': [element_to_dict(e) for e in element]
}
class RssFdw(ForeignDataWrapper):
"""An rss foreign data wrapper.
The following options are accepted:
url -- The rss feed urls.
The columns named are parsed, and are used as xpath expression on
each item xml node. Exemple: a column named "pubDate" would return the
pubDate element of an rss item.
"""
def __init__(self, options, columns):
super(RssFdw, self).__init__(options, columns)
self.url = options.get('url', None)
self.cache = (None, None)
self.cache_duration = options.get('cache_duration', None)
if self.cache_duration is not None:
self.cache_duration = timedelta(seconds=int(self.cache_duration))
if self.url is None:
log_to_postgres("You MUST set an url when creating the table!",
ERROR)
self.columns = columns
self.default_namespace_prefix = options.pop(
'default_namespace_prefix', None)
self.item_root = options.pop('item_root', 'item')
def get_namespaces(self, xml):
ns = dict(xml.nsmap)
if None in ns:
ns[self.default_namespace_prefix] = ns.pop(None)
return ns
def make_item_from_xml(self, xml_elem):
"""Internal method used for parsing item xml element from the
columns definition."""
item = {}
for prop, column in self.columns.items():
value = xml_elem.xpath(
prop, namespaces=self.get_namespaces(xml_elem))
if value:
if column.type_name.startswith('json'):
item[prop] = json.dumps([
element_to_dict(val) for val in value])
# There should be a better way
# oid is 1009 ?
elif column.type_name.endswith('[]'):
item[prop] = [elem.text for elem in value]
else:
item[prop] = getattr(value[0], 'text', value[0])
return item
def execute(self, quals, columns):
"""Quals are ignored."""
if self.cache_duration is not None:
date, values = self.cache
if values is not None:
if (datetime.now() - date) < self.cache_duration:
return values
try:
xml = etree.fromstring(urlopen(self.url).read())
items = [self.make_item_from_xml(elem)
for elem in xml.xpath(
'//%s' % self.item_root,
namespaces=self.get_namespaces(xml))]
self.cache = (datetime.now(), items)
return items
except etree.ParseError:
log_to_postgres("Malformed xml, returning nothing")
except IOError:
log_to_postgres("Cannot retrieve '%s'" % self.url, WARNING)