Fetching contributors…
Cannot retrieve contributors at this time
189 lines (152 sloc) 6.83 KB
This fdw can be used to access items from an rss feed.
The column names are mapped to the elements inside an item.
An rss item has the following strcture:
.. code-block:: xml
<description>Small description</description>
You can access every element by defining a column with the same name. Be
careful to match the case! Example: pubDate should be quoted like this:
``pubDate`` to preserve the uppercased ``D``.
.. api_compat::
You will need the `lxml`_ library.
.. _lxml:
Required options
``url`` (string)
The RSS feed URL.
Usage Example
.. _Radicale:
If you want to parse the `radicale`_ rss feed, you can use the following
.. code-block:: sql
CREATE SERVER rss_srv foreign data wrapper multicorn options (
wrapper 'multicorn.rssfdw.RssFdw'
"pubDate" timestamp,
description character varying,
title character varying,
link character varying
) server rss_srv options (
url ''
select "pubDate", title, link from radicalerss limit 10;
.. code-block:: bash
pubDate | title | link
2011-09-27 06:07:42 | Radicale 0.6.2 |
2011-08-28 13:20:46 | Radicale 0.6.1, Changes, Future |
2011-08-01 08:54:43 | Radicale 0.6 Released |
2011-07-02 20:13:29 | Feature Freeze for 0.6 |
2011-05-01 17:24:33 | Ready for WSGI |
2011-04-30 10:21:12 | Apple iCal Support |
2011-04-25 22:10:59 | Two Features and One New Roadmap |
2011-04-10 20:04:33 | New Features |
2011-04-02 12:11:37 | Radicale 0.5 Released |
2011-02-03 23:35:55 | Jabber Room and iPhone Support |
(10 lignes)
from . import ForeignDataWrapper
from datetime import datetime, timedelta
from lxml import etree
from urllib.request import urlopen
except ImportError:
from urllib import urlopen
from logging import ERROR, WARNING
from multicorn.utils import log_to_postgres
import json
def element_to_dict(element):
This method takes a lxml element and return a json string containing
the element attributes and a text key and a child node.
>>> test = lambda x: sorted([(k, sorted(v.items())) if isinstance(v, dict) else (k, [sorted(e.items()) for e in v]) if isinstance(v, list) else (k, v) for k, v in element_to_dict(etree.fromstring(x)).items()])
>>> test('<t a1="v1"/>')
[('attributes', {'a1': 'v1'}), ('children', []), ('tag', 't'), ('text', '')]
>>> test('<t a1="v1">Txt</t>')
[('attributes', {'a1': 'v1'}), ('children', []), ('tag', 't'), ('text', 'Txt')]
>>> test('<t>Txt<s1 a1="v1">Sub1</s1>Txt2<s2 a2="v2">Sub2</s2>Txt3</t>')
[('attributes', {}), ('children', [[('attributes', {'a1': 'v1'}), ('children', []), ('tag', 's1'), ('text', 'Sub1')], [('attributes', {'a2': 'v2'}), ('children', []), ('tag', 's2'), ('text', 'Sub2')]]), ('tag', 't'), ('text', 'Txt')]
return {
'tag': etree.QName(element.tag).localname,
'text': element.text or '',
'attributes': dict(element.attrib),
'children': [element_to_dict(e) for e in element]
class RssFdw(ForeignDataWrapper):
"""An rss foreign data wrapper.
The following options are accepted:
url -- The rss feed urls.
The columns named are parsed, and are used as xpath expression on
each item xml node. Exemple: a column named "pubDate" would return the
pubDate element of an rss item.
def __init__(self, options, columns):
super(RssFdw, self).__init__(options, columns)
self.url = options.get('url', None)
self.cache = (None, None)
self.cache_duration = options.get('cache_duration', None)
if self.cache_duration is not None:
self.cache_duration = timedelta(seconds=int(self.cache_duration))
if self.url is None:
log_to_postgres("You MUST set an url when creating the table!",
self.columns = columns
self.default_namespace_prefix = options.pop(
'default_namespace_prefix', None)
self.item_root = options.pop('item_root', 'item')
def get_namespaces(self, xml):
ns = dict(xml.nsmap)
if None in ns:
ns[self.default_namespace_prefix] = ns.pop(None)
return ns
def make_item_from_xml(self, xml_elem):
"""Internal method used for parsing item xml element from the
columns definition."""
item = {}
for prop, column in self.columns.items():
value = xml_elem.xpath(
prop, namespaces=self.get_namespaces(xml_elem))
if value:
if column.type_name.startswith('json'):
item[prop] = json.dumps([
element_to_dict(val) for val in value])
# There should be a better way
# oid is 1009 ?
elif column.type_name.endswith('[]'):
item[prop] = [elem.text for elem in value]
item[prop] = getattr(value[0], 'text', value[0])
return item
def execute(self, quals, columns):
"""Quals are ignored."""
if self.cache_duration is not None:
date, values = self.cache
if values is not None:
if ( - date) < self.cache_duration:
return values
xml = etree.fromstring(urlopen(self.url).read())
items = [self.make_item_from_xml(elem)
for elem in xml.xpath(
'//%s' % self.item_root,
self.cache = (, items)
return items
except etree.ParseError:
log_to_postgres("Malformed xml, returning nothing")
except IOError:
log_to_postgres("Cannot retrieve '%s'" % self.url, WARNING)