-
Notifications
You must be signed in to change notification settings - Fork 145
/
rssfdw.py
188 lines (152 loc) · 6.83 KB
/
rssfdw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""
Purpose
-------
This fdw can be used to access items from an rss feed.
The column names are mapped to the elements inside an item.
An rss item has the following strcture:
.. code-block:: xml
<item>
<title>Title</title>
<pubDate>2011-01-02</pubDate>
<link>http://example.com/test</link>
<guid>http://example.com/test</link>
<description>Small description</description>
</item>
You can access every element by defining a column with the same name. Be
careful to match the case! Example: pubDate should be quoted like this:
``pubDate`` to preserve the uppercased ``D``.
.. api_compat::
:read:
Dependencies
------------
You will need the `lxml`_ library.
.. _lxml: http://lxml.de/
Required options
-----------------
``url`` (string)
The RSS feed URL.
Usage Example
-------------
.. _Radicale: http://radicale.org
If you want to parse the `radicale`_ rss feed, you can use the following
definition:
.. code-block:: sql
CREATE SERVER rss_srv foreign data wrapper multicorn options (
wrapper 'multicorn.rssfdw.RssFdw'
);
CREATE FOREIGN TABLE radicalerss (
"pubDate" timestamp,
description character varying,
title character varying,
link character varying
) server rss_srv options (
url 'http://radicale.org/rss/'
);
select "pubDate", title, link from radicalerss limit 10;
.. code-block:: bash
pubDate | title | link
---------------------+----------------------------------+----------------------------------------------
2011-09-27 06:07:42 | Radicale 0.6.2 | http://radicale.org/news#2011-09-27@06:07:42
2011-08-28 13:20:46 | Radicale 0.6.1, Changes, Future | http://radicale.org/news#2011-08-28@13:20:46
2011-08-01 08:54:43 | Radicale 0.6 Released | http://radicale.org/news#2011-08-01@08:54:43
2011-07-02 20:13:29 | Feature Freeze for 0.6 | http://radicale.org/news#2011-07-02@20:13:29
2011-05-01 17:24:33 | Ready for WSGI | http://radicale.org/news#2011-05-01@17:24:33
2011-04-30 10:21:12 | Apple iCal Support | http://radicale.org/news#2011-04-30@10:21:12
2011-04-25 22:10:59 | Two Features and One New Roadmap | http://radicale.org/news#2011-04-25@22:10:59
2011-04-10 20:04:33 | New Features | http://radicale.org/news#2011-04-10@20:04:33
2011-04-02 12:11:37 | Radicale 0.5 Released | http://radicale.org/news#2011-04-02@12:11:37
2011-02-03 23:35:55 | Jabber Room and iPhone Support | http://radicale.org/news#2011-02-03@23:35:55
(10 lignes)
"""
from . import ForeignDataWrapper
from datetime import datetime, timedelta
from lxml import etree
try:
from urllib.request import urlopen
except ImportError:
from urllib import urlopen
from logging import ERROR, WARNING
from multicorn.utils import log_to_postgres
import json
def element_to_dict(element):
"""
This method takes a lxml element and return a json string containing
the element attributes and a text key and a child node.
>>> test = lambda x: sorted([(k, sorted(v.items())) if isinstance(v, dict) else (k, [sorted(e.items()) for e in v]) if isinstance(v, list) else (k, v) for k, v in element_to_dict(etree.fromstring(x)).items()])
>>> test('<t a1="v1"/>')
[('attributes', {'a1': 'v1'}), ('children', []), ('tag', 't'), ('text', '')]
>>> test('<t a1="v1">Txt</t>')
[('attributes', {'a1': 'v1'}), ('children', []), ('tag', 't'), ('text', 'Txt')]
>>> test('<t>Txt<s1 a1="v1">Sub1</s1>Txt2<s2 a2="v2">Sub2</s2>Txt3</t>')
[('attributes', {}), ('children', [[('attributes', {'a1': 'v1'}), ('children', []), ('tag', 's1'), ('text', 'Sub1')], [('attributes', {'a2': 'v2'}), ('children', []), ('tag', 's2'), ('text', 'Sub2')]]), ('tag', 't'), ('text', 'Txt')]
"""
return {
'tag': etree.QName(element.tag).localname,
'text': element.text or '',
'attributes': dict(element.attrib),
'children': [element_to_dict(e) for e in element]
}
class RssFdw(ForeignDataWrapper):
"""An rss foreign data wrapper.
The following options are accepted:
url -- The rss feed urls.
The columns named are parsed, and are used as xpath expression on
each item xml node. Exemple: a column named "pubDate" would return the
pubDate element of an rss item.
"""
def __init__(self, options, columns):
super(RssFdw, self).__init__(options, columns)
self.url = options.get('url', None)
self.cache = (None, None)
self.cache_duration = options.get('cache_duration', None)
if self.cache_duration is not None:
self.cache_duration = timedelta(seconds=int(self.cache_duration))
if self.url is None:
log_to_postgres("You MUST set an url when creating the table!",
ERROR)
self.columns = columns
self.default_namespace_prefix = options.pop(
'default_namespace_prefix', None)
self.item_root = options.pop('item_root', 'item')
def get_namespaces(self, xml):
ns = dict(xml.nsmap)
if None in ns:
ns[self.default_namespace_prefix] = ns.pop(None)
return ns
def make_item_from_xml(self, xml_elem):
"""Internal method used for parsing item xml element from the
columns definition."""
item = {}
for prop, column in self.columns.items():
value = xml_elem.xpath(
prop, namespaces=self.get_namespaces(xml_elem))
if value:
if column.type_name.startswith('json'):
item[prop] = json.dumps([
element_to_dict(val) for val in value])
# There should be a better way
# oid is 1009 ?
elif column.type_name.endswith('[]'):
item[prop] = [elem.text for elem in value]
else:
item[prop] = getattr(value[0], 'text', value[0])
return item
def execute(self, quals, columns):
"""Quals are ignored."""
if self.cache_duration is not None:
date, values = self.cache
if values is not None:
if (datetime.now() - date) < self.cache_duration:
return values
try:
xml = etree.fromstring(urlopen(self.url).read())
items = [self.make_item_from_xml(elem)
for elem in xml.xpath(
'//%s' % self.item_root,
namespaces=self.get_namespaces(xml))]
self.cache = (datetime.now(), items)
return items
except etree.ParseError:
log_to_postgres("Malformed xml, returning nothing")
except IOError:
log_to_postgres("Cannot retrieve '%s'" % self.url, WARNING)