Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Add a reStructuredText metadata access point.

  • Loading branch information...
commit 14965d5add40e2acbba8251e51984fe25d151d6e 1 parent ad6341c
@SimonSapin SimonSapin authored
View
86 python/multicorn/fsfdw/__init__.py
@@ -68,25 +68,67 @@ def execute(self, quals, columns):
structure.
"""
- cond = dict((qual.field_name, unicode(qual.value)) for
- qual in quals if qual.operator == '=')
- if self.filename_column in cond:
- item = self.structured_directory.from_filename(
- cond[self.filename_column])
- if item is not None and os.path.exists(item.full_filename):
- new_item = dict(item)
- if self.content_column:
- new_item[self.content_column] = item.read()
- if self.filename_column:
- new_item[self.filename_column] = item.filename
- yield new_item
- return
- else:
- cond.pop(self.content_column, None)
- for item in self.structured_directory.get_items(**cond):
- new_item = dict(item)
- if self.content_column and self.content_column in columns:
- new_item[self.content_column] = item.read()
- if self.filename_column and self.filename_column in columns:
- new_item[self.filename_column] = item.filename
- yield new_item
+ return self.items_to_dicts(self.get_items(quals, columns), columns)
+
+ def get_items(self, quals, columns):
+ filename_column = self.filename_column
+ for qual in quals:
+ if qual.field_name == filename_column and qual.operator == '=':
+ item = self.structured_directory.from_filename(
+ unicode(qual.value))
+ if item is not None and os.path.exists(item.full_filename):
+ return [item]
+ else:
+ return []
+ properties = self.structured_directory.properties
+ return self.structured_directory.get_items(**dict(
+ (qual.field_name, unicode(qual.value)) for qual in quals
+ if qual.operator == '=' and qual.field_name in properties))
+
+
+ def items_to_dicts(self, columns, items):
+ content_column = self.content_column
+ filename_column = self.filename_column
+ has_content = content_column and content_column in columns
+ has_filename = filename_column and filename_column in columns
+ for item in items:
+ new_item = dict(item)
+ if has_content:
+ new_item[content_column] = item.read()
+ if has_filename:
+ new_item[filename_column] = item.filename
+ yield new_item
+
+
+class ReStructuredTextFdw(FilesystemFdw):
+ """A filesystem with reStructuredText metadata foreign data wrapper.
+
+ The foreign data wrapper accepts the same options as FilesystemFdw.
+ Any column with a name in rest_* is set to the metadata value with the
+ corresponding key. (Eg. rest_title is set to the title of the document.)
+
+ """
+ def __init__(self, options, columns):
+ from multicorn.fsfdw.docutils_meta import mtime_lru_cache, extract_meta
+ # TODO: make max_size configurable?
+ self.extract_meta = mtime_lru_cache(extract_meta, max_size=1000)
+ columns = dict((name, column) for name, column in columns.items()
+ if not name.startswith('rest_'))
+ super(RestructuredText, self).__init__(options, columns)
+
+ def execute(self, quals, columns):
+ items = self.get_items(quals, columns)
+ keys = [(name, name[:5]) # len('rest_') == 5
+ for name in columns if name.startswith('rest_')]
+ if keys:
+ items = self.add_meta(items, keys)
+ return self.items_to_dicts(items, columns)
+
+ def add_meta(self, items, keys):
+ extract_meta = self.extract_meta
+ for item in items:
+ meta = extract_meta(item.full_filename)
+ item = dict(item)
+ for column, key in keys:
+ item[column] = meta.get(key)
+ yield item
View
71 python/multicorn/fsfdw/docutils_meta.py
@@ -0,0 +1,71 @@
+"""
+Use low-level docutils API to extract metadata from ReStructuredText files.
+"""
+
+from collections import OrderedDict # Python 2.7 or 3.1+
+from threading import Lock
+from functools import wraps
+from threading import Lock
+from os.path import getmtime
+
+from docutils.core import publish_doctree
+
+
+def extract_meta(filename):
+ """Read meta-data from a reStructuredText file and return a dict.
+
+ The 'title' and 'subtitle' keys are special-cased, but other keys
+ are read from the `docinfo` element.
+
+ """
+ with open(filename) as file_obj:
+ content = file_obj.read()
+ meta = {}
+ for element in publish_doctree(content):
+ if element.tagname in ('title', 'subtitle'):
+ meta[element.tagname] = element.astext()
+ elif element.tagname == 'docinfo':
+ for field in element:
+ if field.tagname == 'field':
+ name, body = field.children
+ meta[name.astext().lower()] = body.astext()
+ else:
+ meta[field.tagname.lower()] = field.astext()
+ return meta
+
+
+def mtime_lru_cache(function, max_size=100):
+ """File mtime-based least-recently-used cache.
+
+ :param function:
+ A function that takes a filename as its single parameter.
+ The file should exist, and the function's return value should
+ only depend on the contents of the file.
+
+ Return a decorated function that caches at most the ``max_size`` value.
+ Least recently used value are dropped first. Cached values are invalidated
+ when the files's modification time changes.
+
+ Inspired from functools.lru_cache, which only exists in Python 3.2+.
+
+ """
+ lock = Lock() # OrderedDict isn't threadsafe
+ cache = OrderedDict() # ordered least recent to most recent
+
+ @wraps(function)
+ def wrapper(filename):
+ mtime = getmtime(filename)
+ with lock:
+ if filename in cache:
+ old_mtime, result = cache.pop(filename)
+ if old_mtime == mtime:
+ # Move to the end
+ cache[filename] = old_mtime, result
+ return result
+ result = function(filename)
+ with lock:
+ cache[filename] = mtime, result # at the end
+ if len(cache) > max_size:
+ cache.popitem(last=False)
+ return result
+ return wrapper
View
58 python/multicorn/fsfdw/test.py
@@ -17,6 +17,7 @@
import pytest
from .structuredfs import StructuredDirectory, Item
+from .docutils_meta import mtime_lru_cache, extract_meta
def with_tempdir(function):
@@ -303,5 +304,62 @@ def assert_listed(properties, expected_ids, expected_listed):
['nonexistent'])
+@with_tempdir
+def test_docutils_meta(tempdir):
+ def counting(filename):
+ counting.n_calls += 1
+ return extract_meta(filename)
+ counting.n_calls = 0
+ wrapper = mtime_lru_cache(counting, max_size=2)
+ def extract(filename):
+ return wrapper(os.path.join(tempdir, filename))
+ rest_1 = '''
+The main title
+==============
+
+Second title
+------------
+
+:Author: Me
+
+Content
+'''
+ meta_1 = {'title': 'The main title', 'subtitle': u'Second title',
+ 'author': u'Me'}
+ rest_2 = '''
+First title
+===========
+
+:Author: Myself
+:foo: bar
+
+Not a subtitle
+--------------
+
+Content
+'''
+ meta_2 = {'title': 'First title', 'author': 'Myself', 'foo': 'bar'}
+ def write(filename, content):
+ with open(os.path.join(tempdir, filename), 'w') as file_obj:
+ file_obj.write(content)
+ write('first.rst', rest_1)
+ write('second.rst', rest_2)
+ assert counting.n_calls == 0
+ assert extract('first.rst') == meta_1
+ assert counting.n_calls == 1
+ assert extract('first.rst') == meta_1 # cached
+ assert counting.n_calls == 1
+ assert extract('second.rst') == meta_2
+ assert counting.n_calls == 2
+ write('third.rst', rest_1)
+ assert extract('third.rst') == meta_1 # Exceeds the cache size
+ assert counting.n_calls == 3
+ write('third.rst', rest_2)
+ assert extract('third.rst') == meta_2
+ assert counting.n_calls == 4
+ assert extract('first.rst') == meta_1 # Not cached anymore
+ assert counting.n_calls == 5
+
+
if __name__ == '__main__':
pytest.main([__file__] + sys.argv)
Please sign in to comment.
Something went wrong with that request. Please try again.