Removed timeout, added ability to extract text from file-like objects…

…, changed zipfile handling, lots of docs.
JordanReiter · Feb 6, 2012 · fdce48b · fdce48b
1 parent 385eab1
commit fdce48b
Show file tree

Hide file tree

Showing 5 changed files with 270 additions and 107 deletions.
diff --git a/README.md b/README.md
@@ -88,19 +88,4 @@ the file contents.
 Future
 ----
 
-A feature to expect in the near future is support for file-like objects:
-
-```python
-> import fulltext
-> f = file('existing.pdf', 'r')
-> fulltext.get(f, '< no content >')
-'Lorem ipsum...'
-```
-
-Also, currently, plain text files are not detected or handled. The library should detect
-a plain text file and perform only post-processing on it.
-
-One last change I plan is to use the mimetypes library for detection rather than using
-the file extension directly. Commands and post-processing will then be mapped to mime types.
-Then we should also allow the library user to add their own commands for support of other
-file types (much like the mimetypes library does with the add_type function).
+Sometimes multiple tools can be used. For example, catdoc provides xls2csv, while xls2csv provides convertxls2csv. We should use whichever is present.
diff --git a/files/test.txt b/files/test.txt
@@ -0,0 +1,8 @@
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ipsum augue, iaculis quis auctor eu, adipiscing non est.
+Nullam id sem diam, eget varius dui. Etiam sollicitudin sapien nec odio elementum sit amet luctus magna volutpat. Ut
+commodo nulla neque. Aliquam erat volutpat. Integer et nunc augue. Pellentesque habitant morbi tristique senectus et
+netus et malesuada fames ac turpis egestas. Quisque at enim nulla, vel tincidunt urna. Nam leo augue, elementum ut
+viverra eget, scelerisque in purus. In arcu orci, porta nec aliquet quis, pretium a sem. In fermentum nisl id diam
+luctus viverra. Nullam semper, metus at euismod vulputate, orci odio dignissim urna, quis iaculis neque lacus ut
+tortor. Ut a justo non dolor venenatis accumsan. Proin dolor eros, aliquam id condimentum et, aliquam quis metus.
+Vivamus eget purus diam.
diff --git a/files/test.zip b/files/test.zip
diff --git a/fulltext/__init__.py b/fulltext/__init__.py
@@ -1,63 +1,14 @@
-import os, os.path, subprocess, re, csv, time, select, mimetypes
-# TODO: support file-like objects (pipe file data to stdin).
+import os, os.path, subprocess, re, csv, time, select, mimetypes, tempfile
+# TODO: Sometimes multiple tools can be used, choose the one that is installed.
 
 mimetypes.add_type('application/rar', '.rar')
 
-PROG_MAP = {
-    (None, None): ('cat', '{0}'),
-    ('application/pdf', None): ('pdftotext', '-q', '-nopgbrk', '{0}', '-'),
-    ('application/msword', None): ('antiword', '{0}'),
-    ('application/vnd.openxmlformats-officedocument.wordprocessingml.document', None): ('docx2txt', '{0}', '-'),  # http://sourceforge.net/projects/docx2txt/
-    ('application/vnd.ms-excel', None): ('convertxls2csv', '-q', '-x {0}', '-c -'),
-    ('application/rtf', None): ('unrtf', '--text', '--nopict', '{0}'),
-    ('application/vnd.oasis.opendocument.text', None): ('odt2txt', '{0}'),
-    ('application/vnd.oasis.opendocument.spreadsheet', None): ('odt2txt', '{0}'),
-    ('application/zip', None): ('zipinfo', '-2z', '{0}'),
-    ('application/x-tar', 'gzip'): ('tar', 'tzf', '{0}'),
-    ('application/x-tar', 'gzip2'): ('tar', 'tjf', '{0}'),
-    ('application/rar', None): ('unrar', 'vb', '-p-', '-ierr', '-y', '{0}'),
-    ('text/html', None): ('html2text', '-nobs', '{0}'),
-    ('text/xml', None): ('html2text', '-nobs', '{0}'),
-    ('image/jpeg', None): ('exiftool', '-s', '-s', '-s', '{0}'),
-    ('video/mpeg', None): ('exiftool', '-s', '-s', '-s', '{0}'),
-    ('audio/mpeg', None): ('exiftool', '-s', '-s', '-s', '{0}'),
-    ('application/octet-stream', None): ('strings', '-n 18', '{0}'),
-}
-
 STRIP_WHITE = re.compile(r'[ \t\v\f\r\n]+')
 UNRTF = re.compile(r'.*-+\n', flags=re.MULTILINE)
 
-PROC_TIMEOUT     = 5
-"How long to wait for command exectuion."
-
-def strip_unrtf_header(text):
-    """
-    Can't find a way to turn off the stupid header in unrtf.
-    """
-    return text.split('-----------------')[1]
-
-def csv_to_text(text):
-    """
-    Can convert xls to csv, but this will go from csv to plain old
-    text.
-    """
-    buffer = []
-    for row in csv.reader(text.splitlines(), dialect="excel"):
-        buffer.append(' '.join(row))
-    return ' '.join(buffer)
-
-FUNC_MAP = {
-    ('application/rtf', None): strip_unrtf_header,
-    ('application/vnd.ms-excel', None): csv_to_text,
-}
-
-class FullTextException(Exception):
-    pass
-
-
-# From:
 # http://stackoverflow.com/questions/377017/test-if-executable-exists-in-python
 def which(program):
+    "Simply checks if a given program exists within PATH and is executable."
     def is_exe(fpath):
         return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
     fpath, fname = os.path.split(program)
@@ -71,61 +22,248 @@ def is_exe(fpath):
                 return exe_file
     return None
 
+def read_content(f, type):
+    "A handler that simply reads a file's output. Used on unrecognized types."
+    if isinstance(f, basestring):
+        f = file(f, 'r')
+    return f.read()
+
+def run_command(f, type):
+    "The default handler. It runs a command and reads it's output."
+    cmds = PROG_MAP[type]
+    if isinstance(f, basestring):
+        cmd = cmds[0]
+        cmd = map(lambda x: x.format(f), cmd)
+        i = None
+    else:
+        assert hasattr(f, 'read'), 'File-like object must have read() method.'
+        cmd = cmds[1]
+        if cmd is None:
+            # Use temp file:
+            fd, fname = tempfile.mkstemp()
+            os.write(fd, f.read())
+            os.close(fd)
+            return run_command(fname, type)
+        i = f.read()
+    # We use regular subprocess module here. No timeout is allowed with communicate()
+    # If there are problems with times, I will investigate other options, like:
+    # http://pypi.python.org/pypi/EasyProcess
+    p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    return p.communicate(i)[0]
+
+def strip_unrtf_header(f, type):
+    "Can't find a way to turn off the stupid header in unrtf."
+    text = run_command(f, type)
+    parts = text.split('-----------------')
+    return '-----------------'.join(parts[1:])
+
+def csv_to_text(f, type):
+    "Can convert xls to csv, but this will go from csv to plain old text."
+    text = run_command(f, type)
+    buffer = []
+    for row in csv.reader(text.splitlines(), dialect="excel"):
+        buffer.append(' '.join(row))
+    return ' '.join(buffer)
+
+PROG_MAP = {
+    ('application/pdf', None): (
+        ('pdftotext', '-q', '-nopgbrk', '{0}', '-'),
+        ('pdftotext', '-q', '-nopgbrk', '-', '-'),
+    ),
+    ('application/msword', None): (
+        ('antiword', '{0}'),
+        ('antiword', '-'),
+    ),
+    ('application/vnd.openxmlformats-officedocument.wordprocessingml.document', None): (
+        ('docx2txt', '{0}', '-'),  # http://sourceforge.net/projects/docx2txt/
+        ('docx2txt', '{0}', '-'),  # http://sourceforge.net/projects/docx2txt/
+    ),
+    ('application/vnd.ms-excel', None): (
+        ('xls2csv', '{0}'),  # as provided by catdoc
+        None,  # Supposedly this works, but I get segmentation fault.
+    ),
+    ('application/rtf', None): (
+        ('unrtf', '--text', '--nopict', '{0}'),
+        ('unrtf', '--text', '--nopict'),
+    ),
+    ('application/vnd.oasis.opendocument.text', None): (
+        ('odt2txt', '{0}'),
+        None,
+    ),
+    ('application/vnd.oasis.opendocument.spreadsheet', None): (
+        ('odt2txt', '{0}'),
+        None,
+    ),
+    ('application/zip', None): (
+        ('funzip', '{0}', ),
+        ('funzip'),
+    ),
+    ('application/x-tar', 'gzip'): (
+        ('tar', 'tzf', '{0}'),
+        ('tar', 'tz'),
+    ),
+    ('application/x-tar', 'gzip2'): (
+        ('tar', 'tjf', '{0}'),
+        ('tar', 'tj'),
+    ),
+    ('application/rar', None): (
+        ('unrar', 'vb', '-p-', '-ierr', '-y', '{0}'),
+        None,
+    ),
+    ('text/html', None): (
+        ('html2text', '-nobs', '{0}'),
+        ('html2text', '-nobs'),
+    ),
+    ('text/xml', None): (
+        ('html2text', '-nobs', '{0}'),
+        ('html2text', '-nobs'),
+    ),
+    ('image/jpeg', None): (
+        ('exiftool', '-s', '-s', '-s', '{0}'),
+        ('exiftool', '-s', '-s', '-s', '-'),
+    ),
+    ('video/mpeg', None): (
+        ('exiftool', '-s', '-s', '-s', '{0}'),
+        ('exiftool', '-s', '-s', '-s', '-'),
+    ),
+    ('audio/mpeg', None): (
+        ('exiftool', '-s', '-s', '-s', '{0}'),
+        ('exiftool', '-s', '-s', '-s', '-'),
+    ),
+    ('application/octet-stream', None): (
+        ('strings', '-n 18', '{0}'),
+        ('strings', '-n 18'),
+    ),
+}
+"The command registry. Use add_commands to override this."
+
+FUNC_MAP = {
+    ('application/pdf', None): run_command,
+    ('application/msword', None): run_command,
+    ('application/vnd.openxmlformats-officedocument.wordprocessingml.document', None): run_command,  # http://sourceforge.net/projects/docx2txt/
+    ('application/vnd.ms-excel', None): csv_to_text,
+    ('application/rtf', None): strip_unrtf_header,
+    ('application/vnd.oasis.opendocument.text', None): run_command,
+    ('application/vnd.oasis.opendocument.spreadsheet', None): run_command,
+    ('application/zip', None): run_command,
+    ('application/x-tar', 'gzip'): run_command,
+    ('application/x-tar', 'gzip2'): run_command,
+    ('application/rar', None): run_command,
+    ('text/html', None): run_command,
+    ('text/xml', None): run_command,
+    ('image/jpeg', None): run_command,
+    ('video/mpeg', None): run_command,
+    ('audio/mpeg', None): run_command,
+    ('application/octet-stream', None): run_command,
+}
+"The handler registry. Use add_handler to override this."
+
+def add_commands(mime, commands, enc=None):
+    """
+    Adds a set of commands to the command registry. These commands are used to extract
+    text from various file types. Each command set consists of two commands. The first
+    command is used to extract text from a file on disk. The second command is used to
+    extract text from a file-like object. If suitable command can accept input via stdin
+    then the second command can be None. In this case, the file contents are written to
+    a temporary file, then the first command is used on that.
+
+    Each command is a tuple, which should represent the program name, and any arguments
+    needed to cause the program to print plain text to stdout. You need to put `{0}` at
+    the location where the input file should reside. string.format() is used to merge in
+    the file name being processed when the command is executed.
+
+    Although you would never want to do this, here is how you would register the `cat`
+    command for use with plain text files.
+
+    >>> import fulltext
+    >>> fulltext.add_commands('text/plain', (('cat', '{0}'), None))
+
+    The above is not yet complete, as you still need to register a handler using add_handler().
+    """
+    assert isinstance(commands, tuple), 'Commands must be tuple (Command for disk file, Command for use with piping).'
+    assert len(commands) == 2, 'Commands must contain two commands.'
+    assert isinstance(commands[0], tuple), 'Each command must be a tuple.'
+    assert isinstance(commands[1], tuple), 'Each command must be a tuple.'
+    assert '{0}' in command[0], 'Disk file command must contain {0}.'
+    PROG_MAP[(mime, enc)] = commands
+
+def add_handler(mime, handler, enc=None):
+    """
+    Adds a function to handle files of a specific type. Most file types use the built-in
+    run_command handler. This handler executes a command and reads the output in order
+    to convert the file to text. If you use this handler for your file type, then you
+    must also use add_commands to register a command to handle this type.
+
+    Here is how you could register a handler for plain text files.
+
+    >>> import fulltext
+    >>> fulltext.add_handler('text/plain', fulltext.run_command)
 
-def add_type(mime, command, encoding=None):
-    assert isinstance(command, tuple), 'Command must be tuple.'
-    assert '{0}' in command, 'Command must contain {0}.'
-    PROG_MAP[(mime, encoding)] = command
+    As mentioned, this is a contrived example. You would be better served by the following.
 
+    >>> import fulltext
+    >>> fulltext.add_handler('text/plain', fulltext.read_content)
+
+    But, since the default action is to read the content of an unrecognized file type,
+    registering text/plain is redundant.
+    """
+    assert callable(handler), 'Handler must be callable.'
+    FUNC_MAP[(mime, enc)] = handler
 
 def get_type(filename):
+    """
+    Gets the mimetype and encoding using the mimetypes module. Defined as a standalone
+    function for future expansion.
+    """
     return mimetypes.guess_type(filename)
 
 
-def get(filename, default=None, type=None):
-    # TODO: allow an open file to be passed in, most of the tools
-    # can accept data from stdin.
-    if not os.path.exists(filename):
-        if default is not None:
-            return default
-        raise FullTextException('File not found')
+class FullTextException(Exception):
+    pass
+
+
+def get(f, default=None, filename=None, type=None):
+    """
+    Gets text from a given file. The first parameter can be a path or a file-like object that
+    has a read method. Default is a way to supress errors and just return the default text.
+    Filename can help figure out the type of file being used if you passed in a file-like
+    object. Type can be used to override the type guessing. Type should be a tuple: (mimetype, encoding)
+    as returned by the mimetypes library.
+
+    For any recognizable type, a command will be execute to extract the text. Then that text
+    will be post processed to remove redundant whitespace.
+
+    For zip files, the first member is extracted and post processed.
+
+    Any file whose type cannot be determined will simply be read then post processed.
+    """
+    if not isinstance(f, basestring) and filename is None:
+        # Try to help figure out the file type.
+        filename = getattr(f, 'name', None)
+    else:
+        # If user provided filename, don't override it.
+        if filename is None:
+            filename = f
+        if not os.path.exists(filename):
+            if default is not None:
+                return default
+            raise FullTextException('File not found')
     if type is None:
         type = get_type(filename)
+    handler = FUNC_MAP.get(type, read_content)
     try:
-        cmd = PROG_MAP[type]
-    except KeyError:
-        if default is not None:
-            return default
-        raise FullTextException('Unknown file type, known file types are: {0}'.format(' '.join(PROG_MAP.keys())))
-    cmd = map(lambda x: x.format(filename), cmd)
-    if which(cmd[0]) is None:
-        raise FullTextException('Cannot execute binary: {0}'.format(cmd[0]))
-    try:
-        p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
-        f = p.stdout.fileno()
-        s, b = time.time(), []
-        while True:
-            if time.time() - s >= PROC_TIMEOUT:
-                p.stdout.close()
-                p.terminate()
-                raise FullTextException('Timeout executing command.')
-            if f in select.select([f], [], [], 0)[0]:
-                b.append(p.stdout.read())
-            if p.poll() is not None:
-                break
-            time.sleep(0.1)
-        text = ''.join(b)
+        text = handler(f, type)
     except:
         if default is not None:
             return default
         raise
-    post = FUNC_MAP.get(type, None)
-    if post:
-        text = post(text)
     return STRIP_WHITE.sub(' ', text).strip()
 
-
 def check():
+    """
+    Checks for the existence of required tools, then reports missing tools to stdout. This
+    can help you determine what needs to be installed for fulltext to fully function.
+    """
     commands = {}
     for type, cmd in PROG_MAP.items():
         commands[cmd[0]] = None