Skip to content

Commit

Permalink
Removed timeout, added ability to extract text from file-like objects…
Browse files Browse the repository at this point in the history
…, changed zipfile handling, lots of docs.
  • Loading branch information
btimby committed Feb 6, 2012
1 parent 385eab1 commit fdce48b
Show file tree
Hide file tree
Showing 5 changed files with 270 additions and 107 deletions.
17 changes: 1 addition & 16 deletions README.md
Expand Up @@ -88,19 +88,4 @@ the file contents.
Future
----

A feature to expect in the near future is support for file-like objects:

```python
> import fulltext
> f = file('existing.pdf', 'r')
> fulltext.get(f, '< no content >')
'Lorem ipsum...'
```

Also, currently, plain text files are not detected or handled. The library should detect
a plain text file and perform only post-processing on it.

One last change I plan is to use the mimetypes library for detection rather than using
the file extension directly. Commands and post-processing will then be mapped to mime types.
Then we should also allow the library user to add their own commands for support of other
file types (much like the mimetypes library does with the add_type function).
Sometimes multiple tools can be used. For example, catdoc provides xls2csv, while xls2csv provides convertxls2csv. We should use whichever is present.
8 changes: 8 additions & 0 deletions files/test.txt
@@ -0,0 +1,8 @@
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ipsum augue, iaculis quis auctor eu, adipiscing non est.
Nullam id sem diam, eget varius dui. Etiam sollicitudin sapien nec odio elementum sit amet luctus magna volutpat. Ut
commodo nulla neque. Aliquam erat volutpat. Integer et nunc augue. Pellentesque habitant morbi tristique senectus et
netus et malesuada fames ac turpis egestas. Quisque at enim nulla, vel tincidunt urna. Nam leo augue, elementum ut
viverra eget, scelerisque in purus. In arcu orci, porta nec aliquet quis, pretium a sem. In fermentum nisl id diam
luctus viverra. Nullam semper, metus at euismod vulputate, orci odio dignissim urna, quis iaculis neque lacus ut
tortor. Ut a justo non dolor venenatis accumsan. Proin dolor eros, aliquam id condimentum et, aliquam quis metus.
Vivamus eget purus diam.
Binary file added files/test.zip
Binary file not shown.
318 changes: 228 additions & 90 deletions fulltext/__init__.py
@@ -1,63 +1,14 @@
import os, os.path, subprocess, re, csv, time, select, mimetypes
# TODO: support file-like objects (pipe file data to stdin).
import os, os.path, subprocess, re, csv, time, select, mimetypes, tempfile
# TODO: Sometimes multiple tools can be used, choose the one that is installed.

mimetypes.add_type('application/rar', '.rar')

PROG_MAP = {
(None, None): ('cat', '{0}'),
('application/pdf', None): ('pdftotext', '-q', '-nopgbrk', '{0}', '-'),
('application/msword', None): ('antiword', '{0}'),
('application/vnd.openxmlformats-officedocument.wordprocessingml.document', None): ('docx2txt', '{0}', '-'), # http://sourceforge.net/projects/docx2txt/
('application/vnd.ms-excel', None): ('convertxls2csv', '-q', '-x {0}', '-c -'),
('application/rtf', None): ('unrtf', '--text', '--nopict', '{0}'),
('application/vnd.oasis.opendocument.text', None): ('odt2txt', '{0}'),
('application/vnd.oasis.opendocument.spreadsheet', None): ('odt2txt', '{0}'),
('application/zip', None): ('zipinfo', '-2z', '{0}'),
('application/x-tar', 'gzip'): ('tar', 'tzf', '{0}'),
('application/x-tar', 'gzip2'): ('tar', 'tjf', '{0}'),
('application/rar', None): ('unrar', 'vb', '-p-', '-ierr', '-y', '{0}'),
('text/html', None): ('html2text', '-nobs', '{0}'),
('text/xml', None): ('html2text', '-nobs', '{0}'),
('image/jpeg', None): ('exiftool', '-s', '-s', '-s', '{0}'),
('video/mpeg', None): ('exiftool', '-s', '-s', '-s', '{0}'),
('audio/mpeg', None): ('exiftool', '-s', '-s', '-s', '{0}'),
('application/octet-stream', None): ('strings', '-n 18', '{0}'),
}

STRIP_WHITE = re.compile(r'[ \t\v\f\r\n]+')
UNRTF = re.compile(r'.*-+\n', flags=re.MULTILINE)

PROC_TIMEOUT = 5
"How long to wait for command exectuion."

def strip_unrtf_header(text):
"""
Can't find a way to turn off the stupid header in unrtf.
"""
return text.split('-----------------')[1]

def csv_to_text(text):
"""
Can convert xls to csv, but this will go from csv to plain old
text.
"""
buffer = []
for row in csv.reader(text.splitlines(), dialect="excel"):
buffer.append(' '.join(row))
return ' '.join(buffer)

FUNC_MAP = {
('application/rtf', None): strip_unrtf_header,
('application/vnd.ms-excel', None): csv_to_text,
}

class FullTextException(Exception):
pass


# From:
# http://stackoverflow.com/questions/377017/test-if-executable-exists-in-python
def which(program):
"Simply checks if a given program exists within PATH and is executable."
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
fpath, fname = os.path.split(program)
Expand All @@ -71,61 +22,248 @@ def is_exe(fpath):
return exe_file
return None

def read_content(f, type):
"A handler that simply reads a file's output. Used on unrecognized types."
if isinstance(f, basestring):
f = file(f, 'r')
return f.read()

def run_command(f, type):
"The default handler. It runs a command and reads it's output."
cmds = PROG_MAP[type]
if isinstance(f, basestring):
cmd = cmds[0]
cmd = map(lambda x: x.format(f), cmd)
i = None
else:
assert hasattr(f, 'read'), 'File-like object must have read() method.'
cmd = cmds[1]
if cmd is None:
# Use temp file:
fd, fname = tempfile.mkstemp()
os.write(fd, f.read())
os.close(fd)
return run_command(fname, type)
i = f.read()
# We use regular subprocess module here. No timeout is allowed with communicate()
# If there are problems with times, I will investigate other options, like:
# http://pypi.python.org/pypi/EasyProcess
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
return p.communicate(i)[0]

def strip_unrtf_header(f, type):
"Can't find a way to turn off the stupid header in unrtf."
text = run_command(f, type)
parts = text.split('-----------------')
return '-----------------'.join(parts[1:])

def csv_to_text(f, type):
"Can convert xls to csv, but this will go from csv to plain old text."
text = run_command(f, type)
buffer = []
for row in csv.reader(text.splitlines(), dialect="excel"):
buffer.append(' '.join(row))
return ' '.join(buffer)

PROG_MAP = {
('application/pdf', None): (
('pdftotext', '-q', '-nopgbrk', '{0}', '-'),
('pdftotext', '-q', '-nopgbrk', '-', '-'),
),
('application/msword', None): (
('antiword', '{0}'),
('antiword', '-'),
),
('application/vnd.openxmlformats-officedocument.wordprocessingml.document', None): (
('docx2txt', '{0}', '-'), # http://sourceforge.net/projects/docx2txt/
('docx2txt', '{0}', '-'), # http://sourceforge.net/projects/docx2txt/
),
('application/vnd.ms-excel', None): (
('xls2csv', '{0}'), # as provided by catdoc
None, # Supposedly this works, but I get segmentation fault.
),
('application/rtf', None): (
('unrtf', '--text', '--nopict', '{0}'),
('unrtf', '--text', '--nopict'),
),
('application/vnd.oasis.opendocument.text', None): (
('odt2txt', '{0}'),
None,
),
('application/vnd.oasis.opendocument.spreadsheet', None): (
('odt2txt', '{0}'),
None,
),
('application/zip', None): (
('funzip', '{0}', ),
('funzip'),
),
('application/x-tar', 'gzip'): (
('tar', 'tzf', '{0}'),
('tar', 'tz'),
),
('application/x-tar', 'gzip2'): (
('tar', 'tjf', '{0}'),
('tar', 'tj'),
),
('application/rar', None): (
('unrar', 'vb', '-p-', '-ierr', '-y', '{0}'),
None,
),
('text/html', None): (
('html2text', '-nobs', '{0}'),
('html2text', '-nobs'),
),
('text/xml', None): (
('html2text', '-nobs', '{0}'),
('html2text', '-nobs'),
),
('image/jpeg', None): (
('exiftool', '-s', '-s', '-s', '{0}'),
('exiftool', '-s', '-s', '-s', '-'),
),
('video/mpeg', None): (
('exiftool', '-s', '-s', '-s', '{0}'),
('exiftool', '-s', '-s', '-s', '-'),
),
('audio/mpeg', None): (
('exiftool', '-s', '-s', '-s', '{0}'),
('exiftool', '-s', '-s', '-s', '-'),
),
('application/octet-stream', None): (
('strings', '-n 18', '{0}'),
('strings', '-n 18'),
),
}
"The command registry. Use add_commands to override this."

FUNC_MAP = {
('application/pdf', None): run_command,
('application/msword', None): run_command,
('application/vnd.openxmlformats-officedocument.wordprocessingml.document', None): run_command, # http://sourceforge.net/projects/docx2txt/
('application/vnd.ms-excel', None): csv_to_text,
('application/rtf', None): strip_unrtf_header,
('application/vnd.oasis.opendocument.text', None): run_command,
('application/vnd.oasis.opendocument.spreadsheet', None): run_command,
('application/zip', None): run_command,
('application/x-tar', 'gzip'): run_command,
('application/x-tar', 'gzip2'): run_command,
('application/rar', None): run_command,
('text/html', None): run_command,
('text/xml', None): run_command,
('image/jpeg', None): run_command,
('video/mpeg', None): run_command,
('audio/mpeg', None): run_command,
('application/octet-stream', None): run_command,
}
"The handler registry. Use add_handler to override this."

def add_commands(mime, commands, enc=None):
"""
Adds a set of commands to the command registry. These commands are used to extract
text from various file types. Each command set consists of two commands. The first
command is used to extract text from a file on disk. The second command is used to
extract text from a file-like object. If suitable command can accept input via stdin
then the second command can be None. In this case, the file contents are written to
a temporary file, then the first command is used on that.
Each command is a tuple, which should represent the program name, and any arguments
needed to cause the program to print plain text to stdout. You need to put `{0}` at
the location where the input file should reside. string.format() is used to merge in
the file name being processed when the command is executed.
Although you would never want to do this, here is how you would register the `cat`
command for use with plain text files.
>>> import fulltext
>>> fulltext.add_commands('text/plain', (('cat', '{0}'), None))
The above is not yet complete, as you still need to register a handler using add_handler().
"""
assert isinstance(commands, tuple), 'Commands must be tuple (Command for disk file, Command for use with piping).'
assert len(commands) == 2, 'Commands must contain two commands.'
assert isinstance(commands[0], tuple), 'Each command must be a tuple.'
assert isinstance(commands[1], tuple), 'Each command must be a tuple.'
assert '{0}' in command[0], 'Disk file command must contain {0}.'
PROG_MAP[(mime, enc)] = commands

def add_handler(mime, handler, enc=None):
"""
Adds a function to handle files of a specific type. Most file types use the built-in
run_command handler. This handler executes a command and reads the output in order
to convert the file to text. If you use this handler for your file type, then you
must also use add_commands to register a command to handle this type.
Here is how you could register a handler for plain text files.
>>> import fulltext
>>> fulltext.add_handler('text/plain', fulltext.run_command)
def add_type(mime, command, encoding=None):
assert isinstance(command, tuple), 'Command must be tuple.'
assert '{0}' in command, 'Command must contain {0}.'
PROG_MAP[(mime, encoding)] = command
As mentioned, this is a contrived example. You would be better served by the following.
>>> import fulltext
>>> fulltext.add_handler('text/plain', fulltext.read_content)
But, since the default action is to read the content of an unrecognized file type,
registering text/plain is redundant.
"""
assert callable(handler), 'Handler must be callable.'
FUNC_MAP[(mime, enc)] = handler

def get_type(filename):
"""
Gets the mimetype and encoding using the mimetypes module. Defined as a standalone
function for future expansion.
"""
return mimetypes.guess_type(filename)


def get(filename, default=None, type=None):
# TODO: allow an open file to be passed in, most of the tools
# can accept data from stdin.
if not os.path.exists(filename):
if default is not None:
return default
raise FullTextException('File not found')
class FullTextException(Exception):
pass


def get(f, default=None, filename=None, type=None):
"""
Gets text from a given file. The first parameter can be a path or a file-like object that
has a read method. Default is a way to supress errors and just return the default text.
Filename can help figure out the type of file being used if you passed in a file-like
object. Type can be used to override the type guessing. Type should be a tuple: (mimetype, encoding)
as returned by the mimetypes library.
For any recognizable type, a command will be execute to extract the text. Then that text
will be post processed to remove redundant whitespace.
For zip files, the first member is extracted and post processed.
Any file whose type cannot be determined will simply be read then post processed.
"""
if not isinstance(f, basestring) and filename is None:
# Try to help figure out the file type.
filename = getattr(f, 'name', None)
else:
# If user provided filename, don't override it.
if filename is None:
filename = f
if not os.path.exists(filename):
if default is not None:
return default
raise FullTextException('File not found')
if type is None:
type = get_type(filename)
handler = FUNC_MAP.get(type, read_content)
try:
cmd = PROG_MAP[type]
except KeyError:
if default is not None:
return default
raise FullTextException('Unknown file type, known file types are: {0}'.format(' '.join(PROG_MAP.keys())))
cmd = map(lambda x: x.format(filename), cmd)
if which(cmd[0]) is None:
raise FullTextException('Cannot execute binary: {0}'.format(cmd[0]))
try:
p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
f = p.stdout.fileno()
s, b = time.time(), []
while True:
if time.time() - s >= PROC_TIMEOUT:
p.stdout.close()
p.terminate()
raise FullTextException('Timeout executing command.')
if f in select.select([f], [], [], 0)[0]:
b.append(p.stdout.read())
if p.poll() is not None:
break
time.sleep(0.1)
text = ''.join(b)
text = handler(f, type)
except:
if default is not None:
return default
raise
post = FUNC_MAP.get(type, None)
if post:
text = post(text)
return STRIP_WHITE.sub(' ', text).strip()


def check():
"""
Checks for the existence of required tools, then reports missing tools to stdout. This
can help you determine what needs to be installed for fulltext to fully function.
"""
commands = {}
for type, cmd in PROG_MAP.items():
commands[cmd[0]] = None
Expand Down

0 comments on commit fdce48b

Please sign in to comment.