Skip to content

Commit

Permalink
Merge pull request #621 from matz-e/core-dataset-options
Browse files Browse the repository at this point in the history
Allow to filter on file extensions.
  • Loading branch information
annawoodard committed Jan 5, 2018
2 parents 94865bf + 227afe1 commit a331c39
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 8 deletions.
43 changes: 36 additions & 7 deletions lobster/core/dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import defaultdict
import fnmatch
import math
import os

Expand All @@ -11,17 +12,41 @@
]


def flatten(files):
def flatten(files, matches=None):
"""Flatten a list of directories or files to a single list of files.
Parameters
----------
files : str or list
A list of paths to expand. Can also be a string containing a path.
matches : list
A list of patterns to match files against. Only successfully
matched files will be returned.
Returns
-------
files : list
A list of files found in the paths passed in the input
parameter `files`, optionally matching the extensions in
`exts`.
"""
def matchfn(fn):
base = os.path.basename(fn)
for m in matches:
if fnmatch.fnmatch(base, m):
return True
return False
res = []
if not isinstance(files, list):
files = [files]
for entry in files:
entry = os.path.expanduser(entry)
if fs.isdir(entry):
res += fs.ls(entry)
res.extend(fs.ls(entry))
elif fs.isfile(entry):
res.append(entry)

if matches:
return [fn for fn in res if matchfn(fn)]
return res


Expand Down Expand Up @@ -67,23 +92,27 @@ class Dataset(Configurable):
A list of files or directories to process. May also be a `str`
pointing to a single file or directory.
files_per_task : int
How many files to process in one task
How many files to process in one task. Defaults to 1.
patterns: list
A list of shell-style file patterns to match filenames against.
Defaults to `None` and will use all files considered.
"""
_mutable = {}

def __init__(self, files, files_per_task=1):
def __init__(self, files, files_per_task=1, patterns=None):
self.files = files
self.files_per_task = files_per_task
self.patterns = patterns
self.total_units = 0

def validate(self):
return len(flatten(self.files)) > 0
return len(flatten(self.files, self.patterns)) > 0

def get_info(self):
dset = DatasetInfo()
dset.file_based = True

files = flatten(self.files)
files = flatten(self.files, self.patterns)
dset.tasksize = self.files_per_task
dset.total_units = len(files)
self.total_units = len(files)
Expand Down
29 changes: 28 additions & 1 deletion test/test_core_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,23 @@ def setUpClass(cls):
for i in range(5):
with open(os.path.join(cls.workdir, 'ham', str(i) + '.txt'), 'w') as f:
f.write('bacon')
os.makedirs(os.path.join(cls.workdir, 'spam'))
os.makedirs(os.path.join(cls.workdir, 'spam', 'log'))
for i in range(5):
with open(os.path.join(cls.workdir, 'spam', str(i) + '.txt'), 'w') as f:
f.write('mail')
for i in range(2):
with open(os.path.join(cls.workdir, 'spam', str(i) + '.trash'), 'w') as f:
f.write('mail')
for i in range(3):
with open(os.path.join(cls.workdir, 'spam', 'log', str(i) + '.log'), 'w') as f:
f.write('thing')

@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.workdir)

def runTest(self):
def test_basics(self):
with util.PartiallyMutable.unlock():
s = se.StorageConfiguration(
output=[], input=['file://' + self.workdir])
Expand All @@ -46,3 +57,19 @@ def runTest(self):

info = Dataset(files='eggs/1.txt').get_info()
assert len(info.files) == 1

def test_flatten(self):
with util.PartiallyMutable.unlock():
s = se.StorageConfiguration(
output=[], input=['file://' + self.workdir])
s.activate()

with fs.alternative():
info = Dataset(files=['spam']).get_info()
assert len(info.files) == 8

info = Dataset(files=['spam'], patterns=['*.txt']).get_info()
assert len(info.files) == 5

info = Dataset(files=['spam'], patterns=['[12].txt']).get_info()
assert len(info.files) == 2

0 comments on commit a331c39

Please sign in to comment.