Merge pull request #621 from matz-e/core-dataset-options

Allow to filter on file extensions.
NDCMS · Jan 5, 2018 · a331c39 · a331c39
2 parents 94865bf + 227afe1
commit a331c39
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 8 deletions.
diff --git a/lobster/core/dataset.py b/lobster/core/dataset.py
@@ -1,4 +1,5 @@
 from collections import defaultdict
+import fnmatch
 import math
 import os
 
@@ -11,17 +12,41 @@
 ]
 
 
-def flatten(files):
+def flatten(files, matches=None):
+    """Flatten a list of directories or files to a single list of files.
+
+    Parameters
+    ----------
+        files : str or list
+            A list of paths to expand. Can also be a string containing a path.
+        matches : list
+            A list of patterns to match files against. Only successfully
+            matched files will be returned.
+
+    Returns
+    -------
+        files : list
+            A list of files found in the paths passed in the input
+            parameter `files`, optionally matching the extensions in
+            `exts`.
+    """
+    def matchfn(fn):
+        base = os.path.basename(fn)
+        for m in matches:
+            if fnmatch.fnmatch(base, m):
+                return True
+        return False
     res = []
     if not isinstance(files, list):
         files = [files]
     for entry in files:
         entry = os.path.expanduser(entry)
         if fs.isdir(entry):
-            res += fs.ls(entry)
+            res.extend(fs.ls(entry))
         elif fs.isfile(entry):
             res.append(entry)
-
+    if matches:
+        return [fn for fn in res if matchfn(fn)]
     return res
 
 
@@ -67,23 +92,27 @@ class Dataset(Configurable):
             A list of files or directories to process.  May also be a `str`
             pointing to a single file or directory.
         files_per_task : int
-            How many files to process in one task
+            How many files to process in one task. Defaults to 1.
+        patterns: list
+            A list of shell-style file patterns to match filenames against.
+            Defaults to `None` and will use all files considered.
     """
     _mutable = {}
 
-    def __init__(self, files, files_per_task=1):
+    def __init__(self, files, files_per_task=1, patterns=None):
         self.files = files
         self.files_per_task = files_per_task
+        self.patterns = patterns
         self.total_units = 0
 
     def validate(self):
-        return len(flatten(self.files)) > 0
+        return len(flatten(self.files, self.patterns)) > 0
 
     def get_info(self):
         dset = DatasetInfo()
         dset.file_based = True
 
-        files = flatten(self.files)
+        files = flatten(self.files, self.patterns)
         dset.tasksize = self.files_per_task
         dset.total_units = len(files)
         self.total_units = len(files)

diff --git a/test/test_core_dataset.py b/test/test_core_dataset.py
@@ -26,12 +26,23 @@ def setUpClass(cls):
         for i in range(5):
             with open(os.path.join(cls.workdir, 'ham', str(i) + '.txt'), 'w') as f:
                 f.write('bacon')
+        os.makedirs(os.path.join(cls.workdir, 'spam'))
+        os.makedirs(os.path.join(cls.workdir, 'spam', 'log'))
+        for i in range(5):
+            with open(os.path.join(cls.workdir, 'spam', str(i) + '.txt'), 'w') as f:
+                f.write('mail')
+        for i in range(2):
+            with open(os.path.join(cls.workdir, 'spam', str(i) + '.trash'), 'w') as f:
+                f.write('mail')
+        for i in range(3):
+            with open(os.path.join(cls.workdir, 'spam', 'log', str(i) + '.log'), 'w') as f:
+                f.write('thing')
 
     @classmethod
     def tearDownClass(cls):
         shutil.rmtree(cls.workdir)
 
-    def runTest(self):
+    def test_basics(self):
         with util.PartiallyMutable.unlock():
             s = se.StorageConfiguration(
                 output=[], input=['file://' + self.workdir])
@@ -46,3 +57,19 @@ def runTest(self):
 
                 info = Dataset(files='eggs/1.txt').get_info()
                 assert len(info.files) == 1
+
+    def test_flatten(self):
+        with util.PartiallyMutable.unlock():
+            s = se.StorageConfiguration(
+                output=[], input=['file://' + self.workdir])
+            s.activate()
+
+            with fs.alternative():
+                info = Dataset(files=['spam']).get_info()
+                assert len(info.files) == 8
+
+                info = Dataset(files=['spam'], patterns=['*.txt']).get_info()
+                assert len(info.files) == 5
+
+                info = Dataset(files=['spam'], patterns=['[12].txt']).get_info()
+                assert len(info.files) == 2