In [24]:
#! /Users/sophia/mambaforge/envs/viper-core/bin/python

#script to cleanup intermediate outputs from project directories of no longer active segmentations
#preserves all *final* results while removing any intermediate steps which take up a lot of memory

import sys, getopt
import argparse
import os

from tabulate import tabulate
from functools import partial
from concurrent.futures import ProcessPoolExecutor as Pool
from colorama import init
from colorama import Fore, Back, Style
import h5py

#check size of files contained in each directory

def generate_parser():
    # Instantiate the parser
    parser = argparse.ArgumentParser(description='Scan directory for viper projects.')
    
    # Required positional argument
    parser.add_argument('search_directory', type=str,nargs='?',
                        help='directory containing viper projects')
    
    parser.add_argument("-t","--threads", type=int, default=8, help="number of threads")
    
    parser.add_argument("-r","--recursion", type=int, default=5, help="levels of recursion")
    return parser
    
def main(dir):
   
    print(f"Viper-clean collecting information on current state of directory. This can take some time...")
    
# parser = generate_parser()
# args = parser.parse_args()

# global num_threads 
 #num_threads= args.threads
    num_threads =2

# if args.search_directory is None:
# search_directory = os.getcwd()

# else:
# try:
# search_directory = os.path.abspath(args.search_directory)
# except:
# print("search directory not a valid path")

    search_directory = dir
        
    table = scan_directory(1, search_directory)
    
    # check if any projects were found
    if len(table) > 0:
        
        table.sort(key=lambda x: x[0])
        
        for line in table:
            print_project(line)
    else:
        print("No projects found")

def scan_directory(levels_left, path):
    
    HDF_FILETYPES = ["hdf", "hf", "h5"]
    NUMPY_FILETYPES = [".npy"]

    if levels_left > 0:

        config_name = "config.yml"
        is_project_dir = os.path.isfile(os.path.join(path, config_name))

        if is_project_dir:     

            dir_name = os.path.basename(path)
            project_size = get_dir_size(path)

            #get sizes of subdirectories
            segmentation_dir = os.path.join(path, "segmentation")
            extraction_dir = os.path.join(path, "extraction")
            classification_dir = os.path.join(path, "classification")
            selection_dir = os.path.join(path, "selection")

            segmentation_exists = os.path.isdir(segmentation_dir)
            extraction_exists = os.path.isdir(extraction_dir)
            classification_exists = os.path.isdir(classification_dir)
            selection_exists = os.path.isdir(selection_dir)

            if segmentation_exists:
                segmentation_size = get_dir_size(segmentation_dir)
            
            if extraction_exists:
                extraction_size = get_dir_size(segmentation_dir)

            if classification_exists:
                classification_size = get_dir_size(segmentation_dir)

            if selection_exists:
                selection_size = get_dir_size(segmentation_dir)

            #check if segmentation was completed
            segmentation_name = "segmentation/segmentation.h5"
            segmentation_finished = os.path.isfile(os.path.join(path,segmentation_name))

            #get size of segmentation outputs
            if segmentation_exists: 
                #check if sharding was done
                sharding_dir = os.path.join(segmentation_dir, "tiles")
                sharding_exists = os.path.isdir(sharding_dir)
                
                if sharding_exists:
                    #get number of and size of shards
                    individual_shard_dirs = os.listdir(sharding_dir)
                    number_shards = len(individual_shard_dirs)

                    sizes_shards = []
                    for _dir in individual_shard_dirs:
                        _size = get_dir_size(os.path.join(sharding_dir, _dir))
                        sizes_shards.append(_size)

                segmentation_file_sizes = get_file_size(os.path.join(segmentation_dir, "segmentation.h5")) + get_file_size(os.path.join(segmentation_dir, "classes.csv"))
                input_file_size = get_file_size(os.path.join(segmentation_dir, "input_image.h5"))
            
            if segmentation_exists:
                if sharding_exists:
                    return [[dir_name, [[segmentation_exists, extraction_exists, classification_exists, selection_exists],[segmentation_size, extraction_size, classification_size, selection_size]], [sharding_exists, number_shards, sum(sizes_shards)],[segmentation_file_sizes, input_file_size]]]
                else:
                    return [[dir_name, [[segmentation_exists, extraction_exists, classification_exists, selection_exists],[segmentation_size, extraction_size, classification_size, selection_size]], [sharding_exists, 0, 0],[segmentation_file_sizes, input_file_size]]]
            else:
                return [[dir_name, [[segmentation_exists, extraction_exists, classification_exists, selection_exists],[segmentation_size, extraction_size, classification_size, selection_size]]]]

        else:        
            # iterate all subfolders
            current_level_directories = [os.path.join(path, name) for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]

            with Pool(max_workers=num_threads) as pool:
                projects = pool.map(partial(scan_directory, levels_left-1),current_level_directories)
            
            return list(flatten(projects))

def get_dir_size(path):
    total = 0
    for entry in os.scandir(path):
        if entry.is_file():
            total += entry.stat().st_size
        elif entry.is_dir():
            total += get_dir_size(entry.path)
    return total

def get_file_size(path):
    if path.isfile():
        return path.stat().st_size
    else:
        return None

def flatten(l):
    for el in l:
        if isinstance(el, Iterable):
            if len(el)>0:
                yield el[0]
        else:
            # pool.map might return None on subfolders
            if el is not None:
                yield el

def print_project(line):
    """
    Arguments:
        line (list): List of Lists with the following items:
        [project name, segmentation finished, number of cells in segmentation, project size, extractions]
        extraction is a list of tuples (extraction name , number of cells, size)
    """
    
    out_line = []
    Fore.RED + 'You can colorize a single line.' + Style.RESET_ALL
    out_line.append('\033[1m' + str(line[0]).rjust(15))
    
    color = Fore.GREEN if line[1] else Fore.RED
    
    out_line.append(color + str(line[1]).rjust(15) + Style.RESET_ALL)
    out_line.append('\033[1m' + str(line[2]).rjust(15) + Style.RESET_ALL)
    out_line.append('\033[1m' + str(line[3]).rjust(15) + Style.RESET_ALL)
    print("".join(out_line))
    
    pad = " "*15
    
    line[-1].sort(key=lambda x: x[0])
    
    for extract in line[-1]:
        print(Fore.BLUE +pad + str(extract[0]).rjust(15)+str(extract[1]).rjust(15)+str(extract[2]).rjust(15)+ Style.RESET_ALL)


#actually execute the programm if calles from command line
# if __name__ == "__main__":
#     init()
#     main("test")

In [28]:
dir = "/Volumes/pool-mann-viperssd/datasets/2021_10_13_autophagy_screen/Screen_0"
search_directory = dir
    
table = scan_directory(1, search_directory)

# check if any projects were found
if len(table) > 0:
    
    table.sort(key=lambda x: x[0])
    
    for line in table:
        print_project(line)
else:
    print("No projects found")

Process SpawnProcess-5:
Traceback (most recent call last):
  File "/Users/sophia/mambaforge/envs/viper-core/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/sophia/mambaforge/envs/viper-core/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/sophia/mambaforge/envs/viper-core/lib/python3.9/concurrent/futures/process.py", line 237, in _process_worker
    call_item = call_queue.get(block=True)
  File "/Users/sophia/mambaforge/envs/viper-core/lib/python3.9/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'scan_directory' on <module '__main__' (built-in)>
Process SpawnProcess-6:
Traceback (most recent call last):
  File "/Users/sophia/mambaforge/envs/viper-core/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/sophia/mambaforge/envs/viper-core/lib/python3.9/mu

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [26]:
num_threads = 2
main()

Viper-clean collecting information on current state of directory. This can take some time...


Process SpawnProcess-3:
Traceback (most recent call last):
  File "/Users/sophia/mambaforge/envs/viper-core/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/sophia/mambaforge/envs/viper-core/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/sophia/mambaforge/envs/viper-core/lib/python3.9/concurrent/futures/process.py", line 237, in _process_worker
    call_item = call_queue.get(block=True)
  File "/Users/sophia/mambaforge/envs/viper-core/lib/python3.9/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'scan_directory' on <module '__main__' (built-in)>
Process SpawnProcess-4:
Traceback (most recent call last):
  File "/Users/sophia/mambaforge/envs/viper-core/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/sophia/mambaforge/envs/viper-core/lib/python3.9/mu

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.