In [None]:
import os
import re
import io
import sys
import glob
import shutil
import hashlib
import numpy as np

import threading
import multiprocessing as mp

def get_files(root_dir, sort=True, re_expr='.*', exclude=[]):    
    root_dir = root_dir.replace(os.sep, '/')
    file_list = []
    for dir_n, _, fn_list in os.walk(root_dir):
        if fn_list:
            cur_fs = [os.path.join(dir_n, f).replace(os.sep, '/') for f in fn_list]
            cur_fs = [f for f in cur_fs if \
                          bool(re.fullmatch(re_expr, f)) and \
                          all([not bool(re.fullmatch(ex, f)) for ex in exclude])]
            file_list += cur_fs
    if sort:
        file_list = sorted(file_list)
    file_list = [f.replace(os.sep, '/') for f in file_list]
    return file_list

def get_md5(fname, chunk_size=1024*10):
    chunk_size = int(chunk_size)
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def output_md5(file_list, output_file=sys.stdout, prefix=''):
    if type(output_file) == str:
        output_file = open(output_file, 'w')
    for f in file_list:
        md5 = get_md5(prefix+f)
        print('%s: %s' % (f, md5), file=output_file)

In [None]:
# src = '/media/tangliyao/bhfs'
# dst = '/media/tangliyao/TOSHIBA EXT'
src = 'D:\\Image'  # must append with '\\' on win platform
dst = 'F:\\Image'

include = '.*'
exclude = ['.*/System Volume Information/.*', '.*/\$RECYCLE.BIN/.*', '.*/\.Trash.*']

src_files = get_files(src, sort=True, re_expr=include, exclude=exclude)
dst_files = get_files(dst, sort=True, re_expr=include, exclude=exclude)

src_files = [f[len(src):] for f in src_files]
dst_files = [f[len(dst):] for f in dst_files]

print(f'in src ({src}) but not dst ({dst}):')
for i in sorted(list(set(src_files) - set(dst_files))):
    print('\t', i)
print(f'in dst ({dst}) but not src ({src}):')
for i in sorted(list(set(dst_files) - set(src_files))):
    print('\t', i)

In [None]:
# output_dir = '~'
# src_md5 = os.path.join(os.path.expanduser(output_dir), 'src_md5')
# dst_md5 = os.path.join(os.path.expanduser(output_dir), 'dst_md5')
src_md5 = io.StringIO()
dst_md5 = io.StringIO()

# # sequential
# output_md5(src_files, src_md5, src)
# output_md5(dst_files, dst_md5, dst)

# # multi-processing (not working in IPython)
# worker = [
#     mp.Process(target=output_md5, args=(src_files, src_md5, src)),
#     mp.Process(target=output_md5, args=(dst_files, dst_md5, dst)),
# ]

# multi-threading
worker = [
    threading.Thread(target=output_md5, args=(src_files, src_md5, src)),
    threading.Thread(target=output_md5, args=(dst_files, dst_md5, dst)),
]

for w in worker:
    w.start()
for w in worker:
    w.join()

In [None]:
src_md5_f = open(src_md5).read().split('\n') if type(src_md5) == str else src_md5.getvalue().split('\n')
dst_md5_f = open(dst_md5).read().split('\n') if type(dst_md5) == str else dst_md5.getvalue().split('\n')

for i in np.where(np.array(src_md5_f) != np.array(dst_md5_f))[0]:
    print(f'{i} ({src} <--> {dst})\nsrc_md5_f = {src_md5_f[i]}\ndst_md5_f = {dst_md5_f}')

In [None]:
output_md5(get_files('D:/Research/RandLA-Net/download/download', re_expr='.*tfevents.*', platform='win10'))