In [1]:
import os
import re
import glob
import shutil
import hashlib
import numpy as np

def get_files(root_dir, sort=True, re_expr='.*', exclude=''):
    file_list = []
    for dir_n, _, fn_list in os.walk(root_dir):
        if fn_list:
            cur_fs = [os.path.join(dir_n, f) for f in fn_list]
            cur_fs = [f for f in cur_fs if \
                          bool(re.fullmatch(re_expr, f)) and \
                          all([not bool(re.fullmatch(ex, f)) for ex in exclude])]
            file_list += cur_fs
    if sort:
        file_list = sorted(file_list)
    return file_list

def get_md5(fname, chunk_size=1024*10):
    chunk_size = int(chunk_size)
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def output_md5(output_file, file_list, prefix=''):
    with open(output_file, 'w') as output_file:
        for f in file_list:
            md5 = get_md5(prefix+f)
            output_file.write('%s: %s\n' % (f, md5))

In [2]:
src = '/media/tangliyao/bhfs'
dst = '/media/tangliyao/TOSHIBA EXT'
output_dir = '~'

include = '.*'
exclude = ['.*/System Volume Information/.*', '.*/\$RECYCLE.BIN/.*', '.*/\.Trash.*']

src_files = get_files(src, sort=True, re_expr=include, exclude=exclude)
dst_files = get_files(dst, sort=True, re_expr=include, exclude=exclude)

src_files = [f[len(src):] for f in src_files]
dst_files = [f[len(dst):] for f in dst_files]

In [3]:
set(src_files).difference(set(dst_files))

set()

In [4]:
src_md5 = os.path.join(os.path.expanduser(output_dir), 'src_md5')
output_md5(src_md5, src_files, prefix=src)

In [5]:
dst_md5 = os.path.join(os.path.expanduser(output_dir), 'dst_md5')
output_md5(dst_md5, dst_files, prefix=dst)

In [6]:
src_md5_f = open(src_md5).read().split('\n')
dst_md5_f = open(dst_md5).read().split('\n')
np.where(np.array(src_md5_f) != np.array(dst_md5_f))

(array([], dtype=int64),)