In [1]:
import os
import stat
from itertools import filterfalse
import hashlib


In [2]:
def hash_value_for_file(f, hash_function, block_size=2**20):
    while True:
        # we use the read passing the size of the block to avoid
        # heavy ram usage
        data = f.read(block_size)
        if not data:
            # if we don't have any more data to read, stop.
            break
        # we partially calculate the hash
        hash_function.update(data)
    return hash_function.digest()

In [3]:
_cache = {}
BUFSIZE = 8*1024


def cmp(local_file, remote_file, shallow=True):
    """Compare two files.
    Arguments:
    local_file -- First file name
    remote_file -- Second file name
    sftp_client -- The Paramiko Object of the SFTP Connection to the Server
    shallow -- Just check stat signature (do not read the files).
               defaults to True.
    Return value:
    True if the files are the same, False otherwise.
    This function uses a cache for past comparisons and the results,
    with cache entries invalidated if their stat information
    changes.  The cache may be cleared by calling clear_cache().
    """

    s1 = _sig(os.stat(local_file))
    s2 = _sig(os.stat(remote_file))
    if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG:
        return False
    if shallow and s1 == s2:
        return True
    if s1[1] != s2[1]:
        return False

    outcome = _cache.get((local_file, remote_file, s1, s2))
    if outcome is None:
        outcome = _do_cmp(local_file, remote_file)
        if len(_cache) > 100:      # limit the maximum size of the cache
            clear_cache()
        _cache[local_file, remote_file, s1, s2] = outcome
    return outcome


def _sig(st):
    return (stat.S_IFMT(st.st_mode),
            st.st_size,
            st.st_mtime)


def _do_cmp(f1, f2):
    bufsize = BUFSIZE
    with open(f1, 'rb') as fp1, open(f2, 'rb') as fp2:
        while True:
            b1 = fp1.read(bufsize)
            b2 = fp2.read(bufsize)
            if b1 != b2:
                return False
            if not b1:
                return True


def clear_cache():
    """Clear the filecmp cache."""
    _cache.clear()

In [9]:
def benchMD5():
    with open('SSH_Connection.ipynb', 'rb') as input_file:
        md5 = hashlib.md5()
        hashA = hash_value_for_file(input_file, md5)
    with open('SSH_Connection.ipynb', 'rb') as input_file:
        md5 = hashlib.md5()
        hashB = hash_value_for_file(input_file, md5)

In [11]:
def benchSHA1():
    with open('SSH_Connection.ipynb', 'rb') as input_file:
        sha1 = hashlib.sha1()
        hashA = hash_value_for_file(input_file, sha1)
    with open('SSH_Connection.ipynb', 'rb') as input_file:
        sha1 = hashlib.sha1()
        hashB = hash_value_for_file(input_file, sha1)

In [14]:
%timeit -n 100000 benchMD5()

105 µs ± 346 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [15]:
%timeit -n 100000 benchSHA1()

93.8 µs ± 308 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [18]:
%timeit -n 100000 cmp('SSH_Connection.ipynb', 'SSH_Connection.ipynb'); clear_cache()  

3.2 µs ± 64.6 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [19]:
%timeit -n 100000 cmp('SSH_Connection.ipynb', 'SSH_Connection.ipynb', shallow=False); clear_cache()  

25.7 µs ± 167 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
