In [102]:
# Demo for synchronization of two data directories
import os
import io
import pandas as pd
import numpy as np
import farmhash
import time

In [103]:
def create_dirdf(directory):
    if not os.path.exists(directory):
        print("Error: Directory '" + directory + "' does not exist.")
        return
    
    filenames = []
    hashvalues = []
    
    for root, subdir, files in os.walk(directory):
        for name in files:
            if not name[0] == ".": # ignore hidden files
                filepath = os.path.join(root, name)
                
                # hash full file contents
                # note: we dont know the encoding scheme for the spice data files, so we just read as binary
                # the labels and headers are all ascii, but the kernels are a mix of ascii and ???
                file = str(io.open(filepath,'rb').read()) 
                filenames.append(filepath.split(directory, 1)[1])
                hashvalues.append(farmhash.hash64(file))
                
                # parse file creation date
    df = pd.DataFrame(data=hashvalues, index = filenames, columns = ["Hash"])
    df.index.name = directory
    return df

In [104]:
start = time.time()
dir1df = create_dirdf("./testdir1")
end = time.time()
print("elapsed time: ", end - start)

dir2df = create_dirdf("./testdir2")
print(dir1df)
print(dir2df)

elapsed time:  11.3596670627594
                                                                    Hash
./testdir1                                                              
/testfile1.txt                                      13167233149662072294
/testfile2.txt                                       2116770068367243914
/testfile3.txt                                      10117441339441774812
/testfile4.txt                                        407662078023551858
/testdir1A/testfile1A.txt                            1855841718642996950
/testdir1A/clem1-l-spice-6-v1.0/dsindex.lbl         16661501493414142325
/testdir1A/clem1-l-spice-6-v1.0/dsindex.tab         15056493393769822290
/testdir1A/clem1-l-spice-6-v1.0/clsp_1000/aarea...   9860181418246867054
/testdir1A/clem1-l-spice-6-v1.0/clsp_1000/aarea...   2900448953212383570
/testdir1A/clem1-l-spice-6-v1.0/clsp_1000/aarea...  17585396647624954768
/testdir1A/clem1-l-spice-6-v1.0/clsp_1000/errat...  14362732480219937772
/testdir1A/clem1-l-

In [95]:
def create_datedf(directory):
    if not os.path.exists(directory):
        print("Error: Directory '" + directory + "' does not exist.")
        return
    
    fnames = []
    dates = []
    
    for root, subdir, files in os.walk(directory):
        for name in files:
            if name.endswith(".lbl"): # only parse labels
                filepath = os.path.join(root, name)
                file = io.open(filepath,'r')
                line = file.readline()
                product_id = ""
                product_time = ""
                
                # check for file kernel being pointed to 
                while line:
                    if line.startswith("PRODUCT_ID"):
                        product_id = line.split("= ")[1].strip()
                    if line.startswith("PRODUCT_CREATION_TIME"):
                        product_time = line.split("= ")[1].strip()
                    line = file.readline()
                    
                if product_id and product_time:
                    fnames.append(product_id)
                    dates.append(product_time)
                    
    df = pd.DataFrame(data=dates, index = fnames, columns = ["Hash"])
    df.index.name = directory
    return df

dir1_dates = create_datedf("./testdir1")
dir2_dates = create_datedf("./testdir2")
print(dir1_dates)
print(dir2_dates)

                                              Hash
./testdir1                                        
"clem_act_ck3.bc"              2007-06-13T12:18:08
"clem_moon_940127_940312.bdb"  2007-06-13T12:18:17
"clem_moon_940312_940330.bdb"  2007-06-13T12:20:32
"clem_moon_940330_940415.bdb"  2007-06-13T12:22:41
"clem_moon_940415_940507.bdb"  2007-06-13T12:24:39
"clem_sky_940201_940507.bdb"   2007-06-13T12:26:37
"clem_v20.tf"                  2007-06-13T12:27:00
"moon_060721.tf"               2007-06-13T12:27:01
"moon_assoc_me.tf"             2007-06-13T12:27:01
"moon_assoc_pa.tf"             2007-06-13T12:27:01
"clem_astar_006.ti"            2007-06-13T12:27:01
"clem_bstar_006.ti"            2007-06-13T12:27:01
"clem_cpt_002.ti"              2007-06-13T12:27:01
"clem_hires_008.ti"            2007-06-13T12:27:02
"clem_lidar_005.ti"            2007-06-13T12:27:02
"clem_lwir_008.ti"             2007-06-13T12:27:02
"clem_nir_009.ti"              2007-06-13T12:27:02
"clem_uvvis_008.ti"            

In [17]:
def synchronize_directory(dir1, dir2):
    dir1hash = farmhash.hash64(str(dir1.values))
    dir2hash = farmhash.hash64(str(dir2.values))
    if dir1hash == dir2hash:
        print("Directories are identical")
        return
    else:
        print("Synchronizing Directories: rsync -av " + dir1.index.name + "/ " + dir2.index.name + "/")
        os.system("rsync -av " + dir1.index.name + "/ " + dir2.index.name + "/")
        
#         for fn in dir1.index:
#             if fn in dir2.index:
#                 if dir1["Hash"][fn] != dir2["Hash"][fn] :
#                     os.system("cp " + dir1.index.name + fn + " " + dir2.index.name + fn)
#         else: 
#             print("cp " + dir1.index.name + fn + " " + dir2.index.name + fn)
#             os.system("cp " + dir1.index.name + fn + " " + dir2.index.name + fn)


        ## From Remote -> local
        ## RSYNC Command: rsync -av remotehost@ipaddr:Filepath/to/sync ~/localfilepath/to/sync
        
        ## From Local -> Remote
        ## RSYNC Command: rsync -av ~/localfilepath/to/sync remotehost@ipaddr:Filepath/to/sync

In [18]:
synchronize_directory(dir1df, dir2df)

Synchronizing Directories: rsync -av ./testdir1/ ./testdir2/


In [302]:
# The directories are now identical
dir1df_after = create_dirdf("./testdir1")
dir2df_after = create_dirdf("./testdir2")
print(dir1df_after)
print(dir2df_after)
synchronize_directory(dir1df_after, dir2df_after)

                                           Hash
./testdir1                                     
/testfile1.txt              2453564951503530004
/testfile2.txt             12316122424624457986
/testfile3.txt             11557512894821542172
/testfile4.txt             18391644498650419282
/testdir1A/testfile1A.txt  10345096838406707354
                                           Hash
./testdir2                                     
/testfile1.txt              2453564951503530004
/testfile2.txt             12316122424624457986
/testfile3.txt             11557512894821542172
/testfile4.txt             18391644498650419282
/testdir1A/testfile1A.txt  10345096838406707354
Directories are identical
