In [26]:
# Demo for synchronization of two data directories
import os
import io
import pandas as pd
import numpy as np
import farmhash
import time
import sqlite3

In [4]:
def create_dirdf(directory):
    if not os.path.exists(directory):
        print("Error: Directory '" + directory + "' does not exist.")
        return
    
    filenames = []
    hashvalues = []
    
    for root, subdir, files in os.walk(directory):
        for name in files:
            if not name[0] == ".": # ignore hidden files
                filepath = os.path.join(root, name)
                
                # hash full file contents
                # note: spice data encoding is mixed, so read as binary
                file = str(io.open(filepath,'rb').read()) 
                filenames.append(filepath.split(directory, 1)[1])
                hashvalues.append(farmhash.hash64(file))
                
    df = pd.DataFrame(data=hashvalues, index = filenames, columns = ["Hash"])
    df.index.name = directory
    return df

start = time.time()
dir1df = create_dirdf("./testdir1")
print("elapsed time: ", time.time() - start)

dir2df = create_dirdf("./testdir2")

print(dir1df, dir2df)

elapsed time:  0.0032210350036621094
                                           Hash
./testdir1                                     
/testfile1.txt             13167233149662072294
/testfile2.txt              2116770068367243914
/testfile3.txt             10117441339441774812
/testfile4.txt               407662078023551858
/testdir1A/testfile1A.txt   1855841718642996950                                            Hash
./testdir2                                     
/testfile1.txt             13167233149662072294
/testfile2.txt              2116770068367243914
/testfile3.txt             10117441339441774812
/testfile4.txt               407662078023551858
/testdir1A/testfile1A.txt   1855841718642996950


In [42]:
conn = sqlite3.connect('./spicedb.sqlite')
c = conn.cursor()
c.execute('CREATE TABLE testdir1_hashes (Filename TEXT)')
c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}".format(tn='testdir1_hashes', cn='Hash', ct='TEXT'))
c.execute("INSERT OR IGNORE INTO {tn} ({idf}, {cn}) VALUES ('testfn', '12312312')".format(tn='testdir1_hashes', idf='Filename', cn='Hash'))

<sqlite3.Cursor at 0x114a651f0>

In [119]:
os.remove('./spicedb.sqlite')

In [117]:
def create_spicedb(spdir):
    if not os.path.exists(spdir):
        print("Error: Directory '" + spdir + "' does not exist.")
        return
    
    conn = sqlite3.connect('./spicedb.sqlite') # initialize db, this might move to an init func in the api
    c = conn.cursor()
    
    c.execute("CREATE TABLE SPICE (Mission TEXT)") # create missions table
    c.execute("ALTER TABLE SPICE ADD COLUMN Kernel TEXT")
    c.execute("ALTER TABLE SPICE ADD COLUMN File TEXT")
    c.execute("ALTER TABLE SPICE ADD COLUMN Path TEXT")
    c.execute("ALTER TABLE SPICE ADD COLUMN Hash TEXT") #theres probs a way to to this in one line


    # ooh spicy ~
    for mis in [m for m in os.listdir(spdir) if not m[0] == '.']:
        for ker in [k for k in os.listdir(spdir+'/'+mis) if not k[0] == '.']:
            
            # first two directory layers should always be mission and kernels - then it gets mixed
            for root, subdir, files in os.walk(spdir+'/'+mis+'/'+ker):
                for name in files:
                    if not name[0] == ".": # ignore hidden files
                        # more tabs pls
                        filepath = os.path.join(root, name)

                        # spice data encoding is mixed, so read as binary
                        fhash = farmhash.hash64(str(io.open(filepath,'rb').read()))
                        c.execute("INSERT OR IGNORE INTO SPICE (Mission, Kernel, File, Path, Hash) VALUES ('{mn}', '{kn}', '{fn}', '{fp}', '{fh}')"\
                                  .format(mn=mis, kn=ker, fn=name, fp=filepath, fh=fhash))
    
    conn.commit()

    
    
#     for root, subdir, files in os.walk(directory):
#         for name in files:
#             if not name[0] == ".": # ignore hidden files
#                 filepath = os.path.join(root, name)
#                 file = str(io.open(filepath,'rb').read())
#                 c.execute("INSERT OR IGNORE INTO {tn} (Filename, Hash) VALUES ('{fn}', '{hsh}')".format(tn='testdir1_hashes', fn = filepath,hsh = farmhash.hash64(file) ))

In [120]:
create_spicedb('./spice_data')
# c.execute("SELECT * FROM testdir1_hashes WHERE Hash= '13167233149662072294'")
# # Note: fetchall() will pull the whole buffer, if you SELECT ten times, the result will be in there ten times
# all_rows = c.fetchall() 
# print(all_rows)

In [108]:
conn.close()