In [118]:
import psycopg2
import pandas
from datasketch import MinHashLSHEnsemble, MinHash, LeanMinHash
import pickle
from collections import defaultdict

In [119]:
def create_and_serialize_index(conn, table_name, num_perm=128):
    # get list of tables available in conn
    get_t_q = f"""
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema='public'
    AND table_type='BASE TABLE'
    AND table_name != '{table_name}'
    """
    cursor=conn.cursor()
    cursor.execute(get_t_q)
    res=cursor.fetchall()
    
    drop_if_exist_q = f"""
    DROP TABLE IF EXISTS {table_name};"""
    cursor.execute(drop_if_exist_q)
    
    create_table_q = f"""
    CREATE TABLE IF NOT EXISTS {table_name}(tname text, cname text, dsize int, hashval bytea);
    """
    cursor.execute(create_table_q)
    
    # Create an LSH Ensemble index with threshold and number of partition
    # settings.
    lshensemble = MinHashLSHEnsemble(threshold=0.8, num_perm=num_perm,
    num_part=32)
    # create lshensemble 
    table_hash_input_tuples = []
    
    i = 0
    for t in res:
        print(f"table : {t[0]}")
        t_hashes = {}
        cols_q = f"SELECT column_name FROM INFORMATION_SCHEMA. COLUMNS WHERE TABLE_NAME = '{t[0]}'";
        cursor.execute(cols_q)
        cols = cursor.fetchall()
        for c in cols:
#             print(f'column: {c}')
            distincts_q = f'select distinct "{c[0]}" FROM {t[0]} where "{c[0]}" is not null'
            cursor.execute(distincts_q)
            set_vals = [v[0] for v in cursor.fetchall()]
#             print(set_vals)
            mh = MinHash(num_perm=num_perm)
            for v in set_vals:
                mh.update(v.encode('utf8'))
            lmh = LeanMinHash(seed=mh.seed,hashvalues=mh.hashvalues)
            pickled_lmh = pickle.dumps(lmh)
            insert_one_hash_q = f"""
            INSERT INTO {table_name} VALUES ('{t[0]}', '{c[0]}', {len(set_vals)}, \
            {psycopg2.Binary(pickled_lmh)})
            """
#             print(insert_one_hash_q)
            cursor.execute(insert_one_hash_q)
            table_hash_input_tuples.append((f"{t[0]}-{c[0]}", mh, len(set_vals)))
        i+=1
    lshensemble.index(table_hash_input_tuples)
    
    return lshensemble

In [120]:
def load_index(conn, index_table, num_perm=128):
    
    retrieve_q = f"SELECT tname, cname, dsize, hashval FROM {index_table}"
    cur = conn.cursor()
    
    index_dict = defaultdict(dict)
    
    cur.execute(retrieve_q)
    raw = cur.fetchall()
    
    # create lshensemble 
    table_hash_input_tuples = []
    
    for r in raw:
        tname, cname, dsize, mhash = r[0], r[1], r[2], pickle.loads(r[3])
        index_dict[f'{r[0]}-{r[1]}'] = {'table':tname, 
                                           'col': cname, 
                                           'mhash': mhash,
                                           'dsize':dsize}
        
        table_hash_input_tuples.append((f"{tname}-{cname}", mhash, dsize))
    
    # Create an LSH Ensemble index with threshold and number of partition
    # settings.
    lshensemble = MinHashLSHEnsemble(threshold=0.8, num_perm=num_perm,
    num_part=32)
    lshensemble.index(table_hash_input_tuples)
    
    return lshensemble, index_dict

In [125]:
def search_index(lshensemble, index_dict, tname, cname):
    
    mhash = index_dict[f'{tname}-{cname}']['mhash']
    dsize = index_dict[f'{tname}-{cname}']['dsize']
    
    for key in lshensemble.query(mhash, dsize):
        print(key)

In [123]:
if __name__ == '__main__':
    conn = psycopg2.connect("dbname=test_hash_db user=postgres")
    conn.autocommit=True
    create_and_serialize_index(conn, 'test')

table : a
table : b
table : c
table : d


In [126]:
lshensemble, index_dict = load_index(conn, 'test')
search_index(lshensemble, index_dict, 'a', 'a')

a-a
b-c
