In [93]:
import logging
from data_handler import DataHandler
from cocoa import COCOA
from mate import MATE
import psycopg2
from IPython.display import display, HTML
import pandas as pd
import json
import qgrid


def highlight_columns(table, query_columns, target=None, rows=10):
    def highlight_query(s):
        color = 'lightgreen'
        return 'background-color: %s' % color
    
    def highlight_target(s):
        color = 'orange'
        return 'background-color: %s' % color
    
    sample = table.head(rows).style.applymap(highlight_query, subset=pd.IndexSlice[:, query_columns])
    if target:
        sample = sample.applymap(highlight_target, subset=pd.IndexSlice[:, target])

    return sample


logging.basicConfig(format='%(asctime)s %(message)s')
logging.getLogger().setLevel(logging.INFO)

db_config = json.load(open("/Users/jannisbecktepe/Developer/db_config.json"))

conn = psycopg2.connect(**db_config)
data_handler = DataHandler(
    conn,
    main_table='cafe_gittables2_main_tokenized',
    column_headers_table='cafe_gittables2_column_headers',
    table_info_table='cafe_gittables2_table_info',
    cocoa_index_table='cafe_gittables2_cocoa_index'
)

display(HTML("<style>.container { width:90% !important; }</style>"))


In [94]:
input_dataset_name, input_dataset = data_handler.read_csv('../datasets/movie.csv')

display(HTML(input_dataset.head(10).to_html()))

#qgrid.show_grid(input_dataset)

Unnamed: 0,movie_title,duration,director_name,genres,imdb_score
0,Avatar,178.0,James Cameron,Action|Adventure|Fantasy|Sci-Fi,7.9
1,Pirates of the Caribbean: At World's End,169.0,Gore Verbinski,Action|Adventure|Fantasy,7.1
2,Spectre,148.0,Sam Mendes,Action|Adventure|Thriller,6.8
3,The Dark Knight Rises,164.0,Christopher Nolan,Action|Thriller,8.5
4,Star Wars: Episode VII - The Force Awakens,,Doug Walker,Documentary,7.1
5,John Carter,132.0,Andrew Stanton,Action|Adventure|Sci-Fi,6.6
6,Spider-Man 3,156.0,Sam Raimi,Action|Adventure|Romance,6.2
7,Tangled,100.0,Nathan Greno,Adventure|Animation|Comedy|Family|Fantasy|Musical|Romance,7.8
8,Avengers: Age of Ultron,141.0,Joss Whedon,Action|Adventure|Sci-Fi,7.5
9,Harry Potter and the Half-Blood Prince,153.0,David Yates,Adventure|Family|Fantasy|Mystery,7.5


# 1) Input Preparation

## Select input and target columns

In [95]:
input_columns = ['director_name', 'movie_title']
target_column = 'imdb_score'

input_sample = highlight_columns(input_dataset, input_columns, target=[target_column])
print(f'Input table {str(input_dataset.shape)}:')
display(HTML(input_sample.to_html()))

Input table (100, 5):


Unnamed: 0,movie_title,duration,director_name,genres,imdb_score
0,Avatar,178.0,James Cameron,Action|Adventure|Fantasy|Sci-Fi,7.9
1,Pirates of the Caribbean: At World's End,169.0,Gore Verbinski,Action|Adventure|Fantasy,7.1
2,Spectre,148.0,Sam Mendes,Action|Adventure|Thriller,6.8
3,The Dark Knight Rises,164.0,Christopher Nolan,Action|Thriller,8.5
4,Star Wars: Episode VII - The Force Awakens,,Doug Walker,Documentary,7.1
5,John Carter,132.0,Andrew Stanton,Action|Adventure|Sci-Fi,6.6
6,Spider-Man 3,156.0,Sam Raimi,Action|Adventure|Romance,6.2
7,Tangled,100.0,Nathan Greno,Adventure|Animation|Comedy|Family|Fantasy|Musical|Romance,7.8
8,Avengers: Age of Ultron,141.0,Joss Whedon,Action|Adventure|Sci-Fi,7.5
9,Harry Potter and the Half-Blood Prince,153.0,David Yates,Adventure|Family|Fantasy|Mystery,7.5


# 2) Joinability Discovery

## Find joinable tables using Super Key Index and MATE Algorithm

In [96]:
mate = MATE(data_handler)
top_joinable_tables = mate.enrich(input_dataset,
                                  input_columns,
                                  10,
                                  dataset_name=input_dataset_name)

print(f'{len(top_joinable_tables)} found.' )

joinable_columns_dict = {}
tables_dict = {}
column_headers_dict = {}
for score, table_id, columns, join_map in top_joinable_tables:
    joinable_columns_dict[table_id] = columns

    try:
        table = data_handler.get_table(table_id)
    except:
        continue
    tables_dict[table_id] = table
    
    column_headers = [table.columns[int(col_id)] for col_id in columns.split('_')][:len(input_columns)]
    column_headers_dict[table_id] = column_headers
    highlight_sample = highlight_columns(table, column_headers)
    

movie.csv DATASET


 56%|██████████████████████▉                  | 168/300 [00:03<00:02, 48.08it/s]


10 found.


## Inspect top joinable tables
### #1

In [135]:
top_table_index = 1

score, table_id, columns, join_map = top_joinable_tables[top_table_index]

tables_dict[table_id].to_csv("../datasets/test.csv", index=False)
print(f'Score: {score}, table_id: {table_id}, joinable columns: {columns}, #rows: {tables_dict[table_id].shape[0]}, #columns: {tables_dict[table_id].shape[1]}')
highlight_sample = highlight_columns(tables_dict[table_id], column_headers_dict[table_id])


display(HTML(highlight_sample.to_html()))

Score: 101, table_id: 159543, joinable columns: 1_11, #rows: 149, #columns: 28


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,color,james cameron,723 0,178 0,0,855 0,joel david moore,1000,760505847 0,action adventure fantasy sci fi,cch pounder,avatar,886204,4834,wes studi,0 0,avatar future marine native paraplegic,http www imdb com title tt0499549 ref fn tt tt 1,3054 0,english,usa,pg 13,237000000 0,2009 0,936,7 9,1 78,33000
1,color,gore verbinski,302 0,169 0,563,1000 0,orlando bloom,40000,309404152 0,action adventure fantasy,johnny depp,pirates caribbean world s end,471220,48350,jack davenport,0 0,goddess marriage ceremony marriage proposal pirate singapore,http www imdb com title tt0449088 ref fn tt tt 1,1238 0,english,usa,pg 13,300000000 0,2007 0,5000,7 1,2 35,0
2,color,sam mendes,602 0,148 0,0,161 0,rory kinnear,11000,200074175 0,action adventure thriller,christoph waltz,spectre,275868,11700,stephanie sigman,1 0,bomb espionage sequel spy terrorist,http www imdb com title tt2379713 ref fn tt tt 1,994 0,english,uk,pg 13,245000000 0,2015 0,393,6 8,2 35,85000
3,color,christopher nolan,813 0,164 0,22000,23000 0,christian bale,27000,448130642 0,action thriller,tom hardy,dark knight rises,1144337,106759,joseph gordon levitt,0 0,deception imprisonment lawlessness police officer terrorist plot,http www imdb com title tt1345836 ref fn tt tt 1,2701 0,english,usa,pg 13,250000000 0,2012 0,23000,8 5,2 35,164000
4,none,doug walker,,,131,,rob walker,131,,documentary,doug walker,star wars episode vii force awakens,8,143,none,0 0,none,http www imdb com title tt5289954 ref fn tt tt 1,,none,none,none,,,12,7 1,,0
5,color,andrew stanton,462 0,132 0,475,530 0,samantha morton,640,73058679 0,action adventure sci fi,daryl sabara,john carter,212204,1873,polly walker,1 0,alien american civil war male nipple mars princess,http www imdb com title tt0401729 ref fn tt tt 1,738 0,english,usa,pg 13,263700000 0,2012 0,632,6 6,2 35,24000
6,color,sam raimi,392 0,156 0,0,4000 0,james franco,24000,336530303 0,action adventure romance,j k simmons,spider man 3,383056,46055,kirsten dunst,0 0,sandman spider man symbiote venom villain,http www imdb com title tt0413300 ref fn tt tt 1,1902 0,english,usa,pg 13,258000000 0,2007 0,11000,6 2,2 35,0
7,color,nathan greno,324 0,100 0,15,284 0,donna murphy,799,200807262 0,adventure animation comedy family fantasy musical romance,brad garrett,tangled,294810,2036,m c gainey,1 0,17th century based fairy tale disney flower tower,http www imdb com title tt0398286 ref fn tt tt 1,387 0,english,usa,pg,260000000 0,2010 0,553,7 8,1 85,29000
8,color,joss whedon,635 0,141 0,0,19000 0,robert downey jr,26000,458991599 0,action adventure sci fi,chris hemsworth,avengers age ultron,462669,92000,scarlett johansson,4 0,artificial intelligence based comic book captain america marvel cinematic universe superhero,http www imdb com title tt2395427 ref fn tt tt 1,1117 0,english,usa,pg 13,250000000 0,2015 0,21000,7 5,2 35,118000
9,color,david yates,375 0,153 0,282,10000 0,daniel radcliffe,25000,301956980 0,adventure family fantasy mystery,alan rickman,harry potter half blood prince,321795,58753,rupert grint,3 0,blood book love potion professor,http www imdb com title tt0417741 ref fn tt tt 1,973 0,english,uk,pg,250000000 0,2009 0,11000,7 5,2 35,10000


### #2

In [139]:
top_table_index = 2

score, table_id, columns, join_map = top_joinable_tables[top_table_index]
tables_dict[table_id].to_csv("../datasets/test.csv", index=False)
print(f'Score: {score}, table_id: {table_id}, joinable columns: {columns}, #rows: {tables_dict[table_id].shape[0]}, #columns: {tables_dict[table_id].shape[1]}')
highlight_sample = highlight_columns(tables_dict[table_id], column_headers_dict[table_id])


display(HTML(highlight_sample.to_html()))

Score: 101, table_id: 278123, joinable columns: 1_11, #rows: 149, #columns: 28


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,color,james cameron,723 0,178 0,0,855 0,joel david moore,1000,760505847 0,action adventure fantasy sci fi,cch pounder,avatar,886204,4834,wes studi,0 0,avatar future marine native paraplegic,http www imdb com title tt0499549 ref fn tt tt 1,3054 0,english,usa,pg 13,237000000 0,2009 0,936,7 9,1 78,33000
1,color,gore verbinski,302 0,169 0,563,1000 0,orlando bloom,40000,309404152 0,action adventure fantasy,johnny depp,pirates caribbean world s end,471220,48350,jack davenport,0 0,goddess marriage ceremony marriage proposal pirate singapore,http www imdb com title tt0449088 ref fn tt tt 1,1238 0,english,usa,pg 13,300000000 0,2007 0,5000,7 1,2 35,0
2,color,sam mendes,602 0,148 0,0,161 0,rory kinnear,11000,200074175 0,action adventure thriller,christoph waltz,spectre,275868,11700,stephanie sigman,1 0,bomb espionage sequel spy terrorist,http www imdb com title tt2379713 ref fn tt tt 1,994 0,english,uk,pg 13,245000000 0,2015 0,393,6 8,2 35,85000
3,color,christopher nolan,813 0,164 0,22000,23000 0,christian bale,27000,448130642 0,action thriller,tom hardy,dark knight rises,1144337,106759,joseph gordon levitt,0 0,deception imprisonment lawlessness police officer terrorist plot,http www imdb com title tt1345836 ref fn tt tt 1,2701 0,english,usa,pg 13,250000000 0,2012 0,23000,8 5,2 35,164000
4,none,doug walker,,,131,,rob walker,131,,documentary,doug walker,star wars episode vii force awakens,8,143,none,0 0,none,http www imdb com title tt5289954 ref fn tt tt 1,,none,none,none,,,12,7 1,,0
5,color,andrew stanton,462 0,132 0,475,530 0,samantha morton,640,73058679 0,action adventure sci fi,daryl sabara,john carter,212204,1873,polly walker,1 0,alien american civil war male nipple mars princess,http www imdb com title tt0401729 ref fn tt tt 1,738 0,english,usa,pg 13,263700000 0,2012 0,632,6 6,2 35,24000
6,color,sam raimi,392 0,156 0,0,4000 0,james franco,24000,336530303 0,action adventure romance,j k simmons,spider man 3,383056,46055,kirsten dunst,0 0,sandman spider man symbiote venom villain,http www imdb com title tt0413300 ref fn tt tt 1,1902 0,english,usa,pg 13,258000000 0,2007 0,11000,6 2,2 35,0
7,color,nathan greno,324 0,100 0,15,284 0,donna murphy,799,200807262 0,adventure animation comedy family fantasy musical romance,brad garrett,tangled,294810,2036,m c gainey,1 0,17th century based fairy tale disney flower tower,http www imdb com title tt0398286 ref fn tt tt 1,387 0,english,usa,pg,260000000 0,2010 0,553,7 8,1 85,29000
8,color,joss whedon,635 0,141 0,0,19000 0,robert downey jr,26000,458991599 0,action adventure sci fi,chris hemsworth,avengers age ultron,462669,92000,scarlett johansson,4 0,artificial intelligence based comic book captain america marvel cinematic universe superhero,http www imdb com title tt2395427 ref fn tt tt 1,1117 0,english,usa,pg 13,250000000 0,2015 0,21000,7 5,2 35,118000
9,color,david yates,375 0,153 0,282,10000 0,daniel radcliffe,25000,301956980 0,adventure family fantasy mystery,alan rickman,harry potter half blood prince,321795,58753,rupert grint,3 0,blood book love potion professor,http www imdb com title tt0417741 ref fn tt tt 1,973 0,english,uk,pg,250000000 0,2009 0,11000,7 5,2 35,10000


# 2) Duplicate Detection using Xash

In [1]:
import csv
import datetime
import hashlib
import json
import math
import re
import sys
from collections import defaultdict, Counter
from typing import Dict, Tuple, List
from util import get_cleaned_text
#import pyhash

import numpy as np
import psycopg2

hash_type = "xash"
input_table = "../datasets/test.csv"

#conn = psycopg2.connect(user="postgres", password="123456", database="postgres", host="localhost", port=5432, sslmode="disable")
conn = psycopg2.connect(user="jannis", password="hgjasldhz", database="pdb", host="herkules.dbs.uni-hannover.de")
conn.autocommit = True
print("Successfully connected!")
cursor = conn.cursor()

def genHash(token,hash_function):
    if hash_type == "xash":
        return XASH(str(token))

def XASH(token: str, hash_size: int = 128) -> int:
    """Computes XASH for given token.

    Parameters
    ----------
    token : str
        Token.

    hash_size : int
        Number of bits.

    Returns
    -------
    int
        XASH value.
    """
    number_of_ones = 5
    char = [' ', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
            'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    segment_size_dict = {64: 1, 128: 3, 256: 6, 512: 13}
    segment_size = segment_size_dict[hash_size]
    length_bit_start = 37 * segment_size
    result_ = 0
    cnt_dict = Counter(token)
    selected_chars = [y[0] for y in sorted(cnt_dict.items(), key=lambda x: (x[1], x[0]), reverse=False)[:number_of_ones]]
    for c in selected_chars:
        if c not in char:
            continue
        indices = [i for i, ltr in enumerate(token) if ltr == c]
        mean_index = np.mean(indices)
        token_size = len(token)
        for i in np.arange(segment_size):
            if mean_index <= ((i + 1) * token_size / segment_size):
                location = char.index(c) * segment_size + i
                break
        result_ = result_ | int(math.pow(2, location))

    # rotation
    n = int(result_)
    d = int((length_bit_start * (len(token) % (hash_size - length_bit_start))) / (
            hash_size - length_bit_start))
    int_bits = int(length_bit_start)
    x = n << d
    y = n >> (int_bits - d)
    r = int(math.pow(2, int_bits))
    result_ = int((x | y) % r)

    result_ = int(result_) | int(math.pow(2, len(token) % (hash_size - length_bit_start)) * math.pow(2, length_bit_start))

    return result_

def fpCheck(rowArray1,rowArray2):
    # Check values to check false positive
    rowvalues_t1 = rowArray1 # both are already sorted
    rowvalues_t2 = rowArray2

    ## Duplicate detection
    if len(rowvalues_t1) > len(rowvalues_t2):
        bigger_row = rowvalues_t1
        smaller_row = rowvalues_t2
    else:
        bigger_row = rowvalues_t2
        smaller_row = rowvalues_t1

    for i in range(0,len(bigger_row)):
        if i >= len(smaller_row):
            # fail
            return False
        if bigger_row[i] != smaller_row[i]:
            # fail, different values
            return False

    return True

i = 0
counter_fp = 0
counter_superkey = 0
rows = defaultdict(dict)
superKeyMapping = defaultdict(list)
dup = []
duplicate_tables = []

had_header = False

with open(input_table) as csv_file:
    reader = csv.reader(csv_file)
    for row in reader:
        # skip column titles
        #if not had_header:
        #    had_header = True
        #    continue 
                
        row.sort()
        rows[0][i] = row
        rows[1][i] = 0
        for cell in row:
            #cell = get_cleaned_text(cell)
            rows[1][i] = rows[1][i] | genHash(str(cell),hash_type)
        superKeyMapping[int(rows[1][i])].append(i) # Map super key to rowid
        i = i+1

table = tables_dict[278123]
table = input_dataset
superKeyMapping = defaultdict(list)
rows = defaultdict(dict)

# generate superKeyMapping from dataframe
for row_id, row in table.iloc[:1,:].iterrows():
    super_key = 0
    for _, token in row.items():
        token = get_cleaned_text(str(token))
        super_key |= XASH(token)
        rows[0][row_id] = sorted(list(row))
        rows[1][row_id] = super_key
    superKeyMapping[super_key] += [row_id]
    

in_clause = ""
for v in rows[1].values():
    in_clause = in_clause + "'"+str(np.binary_repr(v).zfill(128))+"',"

#print(in_clause[:-1])

'''
cursor.execute(f'SELECT mate_main_tokenized.tableid, mate_main_tokenized.rowid, mate_main_tokenized.colid, mate_main_tokenized.tokenized, mate_main_tokenized_hashes.super_key_{hash_type} '
f'FROM "mate_main_tokenized", "mate_main_tokenized_hashes" '
f'WHERE mate_main_tokenized_hashes.super_key_{hash_type} IN ({in_clause[:-1]}) '                                                                                                                                                                             
f'AND mate_main_tokenized.tableid = mate_main_tokenized_hashes.tableid AND mate_main_tokenized.rowid = mate_main_tokenized_hashes.rowid '
f'ORDER BY tableid, rowid, colid LIMIT 1000000')
'''
cursor.execute(f'SELECT tableid, rowid, colid, tokenized, super_key FROM "cafe_gittables2_main_tokenized" WHERE super_key IN ({in_clause[:-1]}) ORDER BY tableid, rowid, colid')
#results = cursor.fetchall()
tmp_rowid = -1
tmp_tableid = -1
tmp_superkey = 0
row = []
tableIds_length_to_load = set()
for result in cursor:
    if (tmp_tableid != -1 and tmp_tableid != result[0]) or (tmp_rowid != -1 and tmp_rowid != result[1]):
        row.sort()
        for rowId in superKeyMapping[int(tmp_superkey,2)]:
            if fpCheck(rows[0][rowId],row):
                dup.append((rowId,(tmp_tableid,tmp_rowid)))
                tableIds_length_to_load.add(tmp_tableid)
            else:
                counter_fp = counter_fp+1
            counter_superkey = counter_superkey+1
        row = []
    tmp_tableid = result[0]
    tmp_rowid = result[1]
    tmp_superkey = result[4]
    row.append(str(result[3]))

    
print(tableIds_length_to_load)    
    
row.sort()
if tmp_tableid != -1: # check that at least one row is found
    row.sort()
    for rowId in superKeyMapping[int(tmp_superkey,2)]:
        if fpCheck(rows[0][rowId],row):
            dup.append((rowId,(tmp_tableid,tmp_rowid)))
            tableIds_length_to_load.add(tmp_tableid)
        else:
            counter_fp = counter_fp+1
        counter_superkey = counter_superkey+1

    duplicates = defaultdict(list)
    for i in dup:
        duplicates[i[1][0]].append((i[0],i[1][1]))

    if len(dup) > 0:
        # Check duplicate rows for duplicate tables
        in_clause_tableids = ', '.join(str(s) for s in tableIds_length_to_load)
        # Get number of rows in table:
        cursor.execute(f'SELECT tableid, MAX(rowid) FROM "cafe_gittables2_main_tokenized" WHERE tableid IN ({in_clause_tableids}) GROUP BY tableid')
        for result in cursor:
            t1_dup = []
            t2_dup = []
            for value in duplicates[result[0]]:
                t1_dup.append(value[0])
                t2_dup.append(value[1])
                
            #if (len(set(t1_dup)) >= len(rows[0]) or len(set(t2_dup)) >= result[1]):
            #    if (len(set(t1_dup)) >= len(t2_dup) or len(set(t2_dup)) >= len(t1_dup)):
            #        #print("found duplicate table: " + str(result[0]))
            #        duplicate_tables.append(result[0])
            duplicate_tables.append(result[0])

if len(duplicate_tables) == 0:
    print("NO DUPLICATE TABLES FOUND")

print("\n\nFound duplicate tables:")
print(json.dumps(duplicate_tables))
print("FP: "+str(counter_fp))
print("SUM: "+str(counter_superkey))

duplicate_tables_first = duplicate_tables

Successfully connected!


NameError: name 'tables_dict' is not defined

In [176]:
def getTableData(tableIds):
    rowValues = defaultdict(lambda: defaultdict(dict))
    rowSuperKeys = defaultdict(dict)
    in_tableids = ",".join(str(v) for v in tableIds)

    cursor.execute(f'SELECT tableid, rowid, colid, tokenized, super_key FROM "cafe_gittables2_main_tokenized" WHERE tableid IN ({in_tableids})')
    results = cursor.fetchall()

    for row in results:
        rowValues[row[0]][row[1]][row[2]] = str(row[3])
        rowSuperKeys[row[0]][row[1]] = int(row[4],2) # convert to int

    return [rowValues,rowSuperKeys]

table_data = getTableData(duplicate_tables)

counter_superkey = 0
counter_fp = 0
duplicates = []
duplicate_tables = []

def compareTables(t1,t2,data):
    global counter_fp
    global counter_superkey
    global duplicates
    global duplicate_tables

    duplicates_local = []

    t1_data = data[1][t1]
    t2_data = data[1][t2]

    # Compare num of columns:
    if len(data[0][t1][0]) != len(data[0][t2][0]):
        return None # Number of columns is different
    # End compare num of columns

    for row_t1 in t1_data:
        super_key_t1 = t1_data[row_t1]
        for row_t2 in t2_data:
            super_key_t2 = t2_data[row_t2]
            if len(t2_data) < 1:
                continue

            # Compare super keys:
            if super_key_t1 == super_key_t2:
                counter_superkey = counter_superkey+1

                # Check values to check false positive
                rowvalues_t1 = list(data[0][t1][row_t1].values())
                rowvalues_t2 = list(data[0][t2][row_t2].values())

                rowvalues_t1.sort()
                rowvalues_t2.sort()

                ## Duplicate detection
                if len(rowvalues_t1) > len(rowvalues_t2):
                    bigger_row = rowvalues_t1
                    smaller_row = rowvalues_t2
                else:
                    bigger_row = rowvalues_t2
                    smaller_row = rowvalues_t1

                fail = False
                for i in range(0,len(bigger_row)):
                    if i >= len(smaller_row):
                        # fail
                        fail = True
                        break
                    if bigger_row[i] != smaller_row[i]:
                        # fail, different values
                        fail = True
                        break
                if not fail:
                    duplicates.append({"tableid_1": t1, "rowid_1": row_t1, "tableid_2": t2, "rowid_2": row_t2})
                    duplicates_local.append({"tableid_1": t1, "rowid_1": row_t1, "tableid_2": t2, "rowid_2": row_t2})
                else:
                    ## If only duplicate tables need to be found (Important: this will probably not work for subset duplicates correctly), we can completely skip this table == table comparison
                    ''' if duplicate_tables_only:
                        if enable_print:
                            print("Skipping, ")
                        counter_fp = counter_fp+1
                        return '''
                    counter_fp = counter_fp+1
                ## End duplicate

    num_rows_min = min(len(t1_data),len(t2_data))
    if len(duplicates_local) >= num_rows_min and num_rows_min > 0:
        t1_dup = []
        t2_dup = []
        for value in duplicates_local:
            t1_dup.append(value['rowid_1'])
            t2_dup.append(value['rowid_2'])

        #if (len(set(t1_dup)) >= len(t1_data) or len(set(t2_dup)) >= len(t2_data)):
        #    if (len(set(t1_dup)) >= len(t2_dup) or len(set(t2_dup)) >= len(t1_dup)):
        #        duplicate_tables.append((t1,t2))
        duplicate_tables.append((t1,t2))

for tableIds1 in table_data[0]:
    for tableIds2 in table_data[0]:
        if tableIds1 < tableIds2:
            compareTables(tableIds1,tableIds2,table_data)

print("\n\nFound duplicate tables:")
print(json.dumps(duplicate_tables))
print("FP: "+str(counter_fp))
print("SUM: "+str(counter_superkey))



Found duplicate tables:
[[21812, 159543], [21812, 230999], [21812, 278123], [159543, 230999], [159543, 278123], [230999, 278123]]
FP: 0
SUM: 906


# 3) Correlation Calculation
## Find top correlating columns using Order Index and COCOA Algorithm

In [76]:
from util import get_cleaned_text
cocoa = COCOA(data_handler)
top_correlating_columns = cocoa.enrich_multicolumn(input_dataset, top_joinable_tables, 10, target_column=target_column)
    
# add tokenized input columns for the join
output_dataset = input_dataset.copy()
for input_column in input_columns:
    output_dataset[input_column + "_tokenized"] = input_dataset[input_column].apply(get_cleaned_text)

2022-08-28 12:00:25,878 === Starting COCOA multicolumn ===
2022-08-28 12:00:25,880 Fetching number of columns for each table...
2022-08-28 12:00:25,905 Finished.
2022-08-28 12:00:25,906 Fetching cocoa index...
2022-08-28 12:00:25,986 Finished.
2022-08-28 12:00:25,988 Calculating correlations...
100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 238.17it/s]
2022-08-28 12:00:26,032 Finished.


## Materialize join for top correlating features

In [77]:
# add top features that are not contained in input table

external_columns = []
for cor, table_col_id in top_correlating_columns[:5]:
    table_id = int(table_col_id.split('_')[0])
    column_id = int(table_col_id.split('_')[1])
    table = tables_dict[table_id]
    
    # add correlation info
    new_col_name = f"{table_id}_{table.columns[column_id]}, cor: {cor:.2f}"
    external_columns += [new_col_name]
    table = table.rename(columns={table.columns[column_id]: new_col_name})
        
 
    table = table.loc[:, column_headers_dict[table_id] + [table.columns[column_id]]]
    

    output_dataset = output_dataset.merge(
        table,
        how="left",
        left_on=[col + "_tokenized" for col in input_columns],
        right_on=column_headers_dict[table_id],
        suffixes=('', '_extern')
    )
    output_dataset = output_dataset[[c for c in output_dataset.columns if not c.endswith('_extern')]]
        
output_dataset = output_dataset[[c for c in output_dataset.columns if not c.endswith('_tokenized')]]

print(output_dataset.columns)

output_sample = highlight_columns(output_dataset, input_columns, target=external_columns)
display(HTML(output_sample.to_html()))

Index(['movie_title', 'duration', 'director_name', 'genres', 'imdb_score',
       '230999_num_voted_users, cor: 0.63',
       '159543_num_voted_users, cor: 0.63',
       '278123_num_voted_users, cor: 0.63', '21812_num_voted_users, cor: 0.63',
       '230999_movie_facebook_likes, cor: 0.45'],
      dtype='object')


Unnamed: 0,movie_title,duration,director_name,genres,imdb_score,"230999_num_voted_users, cor: 0.63","159543_num_voted_users, cor: 0.63","278123_num_voted_users, cor: 0.63","21812_num_voted_users, cor: 0.63","230999_movie_facebook_likes, cor: 0.45"
0,Avatar,178.0,James Cameron,Action|Adventure|Fantasy|Sci-Fi,7.9,886204,886204,886204,886204,33000
1,Pirates of the Caribbean: At World's End,169.0,Gore Verbinski,Action|Adventure|Fantasy,7.1,471220,471220,471220,471220,0
2,Spectre,148.0,Sam Mendes,Action|Adventure|Thriller,6.8,275868,275868,275868,275868,85000
3,The Dark Knight Rises,164.0,Christopher Nolan,Action|Thriller,8.5,1144337,1144337,1144337,1144337,164000
4,Star Wars: Episode VII - The Force Awakens,,Doug Walker,Documentary,7.1,8,8,8,8,0
5,John Carter,132.0,Andrew Stanton,Action|Adventure|Sci-Fi,6.6,212204,212204,212204,212204,24000
6,Spider-Man 3,156.0,Sam Raimi,Action|Adventure|Romance,6.2,383056,383056,383056,383056,0
7,Tangled,100.0,Nathan Greno,Adventure|Animation|Comedy|Family|Fantasy|Musical|Romance,7.8,294810,294810,294810,294810,29000
8,Avengers: Age of Ultron,141.0,Joss Whedon,Action|Adventure|Sci-Fi,7.5,462669,462669,462669,462669,118000
9,Harry Potter and the Half-Blood Prince,153.0,David Yates,Adventure|Family|Fantasy|Mystery,7.5,321795,321795,321795,321795,10000
