In [1]:
import logging
from data_handler import DataHandler
from cocoa import COCOA
from mate import MATE
import psycopg2
from IPython.display import display, HTML
import pandas as pd
import json
import qgrid


def highlight_columns(table, query_columns, target=None, rows=10):
    def highlight_query(s):
        color = 'lightgreen'
        return 'background-color: %s' % color
    
    def highlight_target(s):
        color = 'orange'
        return 'background-color: %s' % color
    
    sample = table.head(rows).style.applymap(highlight_query, subset=pd.IndexSlice[:, query_columns])
    if target:
        sample = sample.applymap(highlight_target, subset=pd.IndexSlice[:, target])

    return sample


#logging.basicConfig(format='%(asctime)s %(message)s')
#logging.getLogger().setLevel(logging.INFO)

db_config = json.load(open("/Users/jannisbecktepe/Developer/db_config.json"))

conn = psycopg2.connect(**db_config)
data_handler = DataHandler(
    conn,
    main_table='cafe_gittables2_main_tokenized',
    column_headers_table='cafe_gittables2_column_headers',
    table_info_table='cafe_gittables2_table_info',
    cocoa_index_table='cafe_gittables2_cocoa_index'
)

display(HTML("<style>.container { width:90% !important; }</style>"))


# Demonstrating Multi-attribute and Order-indexes for Data Discovery
## 1) Input Preparation
### Select input and target columns

In [6]:
input_dataset_name, input_dataset = data_handler.read_csv('../datasets/movie.csv')
display(HTML(input_dataset.head(10).to_html()))

Unnamed: 0,movie_title,duration,director_name,genres,imdb_score
0,Avatar,178.0,James Cameron,Action|Adventure|Fantasy|Sci-Fi,7.9
1,Pirates of the Caribbean: At World's End,169.0,Gore Verbinski,Action|Adventure|Fantasy,7.1
2,Spectre,148.0,Sam Mendes,Action|Adventure|Thriller,6.8
3,The Dark Knight Rises,164.0,Christopher Nolan,Action|Thriller,8.5
4,Star Wars: Episode VII - The Force Awakens,,Doug Walker,Documentary,7.1
5,John Carter,132.0,Andrew Stanton,Action|Adventure|Sci-Fi,6.6
6,Spider-Man 3,156.0,Sam Raimi,Action|Adventure|Romance,6.2
7,Tangled,100.0,Nathan Greno,Adventure|Animation|Comedy|Family|Fantasy|Musical|Romance,7.8
8,Avengers: Age of Ultron,141.0,Joss Whedon,Action|Adventure|Sci-Fi,7.5
9,Harry Potter and the Half-Blood Prince,153.0,David Yates,Adventure|Family|Fantasy|Mystery,7.5


In [7]:
input_columns = ['director_name', 'movie_title']
target_column = 'imdb_score'

input_sample = highlight_columns(input_dataset, input_columns, target=[target_column])
display(HTML(input_sample.to_html()))

Unnamed: 0,movie_title,duration,director_name,genres,imdb_score
0,Avatar,178.0,James Cameron,Action|Adventure|Fantasy|Sci-Fi,7.9
1,Pirates of the Caribbean: At World's End,169.0,Gore Verbinski,Action|Adventure|Fantasy,7.1
2,Spectre,148.0,Sam Mendes,Action|Adventure|Thriller,6.8
3,The Dark Knight Rises,164.0,Christopher Nolan,Action|Thriller,8.5
4,Star Wars: Episode VII - The Force Awakens,,Doug Walker,Documentary,7.1
5,John Carter,132.0,Andrew Stanton,Action|Adventure|Sci-Fi,6.6
6,Spider-Man 3,156.0,Sam Raimi,Action|Adventure|Romance,6.2
7,Tangled,100.0,Nathan Greno,Adventure|Animation|Comedy|Family|Fantasy|Musical|Romance,7.8
8,Avengers: Age of Ultron,141.0,Joss Whedon,Action|Adventure|Sci-Fi,7.5
9,Harry Potter and the Half-Blood Prince,153.0,David Yates,Adventure|Family|Fantasy|Mystery,7.5


## 2) Joinability Discovery

### Find top-50 joinable tables using Super Key Index and MATE Algorithm

In [8]:
mate = MATE(data_handler)
top_joinable_tables = mate.enrich(input_dataset,
                                  input_columns,
                                  50,
                                  dataset_name=input_dataset_name)

joinable_columns_dict = {}
tables_dict = {}
column_headers_dict = {}
for score, table_id, columns, join_map in top_joinable_tables:
    joinable_columns_dict[table_id] = columns

    try:
        table = data_handler.get_table(table_id)
    except:
        continue
    tables_dict[table_id] = table
    
    column_headers = [table.columns[int(col_id)] for col_id in columns.split('_')][:len(input_columns)]
    column_headers_dict[table_id] = column_headers
    highlight_sample = highlight_columns(table, column_headers)
    

Running MATE on the movie.csv dataset.


 61%|█████████████████████████▊                | 184/300 [00:03<00:02, 46.29it/s]


MATE runtime:     3.89s

Hash-based filtered rows: 3086
Hash-based approved rows: 977
Matching rows:            862
FP rows:                  115


## Inspect top joinable tables
### #1

In [9]:
top_table_index = 1

score, table_id, columns, join_map = top_joinable_tables[top_table_index]

tables_dict[table_id].to_csv("../datasets/test.csv", index=False)
print(f'Score: {score}, table_id: {table_id}, joinable columns: {columns}, #rows: {tables_dict[table_id].shape[0]}, #columns: {tables_dict[table_id].shape[1]}')
highlight_sample = highlight_columns(tables_dict[table_id], column_headers_dict[table_id])


display(HTML(highlight_sample.to_html()))

Score: 100, table_id: 21812, joinable columns: 1_11, #rows: 149, #columns: 28


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,color,james cameron,723 0,178 0,0,855 0,joel david moore,1000,760505847 0,action adventure fantasy sci fi,cch pounder,avatar,886204,4834,wes studi,0 0,avatar future marine native paraplegic,http www imdb com title tt0499549 ref fn tt tt 1,3054 0,english,usa,pg 13,237000000 0,2009 0,936,7 9,1 78,33000
1,color,gore verbinski,302 0,169 0,563,1000 0,orlando bloom,40000,309404152 0,action adventure fantasy,johnny depp,pirates caribbean world s end,471220,48350,jack davenport,0 0,goddess marriage ceremony marriage proposal pirate singapore,http www imdb com title tt0449088 ref fn tt tt 1,1238 0,english,usa,pg 13,300000000 0,2007 0,5000,7 1,2 35,0
2,color,sam mendes,602 0,148 0,0,161 0,rory kinnear,11000,200074175 0,action adventure thriller,christoph waltz,spectre,275868,11700,stephanie sigman,1 0,bomb espionage sequel spy terrorist,http www imdb com title tt2379713 ref fn tt tt 1,994 0,english,uk,pg 13,245000000 0,2015 0,393,6 8,2 35,85000
3,color,christopher nolan,813 0,164 0,22000,23000 0,christian bale,27000,448130642 0,action thriller,tom hardy,dark knight rises,1144337,106759,joseph gordon levitt,0 0,deception imprisonment lawlessness police officer terrorist plot,http www imdb com title tt1345836 ref fn tt tt 1,2701 0,english,usa,pg 13,250000000 0,2012 0,23000,8 5,2 35,164000
4,none,doug walker,,,131,,rob walker,131,,documentary,doug walker,star wars episode vii force awakens,8,143,none,0 0,none,http www imdb com title tt5289954 ref fn tt tt 1,,none,none,none,,,12,7 1,,0
5,color,andrew stanton,462 0,132 0,475,530 0,samantha morton,640,73058679 0,action adventure sci fi,daryl sabara,john carter,212204,1873,polly walker,1 0,alien american civil war male nipple mars princess,http www imdb com title tt0401729 ref fn tt tt 1,738 0,english,usa,pg 13,263700000 0,2012 0,632,6 6,2 35,24000
6,color,sam raimi,392 0,156 0,0,4000 0,james franco,24000,336530303 0,action adventure romance,j k simmons,spider man 3,383056,46055,kirsten dunst,0 0,sandman spider man symbiote venom villain,http www imdb com title tt0413300 ref fn tt tt 1,1902 0,english,usa,pg 13,258000000 0,2007 0,11000,6 2,2 35,0
7,color,nathan greno,324 0,100 0,15,284 0,donna murphy,799,200807262 0,adventure animation comedy family fantasy musical romance,brad garrett,tangled,294810,2036,m c gainey,1 0,17th century based fairy tale disney flower tower,http www imdb com title tt0398286 ref fn tt tt 1,387 0,english,usa,pg,260000000 0,2010 0,553,7 8,1 85,29000
8,color,joss whedon,635 0,141 0,0,19000 0,robert downey jr,26000,458991599 0,action adventure sci fi,chris hemsworth,avengers age ultron,462669,92000,scarlett johansson,4 0,artificial intelligence based comic book captain america marvel cinematic universe superhero,http www imdb com title tt2395427 ref fn tt tt 1,1117 0,english,usa,pg 13,250000000 0,2015 0,21000,7 5,2 35,118000
9,color,david yates,375 0,153 0,282,10000 0,daniel radcliffe,25000,301956980 0,adventure family fantasy mystery,alan rickman,harry potter half blood prince,321795,58753,rupert grint,3 0,blood book love potion professor,http www imdb com title tt0417741 ref fn tt tt 1,973 0,english,uk,pg,250000000 0,2009 0,11000,7 5,2 35,10000


### #2

In [10]:
top_table_index = 2

score, table_id, columns, join_map = top_joinable_tables[top_table_index]
tables_dict[table_id].to_csv("../datasets/test.csv", index=False)
print(f'Score: {score}, table_id: {table_id}, joinable columns: {columns}, #rows: {tables_dict[table_id].shape[0]}, #columns: {tables_dict[table_id].shape[1]}')
highlight_sample = highlight_columns(tables_dict[table_id], column_headers_dict[table_id])


display(HTML(highlight_sample.to_html()))

Score: 100, table_id: 159543, joinable columns: 1_11, #rows: 149, #columns: 28


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,color,james cameron,723 0,178 0,0,855 0,joel david moore,1000,760505847 0,action adventure fantasy sci fi,cch pounder,avatar,886204,4834,wes studi,0 0,avatar future marine native paraplegic,http www imdb com title tt0499549 ref fn tt tt 1,3054 0,english,usa,pg 13,237000000 0,2009 0,936,7 9,1 78,33000
1,color,gore verbinski,302 0,169 0,563,1000 0,orlando bloom,40000,309404152 0,action adventure fantasy,johnny depp,pirates caribbean world s end,471220,48350,jack davenport,0 0,goddess marriage ceremony marriage proposal pirate singapore,http www imdb com title tt0449088 ref fn tt tt 1,1238 0,english,usa,pg 13,300000000 0,2007 0,5000,7 1,2 35,0
2,color,sam mendes,602 0,148 0,0,161 0,rory kinnear,11000,200074175 0,action adventure thriller,christoph waltz,spectre,275868,11700,stephanie sigman,1 0,bomb espionage sequel spy terrorist,http www imdb com title tt2379713 ref fn tt tt 1,994 0,english,uk,pg 13,245000000 0,2015 0,393,6 8,2 35,85000
3,color,christopher nolan,813 0,164 0,22000,23000 0,christian bale,27000,448130642 0,action thriller,tom hardy,dark knight rises,1144337,106759,joseph gordon levitt,0 0,deception imprisonment lawlessness police officer terrorist plot,http www imdb com title tt1345836 ref fn tt tt 1,2701 0,english,usa,pg 13,250000000 0,2012 0,23000,8 5,2 35,164000
4,none,doug walker,,,131,,rob walker,131,,documentary,doug walker,star wars episode vii force awakens,8,143,none,0 0,none,http www imdb com title tt5289954 ref fn tt tt 1,,none,none,none,,,12,7 1,,0
5,color,andrew stanton,462 0,132 0,475,530 0,samantha morton,640,73058679 0,action adventure sci fi,daryl sabara,john carter,212204,1873,polly walker,1 0,alien american civil war male nipple mars princess,http www imdb com title tt0401729 ref fn tt tt 1,738 0,english,usa,pg 13,263700000 0,2012 0,632,6 6,2 35,24000
6,color,sam raimi,392 0,156 0,0,4000 0,james franco,24000,336530303 0,action adventure romance,j k simmons,spider man 3,383056,46055,kirsten dunst,0 0,sandman spider man symbiote venom villain,http www imdb com title tt0413300 ref fn tt tt 1,1902 0,english,usa,pg 13,258000000 0,2007 0,11000,6 2,2 35,0
7,color,nathan greno,324 0,100 0,15,284 0,donna murphy,799,200807262 0,adventure animation comedy family fantasy musical romance,brad garrett,tangled,294810,2036,m c gainey,1 0,17th century based fairy tale disney flower tower,http www imdb com title tt0398286 ref fn tt tt 1,387 0,english,usa,pg,260000000 0,2010 0,553,7 8,1 85,29000
8,color,joss whedon,635 0,141 0,0,19000 0,robert downey jr,26000,458991599 0,action adventure sci fi,chris hemsworth,avengers age ultron,462669,92000,scarlett johansson,4 0,artificial intelligence based comic book captain america marvel cinematic universe superhero,http www imdb com title tt2395427 ref fn tt tt 1,1117 0,english,usa,pg 13,250000000 0,2015 0,21000,7 5,2 35,118000
9,color,david yates,375 0,153 0,282,10000 0,daniel radcliffe,25000,301956980 0,adventure family fantasy mystery,alan rickman,harry potter half blood prince,321795,58753,rupert grint,3 0,blood book love potion professor,http www imdb com title tt0417741 ref fn tt tt 1,973 0,english,uk,pg,250000000 0,2009 0,11000,7 5,2 35,10000


# 3) Duplicate Detection using Xash
## Discover duplicate tables and their relationship

In [11]:
from duplicate_detection import DuplicateDetection
import time

dup = DuplicateDetection(data_handler)


duplicate_detection_start = time.time()
duplicate_tables = []
for _, table_id, _, _ in top_joinable_tables:
    table = tables_dict[table_id]
    duplicate_tables += dup.get_duplicate_tables(table)

duplicate_relations = dup.get_relations(duplicate_tables)
print(f"\nTotal runtime: {time.time() - duplicate_detection_start:.2f}s")


Total runtime: 13.80s


## Prepare Duplicates Graph

In [12]:
from pyvis.network import Network
import pandas as pd

net = Network(height='1000px', width='100%',notebook=True)

for t in duplicate_relations:
    net.add_node(t[0],str(t[0]))
    net.add_node(t[1],str(t[1]))
    net.add_edge(t[0], t[1])
    
#net.add_node(0,"0")
#for t in duplicate_tables_first:
#    net.add_node(t,str(t))
#    net.add_edge(0, t)

net.show_buttons(filter_=['physics'])
net.set_edge_smooth("dynamic")
net.toggle_stabilization(False)
net.toggle_physics(False)

# Get row values to generate html tables:
output = ""
for table_id in duplicate_tables:
    #print(data_handler.get_table(table_id).head(10).to_html())
    output += data_handler.get_table(table_id).to_html(table_id=f"t{table_id}", index=None)
    
# Convert CSV table to html table
output = output + table.iloc[:10,:].to_html(table_id='t0', index=None)

with open("template.html", 'r') as file :
  filedata = file.read()

# Replace table placeholder with actual tables html
filedata = filedata.replace('%%tables_placeholder%%', output)

with open("template_new.html", 'w') as file:
  file.write(filedata)

net.prep_notebook(custom_template=True, custom_template_path="template_new.html")

## Draw duplicates graph

In [13]:
net.show("nb.html")

## Remove duplicates from joinable tables

In [14]:
from collections import defaultdict

# group relations by first table in tuple
duplicates_dict = defaultdict(list)
for t1, t2 in duplicate_relations:
    duplicates_dict[t1] += [t2]
    
# merge dictionary into groups
remove_tables = []    # tables that will be removed
for t1 in duplicates_dict:
    for t2 in duplicates_dict[t1]:
        if t2 in duplicates_dict:
            duplicates_dict[t1] += duplicates_dict[t2]
            duplicates_dict[t2] = []
    duplicates_dict[t1] = list(set(duplicates_dict[t1]))
    remove_tables += duplicates_dict[t1]

top_joinable_tables_filtered = []
for i in range(len(top_joinable_tables)):
    if top_joinable_tables[i][1] not in remove_tables:
        top_joinable_tables_filtered += [top_joinable_tables[i]]

print("Original joinable tables:")
print([table[1] for table in top_joinable_tables])

print("\nJoinable tables without duplicates:")
print([table[1] for table in top_joinable_tables_filtered])

Original joinable tables:
[278123, 21812, 159543, 230999, 94283, 281512, 313030, 213621, 161110, 323299, 154592, 201131, 3524, 23637, 36168, 73429, 4049, 190417, 130692, 13831, 89831, 76730, 282751, 205005, 141638, 104916, 224501, 119727, 293100, 276042, 325716, 310980, 301769, 261321, 224797, 228622, 245827, 191042, 294701, 278190, 127400, 201125, 216798, 203660, 263674, 220337, 283965, 295504, 226530, 323079]

Joinable tables without duplicates:
[21812, 94283, 281512, 213621, 161110, 154592, 3524, 23637, 36168, 73429, 4049, 130692, 13831, 141638, 104916, 224797, 294701, 216798, 203660]


# 4) Correlation Calculation
## Obtain top-10 correlating features using Order Index and COCOA Algorithm

In [15]:
from util import get_cleaned_text
cocoa = COCOA(data_handler)
top_correlating_columns = cocoa.enrich_multicolumn(input_dataset, top_joinable_tables_filtered, 10, target_column=target_column)

100%|███████████████████████████████████████████| 19/19 [00:00<00:00, 329.93it/s]

Total runtime: 0.28s
Preparation runtime: 0.22s
Correlation calculation runtime: 0.06s

Evaluated features: 433
Max. correlation coefficient: 0.6324
Min. correlation coefficient: 0.0000





## Visualize top correlating features

In [16]:
# add tokenized input columns for the join
output_dataset = input_dataset.copy()
for input_column in input_columns:
    output_dataset[input_column + "_tokenized"] = input_dataset[input_column].apply(get_cleaned_text)

external_columns = []
for cor, table_col_id in top_correlating_columns[:3]:
    table_id = int(table_col_id.split('_')[0])
    column_id = int(table_col_id.split('_')[1])
    table = tables_dict[table_id]
    
    # add correlation info
    new_col_name = f"{table_id}_{table.columns[column_id]}, cor: {cor:.2f}"
    external_columns += [new_col_name]
    table = table.rename(columns={table.columns[column_id]: new_col_name})
    
    table = table.loc[:, column_headers_dict[table_id] + [table.columns[column_id]]]

    output_dataset = output_dataset.merge(
        table,
        how="left",
        left_on=[col + "_tokenized" for col in input_columns],
        right_on=column_headers_dict[table_id],
        suffixes=('', '_extern')
    )
    # remove external join columns
    for ext_col in column_headers_dict[table_id]:
        if ext_col not in input_columns:
            output_dataset = output_dataset.drop(columns=[ext_col])
    
    output_dataset = output_dataset[[c for c in output_dataset.columns if not c.endswith('_extern')]]

output_dataset = output_dataset[[c for c in output_dataset.columns if not c.endswith('_tokenized')]]

output_sample = highlight_columns(output_dataset, input_columns, target=external_columns)
display(HTML(output_sample.to_html()))

Unnamed: 0,movie_title,duration,director_name,genres,imdb_score,"21812_num_voted_users, cor: 0.63","21812_movie_facebook_likes, cor: 0.45","281512_Year, cor: -0.36"
0,Avatar,178.0,James Cameron,Action|Adventure|Fantasy|Sci-Fi,7.9,886204,33000,2009.0
1,Pirates of the Caribbean: At World's End,169.0,Gore Verbinski,Action|Adventure|Fantasy,7.1,471220,0,2007.0
2,Spectre,148.0,Sam Mendes,Action|Adventure|Thriller,6.8,275868,85000,
3,The Dark Knight Rises,164.0,Christopher Nolan,Action|Thriller,8.5,1144337,164000,2012.0
4,Star Wars: Episode VII - The Force Awakens,,Doug Walker,Documentary,7.1,8,0,
5,John Carter,132.0,Andrew Stanton,Action|Adventure|Sci-Fi,6.6,212204,24000,
6,Spider-Man 3,156.0,Sam Raimi,Action|Adventure|Romance,6.2,383056,0,2007.0
7,Tangled,100.0,Nathan Greno,Adventure|Animation|Comedy|Family|Fantasy|Musical|Romance,7.8,294810,29000,
8,Avengers: Age of Ultron,141.0,Joss Whedon,Action|Adventure|Sci-Fi,7.5,462669,118000,
9,Harry Potter and the Half-Blood Prince,153.0,David Yates,Adventure|Family|Fantasy|Mystery,7.5,321795,10000,


## Materialize join for selected features

In [17]:
output_dataset = output_dataset.iloc[:, [0, 1, 2, 3, 4, 5, 6]]
display(HTML(output_dataset.head(10).to_html()))

Unnamed: 0,movie_title,duration,director_name,genres,imdb_score,"21812_num_voted_users, cor: 0.63","21812_movie_facebook_likes, cor: 0.45"
0,Avatar,178.0,James Cameron,Action|Adventure|Fantasy|Sci-Fi,7.9,886204,33000
1,Pirates of the Caribbean: At World's End,169.0,Gore Verbinski,Action|Adventure|Fantasy,7.1,471220,0
2,Spectre,148.0,Sam Mendes,Action|Adventure|Thriller,6.8,275868,85000
3,The Dark Knight Rises,164.0,Christopher Nolan,Action|Thriller,8.5,1144337,164000
4,Star Wars: Episode VII - The Force Awakens,,Doug Walker,Documentary,7.1,8,0
5,John Carter,132.0,Andrew Stanton,Action|Adventure|Sci-Fi,6.6,212204,24000
6,Spider-Man 3,156.0,Sam Raimi,Action|Adventure|Romance,6.2,383056,0
7,Tangled,100.0,Nathan Greno,Adventure|Animation|Comedy|Family|Fantasy|Musical|Romance,7.8,294810,29000
8,Avengers: Age of Ultron,141.0,Joss Whedon,Action|Adventure|Sci-Fi,7.5,462669,118000
9,Harry Potter and the Half-Blood Prince,153.0,David Yates,Adventure|Family|Fantasy|Mystery,7.5,321795,10000
