In [None]:
# Based on some preliminary tests, my idea for table & entity selection is as follows:

# - start with top 30 tables with largets 'tableNclusters' as candidates; in each iteration, randomly select 500 entities 
#   calculate the number of cluster occurences for each table; save the results
# - redo the above step for N=10000 times, and calculate the mean of cluster occurences for each table
# - select the 15 tables with largest mean occurence. The underlying assumption is that, if a table is more likely to 
#   cover some randomly selected entities, it should have a higher probability of including the 500 selected entities
#   in the end.
# - do the above random sampling again on entities and select the 500 entities that are most likely to be included in the
#   15 tables selected from the last step.

In [1]:
import os
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
source_path = r"C:\Users\chench10\Downloads" 
file = 'MFile_cleaned_upper'

df = pd.read_json(os.path.join(source_path, file), compression='gzip', orient='records', lines=True) 

In [9]:
df_copy = df.copy()

In [34]:
df = df_copy

tmp = pd.DataFrame(df.groupby(['table_id'])['cluster_id'].nunique()).reset_index()
tmp.columns = ['table_id','tableNclusters']
df = df.merge(tmp, on='table_id', how='left').sort_values(by='tableNclusters', ascending=False)

tmp = pd.DataFrame(df.groupby(['cluster_id'])['table_id'].nunique()).reset_index()
tmp.columns = ['cluster_id','clusterNtables']
df = df.merge(tmp, on='cluster_id', how='left')

In [53]:
df

Unnamed: 0,cluster_id,table_id,row_id,page_url,tableNclusters,clusterNtables
0,1,Hotel_marriott.com_September2020.json.gz,5860,https://whattoexpect.marriott.com/rommd,3096,2
1,4342,Hotel_marriott.com_September2020.json.gz,5229,https://www.marriott.com/hotels/local-things-t...,3096,3
2,4359,Hotel_marriott.com_September2020.json.gz,5971,https://www.marriott.com/hotels/travel/bosfx-r...,3096,3
3,4358,Hotel_marriott.com_September2020.json.gz,7383,https://www.marriott.com/hotels/hotel-reviews/...,3096,2
4,4354,Hotel_marriott.com_September2020.json.gz,5552,https://www.marriott.com/hotels/hotel-deals/bo...,3096,2
...,...,...,...,...,...,...
40506,10126,Restaurant_acquaefarinaparis.com_September2020...,14,http://www.acquaefarinaparis.com/el/address-co...,1,2
40507,3864,LocalBusiness_potspace.com_September2020.json.gz,93,https://www.potspace.com/dispensaries/los-ange...,1,2
40508,10126,Restaurant_acquaefarinaparis.com_September2020...,0,https://www.acquaefarinaparis.com/nl/,1,2
40509,10126,Restaurant_acquaefarinaparis.com_September2020...,13,http://www.acquaefarinaparis.com/,1,2


### Table Selection

In [128]:
ncluster = df['cluster_id'].nunique()
ntables  = df['table_id'].nunique()

In [72]:
# np.random.seed(42)
# table_idx = np.random.randint(0, ntables, 15)   # 15 is the number of selected tables
# table_candidates = tables[table_idx]

# cluster_idx = np.random.randint(0, ntables, 500)   # 500 is the number of selected clusters
# cluster_candidates = clusters[cluster_idx]

In [29]:
# initialize a dictionary to store cluster frequency
# cluster_fre = {key: 0 for key in range(ncluster)}

# for t in table_candidates:
#     clusters = df[df['table_id']==t]['cluster_id'].unique()
#     for c in clusters:
#         cluster_fre[c] += 1
        
# results =pd.DataFrame.from_dict(cluster_fre, orient='index', columns=['frequency']).sort_values('frequency', ascending=False)

In [160]:
np.random.seed(42)

sampling_tables = 30     # number of table candidates
sampling_times = 30

tables = np.array(df['table_id'].unique())[:sampling_tables]   
clusters = np.array(df['cluster_id'].unique())

array1 = np.ndarray((sampling_times, sampling_tables))
for i in range(sampling_times):             # number of sampling attempts, e,g. N=10000
    table_candidates = np.random.choice(tables, sampling_tables, replace=False)  
    cluster_candidates = np.random.choice(clusters, 500, replace=False)   # 500 is the number of selected clusters

    list_frequency =[]
    for t in table_candidates:
        l1 = set(df[df['table_id']==t].cluster_id.unique())  # clusters that are included in the table  
        l2 = set(cluster_candidates)                         # choosen clusters

        list_frequency.append(len(l1.intersection(l2)))      

    #if min(list_frequency) > 200:
    #    break
    # comment: seems that the brute force method doesn't work
    
    array2 = np.append(array1, list_frequency).reshape(-1,30)

# calculate average cluster occurence for the 30 tables
print(np.mean(array2, axis=0))

[0.22580645 0.67741935 0.35483871 0.61290323 1.83870968 0.32258065
 3.32258065 1.93548387 2.61290323 0.22580645 0.22580645 0.22580645
 0.32258065 0.87096774 0.87096774 0.4516129  3.5483871  0.48387097
 0.35483871 0.58064516 0.25806452 0.96774194 0.35483871 0.35483871
 0.87096774 0.61290323 1.32258065 0.87096774 0.29032258 0.25806452]


### Entity Selection

In [None]:
# do the above random sampling again on entities and select the 500 entities that are most likely to be included in the
# 15 tables selected from the last step.