In [1]:
import pickle
import pandas as pd
import numpy as np

import glob

In [2]:
def jaccard_similarity(set1, set2):
    
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
     
    return intersection / union

In [3]:
def calculate_similarities(sameSTColumns, df_left, df_right):
    similarity = []
    for i in range(sameSTColumns.shape[0]):
        try:
            similarity.append(jaccard_similarity(set(sameSTColumns.iloc[i,4].split(',')), set(sameSTColumns.iloc[i,5].split(','))))
        except:
            similarity.append(jaccard_similarity(set(sameSTColumns.iloc[i,4].split(',')), set(sameSTColumns.iloc[i,5].split(','))))

    return similarity

In [4]:
def generate_possible_joinable_columns(predictions_left, predictions_right, filename_left, filename_right):
    predicted_joinable_columns = []

    for i in range(predictions_left.shape[0]):
        column = predictions_left.iloc[i,0]
        predicted_semantic_type = predictions_left.iloc[i,1]
        for j in range(predictions_right.shape[0]):
            for st in predicted_semantic_type:
                if(st in predictions_right.iloc[j,1]):
                    predicted_joinable_columns.append([filename_left, column,
                                                       filename_right, predictions_right.iloc[j,:].Column, 
                                                       ",".join(predicted_semantic_type), ",".join(predictions_right.iloc[j,:].Predicted)])
                    

    return predicted_joinable_columns

In [5]:
files = glob.glob('GPT3.5Turbo_predictions/*')

In [6]:
ST_predictions_dict = {}

In [7]:
for file in files:
    with open(file, 'rb') as file:
        ST_predictions_dict = ST_predictions_dict | pickle.load(file)

In [8]:
ST_predictions_dict.keys()

dict_keys(['community-gardens-and-food-trees.csv', 'cultural-spaces.csv', 'eo4.csv', 'eo_pr.csv', 'street-intersections.csv', 'eo_xx.csv', 'libraries.csv', 'population-census-of-botswana-2011.csv', 'public-art-artists.csv', 'public-art.csv', 'rental-standards-current-issues.csv', 'schools.csv', 'statewise-census-data-in-india-1901-2011.csv', 'community-centres.csv'])

In [9]:
ST_predictions_dict['community-gardens-and-food-trees.csv']

Unnamed: 0,Column,Predicted
0,MAPID,[Text]
1,YEAR_CREATED,"[Text, Integer]"
2,NAME,"[Place/name, Text]"
3,STREET_NUMBER,[Integer]
4,STREET_DIRECTION,[none]
5,STREET_NAME,[streetAddress]
6,STREET_TYPE,[streetAddress]
7,MERGED_ADDRESS,"[addressLocality, streetAddress, addressRegion]"
8,NUMBER_OF_PLOTS,[Integer]
9,NUMBER_OF_FOOD_TREES,"[Text, Integer]"


In [10]:
df_joinable_columns = pd.read_csv('joinable_columns_90containment.csv')
all_joinable_files = np.concatenate((df_joinable_columns['ds_name'], df_joinable_columns['ds_name_2']), axis=0)
all_joinable_files = np.unique(all_joinable_files)
all_joinable_files

array(['community-centres.csv', 'community-gardens-and-food-trees.csv',
       'cultural-spaces.csv', 'eo4.csv', 'eo_pr.csv', 'eo_xx.csv',
       'libraries.csv', 'population-census-of-botswana-2011.csv',
       'public-art-artists.csv', 'public-art.csv',
       'rental-standards-current-issues.csv', 'schools.csv',
       'statewise-census-data-in-india-1901-2011.csv',
       'street-intersections.csv'], dtype=object)

In [11]:
df_joinable_columns = df_joinable_columns[(df_joinable_columns['ds_name'].isin(all_joinable_files)) & (df_joinable_columns['ds_name_2'].isin(all_joinable_files))]

In [12]:
df_dsInformation = pd.read_csv('datasetInformation_testbedXS.csv')

In [13]:
all_predicted_joinable_columns = pd.DataFrame([])
similarity_calculations = 0
brute_force_calculations = 0

for i in range(len(all_joinable_files)-1):
    left_info  = df_dsInformation[df_dsInformation['filename'] == all_joinable_files[i]]
    df_left = pd.read_csv(f'datasets/{all_joinable_files[i]}', delimiter=left_info['delimiter'].values[0])
    
    for j in range(i+1, len(all_joinable_files)):
        print(f'Calculating Similarities for tables {all_joinable_files[i]} and {all_joinable_files[j]}.')
        
        right_info = df_dsInformation[df_dsInformation['filename'] == all_joinable_files[j]]
        df_right = pd.read_csv(f'datasets/{all_joinable_files[j]}', delimiter=right_info['delimiter'].values[0])
    
        predictions_left = ST_predictions_dict[all_joinable_files[i]]
        predictions_right = ST_predictions_dict[all_joinable_files[j]]

        predicted_joinable_columns = generate_possible_joinable_columns(predictions_left, predictions_right, all_joinable_files[i], all_joinable_files[j])

        try:
            sameSTColumns = pd.DataFrame(np.array(predicted_joinable_columns), columns=['FilenameLeft', 'ColumnLeft', 
                                                                                        'FilenameRight','ColumnRight',
                                                                                        'SemanticTypeLeft', 'SemanticTypeRight'])
        except ValueError:
            print('No matches found, skipping to next column.')
            continue
            
        similarity = calculate_similarities(sameSTColumns, df_left, df_right)
        sameSTColumns['JaccardSimilarity'] = similarity
        joinableColumns = sameSTColumns[sameSTColumns['JaccardSimilarity'] == 1]

        similarity_calculations += joinableColumns.shape[0]
        brute_force_calculations += df_left.shape[1] * df_right.shape[1]
        
        if(len(joinableColumns) > 0):
            print(f'Adding {joinableColumns.shape[0]} columns')
        
        if(len(all_predicted_joinable_columns) == 0):
            all_predicted_joinable_columns = joinableColumns
        else:
            all_predicted_joinable_columns = pd.concat((all_predicted_joinable_columns, joinableColumns), axis=0)
            print(f'New size {all_predicted_joinable_columns.shape[0]}')

Calculating Similarities for tables community-centres.csv and community-gardens-and-food-trees.csv.
Adding 4 columns
Calculating Similarities for tables community-centres.csv and cultural-spaces.csv.
Adding 4 columns
New size 8
Calculating Similarities for tables community-centres.csv and eo4.csv.
Adding 6 columns
New size 14
Calculating Similarities for tables community-centres.csv and eo_pr.csv.
Adding 6 columns
New size 20
Calculating Similarities for tables community-centres.csv and eo_xx.csv.
Adding 5 columns
New size 25
Calculating Similarities for tables community-centres.csv and libraries.csv.
Adding 2 columns
New size 27
Calculating Similarities for tables community-centres.csv and population-census-of-botswana-2011.csv.
Adding 2 columns
New size 29
Calculating Similarities for tables community-centres.csv and public-art-artists.csv.
Adding 1 columns
New size 30
Calculating Similarities for tables community-centres.csv and public-art.csv.
Adding 9 columns
New size 39
Calculati

In [14]:
all_predicted_joinable_columns

Unnamed: 0,FilenameLeft,ColumnLeft,FilenameRight,ColumnRight,SemanticTypeLeft,SemanticTypeRight,JaccardSimilarity
17,community-centres.csv,Geom,community-gardens-and-food-trees.csv,STREET_DIRECTION,none,none,1.0
18,community-centres.csv,Geom,community-gardens-and-food-trees.csv,OTHER_FOOD_ASSETS,none,none,1.0
19,community-centres.csv,Geom,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL,none,none,1.0
21,community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,Geo Local Area,addressLocality,addressLocality,1.0
7,community-centres.csv,URLLINK,cultural-spaces.csv,WEBSITE,"Text,URL","Text,URL",1.0
...,...,...,...,...,...,...,...
5,schools.csv,Geom,statewise-census-data-in-india-1901-2011.csv,VALUE,none,none,1.0
1,schools.csv,Geom,street-intersections.csv,AT_STREET,none,none,1.0
3,schools.csv,Geo Local Area,street-intersections.csv,Geo Local Area,addressLocality,addressLocality,1.0
0,statewise-census-data-in-india-1901-2011.csv,DATE,street-intersections.csv,ON_STREET,Integer,Integer,1.0


In [15]:
print(similarity_calculations, brute_force_calculations)

1733 15868


In [16]:
all_predicted_joinable_columns.SemanticTypeLeft.value_counts()

SemanticTypeLeft
Integer                          914
none                             454
Text                             239
addressLocality                   54
URL                               17
streetAddress                     14
addressLocality,streetAddress      6
Text,URL                           6
Date                               5
Country                            5
Text,IdentifierAT                  4
Organization                       3
postalCode                         3
addressLocality,Place/name         2
Integer,Date                       2
Place/name,Text                    2
Event/name,Text                    2
CoordinateAT                       1
Name: count, dtype: int64

In [17]:
all_predicted_joinable_columns

Unnamed: 0,FilenameLeft,ColumnLeft,FilenameRight,ColumnRight,SemanticTypeLeft,SemanticTypeRight,JaccardSimilarity
17,community-centres.csv,Geom,community-gardens-and-food-trees.csv,STREET_DIRECTION,none,none,1.0
18,community-centres.csv,Geom,community-gardens-and-food-trees.csv,OTHER_FOOD_ASSETS,none,none,1.0
19,community-centres.csv,Geom,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL,none,none,1.0
21,community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,Geo Local Area,addressLocality,addressLocality,1.0
7,community-centres.csv,URLLINK,cultural-spaces.csv,WEBSITE,"Text,URL","Text,URL",1.0
...,...,...,...,...,...,...,...
5,schools.csv,Geom,statewise-census-data-in-india-1901-2011.csv,VALUE,none,none,1.0
1,schools.csv,Geom,street-intersections.csv,AT_STREET,none,none,1.0
3,schools.csv,Geo Local Area,street-intersections.csv,Geo Local Area,addressLocality,addressLocality,1.0
0,statewise-census-data-in-india-1901-2011.csv,DATE,street-intersections.csv,ON_STREET,Integer,Integer,1.0


In [18]:
all_predicted_joinable_columns_joins = []
excluding_types = ['Text', 'none', 'Integer', 'Text,Integer']
for i in range(len(all_predicted_joinable_columns)):
    if((all_predicted_joinable_columns.iloc[i,:].SemanticTypeLeft not in excluding_types) and
       (all_predicted_joinable_columns.iloc[i,:].SemanticTypeRight not in excluding_types)):
        all_predicted_joinable_columns_joins.append(';'.join(all_predicted_joinable_columns.iloc[i,:4].values))
all_predicted_joinable_columns_joins = np.array(all_predicted_joinable_columns_joins)
all_predicted_joinable_columns_joins

array(['community-centres.csv;Geo Local Area;community-gardens-and-food-trees.csv;Geo Local Area',
       'community-centres.csv;URLLINK;cultural-spaces.csv;WEBSITE',
       'community-centres.csv;URLLINK;cultural-spaces.csv;WEBSITE',
       'community-centres.csv;Geo Local Area;cultural-spaces.csv;LOCAL_AREA',
       'community-centres.csv;Geo Local Area;eo_pr.csv;CITY',
       'community-centres.csv;Geo Local Area;libraries.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;population-census-of-botswana-2011.csv;REGION_NAME',
       'community-centres.csv;NAME;public-art.csv;SiteName',
       'community-centres.csv;NAME;public-art.csv;SiteName',
       'community-centres.csv;ADDRESS;public-art.csv;SiteAddress',
       'community-centres.csv;ADDRESS;public-art.csv;SiteAddress',
       'community-centres.csv;Geo Local Area;public-art.csv;Neighbourhood',
       'community-centres.csv;Geo Local Area;public-art.csv;GeoLocalArea',
       'community-centres.csv;URLLINK;rental

In [19]:
all_predicted_joinable_columns_joins.shape

(126,)

In [20]:
ids = [0,1,4,11,12,13,14,15,16,17,18,19,20,21,22,23]

In [21]:
df_joinable_columns

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2
0,eo_pr.csv,NAME,eo4.csv,NAME
1,eo_pr.csv,ICO,eo4.csv,ICO
2,eo_pr.csv,STREET,eo4.csv,STREET
3,eo_pr.csv,CITY,eo4.csv,CITY
4,eo_pr.csv,STATE,eo4.csv,STATE
...,...,...,...,...
65,eo4.csv,STATE,eo_xx.csv,STATE
66,eo_xx.csv,ZIP,eo4.csv,ZIP
67,eo_xx.csv,NTEE_CD,eo4.csv,NTEE_CD
68,eo_xx.csv,SORT_NAME,eo4.csv,SORT_NAME


In [22]:
all_predicted_joinable_columns_joins = pd.DataFrame([line.split(';') for line in all_predicted_joinable_columns_joins], columns=['ds_name', 'att_name', 'ds_name_2','att_name_2'])
all_predicted_joinable_columns_joins

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2
0,community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,Geo Local Area
1,community-centres.csv,URLLINK,cultural-spaces.csv,WEBSITE
2,community-centres.csv,URLLINK,cultural-spaces.csv,WEBSITE
3,community-centres.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA
4,community-centres.csv,Geo Local Area,eo_pr.csv,CITY
...,...,...,...,...
121,public-art.csv,GeoLocalArea,street-intersections.csv,Geo Local Area
122,rental-standards-current-issues.csv,Street,schools.csv,ADDRESS
123,rental-standards-current-issues.csv,Geo Local Area,schools.csv,Geo Local Area
124,rental-standards-current-issues.csv,Geo Local Area,street-intersections.csv,Geo Local Area


In [23]:
new_df = pd.concat((all_predicted_joinable_columns_joins.iloc[:,:2],all_predicted_joinable_columns_joins.iloc[:,2:].rename(columns={'ds_name_2':'ds_name','att_name_2':'att_name'})))
unique_columns = np.unique([f'{table_name}_{column_name}' for table_name,column_name in new_df.values])
unique_columns

array(['community-centres.csv_ADDRESS',
       'community-centres.csv_Geo Local Area',
       'community-centres.csv_NAME', 'community-centres.csv_URLLINK',
       'community-gardens-and-food-trees.csv_Geo Local Area',
       'community-gardens-and-food-trees.csv_Geom',
       'community-gardens-and-food-trees.csv_NAME',
       'community-gardens-and-food-trees.csv_STREET_NAME',
       'community-gardens-and-food-trees.csv_STREET_TYPE',
       'community-gardens-and-food-trees.csv_WEBSITE',
       'cultural-spaces.csv_CULTURAL_SPACE_NAME',
       'cultural-spaces.csv_Geom', 'cultural-spaces.csv_LOCAL_AREA',
       'cultural-spaces.csv_WEBSITE', 'eo4.csv_CITY', 'eo4.csv_NAME',
       'eo4.csv_RULING', 'eo4.csv_TAX_PERIOD', 'eo4.csv_ZIP',
       'eo_pr.csv_CITY', 'eo_pr.csv_RULING', 'eo_pr.csv_STREET',
       'eo_pr.csv_TAX_PERIOD', 'eo_pr.csv_ZIP', 'eo_xx.csv_CITY',
       'eo_xx.csv_NAME', 'eo_xx.csv_NTEE_CD', 'eo_xx.csv_RULING',
       'eo_xx.csv_STATE', 'eo_xx.csv_TAX_PERIOD', 'eo_xx

In [30]:
np.save('unique_joinable_columns.npy',unique_columns)

In [31]:
all_predicted_joinable_columns_joins.to_csv('predicted_joinable_columns.csv', index=False)

In [24]:
len(unique_columns)

57

In [25]:
n_columns = 0
for key in ST_predictions_dict.keys():
    n_columns += ST_predictions_dict[key].Column.shape[0]
n_columns

188

In [26]:
df_joinable_columns_joins = []
for i in range(len(df_joinable_columns)):
    df_joinable_columns_joins.append(';'.join(df_joinable_columns.iloc[i,:4].values))
    df_joinable_columns_joins.append(';'.join(df_joinable_columns.iloc[i,2:].values)+';'+';'.join(df_joinable_columns.iloc[i,:2].values))
df_joinable_columns_joins = np.array(df_joinable_columns_joins)
df_joinable_columns_joins = np.unique(df_joinable_columns_joins)
df_joinable_columns_joins = df_joinable_columns_joins[ids] 
df_joinable_columns_joins

array(['community-centres.csv;Geo Local Area;community-gardens-and-food-trees.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;cultural-spaces.csv;LOCAL_AREA',
       'community-centres.csv;Geo Local Area;public-art.csv;Neighbourhood',
       'community-gardens-and-food-trees.csv;Geo Local Area;libraries.csv;Geo Local Area',
       'community-gardens-and-food-trees.csv;Geo Local Area;public-art.csv;GeoLocalArea',
       'community-gardens-and-food-trees.csv;Geo Local Area;public-art.csv;Neighbourhood',
       'community-gardens-and-food-trees.csv;Geo Local Area;rental-standards-current-issues.csv;Geo Local Area',
       'community-gardens-and-food-trees.csv;Geo Local Area;schools.csv;Geo Local Area',
       'community-gardens-and-food-trees.csv;Geo Local Area;street-intersections.csv;Geo Local Area',
       'community-gardens-and-food-trees.csv;NUMBER_OF_FOOD_TREES;cultural-spaces.csv;ACTIVE_SPACE',
       'cultural-spaces.csv;ACTIVE_SPACE;community-gardens-and-food-tr

In [27]:
all_predicted_joinable_columns_joins.shape

(126, 4)

In [28]:
df_joinable_columns_joins.shape

(16,)

In [29]:
tp = 0
fp = 0
fn = 0

for i in range(len(all_predicted_joinable_columns_joins)):
    if(";".join(all_predicted_joinable_columns_joins.iloc[i,:]) in df_joinable_columns_joins):
        tp += 1
    else:
        fp += 1
    fn = len(df_joinable_columns_joins) - tp

precision = (tp)/(tp+fp)
recall = (tp)/(tp+fn)
f1_score = 2 * (precision*recall)/(precision+recall)

print(precision, recall, f1_score)

0.0873015873015873 0.6875 0.15492957746478872
