In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [53]:
def generate_prompt_predict_possible_joins(target_description, candidate_descriptions, candidate_tables):

    system_msg = f"""
            Given one target column description and many candidate column descriptions, predict all the pairs (candidate table name, candidate 
            description column name) that could be joined.

            Task: Look carefully at the target column description and candidate column descriptions and use this information to identify 
            patterns and relationships between the descriptions, the result must be a list of all the JOINable pairs found. If no joinable pair is 
            found the result should be just the word "none".

            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them 
            based on related columns between them.
            
            Instructions: 
                1. Look at the target description given to you. 
                2. Look at the candidate descriptions in detail. 
                3. Predict if the target column description belongs to a column that may be used in join. 
                4. Select all the highly likely JOINs between these columns based only on these descriptions. Disregard the column names.

            Example 1:
                Target description: this column represents a worker's id
                Candidate tables: ['salary.csv','salary.csv','hospital.csv']
                Candidate description: ['the column worker_id represents the worker's id', 'this column represents a worker's salary', 'this column represents a hospital location']
                Possible JOINs: ('salary.csv', 'worker_id')
            """
    
    user_msg = f"""Target description:      {target_description}
                   Candidate table:         {candidate_tables.values}
                   Candidate descriptions:  {candidate_descriptions.values}
                   Possible JOINs: """.strip()
    
    return system_msg, user_msg

In [None]:
def generate_predictions(target_descriptions, candidate_tables, candidate_descriptions, client):
    
    system_msg_predict_joins, user_msg_predict_joins = generate_prompt_predict_possible_joins(target_descriptions, candidate_tables, candidate_descriptions)
    result = execute_prompt(client, system_msg_predict_joins, user_msg_predict_joins)
    joins = result.choices[0].message.content.split('Possible JOINs: ')[-1].strip()
    
    return joins

In [None]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            temperature=0.2,
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [None]:
client = OpenAI()

In [None]:
descriptions = pd.read_csv('Description_test/all_descriptions.csv')
descriptions.head()

In [5]:
files = ['eo_pr.csv', 'cultural-spaces.csv', 'public-art.csv', 'libraries.csv', 'schools.csv']
files

['eo_pr.csv',
 'cultural-spaces.csv',
 'public-art.csv',
 'libraries.csv',
 'schools.csv']

In [None]:
sample_descriptions = descriptions[descriptions['TableName'].isin(files)]
sample_descriptions = sample_descriptions.reset_index(drop=True)
sample_descriptions.head()

In [None]:
sample_descriptions['TableName'].value_counts()

In [None]:
df_dsInformation = pd.read_csv('datasetInformation_testbedXS.csv')

In [58]:
# joins_dict = {}

for file in files:
    info  = df_dsInformation[df_dsInformation['filename'] == file]                
    table = pd.read_csv(f'datasets/{file}', delimiter=info['delimiter'].values[0])

    table_descriptions = sample_descriptions[sample_descriptions['TableName'] == file]
    candidate_tables       = sample_descriptions[sample_descriptions['TableName'] != file].iloc[:,0]
    candidate_descriptions = sample_descriptions[sample_descriptions['TableName'] != file].iloc[:,2]

    joins_dict[f'{file}'] = {}
    
    for i in tqdm(range(table_descriptions.shape[0])):
        target_description = table_descriptions.iloc[i, 2]
        joins = generate_predictions(target_description, candidate_tables, candidate_descriptions, client)
        joins_dict[f'{file}'][f'{table_descriptions.iloc[i, 1]}'] = joins

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [00:29<00:00,  1.56s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.88s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:08<00:00,  1.76s/it]


In [60]:
with open('Description_test/join_prediction_v3.json', 'w') as file:
    json.dump(joins_dict, file)

In [15]:
with open('Description_test/join_prediction_v3.json', 'r') as file:
    joins_dict = json.load(file)

In [16]:
joins_dict

{'eo_pr.csv': {'EIN': 'none',
  'NAME': "('libraries.csv', 'NAME')",
  'ICO': 'none',
  'STREET': "('public-art.csv', 'SiteAddress'), ('schools.csv', 'ADDRESS'), ('libraries.csv', 'ADDRESS'), ('cultural-spaces.csv', 'ADDRESS')",
  'CITY': "('public-art.csv', 'Neighbourhood')\n('public-art.csv', 'SiteName')\n('public-art.csv', 'GeoLocalArea')\n('schools.csv', 'Geo Local Area')\n('libraries.csv', 'Geo Local Area')\n('cultural-spaces.csv', 'LOCAL_AREA')",
  'STATE': 'none',
  'ZIP': 'None',
  'GROUP': 'none',
  'SUBSECTION': 'none',
  'AFFILIATION': 'none',
  'CLASSIFICATION': 'none',
  'RULING': 'none',
  'DEDUCTIBILITY': 'none',
  'FOUNDATION': 'none',
  'ACTIVITY': 'none',
  'ORGANIZATION': 'none',
  'STATUS': 'none',
  'TAX_PERIOD': 'none',
  'ASSET_CD': 'none',
  'INCOME_CD': 'none',
  'FILING_REQ_CD': 'none',
  'PF_FILING_REQ_CD': 'none',
  'ACCT_PD': 'none',
  'ASSET_AMT': 'none',
  'INCOME_AMT': 'none',
  'REVENUE_AMT': 'none',
  'NTEE_CD': 'none',
  'SORT_NAME': 'none'},
 'cultur

In [80]:
groundTruth = pd.read_csv('joinable_columns_gt3_quality.csv')
groundTruth

Unnamed: 0,ds_name,att_name,sizeDistinct1,ds_name_2,att_name_2,sizeDistinct2,joinSize,trueContainment,trueQuality
0,road-ahead-current-road-closures.csv,COMP_DATE,12,road-ahead-upcoming-projects.csv,COMP_DATE,35,6,0.500000,3.0
1,eo_pr.csv,NTEE_CD,302,eo_xx.csv,NTEE_CD,397,168,0.556291,3.0
2,eo_pr.csv,NAME,1270,eo4.csv,NAME,3097,1270,1.000000,4.0
3,eo_pr.csv,ICO,784,eo4.csv,ICO,2050,784,1.000000,4.0
4,eo_pr.csv,STREET,1266,eo4.csv,STREET,3045,1266,1.000000,4.0
...,...,...,...,...,...,...,...,...,...
93,eo_xx.csv,ZIP,634,eo4.csv,ZIP,1745,634,1.000000,4.0
94,eo_xx.csv,NTEE_CD,397,eo4.csv,NTEE_CD,531,397,1.000000,4.0
95,eo4.csv,NTEE_CD,531,eo_xx.csv,NTEE_CD,397,397,0.747646,3.0
96,eo_xx.csv,SORT_NAME,391,eo4.csv,SORT_NAME,667,391,1.000000,4.0


In [81]:
groundTruth = groundTruth[groundTruth['ds_name'].isin(files) & groundTruth['ds_name_2'].isin(files)]
groundTruth = groundTruth.reset_index(drop=True)
groundTruth

Unnamed: 0,ds_name,att_name,sizeDistinct1,ds_name_2,att_name_2,sizeDistinct2,joinSize,trueContainment,trueQuality
0,cultural-spaces.csv,LOCAL_AREA,23,public-art.csv,Neighbourhood,24,21,0.913043,4.0
1,public-art.csv,Neighbourhood,24,cultural-spaces.csv,LOCAL_AREA,23,21,0.875,4.0
2,cultural-spaces.csv,LOCAL_AREA,23,public-art.csv,GeoLocalArea,20,19,0.826087,4.0
3,public-art.csv,GeoLocalArea,20,cultural-spaces.csv,LOCAL_AREA,23,19,0.95,4.0
4,cultural-spaces.csv,LOCAL_AREA,23,schools.csv,Geo Local Area,22,21,0.913043,4.0
5,schools.csv,Geo Local Area,22,cultural-spaces.csv,LOCAL_AREA,23,21,0.954545,4.0
6,cultural-spaces.csv,LOCAL_AREA,23,libraries.csv,Geo Local Area,19,19,0.826087,4.0
7,libraries.csv,Geo Local Area,19,cultural-spaces.csv,LOCAL_AREA,23,19,1.0,4.0
8,public-art.csv,Neighbourhood,24,schools.csv,Geo Local Area,22,20,0.833333,4.0
9,schools.csv,Geo Local Area,22,public-art.csv,Neighbourhood,24,20,0.909091,4.0


In [82]:
groundTruth = groundTruth[['ds_name', 'att_name', 'ds_name_2', 'att_name_2']]
groundTruth

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2
0,cultural-spaces.csv,LOCAL_AREA,public-art.csv,Neighbourhood
1,public-art.csv,Neighbourhood,cultural-spaces.csv,LOCAL_AREA
2,cultural-spaces.csv,LOCAL_AREA,public-art.csv,GeoLocalArea
3,public-art.csv,GeoLocalArea,cultural-spaces.csv,LOCAL_AREA
4,cultural-spaces.csv,LOCAL_AREA,schools.csv,Geo Local Area
5,schools.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA
6,cultural-spaces.csv,LOCAL_AREA,libraries.csv,Geo Local Area
7,libraries.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA
8,public-art.csv,Neighbourhood,schools.csv,Geo Local Area
9,schools.csv,Geo Local Area,public-art.csv,Neighbourhood


In [19]:
def remove_extra_quote(string):
    return string.replace('"','').replace("'",'').strip()

In [23]:
left_table_name = []
left_column_name = []
right_table_name = []
right_column_name = []

for left_table in joins_dict.keys():
    for left_column in joins_dict[left_table].keys():
        if(joins_dict[left_table][left_column] != 'none'):
            for predicted_joins in joins_dict[left_table][left_column].split('\n'):
                try:
                    rtn = remove_extra_quote(predicted_joins.split('(')[1].split(',')[0])
                    right_table_name.append(rtn if rtn > left_table else left_table)
                    left_table_name.append(rtn if rtn < left_table else left_table)

                    rcn = remove_extra_quote(predicted_joins.split(')')[0].split(',')[1])
                    
                    right_column_name.append(rcn if rcn > left_column else left_column)
                    left_column_name.append(rcn if rcn < left_column else left_column)
                except:
                    continue

In [24]:
d = {'LEFT_TABLE':left_table_name,
     'LEFT_COLUMN':left_column_name,
     'RIGHT_TABLE':right_table_name,
     'RIGHT_COLUMN':right_column_name}
predicted_joins_df = pd.DataFrame(d)

In [25]:
predicted_joins_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN
0,eo_pr.csv,NAME,libraries.csv,NAME
1,eo_pr.csv,STREET,public-art.csv,SiteAddress
2,eo_pr.csv,CITY,public-art.csv,Neighbourhood
3,eo_pr.csv,CITY,public-art.csv,SiteName
4,eo_pr.csv,CITY,public-art.csv,GeoLocalArea
5,eo_pr.csv,CITY,schools.csv,Geo Local Area
6,eo_pr.csv,CITY,libraries.csv,Geo Local Area
7,cultural-spaces.csv,CITY,eo_pr.csv,LOCAL_AREA
8,cultural-spaces.csv,ADDRESS,public-art.csv,SiteAddress
9,cultural-spaces.csv,GeoLocalArea,public-art.csv,LOCAL_AREA


In [79]:
predicted_joins_df = predicted_joins_df.drop_duplicates()
predicted_joins_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN
0,eo_pr.csv,NAME,libraries.csv,NAME
1,eo_pr.csv,STREET,public-art.csv,SiteAddress
2,eo_pr.csv,CITY,public-art.csv,Neighbourhood
3,eo_pr.csv,CITY,public-art.csv,SiteName
4,eo_pr.csv,CITY,public-art.csv,GeoLocalArea
5,eo_pr.csv,CITY,schools.csv,Geo Local Area
6,eo_pr.csv,CITY,libraries.csv,Geo Local Area
7,cultural-spaces.csv,CITY,eo_pr.csv,LOCAL_AREA
8,cultural-spaces.csv,ADDRESS,public-art.csv,SiteAddress
9,cultural-spaces.csv,GeoLocalArea,public-art.csv,LOCAL_AREA


In [83]:
left_table_name = []
left_column_name = []
right_table_name = []
right_column_name = []
groundTruth_df = pd.DataFrame((), columns=groundTruth.columns)

for i in range(groundTruth.shape[0]):
    rtn = groundTruth.iloc[i, 2]
    ltn = groundTruth.iloc[i, 0]
    right_table_name.append(rtn if rtn > ltn else ltn)
    left_table_name.append(rtn if rtn < ltn else ltn)

    rcn = groundTruth.iloc[i, 3]
    lcn = groundTruth.iloc[i, 1]
    right_column_name.append(rcn if rcn > lcn else lcn)
    left_column_name.append(rcn if rcn < lcn else lcn)

In [90]:
d = {'LEFT_TABLE':left_table_name,
     'LEFT_COLUMN':left_column_name,
     'RIGHT_TABLE':right_table_name,
     'RIGHT_COLUMN':right_column_name}
groundTruth_df = pd.DataFrame(d)
groundTruth_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN
0,cultural-spaces.csv,LOCAL_AREA,public-art.csv,Neighbourhood
1,cultural-spaces.csv,LOCAL_AREA,public-art.csv,Neighbourhood
2,cultural-spaces.csv,GeoLocalArea,public-art.csv,LOCAL_AREA
3,cultural-spaces.csv,GeoLocalArea,public-art.csv,LOCAL_AREA
4,cultural-spaces.csv,Geo Local Area,schools.csv,LOCAL_AREA
5,cultural-spaces.csv,Geo Local Area,schools.csv,LOCAL_AREA
6,cultural-spaces.csv,Geo Local Area,libraries.csv,LOCAL_AREA
7,cultural-spaces.csv,Geo Local Area,libraries.csv,LOCAL_AREA
8,public-art.csv,Geo Local Area,schools.csv,Neighbourhood
9,public-art.csv,Geo Local Area,schools.csv,Neighbourhood


In [92]:
groundTruth_df = groundTruth_df.drop_duplicates()
groundTruth_df = groundTruth_df.reset_index(drop=True)
groundTruth_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN
0,cultural-spaces.csv,LOCAL_AREA,public-art.csv,Neighbourhood
1,cultural-spaces.csv,GeoLocalArea,public-art.csv,LOCAL_AREA
2,cultural-spaces.csv,Geo Local Area,schools.csv,LOCAL_AREA
3,cultural-spaces.csv,Geo Local Area,libraries.csv,LOCAL_AREA
4,public-art.csv,Geo Local Area,schools.csv,Neighbourhood
5,public-art.csv,Geo Local Area,schools.csv,GeoLocalArea
6,libraries.csv,Geo Local Area,public-art.csv,Neighbourhood
7,libraries.csv,Geo Local Area,public-art.csv,GeoLocalArea
8,libraries.csv,Geo Local Area,schools.csv,Geo Local Area


In [27]:
key = []
for i in range(predicted_joins_df.shape[0]):
    key.append("#".join(predicted_joins_df.iloc[i,:]))
predicted_joins_df['KEY'] = key
predicted_joins_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY
0,eo_pr.csv,NAME,libraries.csv,NAME,eo_pr.csv#NAME#libraries.csv#NAME
1,eo_pr.csv,STREET,public-art.csv,SiteAddress,eo_pr.csv#STREET#public-art.csv#SiteAddress
2,eo_pr.csv,CITY,public-art.csv,Neighbourhood,eo_pr.csv#CITY#public-art.csv#Neighbourhood
3,eo_pr.csv,CITY,public-art.csv,SiteName,eo_pr.csv#CITY#public-art.csv#SiteName
4,eo_pr.csv,CITY,public-art.csv,GeoLocalArea,eo_pr.csv#CITY#public-art.csv#GeoLocalArea
5,eo_pr.csv,CITY,schools.csv,Geo Local Area,eo_pr.csv#CITY#schools.csv#Geo Local Area
6,eo_pr.csv,CITY,libraries.csv,Geo Local Area,eo_pr.csv#CITY#libraries.csv#Geo Local Area
7,cultural-spaces.csv,CITY,eo_pr.csv,LOCAL_AREA,cultural-spaces.csv#CITY#eo_pr.csv#LOCAL_AREA
8,cultural-spaces.csv,ADDRESS,public-art.csv,SiteAddress,cultural-spaces.csv#ADDRESS#public-art.csv#Sit...
9,cultural-spaces.csv,GeoLocalArea,public-art.csv,LOCAL_AREA,cultural-spaces.csv#GeoLocalArea#public-art.cs...


In [106]:
predicted_joins_df.to_csv('Description_test/predicted_joins_df_20241030.csv', index=False)

In [94]:
key = []
for i in range(groundTruth_df.shape[0]):
    key.append("#".join(groundTruth_df.iloc[i,:]))
groundTruth_df['KEY'] = key
groundTruth_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY
0,cultural-spaces.csv,LOCAL_AREA,public-art.csv,Neighbourhood,cultural-spaces.csv#LOCAL_AREA#public-art.csv#...
1,cultural-spaces.csv,GeoLocalArea,public-art.csv,LOCAL_AREA,cultural-spaces.csv#GeoLocalArea#public-art.cs...
2,cultural-spaces.csv,Geo Local Area,schools.csv,LOCAL_AREA,cultural-spaces.csv#Geo Local Area#schools.csv...
3,cultural-spaces.csv,Geo Local Area,libraries.csv,LOCAL_AREA,cultural-spaces.csv#Geo Local Area#libraries.c...
4,public-art.csv,Geo Local Area,schools.csv,Neighbourhood,public-art.csv#Geo Local Area#schools.csv#Neig...
5,public-art.csv,Geo Local Area,schools.csv,GeoLocalArea,public-art.csv#Geo Local Area#schools.csv#GeoL...
6,libraries.csv,Geo Local Area,public-art.csv,Neighbourhood,libraries.csv#Geo Local Area#public-art.csv#Ne...
7,libraries.csv,Geo Local Area,public-art.csv,GeoLocalArea,libraries.csv#Geo Local Area#public-art.csv#Ge...
8,libraries.csv,Geo Local Area,schools.csv,Geo Local Area,libraries.csv#Geo Local Area#schools.csv#Geo L...


In [107]:
groundTruth_df.to_csv('Description_test/groundTruth_df_20241030.csv', index=False)

In [103]:
tp = 0
fp = 0
fn = 0

for i in range(len(predicted_joins_df)):
    if(predicted_joins_df.iloc[i,4] in groundTruth_df.iloc[:,4].values):
        tp += 1
    else:
        fp += 1
    
for i in range(len(groundTruth_df)):
    if(groundTruth_df.iloc[i,4] not in predicted_joins_df.iloc[:,4].values):
        fn += 1
    
precision = (tp)/(tp+fp)
recall = (tp)/(tp+fn)
f1_score = 2 * (precision*recall)/(precision+recall)

print(precision, recall, f1_score)

0.3103448275862069 1.0 0.4736842105263158


In [37]:
gt = pd.read_csv('groundTruth_testbedXS.csv')
gt

Unnamed: 0,ds_name,att_name,sizeDistinct1,ds_name_2,att_name_2,sizeDistinct2,joinSize,trueContainment,trueQuality
0,property-tie-lines.csv,Geom,3311,datasets_579296_1047868_authors.csv,NAME,107,0,0.0,0.0
1,datasets_579296_1047868_authors.csv,NAME,107,property-tie-lines.csv,Geom,3311,0,0.0,0.0
2,property-tie-lines.csv,P_ANNOTATION_LBL_ID,3311,datasets_579296_1047868_authors.csv,NAME,107,0,0.0,0.0
3,datasets_579296_1047868_authors.csv,NAME,107,property-tie-lines.csv,P_ANNOTATION_LBL_ID,3311,0,0.0,0.0
4,property-tie-lines.csv,Geom,3311,road-ahead-current-road-closures.csv,PROJECT,17,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
23935,community-centres.csv,URLLINK,26,eo4.csv,SORT_NAME,667,0,0.0,0.0
23936,eo4.csv,SORT_NAME,667,community-centres.csv,Geom,27,0,0.0,0.0
23937,community-centres.csv,Geom,27,eo4.csv,SORT_NAME,667,0,0.0,0.0
23938,eo4.csv,SORT_NAME,667,community-centres.csv,Geo Local Area,18,0,0.0,0.0


In [38]:
gt = gt[gt['ds_name'].isin(files) & gt['ds_name_2'].isin(files)]
gt = gt.reset_index(drop=True)
gt

Unnamed: 0,ds_name,att_name,sizeDistinct1,ds_name_2,att_name_2,sizeDistinct2,joinSize,trueContainment,trueQuality
0,eo_pr.csv,NAME,1270,cultural-spaces.csv,CULTURAL_SPACE_NAME,463,0,0.000000,0.0
1,cultural-spaces.csv,CULTURAL_SPACE_NAME,463,eo_pr.csv,NAME,1270,0,0.000000,0.0
2,eo_pr.csv,NAME,1270,cultural-spaces.csv,WEBSITE,350,0,0.000000,0.0
3,cultural-spaces.csv,WEBSITE,350,eo_pr.csv,NAME,1270,0,0.000000,0.0
4,eo_pr.csv,NAME,1270,cultural-spaces.csv,TYPE,8,0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...
1447,libraries.csv,URLLINK,22,schools.csv,Geo Local Area,22,0,0.000000,0.0
1448,schools.csv,Geo Local Area,22,libraries.csv,Geom,21,0,0.000000,0.0
1449,libraries.csv,Geom,21,schools.csv,Geo Local Area,22,0,0.000000,0.0
1450,schools.csv,Geo Local Area,22,libraries.csv,Geo Local Area,19,19,0.863636,4.0


In [39]:
gt[gt['trueQuality'] >= 0]

Unnamed: 0,ds_name,att_name,sizeDistinct1,ds_name_2,att_name_2,sizeDistinct2,joinSize,trueContainment,trueQuality
0,eo_pr.csv,NAME,1270,cultural-spaces.csv,CULTURAL_SPACE_NAME,463,0,0.000000,0.0
1,cultural-spaces.csv,CULTURAL_SPACE_NAME,463,eo_pr.csv,NAME,1270,0,0.000000,0.0
2,eo_pr.csv,NAME,1270,cultural-spaces.csv,WEBSITE,350,0,0.000000,0.0
3,cultural-spaces.csv,WEBSITE,350,eo_pr.csv,NAME,1270,0,0.000000,0.0
4,eo_pr.csv,NAME,1270,cultural-spaces.csv,TYPE,8,0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...
1447,libraries.csv,URLLINK,22,schools.csv,Geo Local Area,22,0,0.000000,0.0
1448,schools.csv,Geo Local Area,22,libraries.csv,Geom,21,0,0.000000,0.0
1449,libraries.csv,Geom,21,schools.csv,Geo Local Area,22,0,0.000000,0.0
1450,schools.csv,Geo Local Area,22,libraries.csv,Geo Local Area,19,19,0.863636,4.0


In [40]:
left_table_name = []
left_column_name = []
right_table_name = []
right_column_name = []
gt_df = pd.DataFrame((), columns=gt.columns)

for i in range(gt.shape[0]):
    rtn = gt.iloc[i, 3]
    ltn = gt.iloc[i, 0]
    right_table_name.append(rtn if rtn > ltn else ltn)
    left_table_name.append(rtn if rtn < ltn else ltn)

    rcn = gt.iloc[i, 4]
    lcn = gt.iloc[i, 1]
    right_column_name.append(rcn if rcn > lcn else lcn)
    left_column_name.append(rcn if rcn < lcn else lcn)

In [41]:
d = {'LEFT_TABLE':left_table_name,
     'LEFT_COLUMN':left_column_name,
     'RIGHT_TABLE':right_table_name,
     'RIGHT_COLUMN':right_column_name}
gt_df = pd.DataFrame(d)
gt_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN
0,cultural-spaces.csv,CULTURAL_SPACE_NAME,eo_pr.csv,NAME
1,cultural-spaces.csv,CULTURAL_SPACE_NAME,eo_pr.csv,NAME
2,cultural-spaces.csv,NAME,eo_pr.csv,WEBSITE
3,cultural-spaces.csv,NAME,eo_pr.csv,WEBSITE
4,cultural-spaces.csv,NAME,eo_pr.csv,TYPE
...,...,...,...,...
1447,libraries.csv,Geo Local Area,schools.csv,URLLINK
1448,libraries.csv,Geo Local Area,schools.csv,Geom
1449,libraries.csv,Geo Local Area,schools.csv,Geom
1450,libraries.csv,Geo Local Area,schools.csv,Geo Local Area


In [42]:
gt_df = gt_df.drop_duplicates()
gt_df = gt_df.reset_index(drop=True)
gt_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN
0,cultural-spaces.csv,CULTURAL_SPACE_NAME,eo_pr.csv,NAME
1,cultural-spaces.csv,NAME,eo_pr.csv,WEBSITE
2,cultural-spaces.csv,NAME,eo_pr.csv,TYPE
3,cultural-spaces.csv,NAME,eo_pr.csv,PRIMARY_USE
4,cultural-spaces.csv,ADDRESS,eo_pr.csv,NAME
...,...,...,...,...
716,libraries.csv,Geom,schools.csv,Geom
717,libraries.csv,Geo Local Area,schools.csv,Geom
718,libraries.csv,Geo Local Area,schools.csv,NAME
719,libraries.csv,Geo Local Area,schools.csv,URLLINK


In [43]:
key = []
for i in range(gt_df.shape[0]):
    key.append("#".join(gt_df.iloc[i,:]))
gt_df['KEY'] = key
gt_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY
0,cultural-spaces.csv,CULTURAL_SPACE_NAME,eo_pr.csv,NAME,cultural-spaces.csv#CULTURAL_SPACE_NAME#eo_pr....
1,cultural-spaces.csv,NAME,eo_pr.csv,WEBSITE,cultural-spaces.csv#NAME#eo_pr.csv#WEBSITE
2,cultural-spaces.csv,NAME,eo_pr.csv,TYPE,cultural-spaces.csv#NAME#eo_pr.csv#TYPE
3,cultural-spaces.csv,NAME,eo_pr.csv,PRIMARY_USE,cultural-spaces.csv#NAME#eo_pr.csv#PRIMARY_USE
4,cultural-spaces.csv,ADDRESS,eo_pr.csv,NAME,cultural-spaces.csv#ADDRESS#eo_pr.csv#NAME
...,...,...,...,...,...
716,libraries.csv,Geom,schools.csv,Geom,libraries.csv#Geom#schools.csv#Geom
717,libraries.csv,Geo Local Area,schools.csv,Geom,libraries.csv#Geo Local Area#schools.csv#Geom
718,libraries.csv,Geo Local Area,schools.csv,NAME,libraries.csv#Geo Local Area#schools.csv#NAME
719,libraries.csv,Geo Local Area,schools.csv,URLLINK,libraries.csv#Geo Local Area#schools.csv#URLLINK


In [44]:
tp = 0
fp = 0
fn = 0

for i in range(len(predicted_joins_df)):
    if(predicted_joins_df.iloc[i,4] in gt_df.iloc[:,4].values):
        tp += 1
    else:
        fp += 1
    
for i in range(len(gt_df)):
    if(gt_df.iloc[i,4] not in predicted_joins_df.iloc[:,4].values):
        fn += 1
    
precision = (tp)/(tp+fp)
recall = (tp)/(tp+fn)
f1_score = 2 * (precision*recall)/(precision+recall)

print(precision, recall, f1_score)

1.0 0.0546448087431694 0.10362694300518134
