In [6]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [13]:
pd.set_option('display.max_rows', None)

In [7]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [3]:
def generate_prompt_predict_possible_joins(target_description, candidate_descriptions, candidate_tables):

    system_msg = f"""
            Given one target column description and many candidate column descriptions, predict all the pairs (candidate table name, candidate 
            description column name) that could be joined.

            Task: Look carefully at the target column description and candidate column descriptions and use this information to identify 
            patterns and relationships between the descriptions, the result must be a list of all the JOINable pairs found. If no joinable pair is 
            found the result should be just the word "none".

            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them 
            based on related columns between them.
            
            Instructions: 
                1. Look at the target description given to you. 
                2. Look at the candidate descriptions in detail. 
                3. Predict if the target column description belongs to a column that may be used in join. 
                4. Select all the highly likely JOINs between these columns based only on these descriptions. Disregard the column names.
                #####

            Example 1:
                Target description: this column represents a worker's id
                Candidate tables: ['salary.csv','salary.csv','hospital.csv']
                Candidate description: ['the column worker_id represents the worker's id', 'this column represents a worker's salary', 'this column represents a hospital location']
                Possible JOINs: ('salary.csv', 'worker_id')
            """
    
    user_msg = f"""Target description:      {target_description}
                   Candidate table:         {candidate_tables.values}
                   Candidate descriptions:  {candidate_descriptions.values}
                   Possible JOINs: """.strip()
    
    return system_msg, user_msg

In [4]:
def generate_predictions(target_descriptions, candidate_tables, candidate_descriptions, client):
    
    system_msg_predict_joins, user_msg_predict_joins = generate_prompt_predict_possible_joins(target_descriptions, candidate_tables, candidate_descriptions)
    result = execute_prompt(client, system_msg_predict_joins, user_msg_predict_joins)
    joins = result.choices[0].message.content.split('Possible JOINs: ')[-1].strip()
    
    return joins

In [5]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            temperature=0.2,
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [6]:
client = OpenAI()

In [7]:
descriptions = pd.read_csv('Description_test/all_descriptions.csv')
descriptions.head()

Unnamed: 0,TableName,Column,Description
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...
1,statewise-census-data-in-india-1901-2011.csv,DATE,The 'DATE' column in the table represents the ...
2,statewise-census-data-in-india-1901-2011.csv,LOCATION_NAME,"The ""LOCATION_NAME"" column in the table repres..."
3,statewise-census-data-in-india-1901-2011.csv,LOCATION_F5,"The column ""LOCATION_F5"" in the table appears ..."
4,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,"The ""LOCATION_ID"" column contains identifiers ..."


In [10]:
# files = ['eo_pr.csv', 'cultural-spaces.csv', 'public-art.csv', 'libraries.csv', 'schools.csv']
files = [file.split('/')[-1] for file in glob.glob('datasets/*')]
files

['statewise-census-data-in-india-1901-2011.csv',
 'road-ahead-current-road-closures.csv',
 'property-tie-lines.csv',
 'public-art.csv',
 'gvrd-sewer-trunk-mains.csv',
 'SCS_Staff_Salaries_data_30th_June 2010.csv',
 'schools.csv',
 'rental-standards-current-issues.csv',
 'datasets_579296_1047868_authors.csv',
 'survey_results_schema.csv',
 'animal-control-inventory-lost-and-found.csv',
 'glassdoor_wwfu_val_captions.csv',
 'eo_xx.csv',
 'community-gardens-and-food-trees.csv',
 'road-ahead-upcoming-projects.csv',
 'libraries.csv',
 'cultural-spaces.csv',
 'datasets_517172_952401_train.csv',
 'public-art-artists.csv',
 'eo4.csv',
 'currency_exchange.csv',
 'eo_pr.csv',
 'road-ahead-projects-under-construction.csv',
 'ability_ids.csv',
 'population-by-governorate-citizenship-and-gender.csv',
 'community-centres.csv',
 'street-intersections.csv',
 'population-census-of-botswana-2011.csv']

In [9]:
sample_descriptions = descriptions[descriptions['TableName'].isin(files)]
sample_descriptions = sample_descriptions.reset_index(drop=True)
sample_descriptions.head()

Unnamed: 0,TableName,Column,Description
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...
1,statewise-census-data-in-india-1901-2011.csv,DATE,The 'DATE' column in the table represents the ...
2,statewise-census-data-in-india-1901-2011.csv,LOCATION_NAME,"The ""LOCATION_NAME"" column in the table repres..."
3,statewise-census-data-in-india-1901-2011.csv,LOCATION_F5,"The column ""LOCATION_F5"" in the table appears ..."
4,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,"The ""LOCATION_ID"" column contains identifiers ..."


In [10]:
sample_descriptions['TableName'].value_counts()

TableName
eo4.csv                                                 28
eo_pr.csv                                               28
eo_xx.csv                                               28
public-art.csv                                          19
community-gardens-and-food-trees.csv                    19
cultural-spaces.csv                                     12
SCS_Staff_Salaries_data_30th_June 2010.csv              10
public-art-artists.csv                                   9
statewise-census-data-in-india-1901-2011.csv             9
population-census-of-botswana-2011.csv                   8
rental-standards-current-issues.csv                      8
gvrd-sewer-trunk-mains.csv                               7
animal-control-inventory-lost-and-found.csv              7
datasets_579296_1047868_authors.csv                      6
road-ahead-projects-under-construction.csv               6
road-ahead-upcoming-projects.csv                         6
road-ahead-current-road-closures.csv          

In [11]:
df_dsInformation = pd.read_csv('datasetInformation_testbedXS.csv')

In [13]:
joins_dict = {}

for file in files:
    info  = df_dsInformation[df_dsInformation['filename'] == file]                
    table = pd.read_csv(f'datasets/{file}', delimiter=info['delimiter'].values[0])

    table_descriptions = sample_descriptions[sample_descriptions['TableName'] == file]
    candidate_tables       = sample_descriptions[sample_descriptions['TableName'] != file].iloc[:,0]
    candidate_descriptions = sample_descriptions[sample_descriptions['TableName'] != file].iloc[:,2]

    joins_dict[f'{file}'] = {}
    
    for i in tqdm(range(table_descriptions.shape[0])):
        target_description = table_descriptions.iloc[i, 2]
        joins = generate_predictions(target_description, candidate_tables, candidate_descriptions, client)
        joins_dict[f'{file}'][f'{table_descriptions.iloc[i, 1]}'] = joins

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:40<00:00,  4.50s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:25<00:00,  4.24s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.39s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [01:14<00:00,  3.92s/it]
100%|███████████████████████████████████████████

In [14]:
with open('Description_test/join_prediction_v4.json', 'w') as file:
    json.dump(joins_dict, file)

In [74]:
with open('Description_test/join_prediction_v4.json', 'r') as file:
    joins_dict = json.load(file)

In [49]:
joins_dict.keys()

dict_keys(['statewise-census-data-in-india-1901-2011.csv', 'road-ahead-current-road-closures.csv', 'property-tie-lines.csv', 'public-art.csv', 'gvrd-sewer-trunk-mains.csv', 'SCS_Staff_Salaries_data_30th_June 2010.csv', 'schools.csv', 'rental-standards-current-issues.csv', 'datasets_579296_1047868_authors.csv', 'survey_results_schema.csv', 'animal-control-inventory-lost-and-found.csv', 'glassdoor_wwfu_val_captions.csv', 'eo_xx.csv', 'community-gardens-and-food-trees.csv', 'road-ahead-upcoming-projects.csv', 'libraries.csv', 'cultural-spaces.csv', 'datasets_517172_952401_train.csv', 'public-art-artists.csv', 'eo4.csv', 'currency_exchange.csv', 'eo_pr.csv', 'road-ahead-projects-under-construction.csv', 'ability_ids.csv', 'population-by-governorate-citizenship-and-gender.csv', 'community-centres.csv', 'street-intersections.csv', 'population-census-of-botswana-2011.csv'])

In [84]:
joins_dict['community-gardens-and-food-trees.csv']

{'MAPID': 'none',
 'YEAR_CREATED': 'none',
 'NAME': 'none',
 'STREET_NUMBER': 'none',
 'STREET_DIRECTION': 'none',
 'STREET_NAME': 'none',
 'STREET_TYPE': 'none',
 'MERGED_ADDRESS': 'none',
 'NUMBER_OF_PLOTS': 'none',
 'NUMBER_OF_FOOD_TREES': 'none',
 'NOTES': 'none',
 'FOOD_TREE_VARIETIES': 'none',
 'OTHER_FOOD_ASSETS': 'none',
 'JURISDICTION': 'none',
 'STEWARD_OR_MANAGING_ORGANIZATION': 'none',
 'PUBLIC_E_MAIL': 'none',
 'WEBSITE': "('cultural-spaces.csv', 'WEBSITE')",
 'Geo Local Area': "('community-centres.csv', 'Geo Local Area')\n('cultural-spaces.csv', 'Geo Local Area')\n('street-intersections.csv', 'Geo Local Area')",
 'Geom': "('road-ahead-current-road-closures.csv', 'Geom')\n('road-ahead-upcoming-projects.csv', 'Geom')\n('road-ahead-projects-under-construction.csv', 'Geom')\n('community-centres.csv', 'Geom')\n('street-intersections.csv', 'Geom')"}

In [16]:
joins_dict

{'statewise-census-data-in-india-1901-2011.csv': {'FREQUENCY': 'none',
  'DATE': "('road-ahead-current-road-closures.csv', 'COMP_DATE')\n('road-ahead-upcoming-projects.csv', 'COMP_DATE')\n('road-ahead-projects-under-construction.csv', 'COMP_DATE')",
  'LOCATION_NAME': 'none',
  'LOCATION_F5': 'none',
  'LOCATION_ID': "('road-ahead-current-road-closures.csv', 'LOCATION')\n('road-ahead-upcoming-projects.csv', 'LOCATION')\n('road-ahead-projects-under-construction.csv', 'LOCATION')",
  'LOCATION_REGIONID': "('population-census-of-botswana-2011.csv', 'REGION_REGIONID')",
  'VARIABLE_NAME': 'none',
  'VALUE': 'none',
  'VARIABLE_ID': 'none'},
 'road-ahead-current-road-closures.csv': {'PROJECT': 'none',
  'STREET': 'none',
  'LOCATION': 'none',
  'COMP_DATE': "('road-ahead-upcoming-projects.csv', 'COMP_DATE')\n('road-ahead-projects-under-construction.csv', 'COMP_DATE')",
  'URL_LINK': "('road-ahead-upcoming-projects.csv', 'URL_LINK')",
  'Geom': 'none'},
 'property-tie-lines.csv': {'Geom': "(

In [63]:
groundTruth = pd.read_csv('joinable_columns_gt3_quality.csv')
groundTruth

Unnamed: 0,ds_name,att_name,sizeDistinct1,ds_name_2,att_name_2,sizeDistinct2,joinSize,trueContainment,trueQuality
0,road-ahead-current-road-closures.csv,COMP_DATE,12,road-ahead-upcoming-projects.csv,COMP_DATE,35,6,0.5,3.0
1,eo_pr.csv,NTEE_CD,302,eo_xx.csv,NTEE_CD,397,168,0.556291,3.0
2,eo_pr.csv,NAME,1270,eo4.csv,NAME,3097,1270,1.0,4.0
3,eo_pr.csv,ICO,784,eo4.csv,ICO,2050,784,1.0,4.0
4,eo_pr.csv,STREET,1266,eo4.csv,STREET,3045,1266,1.0,4.0
5,eo_pr.csv,CITY,107,eo4.csv,CITY,322,107,1.0,4.0
6,eo_pr.csv,ZIP,1111,eo4.csv,ZIP,1745,1111,1.0,4.0
7,eo4.csv,ZIP,1745,eo_pr.csv,ZIP,1111,1111,0.636676,3.0
8,eo_pr.csv,NTEE_CD,302,eo4.csv,NTEE_CD,531,302,1.0,4.0
9,eo4.csv,NTEE_CD,531,eo_pr.csv,NTEE_CD,302,302,0.568738,3.0


In [64]:
groundTruth = groundTruth[groundTruth['ds_name'].isin(files) & groundTruth['ds_name_2'].isin(files)]
groundTruth = groundTruth.reset_index(drop=True)
groundTruth

Unnamed: 0,ds_name,att_name,sizeDistinct1,ds_name_2,att_name_2,sizeDistinct2,joinSize,trueContainment,trueQuality
0,road-ahead-current-road-closures.csv,COMP_DATE,12,road-ahead-upcoming-projects.csv,COMP_DATE,35,6,0.5,3.0
1,eo_pr.csv,NTEE_CD,302,eo_xx.csv,NTEE_CD,397,168,0.556291,3.0
2,eo_pr.csv,NAME,1270,eo4.csv,NAME,3097,1270,1.0,4.0
3,eo_pr.csv,ICO,784,eo4.csv,ICO,2050,784,1.0,4.0
4,eo_pr.csv,STREET,1266,eo4.csv,STREET,3045,1266,1.0,4.0
5,eo_pr.csv,CITY,107,eo4.csv,CITY,322,107,1.0,4.0
6,eo_pr.csv,ZIP,1111,eo4.csv,ZIP,1745,1111,1.0,4.0
7,eo4.csv,ZIP,1745,eo_pr.csv,ZIP,1111,1111,0.636676,3.0
8,eo_pr.csv,NTEE_CD,302,eo4.csv,NTEE_CD,531,302,1.0,4.0
9,eo4.csv,NTEE_CD,531,eo_pr.csv,NTEE_CD,302,302,0.568738,3.0


In [66]:
groundTruth = groundTruth[['ds_name', 'att_name', 'ds_name_2', 'att_name_2']]
groundTruth

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2
0,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-upcoming-projects.csv,COMP_DATE
1,eo_pr.csv,NTEE_CD,eo_xx.csv,NTEE_CD
2,eo_pr.csv,NAME,eo4.csv,NAME
3,eo_pr.csv,ICO,eo4.csv,ICO
4,eo_pr.csv,STREET,eo4.csv,STREET
5,eo_pr.csv,CITY,eo4.csv,CITY
6,eo_pr.csv,ZIP,eo4.csv,ZIP
7,eo4.csv,ZIP,eo_pr.csv,ZIP
8,eo_pr.csv,NTEE_CD,eo4.csv,NTEE_CD
9,eo4.csv,NTEE_CD,eo_pr.csv,NTEE_CD


In [19]:
def remove_extra_quote(string):
    return string.replace('"','').replace("'",'').strip()

In [75]:
left_table_name = []
left_column_name = []
right_table_name = []
right_column_name = []

for left_table in joins_dict.keys():
    for left_column in joins_dict[left_table].keys():
        if(joins_dict[left_table][left_column] != 'none'):
            for predicted_joins in joins_dict[left_table][left_column].split('\n'):
                try:
                    rtn = remove_extra_quote(predicted_joins.split('(')[1].split(',')[0])
                    right_table_name.append(rtn if rtn > left_table else left_table)
                    left_table_name.append(rtn if rtn < left_table else left_table)

                    rcn = remove_extra_quote(predicted_joins.split(')')[0].split(',')[1])
                    
                    right_column_name.append(rcn if rtn > left_table else left_column)
                    left_column_name.append(rcn if rtn < left_table else left_column)
                except:
                    continue

In [76]:
d = {'LEFT_TABLE':left_table_name,
     'LEFT_COLUMN':left_column_name,
     'RIGHT_TABLE':right_table_name,
     'RIGHT_COLUMN':right_column_name}
predicted_joins_df = pd.DataFrame(d)

In [77]:
predicted_joins_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN
0,road-ahead-current-road-closures.csv,COMP_DATE,statewise-census-data-in-india-1901-2011.csv,DATE
1,road-ahead-upcoming-projects.csv,COMP_DATE,statewise-census-data-in-india-1901-2011.csv,DATE
2,road-ahead-projects-under-construction.csv,COMP_DATE,statewise-census-data-in-india-1901-2011.csv,DATE
3,road-ahead-current-road-closures.csv,LOCATION,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID
4,road-ahead-upcoming-projects.csv,LOCATION,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID
5,road-ahead-projects-under-construction.csv,LOCATION,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID
6,population-census-of-botswana-2011.csv,REGION_REGIONID,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID
7,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-upcoming-projects.csv,COMP_DATE
8,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-projects-under-construction.csv,COMP_DATE
9,road-ahead-current-road-closures.csv,URL_LINK,road-ahead-upcoming-projects.csv,URL_LINK


In [78]:
predicted_joins_df = predicted_joins_df.drop_duplicates()
predicted_joins_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN
0,road-ahead-current-road-closures.csv,COMP_DATE,statewise-census-data-in-india-1901-2011.csv,DATE
1,road-ahead-upcoming-projects.csv,COMP_DATE,statewise-census-data-in-india-1901-2011.csv,DATE
2,road-ahead-projects-under-construction.csv,COMP_DATE,statewise-census-data-in-india-1901-2011.csv,DATE
3,road-ahead-current-road-closures.csv,LOCATION,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID
4,road-ahead-upcoming-projects.csv,LOCATION,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID
5,road-ahead-projects-under-construction.csv,LOCATION,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID
6,population-census-of-botswana-2011.csv,REGION_REGIONID,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID
7,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-upcoming-projects.csv,COMP_DATE
8,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-projects-under-construction.csv,COMP_DATE
9,road-ahead-current-road-closures.csv,URL_LINK,road-ahead-upcoming-projects.csv,URL_LINK


In [67]:
left_table_name = []
left_column_name = []
right_table_name = []
right_column_name = []
groundTruth_df = pd.DataFrame((), columns=groundTruth.columns)

for i in range(groundTruth.shape[0]):
    rtn = groundTruth.iloc[i, 2]
    ltn = groundTruth.iloc[i, 0]
    right_table_name.append(rtn if rtn > ltn else ltn)
    left_table_name.append(rtn if rtn < ltn else ltn)

    rcn = groundTruth.iloc[i, 3]
    lcn = groundTruth.iloc[i, 1]
    right_column_name.append(rcn if rtn > ltn else lcn)
    left_column_name.append(rcn if rtn < ltn else lcn)

In [68]:
d = {'LEFT_TABLE':left_table_name,
     'LEFT_COLUMN':left_column_name,
     'RIGHT_TABLE':right_table_name,
     'RIGHT_COLUMN':right_column_name}
groundTruth_df = pd.DataFrame(d)
groundTruth_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN
0,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-upcoming-projects.csv,COMP_DATE
1,eo_pr.csv,NTEE_CD,eo_xx.csv,NTEE_CD
2,eo4.csv,NAME,eo_pr.csv,NAME
3,eo4.csv,ICO,eo_pr.csv,ICO
4,eo4.csv,STREET,eo_pr.csv,STREET
5,eo4.csv,CITY,eo_pr.csv,CITY
6,eo4.csv,ZIP,eo_pr.csv,ZIP
7,eo4.csv,ZIP,eo_pr.csv,ZIP
8,eo4.csv,NTEE_CD,eo_pr.csv,NTEE_CD
9,eo4.csv,NTEE_CD,eo_pr.csv,NTEE_CD


In [69]:
groundTruth_df = groundTruth_df.drop_duplicates()
groundTruth_df = groundTruth_df.reset_index(drop=True)
groundTruth_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN
0,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-upcoming-projects.csv,COMP_DATE
1,eo_pr.csv,NTEE_CD,eo_xx.csv,NTEE_CD
2,eo4.csv,NAME,eo_pr.csv,NAME
3,eo4.csv,ICO,eo_pr.csv,ICO
4,eo4.csv,STREET,eo_pr.csv,STREET
5,eo4.csv,CITY,eo_pr.csv,CITY
6,eo4.csv,ZIP,eo_pr.csv,ZIP
7,eo4.csv,NTEE_CD,eo_pr.csv,NTEE_CD
8,eo4.csv,SORT_NAME,eo_pr.csv,SORT_NAME
9,cultural-spaces.csv,LOCAL_AREA,street-intersections.csv,Geo Local Area


In [79]:
key = []
for i in range(predicted_joins_df.shape[0]):
    key.append("#".join(predicted_joins_df.iloc[i,:]))
predicted_joins_df['KEY'] = key
predicted_joins_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_joins_df['KEY'] = key


Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY
0,road-ahead-current-road-closures.csv,COMP_DATE,statewise-census-data-in-india-1901-2011.csv,DATE,road-ahead-current-road-closures.csv#COMP_DATE...
1,road-ahead-upcoming-projects.csv,COMP_DATE,statewise-census-data-in-india-1901-2011.csv,DATE,road-ahead-upcoming-projects.csv#COMP_DATE#sta...
2,road-ahead-projects-under-construction.csv,COMP_DATE,statewise-census-data-in-india-1901-2011.csv,DATE,road-ahead-projects-under-construction.csv#COM...
3,road-ahead-current-road-closures.csv,LOCATION,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,road-ahead-current-road-closures.csv#LOCATION#...
4,road-ahead-upcoming-projects.csv,LOCATION,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,road-ahead-upcoming-projects.csv#LOCATION#stat...
5,road-ahead-projects-under-construction.csv,LOCATION,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,road-ahead-projects-under-construction.csv#LOC...
6,population-census-of-botswana-2011.csv,REGION_REGIONID,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,population-census-of-botswana-2011.csv#REGION_...
7,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-upcoming-projects.csv,COMP_DATE,road-ahead-current-road-closures.csv#COMP_DATE...
8,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-projects-under-construction.csv,COMP_DATE,road-ahead-current-road-closures.csv#COMP_DATE...
9,road-ahead-current-road-closures.csv,URL_LINK,road-ahead-upcoming-projects.csv,URL_LINK,road-ahead-current-road-closures.csv#URL_LINK#...


In [80]:
predicted_joins_df.to_csv('Description_test/predicted_joins_df_20241104.csv', index=False)

In [45]:
predicted_joins_df = pd.read_csv('Description_test/predicted_joins_df_20241104.csv')

In [71]:
key = []
for i in range(groundTruth_df.shape[0]):
    key.append("#".join(groundTruth_df.iloc[i,:]))
groundTruth_df['KEY'] = key
groundTruth_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY
0,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-upcoming-projects.csv,COMP_DATE,road-ahead-current-road-closures.csv#COMP_DATE...
1,eo_pr.csv,NTEE_CD,eo_xx.csv,NTEE_CD,eo_pr.csv#NTEE_CD#eo_xx.csv#NTEE_CD
2,eo4.csv,NAME,eo_pr.csv,NAME,eo4.csv#NAME#eo_pr.csv#NAME
3,eo4.csv,ICO,eo_pr.csv,ICO,eo4.csv#ICO#eo_pr.csv#ICO
4,eo4.csv,STREET,eo_pr.csv,STREET,eo4.csv#STREET#eo_pr.csv#STREET
5,eo4.csv,CITY,eo_pr.csv,CITY,eo4.csv#CITY#eo_pr.csv#CITY
6,eo4.csv,ZIP,eo_pr.csv,ZIP,eo4.csv#ZIP#eo_pr.csv#ZIP
7,eo4.csv,NTEE_CD,eo_pr.csv,NTEE_CD,eo4.csv#NTEE_CD#eo_pr.csv#NTEE_CD
8,eo4.csv,SORT_NAME,eo_pr.csv,SORT_NAME,eo4.csv#SORT_NAME#eo_pr.csv#SORT_NAME
9,cultural-spaces.csv,LOCAL_AREA,street-intersections.csv,Geo Local Area,cultural-spaces.csv#LOCAL_AREA#street-intersec...


In [24]:
groundTruth_df.to_csv('Description_test/groundTruth_df_20241104.csv', index=False)

# Calculando Métricas (Precisão, Revocação e F1-Score)

In [82]:
tp = 0
fp = 0
fn = 0

for i in range(len(predicted_joins_df)):
    if(predicted_joins_df.iloc[i,4] in groundTruth_df.iloc[:,4].values):
        tp += 1
    else:
        fp += 1
    
for i in range(len(groundTruth_df)):
    if(groundTruth_df.iloc[i,4] not in predicted_joins_df.iloc[:,4].values):
        fn += 1
    
precision = (tp)/(tp+fp)
recall = (tp)/(tp+fn)
f1_score = 2 * (precision*recall)/(precision+recall)

print(precision, recall, f1_score)

0.22321428571428573 0.4716981132075472 0.30303030303030304


# Linhas Falsos Positivos

In [83]:
ids = []
for i in range(len(predicted_joins_df)):
    if(predicted_joins_df.iloc[i,4] not in groundTruth_df.iloc[:,4].values):
        ids.append(i)
fp_rows = predicted_joins_df.iloc[ids,:]
fp_rows

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY
0,road-ahead-current-road-closures.csv,COMP_DATE,statewise-census-data-in-india-1901-2011.csv,DATE,road-ahead-current-road-closures.csv#COMP_DATE...
1,road-ahead-upcoming-projects.csv,COMP_DATE,statewise-census-data-in-india-1901-2011.csv,DATE,road-ahead-upcoming-projects.csv#COMP_DATE#sta...
2,road-ahead-projects-under-construction.csv,COMP_DATE,statewise-census-data-in-india-1901-2011.csv,DATE,road-ahead-projects-under-construction.csv#COM...
3,road-ahead-current-road-closures.csv,LOCATION,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,road-ahead-current-road-closures.csv#LOCATION#...
4,road-ahead-upcoming-projects.csv,LOCATION,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,road-ahead-upcoming-projects.csv#LOCATION#stat...
5,road-ahead-projects-under-construction.csv,LOCATION,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,road-ahead-projects-under-construction.csv#LOC...
6,population-census-of-botswana-2011.csv,REGION_REGIONID,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,population-census-of-botswana-2011.csv#REGION_...
8,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-projects-under-construction.csv,COMP_DATE,road-ahead-current-road-closures.csv#COMP_DATE...
9,road-ahead-current-road-closures.csv,URL_LINK,road-ahead-upcoming-projects.csv,URL_LINK,road-ahead-current-road-closures.csv#URL_LINK#...
10,property-tie-lines.csv,Geom,road-ahead-current-road-closures.csv,Geom,property-tie-lines.csv#Geom#road-ahead-current...


# Linhas Falsos Negativos

In [61]:
ids = []
for i in range(len(groundTruth_df)):
    if(groundTruth_df.iloc[i,4] not in predicted_joins_df.iloc[:,4].values):
        ids.append(i)
fn_rows = groundTruth_df.iloc[ids,:]
fn_rows

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY
0,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-upcoming-projects.csv,COMP_DATE,road-ahead-current-road-closures.csv#COMP_DATE...
1,eo_pr.csv,NTEE_CD,eo_xx.csv,NTEE_CD,eo_pr.csv#NTEE_CD#eo_xx.csv#NTEE_CD#eo_pr.csv#...
2,eo4.csv,NAME,eo_pr.csv,NAME,eo4.csv#NAME#eo_pr.csv#NAME#eo4.csv#NAME#eo_pr...
3,eo4.csv,ICO,eo_pr.csv,ICO,eo4.csv#ICO#eo_pr.csv#ICO#eo4.csv#ICO#eo_pr.cs...
4,eo4.csv,STREET,eo_pr.csv,STREET,eo4.csv#STREET#eo_pr.csv#STREET#eo4.csv#STREET...
5,eo4.csv,CITY,eo_pr.csv,CITY,eo4.csv#CITY#eo_pr.csv#CITY#eo4.csv#CITY#eo_pr...
6,eo4.csv,ZIP,eo_pr.csv,ZIP,eo4.csv#ZIP#eo_pr.csv#ZIP#eo4.csv#ZIP#eo_pr.cs...
7,eo4.csv,NTEE_CD,eo_pr.csv,NTEE_CD,eo4.csv#NTEE_CD#eo_pr.csv#NTEE_CD#eo4.csv#NTEE...
8,eo4.csv,SORT_NAME,eo_pr.csv,SORT_NAME,eo4.csv#SORT_NAME#eo_pr.csv#SORT_NAME#eo4.csv#...
9,cultural-spaces.csv,LOCAL_AREA,street-intersections.csv,Geo Local Area,cultural-spaces.csv#LOCAL_AREA#street-intersec...
