In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from pydantic import BaseModel
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [4]:
class join_schema(BaseModel):
    table_name: str
    column_name: str

In [5]:
def generate_prompt_predict_possible_joins(target_description, candidate_descriptions, candidate_tables):

    system_msg = f"""
            Given one target column description and many candidate column descriptions, predict all the pairs (candidate table name, candidate 
            description column name) that could be joined.

            Task: Look carefully at the target column description and candidate column descriptions and use this information to identify 
            patterns and relationships between the descriptions, the result must be a list of all the JOINable pairs found. If no joinable pair is 
            found the result should be just the word "none".

            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them 
            based on related columns between them.
            
            Instructions: 
                1. Look at the target description given to you. 
                2. Look at the candidate descriptions in detail. 
                3. Predict if the target column description belongs to a column that may be used in join. 
                4. Select all the highly likely JOINs between these columns based only on these descriptions. Disregard the column names.

            Example 1:
                Target description: this column represents a worker's id
                Candidate tables: ['salary.csv','salary.csv','hospital.csv']
                Candidate description: ['the column worker_id represents the worker's id', 'this column represents a worker's salary', 'this column represents a hospital location']
                Possible JOINs: ('salary.csv', 'worker_id')
            """
    
    user_msg = f"""Target description:      {target_description}
                   Candidate table:         {candidate_tables.values}
                   Candidate descriptions:  {candidate_descriptions.values}
                   Possible JOINs: """.strip()
    
    return system_msg, user_msg

In [6]:
def generate_predictions(target_descriptions, candidate_tables, candidate_descriptions, client):
    
    system_msg_predict_joins, user_msg_predict_joins = generate_prompt_predict_possible_joins(target_descriptions, candidate_tables, candidate_descriptions)
    result = execute_prompt(client, system_msg_predict_joins, user_msg_predict_joins)
    joins = result.choices[0].message.content.split('Possible JOINs: ')[-1].strip()
    
    return joins

In [7]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            temperature=0.2,
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [8]:
client = OpenAI()

In [9]:
descriptions = pd.read_csv('Description_test/all_descriptions.csv')
descriptions.head()

Unnamed: 0,TableName,Column,Description
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...
1,statewise-census-data-in-india-1901-2011.csv,DATE,The 'DATE' column in the table represents the ...
2,statewise-census-data-in-india-1901-2011.csv,LOCATION_NAME,"The ""LOCATION_NAME"" column in the table repres..."
3,statewise-census-data-in-india-1901-2011.csv,LOCATION_F5,"The column ""LOCATION_F5"" in the table appears ..."
4,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,"The ""LOCATION_ID"" column contains identifiers ..."


In [10]:
# files = ['eo_pr.csv', 'cultural-spaces.csv', 'public-art.csv', 'libraries.csv', 'schools.csv']
files = [file.split('/')[-1] for file in glob.glob('datasets/*')]
files

['statewise-census-data-in-india-1901-2011.csv',
 'road-ahead-current-road-closures.csv',
 'property-tie-lines.csv',
 'public-art.csv',
 'gvrd-sewer-trunk-mains.csv',
 'SCS_Staff_Salaries_data_30th_June 2010.csv',
 'schools.csv',
 'rental-standards-current-issues.csv',
 'datasets_579296_1047868_authors.csv',
 'survey_results_schema.csv',
 'animal-control-inventory-lost-and-found.csv',
 'glassdoor_wwfu_val_captions.csv',
 'eo_xx.csv',
 'community-gardens-and-food-trees.csv',
 'road-ahead-upcoming-projects.csv',
 'libraries.csv',
 'cultural-spaces.csv',
 'datasets_517172_952401_train.csv',
 'public-art-artists.csv',
 'eo4.csv',
 'currency_exchange.csv',
 'eo_pr.csv',
 'road-ahead-projects-under-construction.csv',
 'ability_ids.csv',
 'population-by-governorate-citizenship-and-gender.csv',
 'community-centres.csv',
 'street-intersections.csv',
 'population-census-of-botswana-2011.csv']

In [11]:
df_dsInformation = pd.read_csv('datasetInformation_testbedXS.csv')

In [12]:
joins_dict = {}
step = 20

exclude_files = []
# exclude_files = ['cultural-spaces.csv', 'statewise-census-data-in-india-1901-2011.csv']
# for file in ['cultural-spaces.csv', 'street-intersections.csv', 'public-art.csv']:
# for file in ['cultural-spaces.csv']:
for file in files:
    if(file in exclude_files):
        continue
    
    info  = df_dsInformation[df_dsInformation['filename'] == file]                
    table = pd.read_csv(f'datasets/{file}', delimiter=info['delimiter'].values[0])

    table_descriptions = descriptions[descriptions['TableName'] == file]
    candidate_tables       = descriptions[descriptions['TableName'] != file].iloc[:,0]
    candidate_descriptions = descriptions[descriptions['TableName'] != file].iloc[:,2]

    joins_dict[f'{file}'] = {}
    
    for i in tqdm(range(table_descriptions.shape[0])):

        joins_dict[f'{file}'][f'{table_descriptions.iloc[i, 1]}'] = ''
        
        for j in range(0, len(candidate_tables), step):
            target_description = table_descriptions.iloc[i, 2]

            sample_cadidate_tables = candidate_tables[j:j+step]
            sample_candidate_descriptions = candidate_descriptions[j:j+step]
            
            sample_joins = generate_predictions(target_description, sample_cadidate_tables, sample_candidate_descriptions, client)
            
            if((sample_joins != 'none') or (sample_joins != 'None')):
                joins_dict[f'{file}'][f'{table_descriptions.iloc[i, 1]}'] += ', ' + sample_joins

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [01:25<00:00,  9.49s/it]
  0%|                                                                                                                                                                                                                  | 0/6 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [20]:
with open('Description_test/join_prediction_v7.json', 'w') as file:
    json.dump(joins_dict, file)

In [22]:
with open('Description_test/join_prediction_v7.json', 'r') as file:
    joins_dict = json.load(file)

In [25]:
joins_dict.keys()

dict_keys(['statewise-census-data-in-india-1901-2011.csv', 'cultural-spaces.csv', 'road-ahead-current-road-closures.csv', 'property-tie-lines.csv', 'public-art.csv', 'gvrd-sewer-trunk-mains.csv', 'SCS_Staff_Salaries_data_30th_June 2010.csv', 'schools.csv', 'rental-standards-current-issues.csv', 'datasets_579296_1047868_authors.csv', 'survey_results_schema.csv', 'animal-control-inventory-lost-and-found.csv', 'glassdoor_wwfu_val_captions.csv', 'eo_xx.csv', 'community-gardens-and-food-trees.csv', 'road-ahead-upcoming-projects.csv', 'libraries.csv', 'datasets_517172_952401_train.csv', 'public-art-artists.csv', 'eo4.csv', 'currency_exchange.csv', 'eo_pr.csv', 'road-ahead-projects-under-construction.csv', 'ability_ids.csv', 'population-by-governorate-citizenship-and-gender.csv', 'community-centres.csv', 'street-intersections.csv', 'population-census-of-botswana-2011.csv'])

In [26]:
groundTruth = pd.read_csv('joinable_columns_gt3_quality.csv')
groundTruth

Unnamed: 0,ds_name,att_name,sizeDistinct1,ds_name_2,att_name_2,sizeDistinct2,joinSize,trueContainment,trueQuality
0,road-ahead-current-road-closures.csv,COMP_DATE,12,road-ahead-upcoming-projects.csv,COMP_DATE,35,6,0.5,3.0
1,eo_pr.csv,NTEE_CD,302,eo_xx.csv,NTEE_CD,397,168,0.556291,3.0
2,eo_pr.csv,NAME,1270,eo4.csv,NAME,3097,1270,1.0,4.0
3,eo_pr.csv,ICO,784,eo4.csv,ICO,2050,784,1.0,4.0
4,eo_pr.csv,STREET,1266,eo4.csv,STREET,3045,1266,1.0,4.0
5,eo_pr.csv,CITY,107,eo4.csv,CITY,322,107,1.0,4.0
6,eo_pr.csv,ZIP,1111,eo4.csv,ZIP,1745,1111,1.0,4.0
7,eo4.csv,ZIP,1745,eo_pr.csv,ZIP,1111,1111,0.636676,3.0
8,eo_pr.csv,NTEE_CD,302,eo4.csv,NTEE_CD,531,302,1.0,4.0
9,eo4.csv,NTEE_CD,531,eo_pr.csv,NTEE_CD,302,302,0.568738,3.0


In [27]:
groundTruth = groundTruth[groundTruth['ds_name'].isin(files) & groundTruth['ds_name_2'].isin(files)]
groundTruth = groundTruth.reset_index(drop=True)
groundTruth

Unnamed: 0,ds_name,att_name,sizeDistinct1,ds_name_2,att_name_2,sizeDistinct2,joinSize,trueContainment,trueQuality
0,road-ahead-current-road-closures.csv,COMP_DATE,12,road-ahead-upcoming-projects.csv,COMP_DATE,35,6,0.5,3.0
1,eo_pr.csv,NTEE_CD,302,eo_xx.csv,NTEE_CD,397,168,0.556291,3.0
2,eo_pr.csv,NAME,1270,eo4.csv,NAME,3097,1270,1.0,4.0
3,eo_pr.csv,ICO,784,eo4.csv,ICO,2050,784,1.0,4.0
4,eo_pr.csv,STREET,1266,eo4.csv,STREET,3045,1266,1.0,4.0
5,eo_pr.csv,CITY,107,eo4.csv,CITY,322,107,1.0,4.0
6,eo_pr.csv,ZIP,1111,eo4.csv,ZIP,1745,1111,1.0,4.0
7,eo4.csv,ZIP,1745,eo_pr.csv,ZIP,1111,1111,0.636676,3.0
8,eo_pr.csv,NTEE_CD,302,eo4.csv,NTEE_CD,531,302,1.0,4.0
9,eo4.csv,NTEE_CD,531,eo_pr.csv,NTEE_CD,302,302,0.568738,3.0


In [28]:
groundTruth = groundTruth[['ds_name', 'att_name', 'ds_name_2', 'att_name_2']]
groundTruth

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2
0,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-upcoming-projects.csv,COMP_DATE
1,eo_pr.csv,NTEE_CD,eo_xx.csv,NTEE_CD
2,eo_pr.csv,NAME,eo4.csv,NAME
3,eo_pr.csv,ICO,eo4.csv,ICO
4,eo_pr.csv,STREET,eo4.csv,STREET
5,eo_pr.csv,CITY,eo4.csv,CITY
6,eo_pr.csv,ZIP,eo4.csv,ZIP
7,eo4.csv,ZIP,eo_pr.csv,ZIP
8,eo_pr.csv,NTEE_CD,eo4.csv,NTEE_CD
9,eo4.csv,NTEE_CD,eo_pr.csv,NTEE_CD


In [29]:
def remove_extra_quote(string):
    return string.replace('"','').replace("'",'').strip()

In [30]:
left_table_name = []
left_column_name = []
right_table_name = []
right_column_name = []

for left_table in joins_dict.keys():
    for left_column in joins_dict[left_table].keys():
        if(joins_dict[left_table][left_column] != 'none'):
            for predicted_joins in joins_dict[left_table][left_column].split('#'):
                try:
                    rtn = remove_extra_quote(predicted_joins.split('(')[1].split(',')[0])
                    right_table_name.append(rtn if rtn > left_table else left_table)
                    left_table_name.append(rtn if rtn < left_table else left_table)

                    rcn = remove_extra_quote(predicted_joins.split(')')[0].split(',')[1])
                    
                    right_column_name.append(rcn if rtn > left_table else left_column)
                    left_column_name.append(rcn if rtn < left_table else left_column)
                except:
                    continue

In [31]:
d = {'LEFT_TABLE':left_table_name,
     'LEFT_COLUMN':left_column_name,
     'RIGHT_TABLE':right_table_name,
     'RIGHT_COLUMN':right_column_name}
predicted_joins_df = pd.DataFrame(d)

In [33]:
predicted_joins_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN
0,population-by-governorate-citizenship-and-gend...,Year,statewise-census-data-in-india-1901-2011.csv,DATE
1,community-gardens-and-food-trees.csv,MAPID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID
2,population-census-of-botswana-2011.csv,REGION_ID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID
3,population-by-governorate-citizenship-and-gend...,Governorate,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID
4,population-census-of-botswana-2011.csv,REGION_REGIONID,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID
5,datasets_579296_1047868_authors.csv,Author_ID,statewise-census-data-in-india-1901-2011.csv,VARIABLE_ID
6,cultural-spaces.csv,YEAR,population-by-governorate-citizenship-and-gend...,Year
7,cultural-spaces.csv,YEAR,population-census-of-botswana-2011.csv,DATE
8,cultural-spaces.csv,CULTURAL_SPACE_NAME,eo_xx.csv,NAME
9,cultural-spaces.csv,CULTURAL_SPACE_NAME,eo4.csv,NAME


In [34]:
predicted_joins_df = predicted_joins_df.drop_duplicates()
predicted_joins_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN
0,population-by-governorate-citizenship-and-gend...,Year,statewise-census-data-in-india-1901-2011.csv,DATE
1,community-gardens-and-food-trees.csv,MAPID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID
2,population-census-of-botswana-2011.csv,REGION_ID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID
3,population-by-governorate-citizenship-and-gend...,Governorate,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID
4,population-census-of-botswana-2011.csv,REGION_REGIONID,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID
5,datasets_579296_1047868_authors.csv,Author_ID,statewise-census-data-in-india-1901-2011.csv,VARIABLE_ID
6,cultural-spaces.csv,YEAR,population-by-governorate-citizenship-and-gend...,Year
7,cultural-spaces.csv,YEAR,population-census-of-botswana-2011.csv,DATE
8,cultural-spaces.csv,CULTURAL_SPACE_NAME,eo_xx.csv,NAME
9,cultural-spaces.csv,CULTURAL_SPACE_NAME,eo4.csv,NAME


In [35]:
left_table_name = []
left_column_name = []
right_table_name = []
right_column_name = []
groundTruth_df = pd.DataFrame((), columns=groundTruth.columns)

for i in range(groundTruth.shape[0]):
    rtn = groundTruth.iloc[i, 2]
    ltn = groundTruth.iloc[i, 0]
    right_table_name.append(rtn if rtn > ltn else ltn)
    left_table_name.append(rtn if rtn < ltn else ltn)

    rcn = groundTruth.iloc[i, 3]
    lcn = groundTruth.iloc[i, 1]
    right_column_name.append(rcn if rtn > ltn else lcn)
    left_column_name.append(rcn if rtn < ltn else lcn)

In [36]:
d = {'LEFT_TABLE':left_table_name,
     'LEFT_COLUMN':left_column_name,
     'RIGHT_TABLE':right_table_name,
     'RIGHT_COLUMN':right_column_name}
groundTruth_df = pd.DataFrame(d)
groundTruth_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN
0,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-upcoming-projects.csv,COMP_DATE
1,eo_pr.csv,NTEE_CD,eo_xx.csv,NTEE_CD
2,eo4.csv,NAME,eo_pr.csv,NAME
3,eo4.csv,ICO,eo_pr.csv,ICO
4,eo4.csv,STREET,eo_pr.csv,STREET
5,eo4.csv,CITY,eo_pr.csv,CITY
6,eo4.csv,ZIP,eo_pr.csv,ZIP
7,eo4.csv,ZIP,eo_pr.csv,ZIP
8,eo4.csv,NTEE_CD,eo_pr.csv,NTEE_CD
9,eo4.csv,NTEE_CD,eo_pr.csv,NTEE_CD


In [37]:
groundTruth_df = groundTruth_df.drop_duplicates()
groundTruth_df = groundTruth_df.reset_index(drop=True)
groundTruth_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN
0,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-upcoming-projects.csv,COMP_DATE
1,eo_pr.csv,NTEE_CD,eo_xx.csv,NTEE_CD
2,eo4.csv,NAME,eo_pr.csv,NAME
3,eo4.csv,ICO,eo_pr.csv,ICO
4,eo4.csv,STREET,eo_pr.csv,STREET
5,eo4.csv,CITY,eo_pr.csv,CITY
6,eo4.csv,ZIP,eo_pr.csv,ZIP
7,eo4.csv,NTEE_CD,eo_pr.csv,NTEE_CD
8,eo4.csv,SORT_NAME,eo_pr.csv,SORT_NAME
9,cultural-spaces.csv,LOCAL_AREA,street-intersections.csv,Geo Local Area


In [38]:
key = []
for i in range(predicted_joins_df.shape[0]):
    key.append("#".join(predicted_joins_df.iloc[i,:]))
predicted_joins_df['KEY'] = key
predicted_joins_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_joins_df['KEY'] = key


Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY
0,population-by-governorate-citizenship-and-gend...,Year,statewise-census-data-in-india-1901-2011.csv,DATE,population-by-governorate-citizenship-and-gend...
1,community-gardens-and-food-trees.csv,MAPID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,community-gardens-and-food-trees.csv#MAPID#sta...
2,population-census-of-botswana-2011.csv,REGION_ID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,population-census-of-botswana-2011.csv#REGION_...
3,population-by-governorate-citizenship-and-gend...,Governorate,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,population-by-governorate-citizenship-and-gend...
4,population-census-of-botswana-2011.csv,REGION_REGIONID,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,population-census-of-botswana-2011.csv#REGION_...
5,datasets_579296_1047868_authors.csv,Author_ID,statewise-census-data-in-india-1901-2011.csv,VARIABLE_ID,datasets_579296_1047868_authors.csv#Author_ID#...
6,cultural-spaces.csv,YEAR,population-by-governorate-citizenship-and-gend...,Year,cultural-spaces.csv#YEAR#population-by-governo...
7,cultural-spaces.csv,YEAR,population-census-of-botswana-2011.csv,DATE,cultural-spaces.csv#YEAR#population-census-of-...
8,cultural-spaces.csv,CULTURAL_SPACE_NAME,eo_xx.csv,NAME,cultural-spaces.csv#CULTURAL_SPACE_NAME#eo_xx....
9,cultural-spaces.csv,CULTURAL_SPACE_NAME,eo4.csv,NAME,cultural-spaces.csv#CULTURAL_SPACE_NAME#eo4.cs...


In [39]:
predicted_joins_df.to_csv('Description_test/predicted_joins_df_20241111.csv', index=False)

In [40]:
predicted_joins_df = pd.read_csv('Description_test/predicted_joins_df_20241111.csv')

In [41]:
key = []
for i in range(groundTruth_df.shape[0]):
    key.append("#".join(groundTruth_df.iloc[i,:]))
groundTruth_df['KEY'] = key
groundTruth_df

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY
0,road-ahead-current-road-closures.csv,COMP_DATE,road-ahead-upcoming-projects.csv,COMP_DATE,road-ahead-current-road-closures.csv#COMP_DATE...
1,eo_pr.csv,NTEE_CD,eo_xx.csv,NTEE_CD,eo_pr.csv#NTEE_CD#eo_xx.csv#NTEE_CD
2,eo4.csv,NAME,eo_pr.csv,NAME,eo4.csv#NAME#eo_pr.csv#NAME
3,eo4.csv,ICO,eo_pr.csv,ICO,eo4.csv#ICO#eo_pr.csv#ICO
4,eo4.csv,STREET,eo_pr.csv,STREET,eo4.csv#STREET#eo_pr.csv#STREET
5,eo4.csv,CITY,eo_pr.csv,CITY,eo4.csv#CITY#eo_pr.csv#CITY
6,eo4.csv,ZIP,eo_pr.csv,ZIP,eo4.csv#ZIP#eo_pr.csv#ZIP
7,eo4.csv,NTEE_CD,eo_pr.csv,NTEE_CD,eo4.csv#NTEE_CD#eo_pr.csv#NTEE_CD
8,eo4.csv,SORT_NAME,eo_pr.csv,SORT_NAME,eo4.csv#SORT_NAME#eo_pr.csv#SORT_NAME
9,cultural-spaces.csv,LOCAL_AREA,street-intersections.csv,Geo Local Area,cultural-spaces.csv#LOCAL_AREA#street-intersec...


# Avaliação de Métricas (Precisão, Revocação e F1-Score)

In [47]:
tp = 0
fp = 0
fn = 0

for i in range(len(predicted_joins_df)):
    if(predicted_joins_df.iloc[i,4] in groundTruth_df.iloc[:,4].values):
        tp += 1
    else:
        fp += 1
    
for i in range(len(groundTruth_df)):
    if(groundTruth_df.iloc[i,4] not in predicted_joins_df.iloc[:,4].values):
        fn += 1
    
precision = (tp)/(tp+fp)
recall = (tp)/(tp+fn)
f1_score = 2 * (precision*recall)/(precision+recall)

print(precision, recall, f1_score)

0.08787346221441125 0.9433962264150944 0.1607717041800643


# Análise Falso Positivo

In [45]:
len(predicted_joins_df)

569

In [48]:
len(groundTruth_df)

53

In [49]:
for i in range(len(predicted_joins_df)):
    if(predicted_joins_df.iloc[i,4] not in groundTruth_df.iloc[:,4].values):
        print(predicted_joins_df.iloc[i,4])

population-by-governorate-citizenship-and-gender.csv#Year#statewise-census-data-in-india-1901-2011.csv#DATE
community-gardens-and-food-trees.csv#MAPID#statewise-census-data-in-india-1901-2011.csv#LOCATION_ID
population-census-of-botswana-2011.csv#REGION_ID#statewise-census-data-in-india-1901-2011.csv#LOCATION_ID
population-by-governorate-citizenship-and-gender.csv#Governorate#statewise-census-data-in-india-1901-2011.csv#LOCATION_REGIONID
population-census-of-botswana-2011.csv#REGION_REGIONID#statewise-census-data-in-india-1901-2011.csv#LOCATION_REGIONID
datasets_579296_1047868_authors.csv#Author_ID#statewise-census-data-in-india-1901-2011.csv#VARIABLE_ID
cultural-spaces.csv#YEAR#population-by-governorate-citizenship-and-gender.csv#Year
cultural-spaces.csv#YEAR#population-census-of-botswana-2011.csv#DATE
cultural-spaces.csv#CULTURAL_SPACE_NAME#eo_xx.csv#NAME
cultural-spaces.csv#CULTURAL_SPACE_NAME#eo4.csv#NAME
community-centres.csv#NAME#cultural-spaces.csv#CULTURAL_SPACE_NAME
cultural-s

# Análise Falso Negativo

In [50]:
for i in range(len(groundTruth_df)):
    if(groundTruth_df.iloc[i,4] not in predicted_joins_df.iloc[:,4].values):
        print(groundTruth_df.iloc[i,4])

eo4.csv#ICO#eo_pr.csv#ICO
public-art.csv#GeoLocalArea#street-intersections.csv#Geo Local Area
population-census-of-botswana-2011.csv#FREQUENCY#statewise-census-data-in-india-1901-2011.csv#FREQUENCY
