In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from pydantic import BaseModel
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [4]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [5]:
client = OpenAI()

In [6]:
def generate_prompt_JD_prediction(target_table_description, target_column_descriptions, candidate_table_description, candidate_column_descriptions):

    system_msg = """
            Given a target table and a candidate table, predict if the pair (target table, candidate table) could be joined and answer with only "Yes" or "No".

            Task: Look carefully at the target table description and candidate table description column and use this information to identify 
            patterns and relationships between the descriptions, the result must be only the word "Yes" if the JOIN is possible or "No" otherwise.

            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them 
            based on related  columns between them.
            
           Instructions: 
                1. Look at the target table description given to you.
                2. Look at the target column descriptions.
                3. Look at the candidate table description in detail. 
                4. Look at the candidate column descriptions.
                5. Predict if the target table and candidate tables are joinable, answer with "Yes" or "No".          
            """
            
    user_msg =  f"""Target Table description: {target_table_description}
                    Target columns descriptions: {target_column_descriptions}
                    Candidate Table description: {candidate_table_description}
                    Candidate columns descriptions: {candidate_column_descriptions}
                    Answer:""".strip()
    
    return system_msg, user_msg

In [7]:
def generate_predictions(target_table_description, target_column_descriptions, candidate_table_description, candidate_column_descriptions, client):
    
    system_msg_jd_prediction, user_msg_jd_prediction = generate_prompt_JD_prediction(target_table_description, target_column_descriptions, candidate_table_description, candidate_column_descriptions)
    result = execute_prompt(client, system_msg_jd_prediction, user_msg_jd_prediction)
    jd_prediction = result.choices[0].message.content.split('Answer: ')[-1].strip()
    
    return jd_prediction

In [8]:
column_descriptions = pd.read_csv('Description_test/CTA_from_descriptions/all_junio_descriptions_with_cta.csv')
column_descriptions.head()

Unnamed: 0,TableName,Column,Description,CTA
0,eo_xx,EIN,The Employer Identification Number (EIN) is a ...,Identifier
1,eo_xx,NAME,The NAME column contains the official name of ...,Organization
2,eo_xx,ICO,The ICO column appears to contain names of ind...,Person
3,eo_xx,STREET,The STREET column provides the street address ...,streetAddress
4,eo_xx,CITY,The CITY column specifies the city where the o...,Text


In [9]:
table_descriptions = pd.read_csv('Description_test/CTA_from_descriptions/table_descriptions.csv')
table_descriptions.head()

Unnamed: 0,TableName,Description
0,statewise-census-data-in-india-1901-2011.csv,"The table ""statewise-census-data-in-india-1901..."
1,road-ahead-current-road-closures.csv,"The table ""road-ahead-current-road-closures.cs..."
2,property-tie-lines.csv,"The table ""property-tie-lines.csv"" contains in..."
3,public-art.csv,The public-art.csv table provides detailed inf...
4,gvrd-sewer-trunk-mains.csv,"The ""gvrd-sewer-trunk-mains.csv"" table provide..."


In [10]:
matches = pd.read_csv('Description_test/predicted_joins_df_20241111.csv')
matches.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY
0,population-by-governorate-citizenship-and-gend...,Year,statewise-census-data-in-india-1901-2011.csv,DATE,population-by-governorate-citizenship-and-gend...
1,community-gardens-and-food-trees.csv,MAPID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,community-gardens-and-food-trees.csv#MAPID#sta...
2,population-census-of-botswana-2011.csv,REGION_ID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,population-census-of-botswana-2011.csv#REGION_...
3,population-by-governorate-citizenship-and-gend...,Governorate,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,population-by-governorate-citizenship-and-gend...
4,population-census-of-botswana-2011.csv,REGION_REGIONID,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,population-census-of-botswana-2011.csv#REGION_...


In [11]:
table_matches = matches[['LEFT_TABLE', 'RIGHT_TABLE']]
table_matches = table_matches.drop_duplicates()
table_matches = table_matches.reset_index(drop=True)
table_matches

Unnamed: 0,LEFT_TABLE,RIGHT_TABLE
0,population-by-governorate-citizenship-and-gend...,statewise-census-data-in-india-1901-2011.csv
1,community-gardens-and-food-trees.csv,statewise-census-data-in-india-1901-2011.csv
2,population-census-of-botswana-2011.csv,statewise-census-data-in-india-1901-2011.csv
3,datasets_579296_1047868_authors.csv,statewise-census-data-in-india-1901-2011.csv
4,cultural-spaces.csv,population-by-governorate-citizenship-and-gend...
5,cultural-spaces.csv,population-census-of-botswana-2011.csv
6,cultural-spaces.csv,eo_xx.csv
7,cultural-spaces.csv,eo4.csv
8,community-centres.csv,cultural-spaces.csv
9,cultural-spaces.csv,public-art.csv


In [13]:
asnwers = []

for i in tqdm(range(len(table_matches))):
    left_table_name  = table_matches.iloc[i, 0]
    right_table_name = table_matches.iloc[i, 1]
    
    target_table_description = table_descriptions[table_descriptions['TableName'] == left_table_name].iloc[0,1]
    target_column_descriptions = column_descriptions[column_descriptions['TableName'] == left_table_name].Description.values    
    
    candidate_table_description = table_descriptions[table_descriptions['TableName'] == right_table_name].iloc[0,1]
    candidate_column_descriptions = column_descriptions[column_descriptions['TableName'] == right_table_name].Description.values

    # print(target_table_description + '\n######################\n' +
    #       target_column_descriptions[0] + '\n######################\n' + 
    #       candidate_table_description + '\n######################\n' + 
    #       candidate_column_descriptions[0])
    
    result = generate_predictions(target_table_description, target_column_descriptions, 
                                  candidate_table_description, candidate_column_descriptions, 
                                  client)
    asnwers.append(result)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 136/136 [01:21<00:00,  1.66it/s]


In [14]:
asnwers

['No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No']

In [15]:
table_matches['JOINABLE'] = asnwers

In [16]:
table_matches

Unnamed: 0,LEFT_TABLE,RIGHT_TABLE,JOINABLE
0,population-by-governorate-citizenship-and-gend...,statewise-census-data-in-india-1901-2011.csv,No
1,community-gardens-and-food-trees.csv,statewise-census-data-in-india-1901-2011.csv,No
2,population-census-of-botswana-2011.csv,statewise-census-data-in-india-1901-2011.csv,No
3,datasets_579296_1047868_authors.csv,statewise-census-data-in-india-1901-2011.csv,No
4,cultural-spaces.csv,population-by-governorate-citizenship-and-gend...,No
5,cultural-spaces.csv,population-census-of-botswana-2011.csv,No
6,cultural-spaces.csv,eo_xx.csv,No
7,cultural-spaces.csv,eo4.csv,No
8,community-centres.csv,cultural-spaces.csv,Yes
9,cultural-spaces.csv,public-art.csv,Yes


In [17]:
matches.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY
0,population-by-governorate-citizenship-and-gend...,Year,statewise-census-data-in-india-1901-2011.csv,DATE,population-by-governorate-citizenship-and-gend...
1,community-gardens-and-food-trees.csv,MAPID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,community-gardens-and-food-trees.csv#MAPID#sta...
2,population-census-of-botswana-2011.csv,REGION_ID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,population-census-of-botswana-2011.csv#REGION_...
3,population-by-governorate-citizenship-and-gend...,Governorate,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,population-by-governorate-citizenship-and-gend...
4,population-census-of-botswana-2011.csv,REGION_REGIONID,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,population-census-of-botswana-2011.csv#REGION_...


In [19]:
answers = []
for i in range(len(matches)):
    answer = table_matches[(table_matches['LEFT_TABLE'] == matches.iloc[i,0]) & (table_matches['RIGHT_TABLE'] == matches.iloc[i,2])].iloc[0,2]
    answers.append(answer)

In [20]:
matches['JOINABLE_TD'] = answers

In [21]:
matches

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY,JOINABLE_TD
0,population-by-governorate-citizenship-and-gend...,Year,statewise-census-data-in-india-1901-2011.csv,DATE,population-by-governorate-citizenship-and-gend...,No
1,community-gardens-and-food-trees.csv,MAPID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,community-gardens-and-food-trees.csv#MAPID#sta...,No
2,population-census-of-botswana-2011.csv,REGION_ID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,population-census-of-botswana-2011.csv#REGION_...,No
3,population-by-governorate-citizenship-and-gend...,Governorate,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,population-by-governorate-citizenship-and-gend...,No
4,population-census-of-botswana-2011.csv,REGION_REGIONID,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,population-census-of-botswana-2011.csv#REGION_...,No
5,datasets_579296_1047868_authors.csv,Author_ID,statewise-census-data-in-india-1901-2011.csv,VARIABLE_ID,datasets_579296_1047868_authors.csv#Author_ID#...,No
6,cultural-spaces.csv,YEAR,population-by-governorate-citizenship-and-gend...,Year,cultural-spaces.csv#YEAR#population-by-governo...,No
7,cultural-spaces.csv,YEAR,population-census-of-botswana-2011.csv,DATE,cultural-spaces.csv#YEAR#population-census-of-...,No
8,cultural-spaces.csv,CULTURAL_SPACE_NAME,eo_xx.csv,NAME,cultural-spaces.csv#CULTURAL_SPACE_NAME#eo_xx....,No
9,cultural-spaces.csv,CULTURAL_SPACE_NAME,eo4.csv,NAME,cultural-spaces.csv#CULTURAL_SPACE_NAME#eo4.cs...,No


In [22]:
matches.to_csv('Description_test/CTA_from_descriptions/junio_JD_matches_with_table_descriptions_without_cta.csv', index=False)