In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from pydantic import BaseModel
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [4]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [5]:
client = OpenAI()

In [6]:
def generate_prompt_JD_prediction(target_table_description, target_column_descriptions, candidate_table_description, candidate_column_descriptions):

    system_msg = """
            Given a target table and a candidate table, predict if the pair (target table, candidate table) could be joined and answer with only "Yes" or "No".

            Task: Look carefully at the target table description and candidate table description column and use this information to identify 
            patterns and relationships between the descriptions, the result must be only the word "Yes" if the JOIN is possible or "No" otherwise.

            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them 
            based on related  columns between them.
            
           Instructions: 
                1. Look at the target table description given to you.
                2. Look at the target column descriptions.
                3. Look at the candidate table description in detail. 
                4. Look at the candidate column descriptions.
                5. Predict if the target table and candidate tables are joinable, answer with "Yes" or "No".          
            """
            
    user_msg =  f"""Target Table description: {target_table_description}
                    Target columns descriptions: {target_column_descriptions}
                    Candidate Table description: {candidate_table_description}
                    Candidate columns descriptions: {candidate_column_descriptions}
                    Answer:""".strip()
    
    return system_msg, user_msg

In [7]:
def generate_predictions(target_table_description, target_column_descriptions, candidate_table_description, candidate_column_descriptions, client):
    
    system_msg_jd_prediction, user_msg_jd_prediction = generate_prompt_JD_prediction(target_table_description, target_column_descriptions, candidate_table_description, candidate_column_descriptions)
    result = execute_prompt(client, system_msg_jd_prediction, user_msg_jd_prediction)
    jd_prediction = result.choices[0].message.content.split('Answer: ')[-1].strip()
    
    return jd_prediction

In [8]:
column_descriptions = pd.read_csv('Description_test/CTA_from_descriptions/all_junio_descriptions_with_cta.csv')
column_descriptions.head()

Unnamed: 0,TableName,Column,Description,CTA
0,eo_xx,EIN,The Employer Identification Number (EIN) is a ...,Identifier
1,eo_xx,NAME,The NAME column contains the official name of ...,Organization
2,eo_xx,ICO,The ICO column appears to contain names of ind...,Person
3,eo_xx,STREET,The STREET column provides the street address ...,streetAddress
4,eo_xx,CITY,The CITY column specifies the city where the o...,Text


In [9]:
table_descriptions = pd.read_csv('Description_test/CTA_from_descriptions/table_descriptions.csv')
table_descriptions.head()

Unnamed: 0,TableName,Description
0,statewise-census-data-in-india-1901-2011.csv,"The table ""statewise-census-data-in-india-1901..."
1,road-ahead-current-road-closures.csv,"The table ""road-ahead-current-road-closures.cs..."
2,property-tie-lines.csv,"The table ""property-tie-lines.csv"" contains in..."
3,public-art.csv,The public-art.csv table provides detailed inf...
4,gvrd-sewer-trunk-mains.csv,"The ""gvrd-sewer-trunk-mains.csv"" table provide..."


In [10]:
matches = pd.read_csv('Description_test/CTA_from_descriptions/junio_JD_from_cta_matches_corrigido.csv', sep=';')
matches = matches[matches['JOINABLE'] == 'Yes']
matches.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,LEFT_DESCRIPTION,RIGHT_TABLE,RIGHT_COLUMN,RIGHT_DESCRIPTION,JOINABLE
212,community-centres,ADDRESS,The full postal address of the community garde...,community-gardens-and-food-trees,MERGED_ADDRESS,The ADDRESS column provides the street address...,Yes
221,community-centres,Geo Local Area,The neighbourhood or broader geographical area...,community-gardens-and-food-trees,Geo Local Area,The Geo Local Area column identifies the gener...,Yes
223,community-centres,ADDRESS,This column contains the street address of the...,cultural-spaces,ADDRESS,The ADDRESS column provides the street address...,Yes
226,community-centres,Geo Local Area,This column identifies the local area or neigh...,cultural-spaces,LOCAL_AREA,The Geo Local Area column identifies the gener...,Yes
229,community-centres,ADDRESS,The street address where the organization is l...,eo_pr,STREET,The ADDRESS column provides the street address...,Yes


In [11]:
table_matches = matches[['LEFT_TABLE', 'RIGHT_TABLE']]
table_matches = table_matches.drop_duplicates()
table_matches = table_matches.reset_index(drop=True)
table_matches

Unnamed: 0,LEFT_TABLE,RIGHT_TABLE
0,community-centres,community-gardens-and-food-trees
1,community-centres,cultural-spaces
2,community-centres,eo_pr
3,community-centres,eo4
4,community-centres,libraries
5,community-centres,public-art
6,community-centres,rental-standards-current-issues
7,community-centres,schools
8,community-centres,street-intersections
9,community-gardens-and-food-trees,cultural-spaces


In [12]:
asnwers = []

for i in tqdm(range(len(table_matches))):
    left_table_name  = table_matches.iloc[i, 0] + '.csv' 
    right_table_name = table_matches.iloc[i, 1] + '.csv' 
    
    target_table_description = table_descriptions[table_descriptions['TableName'] == left_table_name].iloc[0,1]
    target_column_descriptions = column_descriptions[column_descriptions['TableName'] == left_table_name].Description.values    
    
    candidate_table_description = table_descriptions[table_descriptions['TableName'] == right_table_name].iloc[0,1]
    candidate_column_descriptions = column_descriptions[column_descriptions['TableName'] == right_table_name].Description.values

    # print(target_table_description + '\n######################\n' +
    #       target_column_descriptions[0] + '\n######################\n' + 
    #       candidate_table_description + '\n######################\n' + 
    #       candidate_column_descriptions[0])
    
    result = generate_predictions(target_table_description, target_column_descriptions, 
                                  candidate_table_description, candidate_column_descriptions, 
                                  client)
    asnwers.append(result)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 59/59 [00:26<00:00,  2.19it/s]


In [13]:
asnwers

['Yes',
 'Yes',
 'No',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'No']

In [14]:
table_matches['JOINABLE'] = asnwers

In [15]:
table_matches

Unnamed: 0,LEFT_TABLE,RIGHT_TABLE,JOINABLE
0,community-centres,community-gardens-and-food-trees,Yes
1,community-centres,cultural-spaces,Yes
2,community-centres,eo_pr,No
3,community-centres,eo4,No
4,community-centres,libraries,Yes
5,community-centres,public-art,No
6,community-centres,rental-standards-current-issues,Yes
7,community-centres,schools,Yes
8,community-centres,street-intersections,Yes
9,community-gardens-and-food-trees,cultural-spaces,No


In [17]:
matches.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,LEFT_DESCRIPTION,RIGHT_TABLE,RIGHT_COLUMN,RIGHT_DESCRIPTION,JOINABLE
212,community-centres,ADDRESS,The full postal address of the community garde...,community-gardens-and-food-trees,MERGED_ADDRESS,The ADDRESS column provides the street address...,Yes
221,community-centres,Geo Local Area,The neighbourhood or broader geographical area...,community-gardens-and-food-trees,Geo Local Area,The Geo Local Area column identifies the gener...,Yes
223,community-centres,ADDRESS,This column contains the street address of the...,cultural-spaces,ADDRESS,The ADDRESS column provides the street address...,Yes
226,community-centres,Geo Local Area,This column identifies the local area or neigh...,cultural-spaces,LOCAL_AREA,The Geo Local Area column identifies the gener...,Yes
229,community-centres,ADDRESS,The street address where the organization is l...,eo_pr,STREET,The ADDRESS column provides the street address...,Yes


In [19]:
answers = []
for i in range(len(matches)):
    answer = table_matches[(table_matches['LEFT_TABLE'] == matches.iloc[i,0]) & (table_matches['RIGHT_TABLE'] == matches.iloc[i,3])].iloc[0,2]
    answers.append(answer)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 149/149 [00:00<00:00, 2117.17it/s]


In [20]:
matches['JOINABLE_TD'] = answers

In [21]:
matches

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,LEFT_DESCRIPTION,RIGHT_TABLE,RIGHT_COLUMN,RIGHT_DESCRIPTION,JOINABLE,JOINABLE_TD
212,community-centres,ADDRESS,The full postal address of the community garde...,community-gardens-and-food-trees,MERGED_ADDRESS,The ADDRESS column provides the street address...,Yes,Yes
221,community-centres,Geo Local Area,The neighbourhood or broader geographical area...,community-gardens-and-food-trees,Geo Local Area,The Geo Local Area column identifies the gener...,Yes,Yes
223,community-centres,ADDRESS,This column contains the street address of the...,cultural-spaces,ADDRESS,The ADDRESS column provides the street address...,Yes,Yes
226,community-centres,Geo Local Area,This column identifies the local area or neigh...,cultural-spaces,LOCAL_AREA,The Geo Local Area column identifies the gener...,Yes,Yes
229,community-centres,ADDRESS,The street address where the organization is l...,eo_pr,STREET,The ADDRESS column provides the street address...,Yes,No
239,community-centres,ADDRESS,This column specifies the street addresses of ...,eo4,STREET,The ADDRESS column provides the street address...,Yes,No
248,community-centres,Geo Local Area,"In this column, the local areas or neighborhoo...",libraries,Geo Local Area,The Geo Local Area column identifies the gener...,Yes,Yes
256,community-centres,Geo Local Area,The neighborhood within Vancouver where the ar...,public-art,Neighbourhood,The Geo Local Area column identifies the gener...,Yes,No
261,community-centres,Geo Local Area,A broader geographical classification indicati...,public-art,GeoLocalArea,The Geo Local Area column identifies the gener...,Yes,No
274,community-centres,Geo Local Area,The broader neighborhood or community area whe...,rental-standards-current-issues,Geo Local Area,The Geo Local Area column identifies the gener...,Yes,Yes


In [22]:
matches.to_csv('Description_test/CTA_from_descriptions/junio_JD_matches_with_table_descriptions.csv', index=False)

In [None]:
matches