In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from pydantic import BaseModel
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [4]:
class join_schema(BaseModel):
    table_name: str
    column_name: str

In [14]:
def generate_prompt_predict_possible_joins(target_description, candidate_description):

    system_msg = f"""
            Given one target column description and one candidate column description, predict if they could be joined, Answer with only 'Yes' or 'No'.

            Task: Look carefully at the target column description and the candidate column description and use this information to identify patterns and relationships between the descriptions, the result must be Yes 
            or No depending if the pair is JOINable.
            
            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them based on related columns between them.
            
            Instructions: 
                1. Look at the target description given to you. 
                2. Look at the candidate description in detail. 
                3. Predict if the target column description belongs to a column that may be used in join. 
                4. Predict if the target column description and the candidate column description are JOINable. Disregard the column names.
                
            Example 1:
                Target description: this column represents a worker's id.
                Candidate description: 'the column worker_id represents the worker's id'.
                Answer: Yes
            """
    
    user_msg = f"""Target description:     {target_description}
                   Candidate description:  {candidate_description}
                   Answer: """.strip()
    
    return system_msg, user_msg

In [16]:
def generate_predictions(target_description, candidate_description, client):
    
    system_msg_predict_joins, user_msg_predict_joins = generate_prompt_predict_possible_joins(target_description, candidate_description)
    result = execute_prompt(client, system_msg_predict_joins, user_msg_predict_joins)
    answer = result.choices[0].message.content.split('Answer: ')[-1].strip()
    
    return answer

In [7]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [8]:
client = OpenAI()

In [9]:
cta_matches = pd.read_csv('Description_test/CTA_from_descriptions/cta_matches.csv')
cta_matches.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,LEFT_DESCRIPTION,RIGHT_TABLE,RIGHT_COLUMN,RIGHT_DESCRIPTION
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,public-art.csv,Type,"The ""Type"" column in the table represents the ..."
1,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,public-art.csv,Status,The 'Status' column in this table indicates th...
2,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,gvrd-sewer-trunk-mains.csv,EFFLUENT_TYPE,"The ""EFFLUENT_TYPE"" column in the table provid..."
3,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,gvrd-sewer-trunk-mains.csv,MATERIAL,"The ""MATERIAL"" column in the table represents ..."
4,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,SCS_Staff_Salaries_data_30th_June 2010.csv,Grade,"The ""Grade"" column in the table represents the..."


In [17]:
answers = []

for i in tqdm(range(len(cta_matches))):
    target_description = cta_matches.iloc[i,2]
    candidate_description = cta_matches.iloc[i,5]
    answer = generate_predictions(target_description, candidate_description, client)
    answers.append(answer)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6246/6246 [56:37<00:00,  1.84it/s]


In [18]:
cta_matches['JOINABLE'] = answers

In [20]:
cta_matches.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,LEFT_DESCRIPTION,RIGHT_TABLE,RIGHT_COLUMN,RIGHT_DESCRIPTION,JOINABLE
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,public-art.csv,Type,"The ""Type"" column in the table represents the ...",No
1,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,public-art.csv,Status,The 'Status' column in this table indicates th...,No
2,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,gvrd-sewer-trunk-mains.csv,EFFLUENT_TYPE,"The ""EFFLUENT_TYPE"" column in the table provid...",No
3,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,gvrd-sewer-trunk-mains.csv,MATERIAL,"The ""MATERIAL"" column in the table represents ...",No
4,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,SCS_Staff_Salaries_data_30th_June 2010.csv,Grade,"The ""Grade"" column in the table represents the...",No


In [22]:
cta_matches[cta_matches['JOINABLE'] == 'Yes'].head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,LEFT_DESCRIPTION,RIGHT_TABLE,RIGHT_COLUMN,RIGHT_DESCRIPTION,JOINABLE
69,statewise-census-data-in-india-1901-2011.csv,DATE,The 'DATE' column in the table represents the ...,population-by-governorate-citizenship-and-gend...,Year,"The ""Year"" column in the table represents the ...",Yes
74,statewise-census-data-in-india-1901-2011.csv,LOCATION_NAME,"The ""LOCATION_NAME"" column in the table repres...",currency_exchange.csv,Country,"The ""Country"" column in the table specifies th...",Yes
214,road-ahead-current-road-closures.csv,PROJECT,"The ""PROJECT"" column in the given table lists ...",road-ahead-upcoming-projects.csv,PROJECT,"The ""PROJECT"" column in the table provides det...",Yes
216,road-ahead-current-road-closures.csv,PROJECT,"The ""PROJECT"" column in the given table lists ...",road-ahead-upcoming-projects.csv,LOCATION,"The ""LOCATION"" column in the table appears to ...",Yes
224,road-ahead-current-road-closures.csv,PROJECT,"The ""PROJECT"" column in the given table lists ...",road-ahead-projects-under-construction.csv,PROJECT,"The target column, 'PROJECT,' contains textual...",Yes


In [23]:
cta_matches[cta_matches['JOINABLE'] == 'Yes'].shape

(408, 7)

In [19]:
cta_matches.to_csv('Description_test/CTA_from_descriptions/JD_from_cta_matches.csv',index=False)