In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from pydantic import BaseModel
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [4]:
class join_schema(BaseModel):
    table_name: str
    column_name: str

In [5]:
def generate_prompt_predict_possible_joins(target_description, candidate_description):

    system_msg = f"""
            Given one target column description and one candidate column description, predict if they could be joined, Answer with only 'Yes' or 'No'.

            Task: Look carefully at the target column description and the candidate column description and use this information to identify patterns and 
            relationships between the descriptions, the result must be Yes or No depending if the pair is JOINable.
            
            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them based on related 
            columns between them.
            
            Instructions: 
                1. Look at the target description given to you. 
                2. Look at the candidate description in detail. 
                3. Predict if the target column description belongs to a column that may be used in join. 
                4. Predict if the target column description and the candidate column description are JOINable. Disregard the column names.
                
            Example 1:
                Target description: this column represents a worker's id.
                Candidate description: 'the column worker_id represents the worker's id'.
                Answer: Yes
            """
    
    user_msg = f"""Target description:     {target_description}
                   Candidate description:  {candidate_description}
                   Answer: """.strip()
    
    return system_msg, user_msg

In [6]:
def generate_predictions(target_description, candidate_description, client):
    
    system_msg_predict_joins, user_msg_predict_joins = generate_prompt_predict_possible_joins(target_description, candidate_description)
    result = execute_prompt(client, system_msg_predict_joins, user_msg_predict_joins)
    answer = result.choices[0].message.content.split('Answer: ')[-1].strip()
    
    return answer

In [7]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [8]:
client = OpenAI()

In [9]:
cta_matches = pd.read_csv('Description_test/CTA_from_descriptions/junio_description_cta_matches.csv')
cta_matches.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,LEFT_DESCRIPTION,RIGHT_TABLE,RIGHT_COLUMN,RIGHT_DESCRIPTION
0,datasets_579296_1047868_authors,Author_ID,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,This column contains a unique identifier for e...
1,datasets_517172_952401_train,id_code,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,The 'id_code' column contains unique identifie...
2,eo_pr,EIN,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,The Employer Identification Number (EIN) uniqu...
3,eo_pr,GROUP,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,"Group exemption number, which indicates if the..."
4,eo_pr,SUBSECTION,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,Numerical code indicating the subsection of th...


In [10]:
cta_matches.shape

(2841, 6)

In [11]:
answers = []

for i in tqdm(range(len(cta_matches))):
    target_description = cta_matches.iloc[i,2]
    candidate_description = cta_matches.iloc[i,5]
    answer = generate_predictions(target_description, candidate_description, client)
    answers.append(answer)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2841/2841 [27:22<00:00,  1.73it/s]


In [12]:
cta_matches['JOINABLE'] = answers

In [13]:
cta_matches.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,LEFT_DESCRIPTION,RIGHT_TABLE,RIGHT_COLUMN,RIGHT_DESCRIPTION,JOINABLE
0,datasets_579296_1047868_authors,Author_ID,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,This column contains a unique identifier for e...,No
1,datasets_517172_952401_train,id_code,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,The 'id_code' column contains unique identifie...,No
2,eo_pr,EIN,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,The Employer Identification Number (EIN) uniqu...,Yes
3,eo_pr,GROUP,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,"Group exemption number, which indicates if the...",No
4,eo_pr,SUBSECTION,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,Numerical code indicating the subsection of th...,No


In [14]:
cta_matches[cta_matches['JOINABLE'] == 'Yes'].head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,LEFT_DESCRIPTION,RIGHT_TABLE,RIGHT_COLUMN,RIGHT_DESCRIPTION,JOINABLE
2,eo_pr,EIN,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,The Employer Identification Number (EIN) uniqu...,Yes
24,eo4,EIN,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,This column contains the Employer Identificati...,Yes
34,eo_xx,NAME,The SCHOOL_NAME column lists the official name...,schools,SCHOOL_NAME,The NAME column contains the official name of ...,Yes
35,eo_pr,NAME,The NAME column contains the official name of ...,eo_xx,NAME,The official name of the organization or busin...,Yes
36,eo_xx,NAME,The name of the person or company responsible ...,rental-standards-current-issues,BUSINESSOPERATOR,The NAME column contains the official name of ...,Yes


In [15]:
cta_matches[cta_matches['JOINABLE'] == 'Yes'].shape

(189, 7)

In [16]:
cta_matches.to_csv('Description_test/CTA_from_descriptions/junio_JD_from_cta_matches.csv',index=False)