In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle

from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [4]:
def generate_prompt_JD_prediction(target_table_description, target_column_descriptions, candidate_table_description, candidate_column_descriptions):

    system_msg = """
            Given a target table and a candidate table, predict if the pair (target table, candidate table) could be joined and answer with only "Yes" or "No".

            Task: Look carefully at the target table description and the candidate table description, as well as the description of their columns, and 
            use these information to identify patterns and relationships between the descriptions. The result must be only the word "Yes" if the JOIN is 
            possible or "No" otherwise.

            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them 
            based on related columns they may have between them.

            Additional info: consider location as JOINable as if the values are standardized.

           Instructions: 
                1. Look at the description of the target table given to you.
                2. Look at the description of the columns of the target table given to you.
                3. Look at the description of the candidate table given to you.
                4. Look at the description of the columns of the candidate table given to you.
                5. Predict if the target table and the candidate tables are joinable, answer with "Yes" or "No".
    """
    user_msg =  f"""Target Table description: {target_table_description}
                  Target Table columns descriptions: {target_column_descriptions}
                  Candidate Table description: {candidate_table_description}
                  Candidate Table columns descriptions: {candidate_column_descriptions}Answer:""".strip()

    return system_msg, user_msg

In [5]:
column_descriptions = pd.read_csv('Description_test/table_descriptions_test_2025_01_05/all_junio_descriptions_with_cta.csv')
column_descriptions.head()

Unnamed: 0,TableName,Column,Description,CTA
0,eo_xx,EIN,The Employer Identification Number (EIN) is a ...,Identifier
1,eo_xx,NAME,The NAME column contains the official name of ...,Organization
2,eo_xx,ICO,The ICO column appears to contain names of ind...,Person
3,eo_xx,STREET,The STREET column provides the street address ...,streetAddress
4,eo_xx,CITY,The CITY column specifies the city where the o...,Text


In [6]:
table_descriptions = pd.read_csv('Description_test/table_descriptions_test_2025_01_05/table_descriptions.csv')
table_descriptions.head()

Unnamed: 0,TableName,Description
0,statewise-census-data-in-india-1901-2011.csv,"The table ""statewise-census-data-in-india-1901..."
1,road-ahead-current-road-closures.csv,"The table ""road-ahead-current-road-closures.cs..."
2,property-tie-lines.csv,"The table ""property-tie-lines.csv"" contains in..."
3,public-art.csv,The public-art.csv table provides detailed inf...
4,gvrd-sewer-trunk-mains.csv,"The ""gvrd-sewer-trunk-mains.csv"" table provide..."


In [7]:
table_matches = pd.read_csv('Description_test/table_descriptions_test_2025_01_05/table_cartesians.csv')
table_matches.head()

Unnamed: 0,LEFT_TABLE,RIGHT_TABLE
0,statewise-census-data-in-india-1901-2011.csv,road-ahead-current-road-closures.csv
1,statewise-census-data-in-india-1901-2011.csv,property-tie-lines.csv
2,statewise-census-data-in-india-1901-2011.csv,public-art.csv
3,statewise-census-data-in-india-1901-2011.csv,gvrd-sewer-trunk-mains.csv
4,statewise-census-data-in-india-1901-2011.csv,SCS_Staff_Salaries_data_30th_June 2010.csv


In [8]:
prompts = []

for i in tqdm(range(len(table_matches))):
    left_table_name  = table_matches.iloc[i, 0]
    right_table_name = table_matches.iloc[i, 1]
    
    target_table_description = table_descriptions[table_descriptions['TableName'] == left_table_name].iloc[0,1]
    target_column_descriptions = column_descriptions[column_descriptions['TableName'] == left_table_name[:-4]].Description.values    
    
    candidate_table_description = table_descriptions[table_descriptions['TableName'] == right_table_name].iloc[0,1]
    candidate_column_descriptions = column_descriptions[column_descriptions['TableName'] == right_table_name[:-4]].Description.values
    
    prompt = generate_prompt_JD_prediction(target_table_description, target_column_descriptions, 
                                           candidate_table_description, candidate_column_descriptions)
    prompts.append(prompt)

100%|███████████████████████████████████████████████████████████████████████████████| 378/378 [00:00<00:00, 1047.50it/s]


In [9]:
prompts[0]

('\n            Given a target table and a candidate table, predict if the pair (target table, candidate table) could be joined and answer with only "Yes" or "No".\n\n            Task: Look carefully at the target table description and the candidate table description, as well as the description of their columns, and \n            use these information to identify patterns and relationships between the descriptions. The result must be only the word "Yes" if the JOIN is \n            possible or "No" otherwise.\n\n            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them \n            based on related columns they may have between them.\n\n            Additional info: consider location as JOINable as if the values are standardized.\n\n           Instructions: \n                1. Look at the description of the target table given to you.\n                2. Look at the description of the columns of the target ta

In [10]:
np.save('Description_test/table_descriptions_test_2025_01_05/Table_JOIN_Prompts/table_JD_with_columns_v3.npy', prompts)