In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle

from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
path = '/home/manoelflorencio/cta_for_jd/LakeBench'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/LakeBench


In [4]:
def generate_prompt_JD_prediction(target_table_description, target_column_descriptions, candidate_table_description, candidate_column_descriptions):

    system_msg = """
            Given a target table and a candidate table, predict if the pair (target table, candidate table) could be joined and answer with only "Yes" or "No".

            Task: Look carefully at the target table description and the candidate table description, as well as the description of their columns, and 
            use these information to identify patterns and relationships between the descriptions. The result must be only the word "Yes" if the JOIN is 
            possible or "No" otherwise.

            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them 
            based on related columns they may have between them.

            Additional info: consider location as JOINable as if the values are standardized.

           Instructions: 
                1. Look at the description of the target table given to you.
                2. Look at the description of the columns of the target table given to you.
                3. Look at the description of the candidate table given to you.
                4. Look at the description of the columns of the candidate table given to you.
                5. Predict if the target table and the candidate tables are joinable, answer with "Yes" or "No".
    """
    user_msg =  f"""Target Table description: {target_table_description}
                  Target Table columns descriptions: {target_column_descriptions}
                  Candidate Table description: {candidate_table_description}
                  Candidate Table columns descriptions: {candidate_column_descriptions}Answer:""".strip()

    return system_msg, user_msg

In [5]:
column_descriptions = pd.read_csv('column_descriptions_with_cta.csv')
column_descriptions.head()

Unnamed: 0,TableName,Column,Description,CTA
0,SG_CSV0000000000000925.csv,respondent_serial,The 'respondent_serial' column contains unique...,Identifier
1,SG_CSV0000000000000925.csv,respondent_id,The 'respondent_id' column contains unique ide...,Identifier
2,SG_CSV0000000000000925.csv,country,"The ""country"" column contains the name of the ...",Country
3,SG_CSV0000000000000925.csv,yearbornin_1_slice,The 'yearbornin_1_slice' column indicates a sl...,Date
4,SG_CSV0000000000000925.csv,yearbornin_1_slice1,The 'yearbornin_1_slice1' column represents th...,Number


In [6]:
column_descriptions.shape

(6756, 4)

In [7]:
table_descriptions = pd.read_csv('table_descriptions.csv')
table_descriptions.head()

Unnamed: 0,TableName,Description
0,SG_CSV0000000000001178.csv,The table SG_CSV0000000000001178.csv presents ...
1,SG_CSV0000000000000451.csv,"The table ""SG_CSV0000000000000451.csv"" contain..."
2,SG_CSV0000000000000147.csv,The table contains data on the average monthly...
3,SG_CSV0000000000000048.csv,"The table ""SG_CSV0000000000000048.csv"" contain..."
4,SG_CSV0000000000001638.csv,The table SG_CSV0000000000001638.csv contains ...


In [8]:
table_descriptions.shape

(1256, 2)

In [9]:
table_matches = pd.read_csv('table_cartesians.csv')
table_matches.head()

Unnamed: 0,LEFT_TABLE,RIGHT_TABLE
0,SG_CSV0000000000001178.csv,SG_CSV0000000000000451.csv
1,SG_CSV0000000000001178.csv,SG_CSV0000000000000147.csv
2,SG_CSV0000000000001178.csv,SG_CSV0000000000000048.csv
3,SG_CSV0000000000001178.csv,SG_CSV0000000000001638.csv
4,SG_CSV0000000000001178.csv,SG_CSV0000000000000744.csv


In [10]:
table_matches = table_matches[(table_matches['LEFT_TABLE'] == 'SG_CSV0000000000000925.csv') | (table_matches['RIGHT_TABLE'] == 'SG_CSV0000000000000925.csv')]

In [11]:
table_matches.shape

(1255, 2)

In [12]:
prompts = []
init = 0
end = 10000

for i in tqdm(range(len(table_matches))):
# for i in tqdm(range(init, end)):
    left_table_name  = table_matches.iloc[i, 0]
    right_table_name = table_matches.iloc[i, 1]
    
    target_table_description = table_descriptions[table_descriptions['TableName'] == left_table_name].iloc[0,1]
    target_column_descriptions = column_descriptions[column_descriptions['TableName'] == left_table_name].Description.values    
    
    candidate_table_description = table_descriptions[table_descriptions['TableName'] == right_table_name].iloc[0,1]
    candidate_column_descriptions = column_descriptions[column_descriptions['TableName'] == right_table_name].Description.values
    
    prompt = generate_prompt_JD_prediction(target_table_description, target_column_descriptions, 
                                           candidate_table_description, candidate_column_descriptions)
    prompts.append(prompt)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1255/1255 [00:03<00:00, 324.06it/s]


In [13]:
del column_descriptions
del table_descriptions
del table_matches

In [14]:
len(prompts)

1255

In [15]:
prompts[0]

('\n            Given a target table and a candidate table, predict if the pair (target table, candidate table) could be joined and answer with only "Yes" or "No".\n\n            Task: Look carefully at the target table description and the candidate table description, as well as the description of their columns, and \n            use these information to identify patterns and relationships between the descriptions. The result must be only the word "Yes" if the JOIN is \n            possible or "No" otherwise.\n\n            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them \n            based on related columns they may have between them.\n\n            Additional info: consider location as JOINable as if the values are standardized.\n\n           Instructions: \n                1. Look at the description of the target table given to you.\n                2. Look at the description of the columns of the target ta

In [16]:
np.save('Table_JOIN_Prompts/table_JD_with_columns_SG_CSV0000000000000925.npy', prompts)