In [1]:
import pandas as pd
import numpy as np
import json

import pickle
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
def generate_prompt_column_description(table, column):

    system_msg = f"""
            Describe the semantics of a target column.
            Task: Describe the information within a column in a given table using continuos text, without itemization.
            Instructions: 
                1. Look at the input given to you. 
                2. Look at the column values in detail. 
                3. Describe the target column. 
            """
    
    user_msg = f"""Table columns: {table.columns}
                   Table values:  {table.iloc[:30,:].values}
                   Target column: {column}
                   Description: """.strip()
    
    return system_msg, user_msg

In [44]:
def generate_prompt_predict_possible_joins(target_description, candidate_descriptions):

    system_msg = f"""
            Given one target column description and many candidate column descriptions, predict all the pairs (target description column name, candidate description column name) 
            that could be joined.

            Task: Look carefully at the target description of the target column and candidate column descriptions and use this information to identify patterns and 
            relationships between the descriptions, the result must be a list of all the JOINable pairs found. If no joinable pair is found the result
            should be just the word "none".

            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them 
            based on related  columns between them.
            
            Instructions: 
                1. Look at the target description given to you. 
                2. Look at the candidate descriptions in detail. 
                3. Predict all the possible JOIN between those descriptions. 
            """
    
    user_msg = f"""Target description:      {target_description}
                   Candidate descriptions:  {candidate_descriptions}
                   Possible JOINs: """.strip()
    
    return system_msg, user_msg

In [45]:
def generate_predictions(df, client):
    
    descriptions = []
    for i in tqdm(range(df.shape[1])): 
        system_msg_predict_description, user_msg_predict_descrition = generate_prompt_column_description(df, df.columns[i])
        result = execute_prompt(client, system_msg_predict_description, user_msg_predict_descrition)
        description = result.choices[0].message.content.split('Description: ')[-1].strip()
        descriptions.append(description)
        
    descriptions_df = pd.DataFrame({
                               "Column":df.columns, 
                               "Description":descriptions
                               })
    
    return descriptions_df

In [46]:
def generate_join_predictions(df1, df2, client):
    joins = []
    for desc in tqdm(df1.Description):
        system_msg_predict_join, user_msg_predict_join = generate_prompt_predict_possible_joins(desc, df2.Description)
        result = execute_prompt(client, system_msg_predict_join, user_msg_predict_join)
        join = result.choices[0].message.content.split('Possible JOINs: ')[-1].strip()
        joins.append(join)

    return joins

In [5]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [6]:
client = OpenAI()

In [7]:
ground_truth = pd.read_csv('opendata_join_ground_truth_clean.csv')
ground_truth.head()

Unnamed: 0,query_table,candidate_table,query_column,candidate_column
0,SG_CSV0000000000000925.csv,SG_CSV0000000000001714.csv,ban_education4,ban_education4
1,SG_CSV0000000000000925.csv,SG_CSV0000000000001714.csv,levelsch,levelsch
2,SG_CSV0000000000000925.csv,SG_CSV0000000000001714.csv,socialnetwork_5_slice,socialnetwork_5_slice
3,SG_CSV0000000000000925.csv,SG_CSV0000000000001714.csv,ban_children4,ban_children4
4,SG_CSV0000000000000925.csv,SG_CSV0000000000001714.csv,q20_q212,q20_q212


In [8]:
df_0925 = pd.read_csv('datasets_SG/SG_CSV0000000000000925.csv')
df_0925.head()

Unnamed: 0,respondent_serial,respondent_id,country,yearbornin_1_slice,yearbornin_1_slice1,ageofrespondent,indicateage,gender,occstatus,levelsch,...,ban_q30_4,ban_q32,ban_q431,ban_q432,ban_q433,ban_q434,ban_q435,ban_q436,ban_q437,md_segment
0,1184,GEN24_ 2833,Singapore,1961,60,60,55-64 years old,Female,Housewife / homemaker,-,...,T2B,Neutral,No,No,Yes,No,No,No,Yes,Loyal_betrayal
1,730,GEN24_.2328.,Singapore,1951,70,70,65 years old and above,Female,Employee (working full-time in private sector),-,...,T2B,Neutral,No,No,Yes,No,No,No,Yes,Harm_Care
2,15,GEN24_1001,Singapore,1983,38,38,35-44 years old,Male,Employee (working full-time in private sector),-,...,Neutral,B2B,No,No,No,Yes,No,Yes,No,Fairness_Cheating
3,18,GEN24_1002,Singapore,1955,66,66,65 years old and above,Male,Retiree,-,...,T2B,Neutral,No,No,Yes,No,No,Yes,No,Harm_Care
4,23,GEN24_1003,Singapore,1965,56,56,55-64 years old,Female,Housewife / homemaker,-,...,T2B,B2B,No,No,No,Yes,No,Yes,No,Purity_Degradation


In [9]:
df_1714 = pd.read_csv('datasets_SG/SG_CSV0000000000001714.csv')
df_1714.head()

Unnamed: 0,respondent_serial,respondent_id,country,yearbornin_1_slice,yearbornin_1_slice1,ageofrespondent,indicateage,gender,occstatus,levelsch,...,ban_q30_3,ban_q30_4,ban_q32,ban_q431,ban_q432,ban_q433,ban_q434,ban_q435,ban_q436,ban_q437
0,3,GEN24_4311,Singapore,1988,33,33,25-34 years old,Female,Housewife / homemaker,-,...,T2B,T2B,B2B,No,No,No,Yes,No,Yes,No
1,4,GEN24_4312,Singapore,1977,44,44,35-44 years old,Female,Employee (working full-time in private sector),-,...,T2B,T2B,B2B,No,No,No,Yes,No,Yes,No
2,5,GEN24_2966,Singapore,1968,53,53,45-54 years old,Female,"Unemployed, for more than 6 months",-,...,Neutral,T2B,Neutral,No,No,Yes,No,No,No,Yes
3,6,GEN24_4313,Singapore,1987,34,34,25-34 years old,Male,Employee (working full-time in private sector),-,...,T2B,T2B,Neutral,No,No,No,Yes,No,Yes,No
4,7,GEN24_4314,Singapore,1958,63,63,55-64 years old,Female,"Unemployed, for more than 6 months",-,...,T2B,T2B,Neutral,No,No,Yes,No,No,No,Yes


In [18]:
df_0925_descriptions = generate_predictions(df_0925, client)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 101/101 [06:53<00:00,  4.10s/it]


In [19]:
df_0925_descriptions.to_csv('descriptions/SG_CSV0000000000000925.csv', index=False)

In [20]:
df_1714_descriptions = generate_predictions(df_1714, client)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [06:45<00:00,  4.05s/it]


In [21]:
df_1714_descriptions.to_csv('descriptions/SG_CSV0000000000001714.csv', index=False)

In [48]:
joins = generate_join_predictions(df_0925_descriptions, df_1714_descriptions, client)
joins

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 101/101 [02:13<00:00,  1.32s/it]


['1. (respondent_serial, respondent_serial)\n2. (respondent_serial, respondent_id)',
 '1. (Target: "respondent_id", Candidate: "respondent_serial")\n2. (Target: "respondent_id", Candidate: "respondent_id")',
 '- (country, country): The target description and candidate description index 2 both refer to a "country" column that contains information related to countries. This commonality suggests that these columns can be joined based on their shared focus on country information.',
 '1. (yearbornin_1_slice, yearbornin_1_slice1)',
 '1. (yearbornin_1_slice1_slice1, yearbornin_1_slice)',
 '- (ageofrespondent, respondent_id)',
 'none',
 'none',
 'none',
 'none',
 'Given the target description and the candidate descriptions, I\'ll look for possible JOINs based on potential keys or related fields. From the descriptions provided, there are no direct connections or keys mentioned that could be used to JOIN with the "highestlevel" target column description.\n\nWithout specific information about the

In [53]:
np.save('predicetd_joins.npy', joins)

In [None]:
tp = 0
fp = 0
fn = 0

for i in range(len(all_predicted_joinable_columns_joins)):
    if(";".join(all_predicted_joinable_columns_joins.iloc[i,:]) in df_joinable_columns_joins):
        tp += 1
    else:
        fp += 1
    fn = len(df_joinable_columns_joins) - tp

precision = (tp)/(tp+fp)
recall = (tp)/(tp+fn)
f1_score = 2 * (precision*recall)/(precision+recall)

print(precision, recall, f1_score)