In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
path = '/home/manoelflorencio/cta_for_jd/LakeBench'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/LakeBench


In [4]:
def generate_prompt_cta_prediction(description, cta_types):

    system_msg = f"""
            Your task is to classify a column's description of a given table with only one of the following classes that are separated 
            with comma: {",".join(cta_types)}.

            Your instructions are:
                1. Look at the input given to you.
                2. Look at the description semantics in detail.
                3. Predict a class among the following classes {",".join(cta_types)} for the target column.
                4. Answer only with the predicted class.

            Example 1:
                Description: "The 'Sex' column in the table represents the gender of the animal listed in each row. The values in this column consist of abbreviations that denote both the sex and reproductive status of the animals. 'M' indicates a male animal, and 'F' stands for a female animal.
                Class: category

            Example 2:
                Description: "The ""GeoLocalArea"" column in the table represents the geographical local area within which the public artworks or installations are situated or associated.
                Class: streetAddress"""
            
    user_msg =  f"""Description: {description}
                    Class: """.strip()
    
    return system_msg, user_msg

In [5]:
def generate_predictions(description, cta_types, client):
    
    system_msg_predict_cta, user_msg_predict_cta = generate_prompt_cta_prediction(description, cta_types)
    result = execute_prompt(client, system_msg_predict_cta, user_msg_predict_cta)
    cta = result.choices[0].message.content.split('Class: ')[-1].strip()
    
    return cta

In [6]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [7]:
client = OpenAI()

In [8]:
descriptions = pd.read_csv('column_descriptions.csv')
descriptions.head()

Unnamed: 0,TableName,Column,Description
0,SG_CSV0000000000000925.csv,respondent_serial,The 'respondent_serial' column contains unique...
1,SG_CSV0000000000000925.csv,respondent_id,The 'respondent_id' column contains unique ide...
2,SG_CSV0000000000000925.csv,country,"The ""country"" column contains the name of the ..."
3,SG_CSV0000000000000925.csv,yearbornin_1_slice,The 'yearbornin_1_slice' column indicates a sl...
4,SG_CSV0000000000000925.csv,yearbornin_1_slice1,The 'yearbornin_1_slice1' column represents th...


In [10]:
cta_types = np.load('cta_types.npy')

In [11]:
cta = []
for table_name, column_name, description in tqdm(descriptions.values):
    target_description = description
    cta.append(generate_predictions(target_description, cta_types, client))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 201/201 [01:58<00:00,  1.70it/s]


In [12]:
descriptions['CTA'] = cta

In [13]:
descriptions.to_csv('column_descriptions_with_cta.csv',index=False)

In [14]:
descriptions_with_cta = pd.read_csv('column_descriptions_with_cta.csv')

In [15]:
descriptions_with_cta.head()

Unnamed: 0,TableName,Column,Description,CTA
0,SG_CSV0000000000000925.csv,respondent_serial,The 'respondent_serial' column contains unique...,Identifier
1,SG_CSV0000000000000925.csv,respondent_id,The 'respondent_id' column contains unique ide...,Identifier
2,SG_CSV0000000000000925.csv,country,"The ""country"" column contains the name of the ...",Country
3,SG_CSV0000000000000925.csv,yearbornin_1_slice,The 'yearbornin_1_slice' column indicates a sl...,Date
4,SG_CSV0000000000000925.csv,yearbornin_1_slice1,The 'yearbornin_1_slice1' column represents th...,Date


In [16]:
table_names = descriptions_with_cta.TableName.unique()
table_names

array(['SG_CSV0000000000000925.csv', 'SG_CSV0000000000001714.csv'],
      dtype=object)

In [17]:
matches = pd.DataFrame({'LEFT_TABLE':[],'LEFT_COLUMN':[],'RIGHT_TABLE':[],'RIGHT_COLUMN':[]})
for table_name in table_names:
    target_table = descriptions_with_cta[descriptions_with_cta['TableName'] == table_name]
    candidate_tables = descriptions_with_cta[descriptions_with_cta['TableName'] != table_name]
    for i in range(len(target_table.Column)):
        for j in range(len(candidate_tables.Column)):
            if(target_table.CTA.values[i] == candidate_tables.CTA.values[j]):
                match = pd.DataFrame({'LEFT_TABLE':[table_name],
                                      'LEFT_COLUMN':[target_table.Column.values[i]],
                                      'LEFT_DESCRIPTION':[target_table.Description.values[i]],
                                      'RIGHT_TABLE':[candidate_tables.TableName.values[j]],
                                      'RIGHT_COLUMN':[candidate_tables.Column.values[j]],
                                      'RIGHT_DESCRIPTION':[candidate_tables.Description.values[j]]})
                if(len(matches) == 0):
                    matches = match
                else:
                    matches = pd.concat([matches, match], axis=0)

In [18]:
matches.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,LEFT_DESCRIPTION,RIGHT_TABLE,RIGHT_COLUMN,RIGHT_DESCRIPTION
0,SG_CSV0000000000000925.csv,respondent_serial,The 'respondent_serial' column contains unique...,SG_CSV0000000000001714.csv,respondent_serial,"The ""respondent_serial"" column contains unique..."
0,SG_CSV0000000000000925.csv,respondent_serial,The 'respondent_serial' column contains unique...,SG_CSV0000000000001714.csv,respondent_id,The 'respondent_id' column contains unique alp...
0,SG_CSV0000000000000925.csv,respondent_id,The 'respondent_id' column contains unique ide...,SG_CSV0000000000001714.csv,respondent_serial,"The ""respondent_serial"" column contains unique..."
0,SG_CSV0000000000000925.csv,respondent_id,The 'respondent_id' column contains unique ide...,SG_CSV0000000000001714.csv,respondent_id,The 'respondent_id' column contains unique alp...
0,SG_CSV0000000000000925.csv,country,"The ""country"" column contains the name of the ...",SG_CSV0000000000001714.csv,country,"The ""country"" column indicates the country whe..."


In [19]:
matches.shape

(7444, 6)

In [20]:
def generate_key_for_evaluation(df):
    left_table_name = []
    left_column_name = []
    left_description = []
    right_table_name = []
    right_column_name = []
    right_description = []
    df_key = pd.DataFrame((), columns=df.columns)
    
    for i in range(df.shape[0]):
        rtn = df.iloc[i, 3]
        ltn = df.iloc[i, 0]
        right_table_name.append(rtn if rtn > ltn else ltn)
        left_table_name.append(rtn if rtn < ltn else ltn)
        
    
        rcn = df.iloc[i, 4]
        lcn = df.iloc[i, 1]
        right_column_name.append(rcn if rtn > ltn else lcn)
        left_column_name.append(rcn if rtn < ltn else lcn)

        rcd = df.iloc[i, 5]
        lcd = df.iloc[i, 2]
        left_description.append(rcd if rtn > ltn else lcd)
        right_description.append(rcd if rtn < ltn else lcd)
    
    d = {'LEFT_TABLE':left_table_name,
         'LEFT_COLUMN':left_column_name,
         'LEFT_DESCRIPTION':left_description,
         'RIGHT_TABLE':right_table_name,
         'RIGHT_COLUMN':right_column_name,
         'RIGHT_DESCRIPTION':right_description}
    df_key = pd.DataFrame(d)
    df_key = df_key.drop_duplicates()
    df_key = df_key.reset_index(drop=True)

    return df_key

In [21]:
matches = generate_key_for_evaluation(matches)
matches.shape

(3722, 6)

In [22]:
matches.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,LEFT_DESCRIPTION,RIGHT_TABLE,RIGHT_COLUMN,RIGHT_DESCRIPTION
0,SG_CSV0000000000000925.csv,respondent_serial,"The ""respondent_serial"" column contains unique...",SG_CSV0000000000001714.csv,respondent_serial,The 'respondent_serial' column contains unique...
1,SG_CSV0000000000000925.csv,respondent_serial,The 'respondent_id' column contains unique alp...,SG_CSV0000000000001714.csv,respondent_id,The 'respondent_serial' column contains unique...
2,SG_CSV0000000000000925.csv,respondent_id,"The ""respondent_serial"" column contains unique...",SG_CSV0000000000001714.csv,respondent_serial,The 'respondent_id' column contains unique ide...
3,SG_CSV0000000000000925.csv,respondent_id,The 'respondent_id' column contains unique alp...,SG_CSV0000000000001714.csv,respondent_id,The 'respondent_id' column contains unique ide...
4,SG_CSV0000000000000925.csv,country,"The ""country"" column indicates the country whe...",SG_CSV0000000000001714.csv,country,"The ""country"" column contains the name of the ..."


In [23]:
matches.to_csv('column_description_cta_matches.csv', index=False)