In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
# def generate_prompt_predict_possible_joins(target_description, candidate_descriptions):

#     system_msg = f"""
#             Given one target column description and many candidate column descriptions, predict all the pairs (target description column name, candidate description column name) 
#             that could be joined.

#             Task: Look carefully at the target description of the target column and candidate column descriptions and use this information to identify patterns and 
#             relationships between the descriptions, the result must be a list of all the JOINable pairs found. If no joinable pair is found the result
#             should be just the word "none".

#             Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them 
#             based on related  columns between them.
            
#             Instructions: 
#                 1. Look at the target description given to you. 
#                 2. Look at the candidate descriptions in detail. 
#                 3. Predict all the possible JOIN between those descriptions. 
#             """
    
#     user_msg = f"""Target description:      {target_description}
#                    Candidate descriptions:  {candidate_descriptions}
#                    Possible JOINs: """.strip()
    
#     return system_msg, user_msg

In [3]:
def generate_prompt_predict_possible_joins(target_description, candidate_descriptions, candidate_tables):

    system_msg = f"""
            Given one target column description and many candidate column descriptions, predict all the pairs (candidate table name, candidate description column name) 
            that could be joined.

            Task: Look carefully at the target description of the target column and candidate column descriptions and use this information to identify patterns and 
            relationships between the descriptions, the result must be a list of all the JOINable pairs found. If no joinable pair is found the result
            should be just the word "none".

            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them 
            based on related  columns between them.
            
            Instructions: 
                1. Look at the target description given to you. 
                2. Look at the candidate descriptions in detail. 
                3. Predict all the possible JOIN between those descriptions. 
            """
    
    user_msg = f"""Target description:      {target_description}
                   Candidate table:         {candidate_tables}
                   Candidate descriptions:  {candidate_descriptions}
                   Possible JOINs: """.strip()
    
    return system_msg, user_msg

In [4]:
def generate_predictions(target_descriptions, candidate_tables, candidate_descriptions, client):
    
    system_msg_predict_joins, user_msg_predict_joins = generate_prompt_predict_possible_joins(target_descriptions, candidate_tables, candidate_descriptions)
    result = execute_prompt(client, system_msg_predict_joins, user_msg_predict_joins)
    joins = result.choices[0].message.content.split('Possible JOINs: ')[-1].strip()
    
    return joins

In [5]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [6]:
client = OpenAI()

In [7]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [8]:
descriptions = pd.read_csv('Description_test/all_descriptions.csv')
descriptions.head()

Unnamed: 0,TableName,Column,Description
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...
1,statewise-census-data-in-india-1901-2011.csv,DATE,The 'DATE' column in the table represents the ...
2,statewise-census-data-in-india-1901-2011.csv,LOCATION_NAME,"The ""LOCATION_NAME"" column in the table repres..."
3,statewise-census-data-in-india-1901-2011.csv,LOCATION_F5,"The column ""LOCATION_F5"" in the table appears ..."
4,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,"The ""LOCATION_ID"" column contains identifiers ..."


In [9]:
files = ['eo_pr.csv', 'cultural-spaces.csv', 'public-art.csv', 'libraries.csv', 'schools.csv']
# files = ['cultural-spaces.csv', 'schools.csv']

In [10]:
sample_descriptions = descriptions[descriptions['TableName'].isin(files)]
sample_descriptions = sample_descriptions.reset_index(drop=True)
sample_descriptions.head()

Unnamed: 0,TableName,Column,Description
0,public-art.csv,RegistryID,"The target column, ""RegistryID"", contains a un..."
1,public-art.csv,ArtistProjectStatement,"The ""ArtistProjectStatement"" column in the tab..."
2,public-art.csv,Type,"The ""Type"" column in the table represents the ..."
3,public-art.csv,Status,The 'Status' column in this table indicates th...
4,public-art.csv,SiteName,"The ""SiteName"" column in the table provides th..."


In [11]:
sample_descriptions['TableName'].value_counts()

TableName
eo_pr.csv              28
public-art.csv         19
cultural-spaces.csv    12
libraries.csv           5
schools.csv             5
Name: count, dtype: int64

In [12]:
df_dsInformation = pd.read_csv('datasetInformation_testbedXS.csv')

In [19]:
joins_dict = {}

for file in files:
    info  = df_dsInformation[df_dsInformation['filename'] == file]                
    table = pd.read_csv(f'datasets/{file}', delimiter=info['delimiter'].values[0])

    table_descriptions = sample_descriptions[sample_descriptions['TableName'] == file]
    candidate_tables       = sample_descriptions[sample_descriptions['TableName'] != file].iloc[:,0]
    candidate_descriptions = sample_descriptions[sample_descriptions['TableName'] != file].iloc[:,2]

    joins_dict[f'{file}'] = {}
    
    for i in tqdm(range(table_descriptions.shape[0])):
        target_description = table_descriptions.iloc[i, 2]
        joins = generate_predictions(target_description, candidate_tables, candidate_descriptions, client)
        joins_dict[f'{file}'][f'{table_descriptions.iloc[i, 1]}'] = joins

100%|███████████████████████████████████████████████████████████████████████████████████| 12/12 [00:06<00:00,  1.77it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 19/19 [00:17<00:00,  1.06it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  2.51it/s]


In [20]:
joins_dict

{'eo_pr.csv': {'EIN': 'none',
  'NAME': '- ("schools.csv", "SCHOOL_NAME")\n- ("libraries.csv", "NAME")\n- ("cultural-spaces.csv", "CULTURAL_SPACE_NAME")',
  'ICO': '1. ("libraries.csv", "NAME")',
  'STREET': '1. ("schools.csv", "ADDRESS")\n2. ("libraries.csv", "ADDRESS")\n3. ("cultural-spaces.csv", "ADDRESS")',
  'CITY': '1. ("libraries.csv", "ADDRESS")\n2. ("cultural-spaces.csv", "ADDRESS")',
  'STATE': 'none',
  'ZIP': '- ("libraries.csv", "ADDRESS")',
  'GROUP': 'none',
  'SUBSECTION': '- ("cultural-spaces.csv", "TYPE")\n- ("cultural-spaces.csv", "PRIMARY_USE")',
  'AFFILIATION': 'none',
  'CLASSIFICATION': '- ("schools.csv", "SCHOOL_CATEGORY")',
  'RULING': '- ("cultural-spaces.csv", "YEAR")',
  'DEDUCTIBILITY': 'none',
  'FOUNDATION': '- ("schools.csv", "SCHOOL_CATEGORY")',
  'ACTIVITY': 'none',
  'ORGANIZATION': '- ("cultural-spaces.csv", "TYPE")\n- ("cultural-spaces.csv", "PRIMARY_USE")\n- ("schools.csv", "SCHOOL_CATEGORY")',
  'STATUS': '- ("public-art.csv", "Status")',
  'TAX_

In [17]:
groundTruth = pd.read_csv('joinable_columns_90containment.csv')
groundTruth

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2
0,eo_pr.csv,NAME,eo4.csv,NAME
1,eo_pr.csv,ICO,eo4.csv,ICO
2,eo_pr.csv,STREET,eo4.csv,STREET
3,eo_pr.csv,CITY,eo4.csv,CITY
4,eo_pr.csv,STATE,eo4.csv,STATE
...,...,...,...,...
65,eo4.csv,STATE,eo_xx.csv,STATE
66,eo_xx.csv,ZIP,eo4.csv,ZIP
67,eo_xx.csv,NTEE_CD,eo4.csv,NTEE_CD
68,eo_xx.csv,SORT_NAME,eo4.csv,SORT_NAME


In [21]:
groundTruth[groundTruth['ds_name'].isin(files) & groundTruth['ds_name_2'].isin(files)]

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2
11,cultural-spaces.csv,LOCAL_AREA,public-art.csv,Neighbourhood
12,public-art.csv,GeoLocalArea,cultural-spaces.csv,LOCAL_AREA
13,cultural-spaces.csv,LOCAL_AREA,schools.csv,Geo Local Area
14,schools.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA
19,libraries.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA
41,schools.csv,Geo Local Area,public-art.csv,Neighbourhood
42,public-art.csv,GeoLocalArea,schools.csv,Geo Local Area
43,schools.csv,Geo Local Area,public-art.csv,GeoLocalArea
47,libraries.csv,Geo Local Area,public-art.csv,Neighbourhood
54,libraries.csv,Geo Local Area,schools.csv,Geo Local Area
