In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [4]:
def generate_prompt_JD_prediction(target_table_description, target_column_descriptions, candidate_table_description, candidate_column_descriptions):

    system_msg = """
            Given a target table and a candidate table, predict if the pair (target table, candidate table) could be joined and list the possible pair
            of columns that could be used in those JOINs.

            Task: Look carefully at the target table description and candidate table description column and use this information to identify 
            patterns and relationships between the descriptions, the result must be a list of all the JOINable pairs found. If no joinable pair is 
            found the result should be just the word "none".

            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them 
            based on related  columns between them.
            
           Instructions: 
                1. Look at the target table description given to you.
                2. Look at the target column descriptions.
                3. Look at the candidate table description in detail. 
                4. Look at the candidate column descriptions.
                5. Predict all the highly likely JOINs between these columns based only on these descriptions. Disregard the column names.
                6. Return the JOINable pairs in plain text using the JSON schema below. Do not output markdown.

                "target_table_name": {
                    "target_column_name01": {"candidate_table_name01", "candidate_column_name01"},
                    "target_column_name01": {"candidate_table_name01", "candidate_column_name02"},
                    "target_column_name02": {"candidate_table_name02", "candidate_column_name01"}
                }
                
            """
            
    user_msg =  f"""Target Table description: {target_table_description}
                    Target columns descriptions: {target_column_descriptions}
                    Candidate Table description: {candidate_table_description}
                    Candidate columns descriptions: {candidate_column_descriptions}
                    Predicted JOINs:""".strip()
    
    return system_msg, user_msg

In [5]:
def generate_predictions(target_table_description, target_column_descriptions, candidate_table_description, candidate_column_descriptions, client):
    
    system_msg_jd_prediction, user_msg_jd_prediction = generate_prompt_JD_prediction(target_table_description, target_column_descriptions, candidate_table_description, candidate_column_descriptions)
    result = execute_prompt(client, system_msg_jd_prediction, user_msg_jd_prediction)
    jd_prediction = result.choices[0].message.content.split(' Predicted JOINs: ')[-1].strip()
    
    return jd_prediction

In [6]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [7]:
client = OpenAI()

In [8]:
# filenames = glob.glob('datasets/*')
filenames = ['public-art.csv','schools.csv','cultural-spaces.csv', 'ability_ids.csv', 'eo4.csv']
filenames

['public-art.csv',
 'schools.csv',
 'cultural-spaces.csv',
 'ability_ids.csv',
 'eo4.csv']

In [9]:
column_descriptions = pd.read_csv('Description_test/all_descriptions.csv')
column_descriptions.head()

Unnamed: 0,TableName,Column,Description
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...
1,statewise-census-data-in-india-1901-2011.csv,DATE,The 'DATE' column in the table represents the ...
2,statewise-census-data-in-india-1901-2011.csv,LOCATION_NAME,"The ""LOCATION_NAME"" column in the table repres..."
3,statewise-census-data-in-india-1901-2011.csv,LOCATION_F5,"The column ""LOCATION_F5"" in the table appears ..."
4,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,"The ""LOCATION_ID"" column contains identifiers ..."


In [10]:
sample_descriptions = column_descriptions[column_descriptions['TableName'].isin(filenames)]
sample_descriptions = sample_descriptions.reset_index(drop=True)
sample_descriptions

Unnamed: 0,TableName,Column,Description
0,public-art.csv,RegistryID,"The target column, ""RegistryID"", contains a un..."
1,public-art.csv,ArtistProjectStatement,"The ""ArtistProjectStatement"" column in the tab..."
2,public-art.csv,Type,"The ""Type"" column in the table represents the ..."
3,public-art.csv,Status,The 'Status' column in this table indicates th...
4,public-art.csv,SiteName,"The ""SiteName"" column in the table provides th..."
5,public-art.csv,SiteAddress,"The ""SiteAddress"" column in the table displays..."
6,public-art.csv,PrimaryMaterial,"The ""PrimaryMaterial"" column in the table prov..."
7,public-art.csv,URL,"The ""URL"" column in the given table contains h..."
8,public-art.csv,PhotoURL,"The ""PhotoURL"" column contains web links, spec..."
9,public-art.csv,Ownership,"The ""Ownership"" column in the table provides i..."


In [11]:
table_descriptions = pd.read_csv('Description_test/table_descriptions_test/table_descriptions.csv')
table_descriptions

Unnamed: 0,TableName,Description
0,statewise-census-data-in-india-1901-2011.csv,"The table ""statewise-census-data-in-india-1901..."
1,road-ahead-current-road-closures.csv,"The table ""road-ahead-current-road-closures.cs..."
2,property-tie-lines.csv,"The table ""property-tie-lines.csv"" contains in..."
3,public-art.csv,The public-art.csv table provides detailed inf...
4,gvrd-sewer-trunk-mains.csv,"The ""gvrd-sewer-trunk-mains.csv"" table provide..."
5,SCS_Staff_Salaries_data_30th_June 2010.csv,"The table ""SCS_Staff_Salaries_data_30th_June 2..."
6,schools.csv,"The table ""schools.csv"" provides detailed info..."
7,rental-standards-current-issues.csv,"The ""rental-standards-current-issues.csv"" tabl..."
8,datasets_579296_1047868_authors.csv,"The table ""datasets_579296_1047868_authors.csv..."
9,survey_results_schema.csv,"The table ""survey_results_schema.csv"" contains..."


In [12]:
sample_table_descriptions = table_descriptions[table_descriptions['TableName'].isin(filenames)]
sample_table_descriptions = sample_table_descriptions.reset_index(drop=True)
sample_table_descriptions

Unnamed: 0,TableName,Description
0,public-art.csv,The public-art.csv table provides detailed inf...
1,schools.csv,"The table ""schools.csv"" provides detailed info..."
2,cultural-spaces.csv,"The table ""cultural-spaces.csv"" contains data ..."
3,eo4.csv,"The table named ""eo4.csv"" contains detailed in..."
4,ability_ids.csv,"The table ""ability_ids.csv"" contains a list of..."


In [13]:
joins_dict = {}

for i in tqdm(range(len(filenames)-1)):
    joins_dict[filenames[i]] = []
    target_table_description = sample_table_descriptions[sample_table_descriptions['TableName'] == filenames[i]].iloc[0,1]
    target_column_descriptions = sample_descriptions[sample_descriptions['TableName'] == filenames[i]].Description.values
    for j in range(i+1, len(filenames)):
        candidate_table_description = sample_table_descriptions[sample_table_descriptions['TableName'] == filenames[j]].iloc[0,1]
        candidate_column_descriptions = sample_descriptions[sample_descriptions['TableName'] == filenames[j]].Description.values

        # print(target_table_description + '\n######################\n' +
        #       target_column_descriptions[0] + '\n######################\n' + 
        #       candidate_table_description + '\n######################\n' + 
        #       candidate_column_descriptions[0])
        
        result = generate_predictions(target_table_description, target_column_descriptions, candidate_table_description, candidate_column_descriptions, client)
   
        joins_dict[filenames[i]].append(result)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:14<00:00,  3.50s/it]


In [14]:
joins_dict

{'public-art.csv': ['```json\n{\n    "public-art": {\n        "SiteAddress": {"schools", "ADDRESS"},\n        "GeoLocalArea": {"schools", "Geo Local Area"},\n        "Geom": {"schools", "Geom"}\n    }\n}\n```',
  '```json\n{\n    "public-art.csv": {\n        "SiteAddress": {"cultural-spaces.csv", "ADDRESS"},\n        "Neighbourhood": {"cultural-spaces.csv", "LOCAL_AREA"},\n        "GeoLocalArea": {"cultural-spaces.csv", "LOCAL_AREA"},\n        "Ownership": {"cultural-spaces.csv", "OWNERSHIP"},\n        "Geom": {"cultural-spaces.csv", "Geom"}\n    }\n}\n```',
  'none',
  '```json\n{\n    "public-art.csv": {\n        "ZIP": {"eo4.csv", "ZIP"}\n    }\n}\n```'],
 'schools.csv': ['{\n    "schools.csv": {\n        "ADDRESS": {"cultural-spaces.csv", "ADDRESS"},\n        "Geo Local Area": {"cultural-spaces.csv", "LOCAL_AREA"},\n        "Geom": {"cultural-spaces.csv", "Geom"}\n    }\n}',
  'none',
  '{\n    "schools.csv": {\n        "ADDRESS": {"eo4.csv", "STREET"}\n    }\n}'],
 'cultural-space

In [15]:
with open('Description_test/table_descriptions_test/join_test02.json', 'w') as f:
    json.dump(joins_dict, f)