In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle

from tqdm import tqdm

In [2]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [3]:
def generate_prompt_predict_possible_joins(target_description, candidate_descriptions, candidate_tables):

    system_msg = f"""
            Given one target column description and many candidate column descriptions, predict all the pairs (candidate table name, candidate 
            description column name) that could be joined.

            Task: Look carefully at the target column description and candidate column descriptions and use this information to identify 
            patterns and relationships between the descriptions, the result must be a list of all the JOINable pairs found. If no joinable pair is 
            found the result should be just the word "none".

            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them 
            based on related columns between them.
            
            Instructions: 
                1. Look at the target description given to you. 
                2. Look at the candidate descriptions in detail. 
                3. Predict if the target column description belongs to a column that may be used in join. 
                4. Select all the highly likely JOINs between these columns based only on these descriptions. Disregard the column names.

            Example 1:
                Target description: this column represents a worker's id
                Candidate tables: ['salary.csv','salary.csv','hospital.csv']
                Candidate description: ['the column worker_id represents the worker's id', 'this column represents a worker's salary', 'this column represents a hospital location']
                Possible JOINs: ('salary.csv', 'worker_id')
            """
    
    user_msg = f"""Target description:      {target_description}
                   Candidate table:         {candidate_tables.values}
                   Candidate descriptions:  {candidate_descriptions.values}
                   Possible JOINs: """.strip()
    
    return system_msg, user_msg

In [4]:
descriptions = pd.read_csv('Description_test/all_descriptions.csv')
descriptions.head()

Unnamed: 0,TableName,Column,Description
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...
1,statewise-census-data-in-india-1901-2011.csv,DATE,The 'DATE' column in the table represents the ...
2,statewise-census-data-in-india-1901-2011.csv,LOCATION_NAME,"The ""LOCATION_NAME"" column in the table repres..."
3,statewise-census-data-in-india-1901-2011.csv,LOCATION_F5,"The column ""LOCATION_F5"" in the table appears ..."
4,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,"The ""LOCATION_ID"" column contains identifiers ..."


In [13]:
# files = ['eo_pr.csv', 'cultural-spaces.csv', 'public-art.csv', 'libraries.csv', 'schools.csv']
files = [file.split('/')[-1] for file in glob.glob('datasets/*')]
files

['statewise-census-data-in-india-1901-2011.csv',
 'road-ahead-current-road-closures.csv',
 'property-tie-lines.csv',
 'public-art.csv',
 'gvrd-sewer-trunk-mains.csv',
 'SCS_Staff_Salaries_data_30th_June 2010.csv',
 'schools.csv',
 'rental-standards-current-issues.csv',
 'datasets_579296_1047868_authors.csv',
 'survey_results_schema.csv',
 'animal-control-inventory-lost-and-found.csv',
 'glassdoor_wwfu_val_captions.csv',
 'eo_xx.csv',
 'community-gardens-and-food-trees.csv',
 'road-ahead-upcoming-projects.csv',
 'libraries.csv',
 'cultural-spaces.csv',
 'datasets_517172_952401_train.csv',
 'public-art-artists.csv',
 'eo4.csv',
 'currency_exchange.csv',
 'eo_pr.csv',
 'road-ahead-projects-under-construction.csv',
 'ability_ids.csv',
 'population-by-governorate-citizenship-and-gender.csv',
 'community-centres.csv',
 'street-intersections.csv',
 'population-census-of-botswana-2011.csv']

In [6]:
sample_descriptions = descriptions[descriptions['TableName'].isin(files)]
sample_descriptions = sample_descriptions.reset_index(drop=True)
sample_descriptions.head()

Unnamed: 0,TableName,Column,Description
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...
1,statewise-census-data-in-india-1901-2011.csv,DATE,The 'DATE' column in the table represents the ...
2,statewise-census-data-in-india-1901-2011.csv,LOCATION_NAME,"The ""LOCATION_NAME"" column in the table repres..."
3,statewise-census-data-in-india-1901-2011.csv,LOCATION_F5,"The column ""LOCATION_F5"" in the table appears ..."
4,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,"The ""LOCATION_ID"" column contains identifiers ..."


In [7]:
sample_descriptions['TableName'].value_counts()

TableName
eo4.csv                                                 28
eo_pr.csv                                               28
eo_xx.csv                                               28
public-art.csv                                          19
community-gardens-and-food-trees.csv                    19
cultural-spaces.csv                                     12
SCS_Staff_Salaries_data_30th_June 2010.csv              10
public-art-artists.csv                                   9
statewise-census-data-in-india-1901-2011.csv             9
population-census-of-botswana-2011.csv                   8
rental-standards-current-issues.csv                      8
gvrd-sewer-trunk-mains.csv                               7
animal-control-inventory-lost-and-found.csv              7
datasets_579296_1047868_authors.csv                      6
road-ahead-projects-under-construction.csv               6
road-ahead-upcoming-projects.csv                         6
road-ahead-current-road-closures.csv          

In [8]:
df_dsInformation = pd.read_csv('datasetInformation_testbedXS.csv')

In [9]:
prompt_path = 'Description_test/Prompts/JoinDiscovery/'

In [20]:
table_descriptions.columns[0]

'TableName'

In [21]:
table_descriptions

Unnamed: 0,TableName,Column,Description
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...
1,statewise-census-data-in-india-1901-2011.csv,DATE,The 'DATE' column in the table represents the ...
2,statewise-census-data-in-india-1901-2011.csv,LOCATION_NAME,"The ""LOCATION_NAME"" column in the table repres..."
3,statewise-census-data-in-india-1901-2011.csv,LOCATION_F5,"The column ""LOCATION_F5"" in the table appears ..."
4,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,"The ""LOCATION_ID"" column contains identifiers ..."
5,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,"The ""LOCATION_REGIONID"" column in the table co..."
6,statewise-census-data-in-india-1901-2011.csv,VARIABLE_NAME,"The ""VARIABLE_NAME"" column provides a descript..."
7,statewise-census-data-in-india-1901-2011.csv,VALUE,"The target column, ""VALUE,"" represents numeric..."
8,statewise-census-data-in-india-1901-2011.csv,VARIABLE_ID,"The ""VARIABLE_ID"" column in the table represen..."


In [26]:
for file in files:
    info  = df_dsInformation[df_dsInformation['filename'] == file]                
    table = pd.read_csv(f'datasets/{file}', delimiter=info['delimiter'].values[0])

    table_descriptions = sample_descriptions[sample_descriptions['TableName'] == file]
    candidate_tables       = sample_descriptions[sample_descriptions['TableName'] != file].iloc[:,0]
    candidate_descriptions = sample_descriptions[sample_descriptions['TableName'] != file].iloc[:,2]

    for i in tqdm(range(table_descriptions.shape[0])):
        target_description = table_descriptions.iloc[i, 2]
        system_msg, user_msg = generate_prompt_predict_possible_joins(target_description, candidate_descriptions, candidate_tables)
        prompt = system_msg + '\n' + user_msg

        with open(f'{prompt_path}{file[:-4]}_{table_descriptions.iloc[i,1]}.txt', 'w') as f:
            f.write(prompt)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 232.07it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 111.76it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 243.75it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [00:00<00:00, 308.52it/s]
100%|███████████████████████████████████████████