In [13]:
import pandas as pd
import numpy as np
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

In [19]:
def generate_prompt_column_description(table, column):

    system_msg = f"""
            Describe the semantics of a target column.
            Task: Describe the informations within a column in a given table.
            Instructions: 1. Look at the input given to you. 2. Look at the column values in detail. 3. Describe the target column. 
            """
    
    user_msg = f"""Table columns: {table.columns}
                   Table values:  {table.iloc[:30,:].values}
                   Target column: {column}
                   Description: """.strip()
    
    return system_msg, user_msg

In [73]:
def generate_prompt_description_similarity(c1, c2):

    system_msg = f"""
            Given the semantic description of two columns in a dataset, determine whether the columns are joinable based on their semantics 
            and provide a likelihood score between 0 (not joinable) and 10 (highly joinable). Asnwer only with the likelihood score.
            Example 1:
                Semantic description of column 1: the column represents an ID.
                Semantic description of column 2: the column represents an ID.
                Likelihood Score: 10
            """
    
    user_msg = f"""Semantic description of column 1: {c1}
                   Semantic description of column 2: {c2}
                   Likelihood Score: """.strip()
    
    return system_msg, user_msg

In [3]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-3.5-turbo-0125",
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [20]:
def generate_predictions(dataframe, column, client):
    system_msg_predict_description, user_msg_predict_description = generate_prompt_column_description(dataframe, column)
    result = execute_prompt(client, system_msg_predict_description, user_msg_predict_description)
    description = result.choices[0].message.content.split('Description: ')[-1].strip()
    
    return description

In [74]:
def generate_likelihood(c1, c2, client):
    system_msg_predict_likelihood, user_msg_predict_likelihood = generate_prompt_description_similarity(c1, c2)
    result = execute_prompt(client, system_msg_predict_likelihood, user_msg_predict_likelihood)
    likelihood = result.choices[0].message.content.split('Likelihood Score: ')[-1].strip()

    return likelihood

In [5]:
df_joinable_columns = pd.read_csv('joinable_columns_90containment.csv')
all_joinable_files = np.concatenate((df_joinable_columns['ds_name'], df_joinable_columns['ds_name_2']), axis=0)
all_joinable_files = np.unique(all_joinable_files)
all_joinable_files

array(['community-centres.csv', 'community-gardens-and-food-trees.csv',
       'cultural-spaces.csv', 'eo4.csv', 'eo_pr.csv', 'eo_xx.csv',
       'libraries.csv', 'population-census-of-botswana-2011.csv',
       'public-art-artists.csv', 'public-art.csv',
       'rental-standards-current-issues.csv', 'schools.csv',
       'statewise-census-data-in-india-1901-2011.csv',
       'street-intersections.csv'], dtype=object)

In [6]:
df_dsInformation = pd.read_csv('datasetInformation_testbedXS.csv')
df_dsInformation.head()

Unnamed: 0,filename,delimiter,multiline,file,nullVal,file_size,ignoreTrailing,source
0,survey_results_schema.csv,",",True,csv,"""""",23kb,True,https://www.kaggle.com/stackoverflow/so-survey...
1,datasets_517172_952401_train.csv,",",False,csv,"""""",55kb,True,https://www.kaggle.com/sovitrath/diabetic-reti...
2,ability_ids.csv,",",False,csv,"""""",19kb,True,https://www.kaggle.com/devinanzelmo/dota-2-mat...
3,public-art.csv,;,True,csv,"""""",669kb,True,https://opendata.vancouver.ca/explore/dataset/...
4,public-art-artists.csv,;,True,csv,"""""",243kb,True,https://opendata.vancouver.ca/explore/dataset/...


In [7]:
predicted_joinable_columns = pd.read_csv('predicted_joinable_columns.csv')
predicted_joinable_columns.head()

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2
0,community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,Geo Local Area
1,community-centres.csv,URLLINK,cultural-spaces.csv,WEBSITE
2,community-centres.csv,URLLINK,cultural-spaces.csv,WEBSITE
3,community-centres.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA
4,community-centres.csv,Geo Local Area,eo_pr.csv,CITY


In [8]:
unique_joinable_columns = pd.DataFrame([unique_info.split('#') for unique_info in np.load('unique_joinable_columns.npy')],
                                      columns = ['TABLE','COLUMN'])
unique_joinable_columns.head()

Unnamed: 0,TABLE,COLUMN
0,community-centres.csv,Geo Local Area
1,community-gardens-and-food-trees.csv,FOOD_TREE_VARIETIES
2,community-gardens-and-food-trees.csv,Geo Local Area
3,community-gardens-and-food-trees.csv,NUMBER_OF_FOOD_TREES
4,cultural-spaces.csv,ACTIVE_SPACE


In [11]:
client = OpenAI()

In [23]:
descriptions = []
for i in range(unique_joinable_columns.shape[0]):
    info  = df_dsInformation[df_dsInformation['filename'] == unique_joinable_columns.iloc[i,0]]
    df = pd.read_csv(f'datasets/{unique_joinable_columns.iloc[i,0]}', delimiter=info['delimiter'].values[0])
    description = generate_predictions(df, unique_joinable_columns.iloc[i,1], client)
    descriptions.append(description)

In [24]:
unique_joinable_columns.insert(2, "DESCRIPTION", descriptions, True)

In [27]:
unique_joinable_columns.head()

Unnamed: 0,TABLE,COLUMN,DESCRIPTION
0,community-centres.csv,Geo Local Area,"The target column ""Geo Local Area"" contains in..."
1,community-gardens-and-food-trees.csv,FOOD_TREE_VARIETIES,The target column 'FOOD_TREE_VARIETIES' in the...
2,community-gardens-and-food-trees.csv,Geo Local Area,"The target column ""Geo Local Area"" contains in..."
3,community-gardens-and-food-trees.csv,NUMBER_OF_FOOD_TREES,"The target column ""NUMBER_OF_FOOD_TREES"" in th..."
4,cultural-spaces.csv,ACTIVE_SPACE,"The target column ""ACTIVE_SPACE"" in the table ..."


In [28]:
unique_joinable_columns.to_csv('described_predicted_joinable_columns.csv', index=False)

In [94]:
for i in range(unique_joinable_columns.shape[0]):
    print(unique_joinable_columns.TABLE[i], unique_joinable_columns.COLUMN[i])
    print(unique_joinable_columns.DESCRIPTION[i])
    print('------------------------')

community-centres.csv Geo Local Area
The target column "Geo Local Area" contains information about the specific geographical local area or neighborhood to which each entry in the table belongs. It provides a categorization of different locations into their respective local areas within Vancouver. The values in this column seem to be descriptive of the neighborhood or area where each community center or park is located, such as "Dunbar-Southlands", "Hastings-Sunrise", "Mount Pleasant", "Kerrisdale", and others. This column helps in identifying and organizing these recreational facilities based on their respective local areas for better understanding and management.
------------------------
community-gardens-and-food-trees.csv FOOD_TREE_VARIETIES
The target column 'FOOD_TREE_VARIETIES' in the table contains information about the varieties of food trees planted in each location or garden. Each value in this column provides details about the specific types of fruit trees or food-producing 

In [34]:
unique_joinable_columns.insert(3, "KEY", unique_joinable_columns.TABLE + "#" + unique_joinable_columns.COLUMN, True)
unique_joinable_columns.head()

Unnamed: 0,TABLE,COLUMN,DESCRIPTION,KEY
0,community-centres.csv,Geo Local Area,"The target column ""Geo Local Area"" contains in...",community-centres.csv#Geo Local Area
1,community-gardens-and-food-trees.csv,FOOD_TREE_VARIETIES,The target column 'FOOD_TREE_VARIETIES' in the...,community-gardens-and-food-trees.csv#FOOD_TREE...
2,community-gardens-and-food-trees.csv,Geo Local Area,"The target column ""Geo Local Area"" contains in...",community-gardens-and-food-trees.csv#Geo Local...
3,community-gardens-and-food-trees.csv,NUMBER_OF_FOOD_TREES,"The target column ""NUMBER_OF_FOOD_TREES"" in th...",community-gardens-and-food-trees.csv#NUMBER_OF...
4,cultural-spaces.csv,ACTIVE_SPACE,"The target column ""ACTIVE_SPACE"" in the table ...",cultural-spaces.csv#ACTIVE_SPACE


In [35]:
predicted_joinable_columns.insert(4, "LEFT_KEY", predicted_joinable_columns.ds_name + "#" + predicted_joinable_columns.att_name, True)
predicted_joinable_columns.insert(5, "RIGHT_KEY", predicted_joinable_columns.ds_name_2 + "#" + predicted_joinable_columns.att_name_2, True)
predicted_joinable_columns.head()

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2,LEFT_KEY,RIGHT_KEY
0,community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,Geo Local Area,community-centres.csv#Geo Local Area,community-gardens-and-food-trees.csv#Geo Local...
1,community-centres.csv,URLLINK,cultural-spaces.csv,WEBSITE,community-centres.csv#URLLINK,cultural-spaces.csv#WEBSITE
2,community-centres.csv,URLLINK,cultural-spaces.csv,WEBSITE,community-centres.csv#URLLINK,cultural-spaces.csv#WEBSITE
3,community-centres.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA,community-centres.csv#Geo Local Area,cultural-spaces.csv#LOCAL_AREA
4,community-centres.csv,Geo Local Area,eo_pr.csv,CITY,community-centres.csv#Geo Local Area,eo_pr.csv#CITY


In [46]:
df_left_description = pd.merge(predicted_joinable_columns, unique_joinable_columns, left_on='LEFT_KEY', right_on='KEY')
df_left_description = df_left_description.drop(['TABLE', 'COLUMN', 'KEY'], axis=1)
df_left_description.columns = ['ds_name', 'att_name', 'ds_name_2', 'att_name_2', 'LEFT_KEY', 'RIGHT_KEY', 'LEFT_DESCRIPTION']

df_description = pd.merge(df_left_description, unique_joinable_columns, left_on='RIGHT_KEY', right_on='KEY')
df_description = df_description.drop(['TABLE', 'COLUMN', 'KEY'], axis=1)
df_description.columns = ['ds_name', 'att_name', 'ds_name_2', 'att_name_2', 'LEFT_KEY', 'RIGHT_KEY', 'LEFT_DESCRIPTION', 'RIGHT_DESCRIPTION']

df_description.head()

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2,LEFT_KEY,RIGHT_KEY,LEFT_DESCRIPTION,RIGHT_DESCRIPTION
0,community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,Geo Local Area,community-centres.csv#Geo Local Area,community-gardens-and-food-trees.csv#Geo Local...,"The target column ""Geo Local Area"" contains in...","The target column ""Geo Local Area"" contains in..."
1,community-centres.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA,community-centres.csv#Geo Local Area,cultural-spaces.csv#LOCAL_AREA,"The target column ""Geo Local Area"" contains in...","The target column ""LOCAL_AREA"" provides inform..."
2,community-centres.csv,Geo Local Area,eo_pr.csv,CITY,community-centres.csv#Geo Local Area,eo_pr.csv#CITY,"The target column ""Geo Local Area"" contains in...",The target column is 'CITY'. It contains the n...
3,community-centres.csv,Geo Local Area,libraries.csv,Geo Local Area,community-centres.csv#Geo Local Area,libraries.csv#Geo Local Area,"The target column ""Geo Local Area"" contains in...","The target column ""Geo Local Area"" in the tabl..."
4,community-centres.csv,Geo Local Area,public-art.csv,Neighbourhood,community-centres.csv#Geo Local Area,public-art.csv#Neighbourhood,"The target column ""Geo Local Area"" contains in...","The ""Neighbourhood"" column in the given table ..."


In [59]:
df_description.shape[0]

55

In [75]:
likelihoods = []
for i in range(df_description.shape[0]):
    print(f'Description {i+1} out of {df_description.shape[0]}.', end='\r')
    left_description = df_description.iloc[i, 6]
    right_description = df_description.iloc[i, 7]

    likelihood = generate_likelihood(left_description, right_description, client)

    likelihoods.append(likelihood)

Description 55 out of 55.

In [76]:
likelihoods

['10',
 '10',
 '8',
 '10',
 '10',
 '10',
 '10',
 '9',
 '7',
 '10',
 '8',
 '10',
 '10',
 '10',
 '10',
 '10',
 '8',
 '8.',
 '10',
 '10',
 '10',
 '10',
 '9',
 '7',
 '10',
 '10',
 '10',
 '10',
 '10',
 '10',
 '10',
 '7',
 '3',
 '8',
 '7',
 '3',
 '3',
 '9',
 '8',
 '10',
 '10',
 '10',
 '10',
 '10',
 '10',
 '10',
 '9',
 '6',
 '8',
 '7',
 '10',
 '6',
 '8',
 '8',
 '2']

In [77]:
df_description.insert(8, "LIKELIHOOD", likelihoods, True)
df_description.head()

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2,LEFT_KEY,RIGHT_KEY,LEFT_DESCRIPTION,RIGHT_DESCRIPTION,LIKELIHOOD
0,community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,Geo Local Area,community-centres.csv#Geo Local Area,community-gardens-and-food-trees.csv#Geo Local...,"The target column ""Geo Local Area"" contains in...","The target column ""Geo Local Area"" contains in...",10
1,community-centres.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA,community-centres.csv#Geo Local Area,cultural-spaces.csv#LOCAL_AREA,"The target column ""Geo Local Area"" contains in...","The target column ""LOCAL_AREA"" provides inform...",10
2,community-centres.csv,Geo Local Area,eo_pr.csv,CITY,community-centres.csv#Geo Local Area,eo_pr.csv#CITY,"The target column ""Geo Local Area"" contains in...",The target column is 'CITY'. It contains the n...,8
3,community-centres.csv,Geo Local Area,libraries.csv,Geo Local Area,community-centres.csv#Geo Local Area,libraries.csv#Geo Local Area,"The target column ""Geo Local Area"" contains in...","The target column ""Geo Local Area"" in the tabl...",10
4,community-centres.csv,Geo Local Area,public-art.csv,Neighbourhood,community-centres.csv#Geo Local Area,public-art.csv#Neighbourhood,"The target column ""Geo Local Area"" contains in...","The ""Neighbourhood"" column in the given table ...",10


In [80]:
for i in range(df_description.shape[0]):
    print(",".join(df_description.iloc[i,:4].values), df_description.iloc[i,8])

community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,Geo Local Area 10
community-centres.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA 10
community-centres.csv,Geo Local Area,eo_pr.csv,CITY 8
community-centres.csv,Geo Local Area,libraries.csv,Geo Local Area 10
community-centres.csv,Geo Local Area,public-art.csv,Neighbourhood 10
community-centres.csv,Geo Local Area,public-art.csv,GeoLocalArea 10
community-centres.csv,Geo Local Area,rental-standards-current-issues.csv,Geo Local Area 10
community-centres.csv,Geo Local Area,schools.csv,Geo Local Area 9
community-centres.csv,Geo Local Area,street-intersections.csv,Geo Local Area 7
community-gardens-and-food-trees.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA 10
community-gardens-and-food-trees.csv,Geo Local Area,eo_pr.csv,CITY 8
community-gardens-and-food-trees.csv,Geo Local Area,libraries.csv,Geo Local Area 10
community-gardens-and-food-trees.csv,Geo Local Area,public-art.csv,Neighbourhood 10
community-gardens-

In [91]:
for i in range(df_description.shape[0]):
    if(float(df_description.iloc[i, 8]) >= 8):
        print(",".join(df_description.iloc[i,:4].values), df_description.iloc[i,8])

community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,Geo Local Area 10
community-centres.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA 10
community-centres.csv,Geo Local Area,eo_pr.csv,CITY 8
community-centres.csv,Geo Local Area,libraries.csv,Geo Local Area 10
community-centres.csv,Geo Local Area,public-art.csv,Neighbourhood 10
community-centres.csv,Geo Local Area,public-art.csv,GeoLocalArea 10
community-centres.csv,Geo Local Area,rental-standards-current-issues.csv,Geo Local Area 10
community-centres.csv,Geo Local Area,schools.csv,Geo Local Area 9
community-gardens-and-food-trees.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA 10
community-gardens-and-food-trees.csv,Geo Local Area,eo_pr.csv,CITY 8
community-gardens-and-food-trees.csv,Geo Local Area,libraries.csv,Geo Local Area 10
community-gardens-and-food-trees.csv,Geo Local Area,public-art.csv,Neighbourhood 10
community-gardens-and-food-trees.csv,Geo Local Area,public-art.csv,GeoLocalArea 10
community-gard

In [81]:
df_description.to_csv('likelihood_described_predicted_joinable_columns.csv',index=False)

In [92]:
df_joinable_columns[df_joinable_columns['ds_name'] == 'community-centres.csv']

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2
20,community-centres.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA
31,community-centres.csv,Geo Local Area,street-intersections.csv,Geo Local Area
40,community-centres.csv,Geo Local Area,rental-standards-current-issues.csv,Geo Local Area
48,community-centres.csv,Geo Local Area,public-art.csv,Neighbourhood
49,community-centres.csv,Geo Local Area,public-art.csv,GeoLocalArea
55,community-centres.csv,Geo Local Area,schools.csv,Geo Local Area
57,community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,Geo Local Area
58,community-centres.csv,Geo Local Area,libraries.csv,Geo Local Area
