In [1]:
import pandas as pd
import numpy as np
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

In [36]:
def generate_prompt_joins_from_description(table1, table2):

    system_msg = f"""
            Based on the semantic descriptions of the columns in Table 1 and Table 2, identify the best key that would allow a join between these tables. 
            """
    
    user_msg = f"""Table 1 columns' descriptions: {table1}
                   Table 2 columns' descriptions: {table2}
                   Potential keys: """.strip()
    
    return system_msg, user_msg

In [2]:
def generate_prompt_column_description(table, column):

    system_msg = f"""
            Describe the semantics of a target column.
            Task: Describe the informations within a column in a given table.
            Instructions: 1. Look at the input given to you. 2. Look at the column values in detail. 3. Describe the target column. 
            """
    
    user_msg = f"""Table columns: {table.columns}
                   Table values:  {table.iloc[:30,:].values}
                   Target column: {column}
                   Description: """.strip()
    
    return system_msg, user_msg

In [3]:
def generate_prompt_description_similarity(c1, c2):

    system_msg = f"""
            Given the semantic description of two columns in a dataset, determine whether the columns are joinable based on their semantics 
            and provide a likelihood score between 0 (not joinable) and 10 (highly joinable). Asnwer only with the likelihood score.
            Example 1:
                Semantic description of column 1: the column represents an ID.
                Semantic description of column 2: the column represents an ID.
                Likelihood Score: 10
            """
    
    user_msg = f"""Semantic description of column 1: {c1}
                   Semantic description of column 2: {c2}
                   Likelihood Score: """.strip()
    
    return system_msg, user_msg

In [4]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-3.5-turbo-0125",
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [5]:
def generate_predictions(dataframe, column, client):
    system_msg_predict_description, user_msg_predict_description = generate_prompt_column_description(dataframe, column)
    result = execute_prompt(client, system_msg_predict_description, user_msg_predict_description)
    description = result.choices[0].message.content.split('Description: ')[-1].strip()
    
    return description

In [6]:
def generate_likelihood(c1, c2, client):
    system_msg_predict_likelihood, user_msg_predict_likelihood = generate_prompt_description_similarity(c1, c2)
    result = execute_prompt(client, system_msg_predict_likelihood, user_msg_predict_likelihood)
    likelihood = result.choices[0].message.content.split('Likelihood Score: ')[-1].strip()

    return likelihood

In [7]:
df_joinable_columns = pd.read_csv('joinable_columns_90containment.csv')
all_joinable_files = np.concatenate((df_joinable_columns['ds_name'], df_joinable_columns['ds_name_2']), axis=0)
all_joinable_files = np.unique(all_joinable_files)
all_joinable_files

array(['community-centres.csv', 'community-gardens-and-food-trees.csv',
       'cultural-spaces.csv', 'eo4.csv', 'eo_pr.csv', 'eo_xx.csv',
       'libraries.csv', 'population-census-of-botswana-2011.csv',
       'public-art-artists.csv', 'public-art.csv',
       'rental-standards-current-issues.csv', 'schools.csv',
       'statewise-census-data-in-india-1901-2011.csv',
       'street-intersections.csv'], dtype=object)

In [8]:
df_dsInformation = pd.read_csv('datasetInformation_testbedXS.csv')
df_dsInformation.head()

Unnamed: 0,filename,delimiter,multiline,file,nullVal,file_size,ignoreTrailing,source
0,survey_results_schema.csv,",",True,csv,"""""",23kb,True,https://www.kaggle.com/stackoverflow/so-survey...
1,datasets_517172_952401_train.csv,",",False,csv,"""""",55kb,True,https://www.kaggle.com/sovitrath/diabetic-reti...
2,ability_ids.csv,",",False,csv,"""""",19kb,True,https://www.kaggle.com/devinanzelmo/dota-2-mat...
3,public-art.csv,;,True,csv,"""""",669kb,True,https://opendata.vancouver.ca/explore/dataset/...
4,public-art-artists.csv,;,True,csv,"""""",243kb,True,https://opendata.vancouver.ca/explore/dataset/...


In [9]:
info  = df_dsInformation[df_dsInformation['filename'] == 'community-centres.csv']
df_community_centres = pd.read_csv('datasets/community-centres.csv', delimiter=info['delimiter'].values[0])
df_community_centres.head()

Unnamed: 0,NAME,ADDRESS,URLLINK,Geom,Geo Local Area
0,Dunbar,4747 Dunbar St,http://vancouver.ca/parks/cc/dunbar/index.htm,"{""type"": ""Point"", ""coordinates"": [-123.1883, 4...",Dunbar-Southlands
1,Hastings,3096 E Hastings St,http://vancouver.ca/parks/cc/hastings/index.htm,"{""type"": ""Point"", ""coordinates"": [-123.0393, 4...",Hastings-Sunrise
2,Gathering Place Community Centre,609 Helmcken St,http://vancouver.ca/parks-recreation-culture/g...,"{""type"": ""Point"", ""coordinates"": [-123.1235, 4...",Downtown
3,Thunderbird,2311 Cassiar St,http://vancouver.ca/parks/cc/thunderbird/index...,"{""type"": ""Point"", ""coordinates"": [-123.0321, 4...",Hastings-Sunrise
4,Douglas Park,801 W 22nd Av,http://vancouver.ca/parks/cc/douglas/index.htm,"{""type"": ""Point"", ""coordinates"": [-123.1213, 4...",South Cambie


In [10]:
info  = df_dsInformation[df_dsInformation['filename'] == 'cultural-spaces.csv']
df_cultural_spaces = pd.read_csv('datasets/cultural-spaces.csv', delimiter=info['delimiter'].values[0])
df_cultural_spaces.head()

Unnamed: 0,YEAR,CULTURAL_SPACE_NAME,WEBSITE,TYPE,PRIMARY_USE,ADDRESS,LOCAL_AREA,OWNERSHIP,SQUARE_FEET,NUMBER_OF_SEATS,ACTIVE_SPACE,Geom
0,2017,15th Field Artillery Regiment Museum and Archives,www.memorybc.ca/museum-of-15th-field-artillery...,Museum/Gallery,Museum/Gallery,"2025 W 11th Av, Vancouver, BC, V6J 2C7",Kitsilano,Privately Owned,,,Yes,"{""type"": ""Point"", ""coordinates"": [-123.151123,..."
1,2017,221A Artist Run Centre,www.221a.ca/,Museum/Gallery,Museum/Gallery,"221 E Georgia St, Vancouver, BC, V6A 1Z6",Strathcona,Privately Owned,9000.0,,Yes,"{""type"": ""Point"", ""coordinates"": [-123.098796,..."
2,2017,Acme Studios,https://www.facebook.com/ACMEstudiobuilding,Studio/Rehearsal,Artist Studio,"112 E Hastings St, Vancouver, BC, V6A 4J1",Downtown,Privately Owned,18000.0,,Yes,"{""type"": ""Point"", ""coordinates"": [-123.101791,..."
3,2017,AHVA Gallery,http://gallery.ahva.ubc.ca/about/,Museum/Gallery,Museum/Gallery,"6398 University Blvd, Vancouver, BC,",UBC,Other,1990.0,,Yes,"{""type"": ""Point"", ""coordinates"": [-123.2549451..."
4,2017,Al Mozaico Flamenco Dance Academy,http://www.mozaicoflamenco.com/,Educational,Educational Institution,"828 E Hastings St, Vancouver, BC, V6A 1R6",Strathcona,Privately Owned,,,Yes,"{""type"": ""Point"", ""coordinates"": [-123.0864434..."


In [11]:
client = OpenAI()

In [13]:
community_centres_descriptions = []
for i in range(df_community_centres.shape[1]):
    info  = df_dsInformation[df_dsInformation['filename'] == 'community-centres.csv']
    description = generate_predictions(df_community_centres, df_community_centres.iloc[:,i], client)
    community_centres_descriptions.append(description)

In [14]:
community_centres_descriptions

["The target column in the given table is named 'NAME'. This column contains the names of different community centers or parks in Vancouver. Each row in the column represents the name of a specific location such as Dunbar, Hastings, Gathering Place Community Centre, Thunderbird, etc. This column provides a unique identifier for each entry in the dataset, allowing users to easily distinguish between the different locations listed in the table.",
 "The target column is the 'ADDRESS' column in the given table. It contains the street addresses of various locations such as community centers, parks, and memorial sites in Vancouver. Each value in the 'ADDRESS' column represents the physical location address of the respective facility. The addresses include street numbers and names, along with any additional details such as building names or numbers.",
 "The target column is named 'URLLINK' and contains URLs linking to websites related to various community centers or parks in Vancouver. Each r

In [16]:
cultural_spaces_descriptions = []
for i in range(df_cultural_spaces.shape[1]):
    info  = df_dsInformation[df_dsInformation['filename'] == 'community-centres.csv']
    description = generate_predictions(df_cultural_spaces, df_cultural_spaces.iloc[:,i], client)
    cultural_spaces_descriptions.append(description)

In [17]:
cultural_spaces_descriptions

['The target column in the given table is named "YEAR". It contains the year associated with each entry in the table. In this specific case, all the values in the "YEAR" column are 2017, indicating that the data pertains to cultural spaces and related information from the year 2017. The data seems to be focused on cultural spaces, their types, addresses, ownership, square footage, and other relevant details from the year 2017.',
 "The target column is named 'CULTURAL_SPACE_NAME'. It contains the names of various cultural spaces such as museums, galleries, studios, community centers, and performance spaces in Vancouver. Each value in this column represents the name of a specific cultural space, providing a unique identifier for the cultural establishment. The values in this column are textual and descriptive, enabling easy identification and categorization of each cultural space mentioned in the dataset.",
 "The target column is 'WEBSITE' and it contains the website URLs associated with

In [19]:
likelihoods = []
for i in range(len(community_centres_descriptions)):
    for j in range(len(cultural_spaces_descriptions)):
        likelihood = generate_likelihood(community_centres_descriptions[i], cultural_spaces_descriptions[j], client)
        likelihoods.append(likelihood)

In [22]:
for i in range(len(community_centres_descriptions)):
    for j in range(len(cultural_spaces_descriptions)):
        if(likelihoods[12*i+j] == '10'):
            print(f'The similarity between column {df_community_centres.columns[i]} from community_centres.csv and the column {df_cultural_spaces.columns[j]} from cultural-spaces.csv is {likelihoods[12*i+j]}.')

The similarity between column NAME from community_centres.csv and the column LOCAL_AREA from cultural-spaces.csv is 10.
The similarity between column ADDRESS from community_centres.csv and the column ADDRESS from cultural-spaces.csv is 10.
The similarity between column ADDRESS from community_centres.csv and the column Geom from cultural-spaces.csv is 10.
The similarity between column URLLINK from community_centres.csv and the column Geom from cultural-spaces.csv is 10.
The similarity between column Geom from community_centres.csv and the column CULTURAL_SPACE_NAME from cultural-spaces.csv is 10.
The similarity between column Geom from community_centres.csv and the column Geom from cultural-spaces.csv is 10.
The similarity between column Geo Local Area from community_centres.csv and the column LOCAL_AREA from cultural-spaces.csv is 10.


In [24]:
def generate_joins_predictions(table1, table2, client):
    system_msg_predict_joins, user_msg_predict_joins = generate_prompt_joins_from_description(table1, table2)
    result = execute_prompt(client, system_msg_predict_joins, user_msg_predict_joins)
    potential_keys = result.choices[0].message.content.split('Potential keys: ')[-1].strip()

    return potential_keys

In [37]:
join_prediction = generate_joins_predictions(community_centres_descriptions, cultural_spaces_descriptions, client)
join_prediction

"Based on the descriptions of the columns in Table 1 and Table 2, the best key that would allow a join between these tables would be the combination of the following columns from each table:\n\n- From Table 1: 'NAME' column and 'ADDRESS' column (unique identifiers for community centers or parks in Vancouver).\n- From Table 2: 'CULTURAL_SPACE_NAME' column and 'ADDRESS' column (unique identifiers for cultural spaces in Vancouver).\n\nThese columns provide distinct names and physical addresses for the respective locations or facilities in each table, making them suitable keys for joining the two tables based on common identifiers."

In [38]:
join_prediction.split('\n\n')

['Based on the descriptions of the columns in Table 1 and Table 2, the best key that would allow a join between these tables would be the combination of the following columns from each table:',
 "- From Table 1: 'NAME' column and 'ADDRESS' column (unique identifiers for community centers or parks in Vancouver).\n- From Table 2: 'CULTURAL_SPACE_NAME' column and 'ADDRESS' column (unique identifiers for cultural spaces in Vancouver).",
 'These columns provide distinct names and physical addresses for the respective locations or facilities in each table, making them suitable keys for joining the two tables based on common identifiers.']

In [35]:
df_joinable_columns[(df_joinable_columns['ds_name'] == 'community-centres.csv') & (df_joinable_columns['ds_name_2'] == 'cultural-spaces.csv')]

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2
20,community-centres.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA
