In [1]:
import pandas as pd
import numpy as np
import json

import pickle
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

In [2]:
def generate_prompt_predict_domain(table):
    with open('cta_types_domain_reduced_6domain.json', 'r') as file:
        cta_type_domain = json.load(file)

    system_msg = f"""
            Answer the question based on the task and instructions below. If the question cannot be answered using the information provided answer with "Place".
            Task: Classify the table given to you with only one of the following domains that are separated with comma: {", ".join(cta_type_domain.keys())}.
            Instructions: 1. Look at the input given to you. 2. Look at the cell values in detail. 3. Decide if describes a {", ".join(cta_type_domain.keys())}. 4. Answer only with the predicted domain. 
            Example 1: Table: [["Friends Pizza", 2525, Cash Visa MasterCard, 7:30 AM]]
            Domain: Restaurant
            Example 2: Table: [[Museum/Gallery, Vancouver; BC; V6J 2C7, Kitsilano]]
            Domain: Place"""
    
    user_msg = f"""Table: {table.iloc[:30,:].values}
                   Domain: """.strip()
    
    return system_msg, user_msg

In [3]:
def generate_prompt_predict_cta(data_point, domain):
    with open('cta_types_domain_reduced_6domain.json', 'r') as file:
        cta_type_domain = json.load(file)

    system_msg = f"""
            Answer the question based on the task, instructions and examples below. If the question cannot be answered using the information provided answer with "I don't know".
            Task: Classify the text given to you with two of the following classes that are separated with comma: {", ".join(cta_type_domain[domain])}.
            Instructions: 1. Look at the input given to you. 2. Look at the cell values in detail.
            Example 1: Column: [Kitsilano, Strathcona, Downtown, UBC, Downtown, Mount Pleasant]
            Label: addressLocality, streetAddress
            Example 2: Column: ['www.memorybc.ca/museum-of-15th-field-artillery-regiment','www.221a.ca/', 'https://www.facebook.com/ACMEstudiobuilding','http://gallery.ahva.ubc.ca/about/','http://www.mozaicoflamenco.com/', 'http://www.anzaclub.org','www.artbeatus.com', 'http://www.artsfactorysociety.ca/']
            Label: URL, Text"""
    
    user_msg = f"""Column: {data_point.values}
                   Label: """.strip()
    
    return system_msg, user_msg

In [4]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-3.5-turbo-0125",
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [5]:
def processing_output(prediction, domain):
    with open('cta_types_domain_reduced_6domain.json', 'r') as file:
        cta_type_domain = json.load(file)
    categories = cta_type_domain[domain]

    y_pred = []
    for category in categories:
        if  category.lower() in prediction.lower():
            y_pred.append(category)
            
    if(len(y_pred) == 0):
        y_pred.append("none")

    return y_pred

In [6]:
def generate_predictions(dataframe, client):
    system_msg_predict_domain, user_msg_predict_domain = generate_prompt_predict_domain(dataframe)
    result = execute_prompt(client, system_msg_predict_domain, user_msg_predict_domain)
    domain = result.choices[0].message.content.split('Domain: ')[-1].strip()
    
    X_test = pd.DataFrame(dataframe.iloc[:50].apply(generate_prompt_predict_cta, args=(domain,), axis=0))
    
    y_pred = []
    for i in range(X_test.shape[1]):
        result = execute_prompt(client, X_test.T.iloc[i,0], X_test.T.iloc[i,1])
        prediction = result.choices[0].message.content.split('Label: ')[-1].strip()
        processed_prediction = processing_output(prediction, domain)
        y_pred.append(processed_prediction) 
    
    predictions = pd.DataFrame({
                                "Column":dataframe.columns, 
                                "Predicted":y_pred
                                })
    
    return X_test, y_pred, domain, predictions

In [7]:
client = OpenAI()

In [8]:
df_joinable_columns = pd.read_csv('joinable_columns_90containment.csv')
all_joinable_files = np.concatenate((df_joinable_columns['ds_name'], df_joinable_columns['ds_name_2']), axis=0)
all_joinable_files = np.unique(all_joinable_files)
all_joinable_files

array(['community-centres.csv', 'community-gardens-and-food-trees.csv',
       'cultural-spaces.csv', 'eo4.csv', 'eo_pr.csv', 'eo_xx.csv',
       'libraries.csv', 'population-census-of-botswana-2011.csv',
       'public-art-artists.csv', 'public-art.csv',
       'rental-standards-current-issues.csv', 'schools.csv',
       'statewise-census-data-in-india-1901-2011.csv',
       'street-intersections.csv'], dtype=object)

In [9]:
df_dsInformation = pd.read_csv('datasetInformation_testbedXS.csv')

In [10]:
ST_predictions_dict = {}
for i in range(13, len(all_joinable_files)):
# for i in range(len(all_joinable_files)):
    print(f'Making predictions for table {all_joinable_files[i]}. {i+1} out of {len(all_joinable_files)}')
    info  = df_dsInformation[df_dsInformation['filename'] == all_joinable_files[i]]
    df = pd.read_csv(f'datasets/{all_joinable_files[i]}', delimiter=info['delimiter'].values[0])
    X_test, y_pred, domain, predictions = generate_predictions(df, client)
    ST_predictions_dict[all_joinable_files[i]] = predictions

    with open('GPT3.5Turbo_predictions/GPT3.5Turbo_ST_predictions_dict_14.pkl', 'wb') as f:
        pickle.dump(ST_predictions_dict, f)

Making predictions for table street-intersections.csv. 14 out of 14


In [11]:
ST_predictions_dict

{'street-intersections.csv':            Column                         Predicted
 0       AT_STREET                            [none]
 1       ON_STREET                         [Integer]
 2         XSTREET  [addressLocality, streetAddress]
 3            Geom        [GeoCoordinates, GeoShape]
 4  Geo Local Area                 [addressLocality]}