In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from pydantic import BaseModel
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [4]:
def generate_prompt_cta_prediction(description, cta_types):

    system_msg = f"""
            Your task is to classify a column's description of a given table with only one of the following classes that are separated 
            with comma: {",".join(cta_types)}.

            Your instructions are:
                1. Look at the input given to you.
                2. Look at the description semantics in detail.
                3. Predict a class among the following classes {",".join(cta_types)} for the target column.
                4. Answer only with the predicted class.

            Example 1:
                Description: "The 'Sex' column in the table represents the gender of the animal listed in each row. The values in this column consist of abbreviations that denote both the sex and reproductive status of the animals. 'M' indicates a male animal, and 'F' stands for a female animal.
                Class: category

            Example 2:
                Description: "The ""GeoLocalArea"" column in the table represents the geographical local area within which the public artworks or installations are situated or associated.
                Class: streetAddress"""
            
    user_msg =  f"""Description: {description}
                    Class: """.strip()
    
    return system_msg, user_msg

In [5]:
def generate_predictions(description, cta_types, client):
    
    system_msg_predict_cta, user_msg_predict_cta = generate_prompt_cta_prediction(description, cta_types)
    result = execute_prompt(client, system_msg_predict_cta, user_msg_predict_cta)
    cta = result.choices[0].message.content.split('Class: ')[-1].strip()
    
    return cta

In [6]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            temperature=0.2,
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [7]:
client = OpenAI()

In [8]:
descriptions = pd.read_csv('Description_test/all_descriptions.csv')
descriptions.head()

Unnamed: 0,TableName,Column,Description
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...
1,statewise-census-data-in-india-1901-2011.csv,DATE,The 'DATE' column in the table represents the ...
2,statewise-census-data-in-india-1901-2011.csv,LOCATION_NAME,"The ""LOCATION_NAME"" column in the table repres..."
3,statewise-census-data-in-india-1901-2011.csv,LOCATION_F5,"The column ""LOCATION_F5"" in the table appears ..."
4,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,"The ""LOCATION_ID"" column contains identifiers ..."


In [9]:
cta_types = np.load('Description_test/CTA_from_descriptions/cta_types.npy')

In [10]:
cta = []
for table_name, column_name, description in tqdm(descriptions.values):
    target_description = description
    cta.append(generate_predictions(target_description, cta_types, client))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 257/257 [02:11<00:00,  1.95it/s]


In [11]:
descriptions['CTA'] = cta

In [12]:
descriptions.to_csv('Description_test/all_descriptions_with_cta.csv',index=False)

In [29]:
descriptions_with_cta = pd.read_csv('Description_test/all_descriptions_with_cta.csv')

In [30]:
descriptions_with_cta.head()

Unnamed: 0,TableName,Column,Description,CTA
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,category
1,statewise-census-data-in-india-1901-2011.csv,DATE,The 'DATE' column in the table represents the ...,Date
2,statewise-census-data-in-india-1901-2011.csv,LOCATION_NAME,"The ""LOCATION_NAME"" column in the table repres...",Country
3,statewise-census-data-in-india-1901-2011.csv,LOCATION_F5,"The column ""LOCATION_F5"" in the table appears ...",Boolean
4,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,"The ""LOCATION_ID"" column contains identifiers ...",Identifier


In [31]:
table_names = descriptions_with_cta.TableName.unique()
table_names

array(['statewise-census-data-in-india-1901-2011.csv',
       'road-ahead-current-road-closures.csv', 'property-tie-lines.csv',
       'public-art.csv', 'gvrd-sewer-trunk-mains.csv',
       'SCS_Staff_Salaries_data_30th_June 2010.csv', 'schools.csv',
       'rental-standards-current-issues.csv',
       'datasets_579296_1047868_authors.csv', 'survey_results_schema.csv',
       'animal-control-inventory-lost-and-found.csv',
       'glassdoor_wwfu_val_captions.csv', 'eo_xx.csv',
       'community-gardens-and-food-trees.csv',
       'road-ahead-upcoming-projects.csv', 'libraries.csv',
       'cultural-spaces.csv', 'datasets_517172_952401_train.csv',
       'public-art-artists.csv', 'eo4.csv', 'currency_exchange.csv',
       'eo_pr.csv', 'road-ahead-projects-under-construction.csv',
       'ability_ids.csv',
       'population-by-governorate-citizenship-and-gender.csv',
       'community-centres.csv', 'street-intersections.csv',
       'population-census-of-botswana-2011.csv'], dtype=object

In [36]:
matches = pd.DataFrame({'LEFT_TABLE':[],'LEFT_COLUMN':[],'RIGHT_TABLE':[],'RIGHT_COLUMN':[]})
for table_name in table_names:
    target_table = descriptions_with_cta[descriptions_with_cta['TableName'] == table_name]
    candidate_tables = descriptions_with_cta[descriptions_with_cta['TableName'] != table_name]
    for i in range(len(target_table.Column)):
        for j in range(len(candidate_tables.Column)):
            if(target_table.CTA.values[i] == candidate_tables.CTA.values[j]):
                match = pd.DataFrame({'LEFT_TABLE':[table_name],
                                      'LEFT_COLUMN':[target_table.Column.values[i]],
                                      'LEFT_DESCRIPTION':[target_table.Description.values[i]],
                                      'RIGHT_TABLE':[candidate_tables.TableName.values[j]],
                                      'RIGHT_COLUMN':[candidate_tables.Column.values[j]],
                                      'RIGHT_DESCRIPTION':[candidate_tables.Description.values[j]]})
                if(len(matches) == 0):
                    matches = match
                else:
                    matches = pd.concat([matches, match], axis=0)

In [37]:
matches.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,LEFT_DESCRIPTION,RIGHT_TABLE,RIGHT_COLUMN,RIGHT_DESCRIPTION
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,public-art.csv,Type,"The ""Type"" column in the table represents the ..."
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,public-art.csv,Status,The 'Status' column in this table indicates th...
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,gvrd-sewer-trunk-mains.csv,EFFLUENT_TYPE,"The ""EFFLUENT_TYPE"" column in the table provid..."
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,gvrd-sewer-trunk-mains.csv,MATERIAL,"The ""MATERIAL"" column in the table represents ..."
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...,SCS_Staff_Salaries_data_30th_June 2010.csv,Grade,"The ""Grade"" column in the table represents the..."


In [38]:
matches.shape

(6246, 6)

In [40]:
matches.to_csv('Description_test/CTA_from_descriptions/cta_matches.csv', index=False)