In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from pydantic import BaseModel
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [4]:
def generate_prompt_cta_prediction(description, cta_types):

    system_msg = f"""
            Your task is to classify a column's description of a given table with only one of the following classes that are separated 
            with comma: {",".join(cta_types)}.

            Your instructions are:
                1. Look at the input given to you.
                2. Look at the description semantics in detail.
                3. Predict a class among the following classes {",".join(cta_types)} for the target column.
                4. Answer only with the predicted class.

            Example 1:
                Description: "The 'Sex' column in the table represents the gender of the animal listed in each row. The values in this column consist of abbreviations that denote both the sex and reproductive status of the animals. 'M' indicates a male animal, and 'F' stands for a female animal.
                Class: category

            Example 2:
                Description: "The ""GeoLocalArea"" column in the table represents the geographical local area within which the public artworks or installations are situated or associated.
                Class: streetAddress"""
            
    user_msg =  f"""Description: {description}
                    Class: """.strip()
    
    return system_msg, user_msg

In [5]:
def generate_predictions(description, cta_types, client):
    
    system_msg_predict_cta, user_msg_predict_cta = generate_prompt_cta_prediction(description, cta_types)
    result = execute_prompt(client, system_msg_predict_cta, user_msg_predict_cta)
    cta = result.choices[0].message.content.split('Class: ')[-1].strip()
    
    return cta

In [6]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            temperature=0.2,
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [7]:
client = OpenAI()

In [32]:
descriptions = pd.read_csv('Description_test/CTA_from_descriptions/nextiajdxs_description_junio.csv')
descriptions.head()

Unnamed: 0,TableName,Column,Description
0,eo_xx,EIN,The Employer Identification Number (EIN) is a ...
1,eo_xx,NAME,The NAME column contains the official name of ...
2,eo_xx,ICO,The ICO column appears to contain names of ind...
3,eo_xx,STREET,The STREET column provides the street address ...
4,eo_xx,CITY,The CITY column specifies the city where the o...


In [10]:
cta_types = np.load('Description_test/CTA_from_descriptions/cta_types.npy')

In [11]:
cta = []
for table_name, column_name, description in tqdm(descriptions.values):
    target_description = description
    cta.append(generate_predictions(target_description, cta_types, client))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 265/265 [02:11<00:00,  2.02it/s]


In [33]:
descriptions['CTA'] = cta

In [13]:
descriptions.to_csv('Description_test/all_junio_descriptions_with_cta.csv',index=False)

In [75]:
descriptions_with_cta = pd.read_csv('Description_test/all_junio_descriptions_with_cta.csv')

In [76]:
descriptions_with_cta.head()

Unnamed: 0,TableName,Column,Description,CTA
0,eo_xx,EIN,The Employer Identification Number (EIN) is a ...,Identifier
1,eo_xx,NAME,The NAME column contains the official name of ...,Organization
2,eo_xx,ICO,The ICO column appears to contain names of ind...,Person
3,eo_xx,STREET,The STREET column provides the street address ...,streetAddress
4,eo_xx,CITY,The CITY column specifies the city where the o...,Text


In [77]:
table_names = descriptions_with_cta.TableName.unique()
table_names

array(['eo_xx', 'datasets_579296_1047868_authors', 'schools',
       'animal-control-inventory-lost-and-found',
       'datasets_517172_952401_train', 'eo_pr',
       'road-ahead-current-road-closures', 'street-intersections',
       'gvrd-sewer-trunk-mains', 'rental-standards-current-issues',
       'public-art', 'public-art-artists',
       'statewise-census-data-in-india-1901-2011', 'cultural-spaces',
       'libraries', 'population-census-of-botswana-2011', 'ability_ids',
       'survey_results_schema', 'glassdoor_wwfu_val_captions', 'eo4',
       'community-centres', 'road-ahead-projects-under-construction',
       'road-ahead-upcoming-projects',
       'SCS_Staff_Salaries_data_30th_June 2010', 'property-tie-lines',
       'population-by-governorate-citizenship-and-gender',
       'community-gardens-and-food-trees', 'currency_exchange'],
      dtype=object)

In [78]:
matches = pd.DataFrame({'LEFT_TABLE':[],'LEFT_COLUMN':[],'RIGHT_TABLE':[],'RIGHT_COLUMN':[]})
for table_name in table_names:
    target_table = descriptions_with_cta[descriptions_with_cta['TableName'] == table_name]
    candidate_tables = descriptions_with_cta[descriptions_with_cta['TableName'] != table_name]
    for i in range(len(target_table.Column)):
        for j in range(len(candidate_tables.Column)):
            if(target_table.CTA.values[i] == candidate_tables.CTA.values[j]):
                match = pd.DataFrame({'LEFT_TABLE':[table_name],
                                      'LEFT_COLUMN':[target_table.Column.values[i]],
                                      'LEFT_DESCRIPTION':[target_table.Description.values[i]],
                                      'RIGHT_TABLE':[candidate_tables.TableName.values[j]],
                                      'RIGHT_COLUMN':[candidate_tables.Column.values[j]],
                                      'RIGHT_DESCRIPTION':[candidate_tables.Description.values[j]]})
                if(len(matches) == 0):
                    matches = match
                else:
                    matches = pd.concat([matches, match], axis=0)

In [79]:
matches.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,LEFT_DESCRIPTION,RIGHT_TABLE,RIGHT_COLUMN,RIGHT_DESCRIPTION
0,eo_xx,EIN,The Employer Identification Number (EIN) is a ...,datasets_579296_1047868_authors,Author_ID,This column contains a unique identifier for e...
0,eo_xx,EIN,The Employer Identification Number (EIN) is a ...,datasets_517172_952401_train,id_code,The 'id_code' column contains unique identifie...
0,eo_xx,EIN,The Employer Identification Number (EIN) is a ...,eo_pr,EIN,The Employer Identification Number (EIN) uniqu...
0,eo_xx,EIN,The Employer Identification Number (EIN) is a ...,eo_pr,GROUP,"Group exemption number, which indicates if the..."
0,eo_xx,EIN,The Employer Identification Number (EIN) is a ...,eo_pr,SUBSECTION,Numerical code indicating the subsection of th...


In [80]:
matches.shape

(5682, 6)

In [71]:
def generate_key_for_evaluation(df):
    left_table_name = []
    left_column_name = []
    left_description = []
    right_table_name = []
    right_column_name = []
    right_description = []
    df_key = pd.DataFrame((), columns=df.columns)
    
    for i in range(df.shape[0]):
        rtn = df.iloc[i, 3]
        ltn = df.iloc[i, 0]
        right_table_name.append(rtn if rtn > ltn else ltn)
        left_table_name.append(rtn if rtn < ltn else ltn)
        
    
        rcn = df.iloc[i, 4]
        lcn = df.iloc[i, 1]
        right_column_name.append(rcn if rtn > ltn else lcn)
        left_column_name.append(rcn if rtn < ltn else lcn)

        rcd = df.iloc[i, 5]
        lcd = df.iloc[i, 2]
        left_description.append(rcd if rtn > ltn else lcd)
        right_description.append(rcd if rtn < ltn else lcd)
    
    d = {'LEFT_TABLE':left_table_name,
         'LEFT_COLUMN':left_column_name,
         'LEFT_DESCRIPTION':left_description,
         'RIGHT_TABLE':right_table_name,
         'RIGHT_COLUMN':right_column_name,
         'RIGHT_DESCRIPTION':right_description}
    df_key = pd.DataFrame(d)
    df_key = df_key.drop_duplicates()
    df_key = df_key.reset_index(drop=True)

    return df_key

In [81]:
matches = generate_key_for_evaluation(matches)
matches.shape

(2841, 6)

In [83]:
matches.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,LEFT_DESCRIPTION,RIGHT_TABLE,RIGHT_COLUMN,RIGHT_DESCRIPTION
0,datasets_579296_1047868_authors,Author_ID,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,This column contains a unique identifier for e...
1,datasets_517172_952401_train,id_code,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,The 'id_code' column contains unique identifie...
2,eo_pr,EIN,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,The Employer Identification Number (EIN) uniqu...
3,eo_pr,GROUP,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,"Group exemption number, which indicates if the..."
4,eo_pr,SUBSECTION,The Employer Identification Number (EIN) is a ...,eo_xx,EIN,Numerical code indicating the subsection of th...


In [82]:
matches.to_csv('Description_test/CTA_from_descriptions/junio_description_cta_matches.csv', index=False)