In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from pydantic import BaseModel
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [23]:
def generate_prompt_cta_prediction(description, cta_types):

    system_msg = f"""
            Your task is to classify a column's description of a given table with only one of the following classes that are separated 
            with comma: {",".join(cta_types)}.

            Your instructions are:
                1. Look at the input given to you.
                2. Look at the description semantics in detail.
                3. Predict a class among the following classes {",".join(cta_types)} for the target column.
                4. Answer only with the predicted class.

            Example 1:
                Description: "The 'Sex' column in the table represents the gender of the animal listed in each row. The values in this column consist of abbreviations that denote both the sex and reproductive status of the animals. 'M' indicates a male animal, and 'F' stands for a female animal.
                Class: category

            Example 2:
                Description: "The ""GeoLocalArea"" column in the table represents the geographical local area within which the public artworks or installations are situated or associated.
                Class: streetAddress"""
            
    user_msg =  f"""Description: {description}
                    Class: """.strip()
    
    return system_msg, user_msg

In [24]:
def generate_predictions(description, cta_types, client):
    
    system_msg_predict_cta, user_msg_predict_cta = generate_prompt_cta_prediction(description, cta_types)
    result = execute_prompt(client, system_msg_predict_cta, user_msg_predict_cta)
    cta = result.choices[0].message.content.split('Class: ')[-1].strip()
    
    return cta

In [4]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            temperature=0.2,
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [5]:
client = OpenAI()

In [6]:
descriptions = pd.read_csv('Description_test/all_descriptions.csv')
descriptions.head()

Unnamed: 0,TableName,Column,Description
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...
1,statewise-census-data-in-india-1901-2011.csv,DATE,The 'DATE' column in the table represents the ...
2,statewise-census-data-in-india-1901-2011.csv,LOCATION_NAME,"The ""LOCATION_NAME"" column in the table repres..."
3,statewise-census-data-in-india-1901-2011.csv,LOCATION_F5,"The column ""LOCATION_F5"" in the table appears ..."
4,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,"The ""LOCATION_ID"" column contains identifiers ..."


In [63]:
columns.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,CTA
0,population-by-governorate-citizenship-and-gend...,Year,Date
1,community-gardens-and-food-trees.csv,MAPID,Identifier
2,population-census-of-botswana-2011.csv,REGION_ID,Identifier
3,population-by-governorate-citizenship-and-gend...,Governorate,Country
4,population-census-of-botswana-2011.csv,REGION_REGIONID,Identifier


In [7]:
columns = pd.read_csv('Description_test/CTA_from_descriptions/columns_to_predict.csv')
columns.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN
0,population-by-governorate-citizenship-and-gend...,Year
1,community-gardens-and-food-trees.csv,MAPID
2,population-census-of-botswana-2011.csv,REGION_ID
3,population-by-governorate-citizenship-and-gend...,Governorate
4,population-census-of-botswana-2011.csv,REGION_REGIONID


In [28]:
cta_types = np.load('Description_test/CTA_from_descriptions/cta_types.npy')

array(['Boolean', 'Coordinates', 'Country', 'CreativeWork', 'Date',
       'Event', 'Gender', 'JobPosting', 'Language', 'Company', 'Number',
       'Organization', 'Person', 'Product', 'SportsTeam', 'Text', 'Time',
       'URL', 'category', 'currency', 'email', 'price', 'streetAddress',
       'telephone', 'Age', 'weight', 'zipCode', 'Identifier'],
      dtype='<U13')

In [35]:
cta = []
for table_name, column_name in tqdm(columns.values):
    target_description = descriptions[(descriptions['TableName'] == table_name) & (descriptions['Column'] == column_name)].Description
    cta.append(generate_predictions(target_description, cta_types, client))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 202/202 [01:41<00:00,  1.99it/s]


In [36]:
cta

['Date',
 'Identifier',
 'Identifier',
 'Country',
 'Identifier',
 'Identifier',
 'Date',
 'Text',
 'Text',
 'URL',
 'URL',
 'category',
 'streetAddress',
 'streetAddress',
 'streetAddress',
 'Coordinates',
 'streetAddress',
 'Text',
 'streetAddress',
 'streetAddress',
 'Text',
 'category',
 'Coordinates',
 'Coordinates',
 'CreativeWork',
 'streetAddress',
 'streetAddress',
 'streetAddress',
 'streetAddress',
 'Text',
 'Date',
 'Date',
 'URL',
 'URL',
 'Coordinates',
 'Coordinates',
 'Coordinates',
 'Coordinates',
 'Identifier',
 'Identifier',
 'URL',
 'streetAddress',
 'category',
 'streetAddress',
 'Text',
 'Coordinates',
 'streetAddress',
 'CreativeWork',
 'Date',
 'streetAddress',
 'streetAddress',
 'streetAddress',
 'streetAddress',
 'Coordinates',
 'Coordinates',
 'Coordinates',
 'streetAddress',
 'streetAddress',
 'Text',
 'Person',
 'Organization',
 'URL',
 'Text',
 'Text',
 'URL',
 'streetAddress',
 'streetAddress',
 'Number',
 'Person',
 'Person',
 'Text',
 'Text',
 'category

In [37]:
columns['CTA'] = cta

In [38]:
columns

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,CTA
0,population-by-governorate-citizenship-and-gend...,Year,Date
1,community-gardens-and-food-trees.csv,MAPID,Identifier
2,population-census-of-botswana-2011.csv,REGION_ID,Identifier
3,population-by-governorate-citizenship-and-gend...,Governorate,Country
4,population-census-of-botswana-2011.csv,REGION_REGIONID,Identifier
5,datasets_579296_1047868_authors.csv,Author_ID,Identifier
6,cultural-spaces.csv,YEAR,Date
7,cultural-spaces.csv,CULTURAL_SPACE_NAME,Text
8,community-centres.csv,NAME,Text
9,cultural-spaces.csv,WEBSITE,URL


In [64]:
columns.to_csv('Description_test/CTA_from_descriptions/columns_with_cta.csv', index=False)

In [39]:
predicted_joins = pd.read_csv('Description_test/predicted_joins_df_20241111.csv')
predicted_joins.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY
0,population-by-governorate-citizenship-and-gend...,Year,statewise-census-data-in-india-1901-2011.csv,DATE,population-by-governorate-citizenship-and-gend...
1,community-gardens-and-food-trees.csv,MAPID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,community-gardens-and-food-trees.csv#MAPID#sta...
2,population-census-of-botswana-2011.csv,REGION_ID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,population-census-of-botswana-2011.csv#REGION_...
3,population-by-governorate-citizenship-and-gend...,Governorate,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,population-by-governorate-citizenship-and-gend...
4,population-census-of-botswana-2011.csv,REGION_REGIONID,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,population-census-of-botswana-2011.csv#REGION_...


In [50]:
left_table_cta = []
right_table_cta = []

for i in range(predicted_joins.shape[0]):
    left_table_cta.append(columns[(columns['LEFT_TABLE'] == predicted_joins.iloc[i,0]) & (columns['LEFT_COLUMN'] == predicted_joins.iloc[i,1])].CTA.values[0])
    right_table_cta.append(columns[(columns['LEFT_TABLE'] == predicted_joins.iloc[i,2]) & (columns['LEFT_COLUMN'] == predicted_joins.iloc[i,3])].CTA.values[0])

In [51]:
predicted_joins['LEFT_CTA'] = left_table_cta
predicted_joins['RIGHT_CTA'] = right_table_cta

In [66]:
predicted_joins[predicted_joins['LEFT_CTA'] != predicted_joins['RIGHT_CTA']]

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY,LEFT_CTA,RIGHT_CTA
3,population-by-governorate-citizenship-and-gend...,Governorate,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,population-by-governorate-citizenship-and-gend...,Country,Identifier
8,cultural-spaces.csv,CULTURAL_SPACE_NAME,eo_xx.csv,NAME,cultural-spaces.csv#CULTURAL_SPACE_NAME#eo_xx....,Text,Person
24,cultural-spaces.csv,ADDRESS,road-ahead-projects-under-construction.csv,LOCATION,cultural-spaces.csv#ADDRESS#road-ahead-project...,streetAddress,Coordinates
26,community-centres.csv,Geom,cultural-spaces.csv,ADDRESS,community-centres.csv#Geom#cultural-spaces.csv...,Coordinates,streetAddress
31,community-gardens-and-food-trees.csv,NAME,cultural-spaces.csv,LOCAL_AREA,community-gardens-and-food-trees.csv#NAME#cult...,Text,streetAddress
33,cultural-spaces.csv,LOCAL_AREA,libraries.csv,NAME,cultural-spaces.csv#LOCAL_AREA#libraries.csv#NAME,streetAddress,Text
35,community-centres.csv,NAME,cultural-spaces.csv,LOCAL_AREA,community-centres.csv#NAME#cultural-spaces.csv...,Text,streetAddress
39,community-gardens-and-food-trees.csv,JURISDICTION,cultural-spaces.csv,OWNERSHIP,community-gardens-and-food-trees.csv#JURISDICT...,category,Text
40,community-centres.csv,NAME,cultural-spaces.csv,NUMBER_OF_SEATS,community-centres.csv#NAME#cultural-spaces.csv...,Text,Number
50,road-ahead-current-road-closures.csv,PROJECT,schools.csv,ADDRESS,road-ahead-current-road-closures.csv#PROJECT#s...,CreativeWork,streetAddress


In [55]:
filtered_predicted_joins = predicted_joins[predicted_joins['LEFT_CTA'] == predicted_joins['RIGHT_CTA']]
filtered_predicted_joins.head()

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,RIGHT_TABLE,RIGHT_COLUMN,KEY,LEFT_CTA,RIGHT_CTA
0,population-by-governorate-citizenship-and-gend...,Year,statewise-census-data-in-india-1901-2011.csv,DATE,population-by-governorate-citizenship-and-gend...,Date,Date
1,community-gardens-and-food-trees.csv,MAPID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,community-gardens-and-food-trees.csv#MAPID#sta...,Identifier,Identifier
2,population-census-of-botswana-2011.csv,REGION_ID,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,population-census-of-botswana-2011.csv#REGION_...,Identifier,Identifier
4,population-census-of-botswana-2011.csv,REGION_REGIONID,statewise-census-data-in-india-1901-2011.csv,LOCATION_REGIONID,population-census-of-botswana-2011.csv#REGION_...,Identifier,Identifier
5,datasets_579296_1047868_authors.csv,Author_ID,statewise-census-data-in-india-1901-2011.csv,VARIABLE_ID,datasets_579296_1047868_authors.csv#Author_ID#...,Identifier,Identifier


In [62]:
filtered_predicted_joins.to_csv('Description_test/CTA_from_descriptions/cta_filtered_join_prediction.csv', index=False)

In [56]:
groundTruth = pd.read_csv('joinable_columns_gt3_quality.csv')
groundTruth = groundTruth[['ds_name', 'att_name', 'ds_name_2', 'att_name_2']]

left_table_name = []
left_column_name = []
right_table_name = []
right_column_name = []
groundTruth_df = pd.DataFrame((), columns=groundTruth.columns)

for i in range(groundTruth.shape[0]):
    rtn = groundTruth.iloc[i, 2]
    ltn = groundTruth.iloc[i, 0]
    right_table_name.append(rtn if rtn > ltn else ltn)
    left_table_name.append(rtn if rtn < ltn else ltn)

    rcn = groundTruth.iloc[i, 3]
    lcn = groundTruth.iloc[i, 1]
    right_column_name.append(rcn if rtn > ltn else lcn)
    left_column_name.append(rcn if rtn < ltn else lcn)

d = {'LEFT_TABLE':left_table_name,
     'LEFT_COLUMN':left_column_name,
     'RIGHT_TABLE':right_table_name,
     'RIGHT_COLUMN':right_column_name}
groundTruth_df = pd.DataFrame(d)
groundTruth_df = groundTruth_df.drop_duplicates()
groundTruth_df = groundTruth_df.reset_index(drop=True)

key = []
for i in range(groundTruth_df.shape[0]):
    key.append("#".join(groundTruth_df.iloc[i,:]))
groundTruth_df['KEY'] = key

In [57]:
tp = 0
fp = 0
fn = 0

for i in range(len(filtered_predicted_joins)):
    if(filtered_predicted_joins.iloc[i,4] in groundTruth_df.iloc[:,4].values):
        tp += 1
    else:
        fp += 1
    
for i in range(len(groundTruth_df)):
    if(groundTruth_df.iloc[i,4] not in filtered_predicted_joins.iloc[:,4].values):
        fn += 1
    
precision = (tp)/(tp+fp)
recall = (tp)/(tp+fn)
f1_score = 2 * (precision*recall)/(precision+recall)

print(precision, recall, f1_score)

0.13662790697674418 0.8867924528301887 0.2367758186397985


In [58]:
len(filtered_predicted_joins)

344

In [59]:
len(groundTruth_df)

53

# Falso-positivos

In [None]:
for i in range(len(predicted_joins_df)):
    if(predicted_joins_df.iloc[i,4] not in groundTruth_df.iloc[:,4].values):
        print(predicted_joins_df.iloc[i,4])

# Falso-negativos

In [72]:
columns[columns['LEFT_TABLE'] == 'eo_xx.csv']

Unnamed: 0,LEFT_TABLE,LEFT_COLUMN,CTA
50,eo_xx.csv,STREET,streetAddress
59,eo_xx.csv,NAME,Person
73,eo_xx.csv,EIN,Identifier
78,eo_xx.csv,CITY,Text
81,eo_xx.csv,STATE,Country
107,eo_xx.csv,SORT_NAME,Text
147,eo_xx.csv,TAX_PERIOD,Date
181,eo_xx.csv,ICO,Text
182,eo_xx.csv,GROUP,category
183,eo_xx.csv,SUBSECTION,Text


In [61]:
for i in range(len(groundTruth_df)):
    if(groundTruth_df.iloc[i,4] not in filtered_predicted_joins.iloc[:,4].values):
        print(groundTruth_df.iloc[i,4])

eo4.csv#NAME#eo_pr.csv#NAME
eo4.csv#ICO#eo_pr.csv#ICO
public-art.csv#GeoLocalArea#street-intersections.csv#Geo Local Area
population-census-of-botswana-2011.csv#FREQUENCY#statewise-census-data-in-india-1901-2011.csv#FREQUENCY
eo4.csv#NAME#eo_xx.csv#NAME
eo4.csv#ICO#eo_xx.csv#ICO
