In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
path = '/home/manoelflorencio/cta_for_jd/LakeBench'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/LakeBench


In [4]:
def generate_prompt_column_description(table, column):

    system_msg = f"""
            Describe the semantics of a target column.
            Task: Describe in one phrase the information within a column in a given table using continuos text, without itemization.
            Instructions: 
                1. Look at the input given to you. 
                2. Look at the column values in detail. 
                3. Describe the target column. 
            """
    
    user_msg = f"""Table columns: {table.columns}
                   Table values:  {table.iloc[:30,:].values}
                   Target column: {column}
                   Description: """.strip()
    
    return system_msg, user_msg

In [5]:
def generate_predictions(df, table_name, client):
    
    descriptions = []
    table_names = []
    
    for i in tqdm(range(df.shape[1])): 
        system_msg_predict_description, user_msg_predict_descrition = generate_prompt_column_description(df, df.columns[i])
        result = execute_prompt(client, system_msg_predict_description, user_msg_predict_descrition)
        description = result.choices[0].message.content.split('Description: ')[-1].strip()
        descriptions.append(description)

    tables_names = [table_name] * len(descriptions)
        
    descriptions_df = pd.DataFrame({
                               "TableName": tables_names,
                               "Column":df.columns, 
                               "Description":descriptions
                               })
    
    return descriptions_df

In [6]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [7]:
client = OpenAI()

In [8]:
filenames = glob.glob('datasets_SG/*')

In [12]:
filenames = ['datasets_SG/SG_CSV0000000000000925.csv','datasets_SG/SG_CSV0000000000001714.csv']

In [13]:
descriptions_df_complete = pd.DataFrame((), columns=['TableName','Column','Description'])

for filename in filenames:
    table_name = filename.split('/')[-1]            
    df = pd.read_csv(filename)
    descriptions_df = generate_predictions(df, table_name, client)
    descriptions_df_complete = pd.concat([descriptions_df_complete, descriptions_df])
    descriptions_df_complete.reset_index(drop=True)
    descriptions_df_complete.to_csv('column_descriptions.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 101/101 [01:55<00:00,  1.15s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:53<00:00,  1.14s/it]
