In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

import pickle
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


In [4]:
def generate_prompt_table_prediction(table, table_name):

    system_msg = f"""
            Given a table name, its respective columns names and some column values, describe the semantics of the given table.
            
            Describe the semantics of a given table.
            Task: Describe in one phrase the information within a table using continuos text, without itemization.
            Instructions: 
                1. Look at the input given to you.
                2. Look at the table and columns names.
                3. Look at the column values in detail. 
                4. Describe the target table. 
            """
            
    user_msg =  f"""Table Name: {table_name}
                    Columns Names: {','.join(table.columns)}
                    Table values: {table.iloc[:50,:]}
                    Description:""".strip()
    
    return system_msg, user_msg

In [12]:
def generate_predictions(table, table_name, client):
    
    system_msg_describe_table, user_msg_describe_table = generate_prompt_table_prediction(table, table_name)
    result = execute_prompt(client, system_msg_describe_table, user_msg_describe_table)
    description = result.choices[0].message.content.split('Description: ')[-1].strip()

    description_df = pd.DataFrame({
                               "TableName": [table_name],
                               "Description":[description]
                               })
    
    return description_df

In [10]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [7]:
client = OpenAI()

In [8]:
filenames = glob.glob('datasets/*')

In [9]:
df_dsInformation = pd.read_csv('datasetInformation_testbedXS.csv')

In [13]:
descriptions_df_complete = pd.DataFrame((), columns=['TableName','Description'])

for filename in tqdm(filenames):
    table_name = filename.split('/')[-1]
    info  = df_dsInformation[df_dsInformation['filename'] == filename.split('/')[-1]]                
    df = pd.read_csv(filename, delimiter=info['delimiter'].values[0])
    descriptions_df = generate_predictions(df, table_name, client)
    descriptions_df_complete = pd.concat([descriptions_df_complete, descriptions_df])

    descriptions_df_complete.reset_index(drop=True)
    descriptions_df_complete.to_csv('Description_test/table_descriptions_test/table_descriptions.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:47<00:00,  1.70s/it]
