In [None]:
import sys
sys.path.append('..')
from tools.llm_completions import get_gpt_completion, get_llama3_completion, get_claude3_completion, get_mixtral_completion
from tools.llm_prompting import llm_propose_features_get_prompt, llm_propose_features, llm_write_code_get_prompt, llm_write_code

In [None]:
# The JSON is just a structured form of column descriptions in the Kaggle data description page.
import json
with open('../descriptions/avito.json') as f:
    table_desc = json.load(f)

In [None]:
from tools.evaluation import load_mock_data, load_metadata

dataframes = load_mock_data('avito', load_metadata('../schemas/avito.yaml'))

In [None]:
for table_name in list(table_desc.keys()):
    if table_name not in dataframes:
        print(f'{table_name} not found in schema, removing...')
        del table_desc[table_name]
    for col in list(table_desc[table_name].keys()):
        if col not in dataframes[table_name].columns:
            print(f'{table_name}.{col} not found in schema, removing...')
            del table_desc[table_name][col]

In [None]:
sample_prompt = llm_propose_features_get_prompt(table_desc, "SearchStream", "IsClick")
print(sample_prompt)

In [None]:
gpt_feature_descs = llm_propose_features(table_desc, "SearchStream", "IsClick", get_gpt_completion)

In [None]:
print(f'Prompt for feature {gpt_feature_descs[0]}')
print(llm_write_code_get_prompt(table_desc, "SearchStream", "IsClick", gpt_feature_descs[0]))

In [None]:
gpt_code_blocks = llm_write_code(dataframes, table_desc, "SearchStream", "IsClick", gpt_feature_descs, get_gpt_completion)

In [None]:
from llm_executor import collect_new_features
gpt_results = collect_new_features(dataframes, 'SearchStream', gpt_feature_descs, gpt_code_blocks)

In [None]:
import pandas as pd
gpt_code_df = pd.DataFrame({'feature_description': gpt_feature_descs, 'code': gpt_code_blocks})
gpt_code_df.to_csv('../test_results/gpt.csv')

In [None]:
claude3_feature_descs = llm_propose_features(table_desc, "SearchStream", "IsClick", get_claude3_completion)
claude3_code_blocks = llm_write_code(dataframes, table_desc, "SearchStream", "IsClick", claude3_feature_descs, get_claude3_completion)
claude3_code_df = pd.DataFrame({'feature_description': claude3_feature_descs, 'code': claude3_code_blocks})
claude3_code_df.to_csv('../test_results/claude3.csv')

In [None]:
llama3_feature_descs = llm_propose_features(table_desc, "SearchStream", "IsClick", get_llama3_completion)
llama3_code_blocks = llm_write_code(dataframes, table_desc, "SearchStream", "IsClick", llama3_feature_descs, get_llama3_completion)
llama3_code_df = pd.DataFrame({'feature_description': llama3_feature_descs, 'code': llama3_code_blocks})
llama3_code_df.to_csv('../test_results/llama3.csv')

In [None]:
mixtral_feature_descs = llm_propose_features(table_desc, "SearchStream", "IsClick", get_mixtral_completion)
mixtral_code_blocks = llm_write_code(dataframes, table_desc, "SearchStream", "IsClick", mixtral_feature_descs, get_mixtral_completion)


In [None]:
mixtral_code_df = pd.DataFrame({'feature_description': mixtral_feature_descs, 'code': mixtral_code_blocks})
mixtral_code_df.to_csv('../test_results/mixtral.csv')