In [17]:
import openai

import random
import string
import tiktoken
import pandas as pd
import json
import os
from dotenv import load_dotenv


from datetime import datetime
from pathlib import Path
from scripts.fact_gen import generate_facts_k_tokens
from scripts.build_prompt import build_prompt_for_all_keys


load_dotenv() # Get dat api key


from openai import OpenAI


client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)


# Log Directory
LOG_DIR = Path("experiment_logs")
LOG_DIR.mkdir(exist_ok=True)

# Model
MODEL_NAME = "gpt-3.5-turbo"
ENCODING_NAME = "gpt-3.5-turbo"

# For token counting
encoding = tiktoken.encoding_for_model(ENCODING_NAME)


In [18]:
def count_gpt_tokens(text: str) -> int:
    """Return how many tokens `text` uses under the GPT-3.5 (or GPT-4) tokenizer."""
    return len(encoding.encode(text))

def query_gpt(prompt: str, model: str = MODEL_NAME, temperature: float = 0.0) -> str:
    """
    Calls OpenAI's ChatCompletion API with the given prompt,
    returns the assistant's message content.
    """
    response = client.responses.create(
        model=model,
        instructions="You are a helpful assistant. Respond with only the correct answer on each line.",
        input=prompt,
        temperature=temperature
    )
    return response.output_text

In [19]:
# Load Token Set
filename = "data/tokens/alpha_tokens.json"
with open(filename, 'r') as f:
        token_set = set(json.load(f))

In [20]:
def run_experiments(facts_list_sizes=[5, 10], token_sizes=[2, 3], trials=3):
    """Enhanced version with comprehensive logging"""
    single_token_vocab = list(token_set)
    
    records = []
    for num_facts in facts_list_sizes:
        for k in token_sizes:
            for trial_idx in range(trials):
                # Generate facts and prompt
                facts_list, key_value_dict = generate_facts_k_tokens(num_facts, k, single_token_vocab)
                prompt_text, question_keys = build_prompt_for_all_keys(facts_list)
                prompt_len = count_gpt_tokens(prompt_text)
                
                # Query GPT
                try:
                    response = query_gpt(prompt_text, model=MODEL_NAME, temperature=0.0)
                except Exception as e:
                    response = f"ERROR: {str(e)}"
                
                # Grade response
                accuracy, num_matches, total_questions, mismatches = grade_response(
                    response, question_keys, key_value_dict
                )
                
                # Build complete record
                record = {
                    "num_facts": num_facts,
                    "k": k,
                    "trial": trial_idx,
                    "prompt_text": prompt_text,  # Store actual prompt
                    "question_keys": question_keys,  # Store expected questions
                    "key_value_dict": key_value_dict,  # Store correct answers
                    "prompt_len": prompt_len,
                    "accuracy": accuracy,
                    "num_correct": num_matches,
                    "num_questions": total_questions,
                    "mismatches": mismatches,
                    "response": response,
                }
                
                # Log everything
                #log_experiment(record)
                # if accuracy < 1:
                #     log_mismatch_details(record)
                
                records.append(record)
                print(f"Completed: facts={num_facts}, k={k}, trial={trial_idx}, accuracy={accuracy:.2f}")
    
    return pd.DataFrame(records)

# Example usage (Uncomment once you have your API key set):
#results_df = run_experiments(facts_list_sizes=[50, 100, 150 ,200, 250,300,350,400,450], token_sizes=[1,2,3,4], trials=4)
#results_df = run_experiments(facts_list_sizes=[2,5,10,25,50,100], token_sizes=[1,2,5,10,25], trials=10)
#results_df.head()


In [21]:
# EXAMPLE PROMPT AND RESPONSE
single_token_vocab = list(token_set)
print("Single-token vocab size:", len(single_token_vocab))

# Generate some facts
num_facts = 5
k = 3  # each key = 3 tokens, each value = 3 tokens
facts_list, key_value_dict = generate_facts_k_tokens(num_facts, k, single_token_vocab)

# Build the big prompt that queries all keys in random order
prompt, question_keys_in_order = build_prompt_for_all_keys(facts_list)

print("=== Prompt (Debug) ===\n")
print(prompt)
prompt_len = count_gpt_tokens(prompt)
print(f"\nPrompt token length: {prompt_len}")

print("\n=== The correct ordering of values (reference) ===")
for key in question_keys_in_order:
    print(f"{key_value_dict[key]}")

# Query GPT (uncomment if you have your key set up)
answer = query_gpt(prompt, model=MODEL_NAME, temperature=0.0)
print("\n=== GPT RESPONSE ===")
print(answer)


Single-token vocab size: 10000
=== Prompt (Debug) ===

Below are some arbitrary assignments of Key => Value:

ligne|kill|untu => play|jobs|ieme
def|reich|blk => ampus|ale|aval
plans|pg|forc => rical|reeze|kre
parm|oure|oglob => ison|shops|ammad
ene|aper|ive => inned|orra|ackle

Your task is to provide the correct Values for each of the following Keys, in the exact same order they are listed:

1) plans|pg|forc
2) def|reich|blk
3) ene|aper|ive
4) parm|oure|oglob
5) ligne|kill|untu

Answer with one value per line, matching the order of the keys above.

Prompt token length: 154

=== The correct ordering of values (reference) ===
rical|reeze|kre
ampus|ale|aval
inned|orra|ackle
ison|shops|ammad
play|jobs|ieme

=== GPT RESPONSE ===
1) rical|reeze|kre
2) ampus|ale|aval
3) inned|orra|ackle
4) ison|shops|ammad
5) play|jobs|ieme
