In [1]:
from datetime import datetime # Results & Storage
import json # 
from pathlib import Path # 

from scripts.fact_gen import generate_facts_k_tokens # Helper Functions
from scripts.build_prompt import build_prompt_for_all_keys #
from scripts.eval import evaluate_token_sequences # 

In [2]:
import os
from dotenv import load_dotenv # APIs
from openai import OpenAI
import tiktoken # Tokenizer

load_dotenv() # Get dem api key

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

# Model
MODEL_NAME = "gpt-3.5-turbo"
ENCODING_NAME = "gpt-3.5-turbo"

# For token counting
encoding = tiktoken.encoding_for_model(ENCODING_NAME)

In [3]:
def count_gpt_tokens(text: str) -> int:
    """Return how many tokens `text` uses under the GPT-3.5 (or GPT-4) tokenizer."""
    return len(encoding.encode(text))

def query_gpt(prompt: str, model: str = MODEL_NAME, temperature: float = 0.0) -> str:
    """
    Calls OpenAI's ChatCompletion API with the given prompt,
    returns the assistant's message content.
    """
    response = client.responses.create(
        model=model,
        instructions="You are a helpful assistant. Respond with only the correct answer on each line.",
        input=prompt,
        temperature=temperature
    )
    return response.output_text

In [4]:
# Load Token Set
filename = "data/tokens/alpha_tokens.json"
with open(filename, 'r') as f:
        token_set = set(json.load(f))

# Convert to list and count list
single_token_vocab = list(token_set)
print("Single-token vocab size:", len(single_token_vocab))

# Ensure unique tokens by count_gpt_tokens
check_good = True
for token in single_token_vocab:
    if count_gpt_tokens(token) != 1:
        print(f"Token '{token}' has {count_gpt_tokens(token)} tokens, not 1.")
        check_good = False
if check_good:
    print("All tokens are single tokens.")

Single-token vocab size: 10000
All tokens are single tokens.


In [5]:
def grade_response(response_text: str, question_keys_in_order: list, key_value_dict: dict, num_facts: int, k: int):
    """
    Grades GPT response:
    -- Checks if response starts with a non-supported character
    -- Converts response to a list of strings
    -- Compares token count with expected token count
    -- Compares response with expected response
    Returns a tuple of (sequence_accuracy, token_accuracy, major_format_flaw)
    """
    # Check if response starts or ends with a non supported character (a-z lowercase is only supported)
    major_format_flaw = False
    if not response_text[0].isalpha() or response_text[-1].isalpha():
        major_format_flaw = True

    # expected response
    correct_response_seqs = [key_value_dict[key] for key in question_keys_in_order]

    # Parse model response
    response_seqs = [line.strip() for line in response_text.strip().split("\n") if line.strip()]

    # Token Count (Using Tokenizer)
    expected_token_count = sum(count_gpt_tokens(seq) for seq in correct_response_seqs)
    response_token_count = sum(count_gpt_tokens(seq) for seq in response_seqs)

    # Evaluate
    scores = evaluate_token_sequences(response_seqs, correct_response_seqs)
    return scores, major_format_flaw, expected_token_count, response_token_count


In [6]:
def run_experiments(
    facts_list_sizes=[3, 6],
    token_sizes=[2, 3],
    trials=1,
    output_dir="results"
):
    """Run GPT response evaluations and save structured JSON per (num_facts, k) combo."""
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    single_token_vocab = list(token_set)

    for num_facts in facts_list_sizes:
        for k in token_sizes:
            group_id = f"{num_facts}facts_{k}tokens"
            timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%S")
            file_id = f"{group_id}_{timestamp}"
            output_path = Path(output_dir) / f"{file_id}.json"

            results_group = {
                "id": file_id,
                "num_facts": num_facts,
                "k": k,
                "trials": []
            }

            for trial_idx in range(trials):
                # Generate data
                facts_list, key_value_dict = generate_facts_k_tokens(num_facts, k, single_token_vocab)
                prompt_text, question_keys = build_prompt_for_all_keys(facts_list)

                try:
                    response = query_gpt(prompt_text, model=MODEL_NAME, temperature=0.0)
                except Exception as e:
                    response = f"ERROR: {str(e)}"

                correct_response_text = "\n".join(key_value_dict[key] for key in question_keys)

                # Grade and evaluate
                (seq_acc, tok_acc), major_format_flaw, expected_token_count, response_token_count = grade_response(
                    response, question_keys, key_value_dict, num_facts, k
                )

                trial_record = {
                    "trial": trial_idx,
                    "sequence_accuracy": seq_acc,
                    "token_accuracy": tok_acc,
                    "major_format_flaw": major_format_flaw,
                    "prompt_text": prompt_text,
                    "response_text": response,
                    "response_token_count": response_token_count,
                    "expected_response_text": correct_response_text,
                    "expected_token_count": expected_token_count
                }

                results_group["trials"].append(trial_record)

            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(results_group, f, indent=2)

    return f"Saved results grouped by (num_facts, k) in: {output_dir}"


run_experiments(
    facts_list_sizes=[3],
    token_sizes=[2],
    trials=1,
    output_dir="results/gpt3.5_3k_6f_2t" 
)


  timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%S")


'Saved results grouped by (num_facts, k) in: results/gpt3.5_3k_6f_2t'

In [7]:
import sys
print(sys.executable)

c:\Users\Marco\Desktop\FTAT\FTAT\venv\Scripts\python.exe
