In [1]:
!pip install z3-solver

Collecting z3-solver
  Downloading z3_solver-4.15.1.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (602 bytes)
Downloading z3_solver-4.15.1.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.5/29.5 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: z3-solver
Successfully installed z3-solver-4.15.1.0


In [2]:
import os
from z3 import *
import time
import base64

In [5]:
from openai import OpenAI
client = OpenAI()

In [6]:
def get_gpt_response(prompt, image_fp, model_str="gpt-4o-mini"):

  with open(image_fp, "rb") as image_file:
      encoded_string = base64.b64encode(image_file.read()).decode()
  data_uri = f"data:image/png;base64,{encoded_string}"

  start = time.time()
  response = client.responses.create(
      model=model_str,
      input=[
          {"role": "user", "content": prompt},
          {
              "role": "user",
              "content": [
                  {
                      "type": "input_image",
                      "image_url": data_uri
                  }
              ]
          }
      ]
  )
  end = time.time()
  return response.output_text, response.usage.total_tokens, end-start

In [7]:
def extract_solution(response):
  res = response
  solution = [[]]
  row = 0

  strt = res.rfind("[[")
  end = res.find("]]", strt)
  if strt == -1 or end == -1:
    return None

  for i in range(strt, end):
    if res[i].isdigit():
      solution[row].append(int(res[i]))
    elif res[i] == ']':
      solution.append([])
      row+=1
  return solution

In [8]:
def parse_block_constraints(puzzle, cells):
    constraints = []
    for block in puzzle:
        op = block["op"]
        target = block["target"]
        vars_in_block = [cells[i][j] for i, j in block["cells"]]
        if op == "":
            constraints.append(vars_in_block[0] == target)
        elif op == "add":
            constraints.append(Sum(vars_in_block) == target)
        elif op == "mul":
            product = vars_in_block[0]
            for v in vars_in_block[1:]:
                product *= v
            constraints.append(product == target)
        elif op == "sub" and len(vars_in_block) == 2:
            a, b = vars_in_block
            constraints.append(Or(a - b == target, b - a == target))
        elif op == "div" and len(vars_in_block) == 2:
            a, b = vars_in_block
            constraints.append(Or(a / b == target, b / a == target))
        else:
            raise ValueError(f"Unsupported operation or malformed block: {block}")
    return constraints



In [9]:
def validate_solution(puzzle, size, solution):
  X = [ [ Int("x_%s_%s" % (i+1, j+1)) for j in range(size) ]
      for i in range(size) ]
  cells_c  = [ And(1 <= X[i][j], X[i][j] <= size)
              for i in range(size) for j in range(size) ]
  rows_c   = [ Distinct(X[i]) for i in range(size) ]
  cols_c   = [ Distinct([ X[i][j] for i in range(size) ])
              for j in range(size) ]
  constraints = cells_c + rows_c + cols_c + parse_block_constraints(puzzle, X)
  instance = [
        X[i][j] == solution[i][j]
        for i in range(size)
        for j in range(size)
    ]
  s = Solver()
  problem = constraints + instance
  s.add(problem)
  return s.check() == sat

In [10]:
simple_prompt = '''
    You will be provided an empty KenKen puzzle board, which is a puzzle similar to Sudoku but with mathematical operations. Like Sudoku,
    every row and column must contain the numbers 1 through n, where n is the size of the grid. The thick border lines represent cages,
    which contain a target number and arithmetic operator (+-/*) in the top left cell of each cage. For a given cage, all of the numbers
    that will make up that cage must arrive at the target number through the arithmetic operator. For example in a cage with two cells
    and the symbol 5+, it could be filled in with a 2 and a 3 because 2 + 3 = 5. If there is only one cell in the cage, then it can be
    automatically filled in with the target number.

    Your task is to provide a correct solution to the puzzle provided. The puzzle could have size 3, 4, 5, 6, or 7. All puzzles have at least
    one solution. Format your response as a 2 dimensional list representing the solution for the puzzle. An example response for a 3x3 KenKen puzzle is:
    [[1, 2, 3],[3, 1, 2],[2, 3, 1]]

  '''

In [11]:
import json
with open("/content/drive/MyDrive/Summer2025Research/KenKenSolver/images/puzzles_all_sizes.json", "r") as f:
    puzzles_ds = json.load(f)
with open("/content/drive/MyDrive/Summer2025Research/KenKenSolver/images/puzzles_7x7.json", "r") as f:
    puzzles_7x7 = json.load(f)
puzzles_ds['7'] = puzzles_7x7['7']

In [12]:
gpt_accuracy = {3:8, 4:0, 5:0, 6:0, 7:0}
gpt_avg_time = {3:3.8245480704307555, 4:2.55120950937271, 5:0, 6:0, 7:0}
gpt_responses = {3:[], 4:[], 5:[], 6:[], 7:[]}
gpt_tokens = {3:0, 4:0, 5:0, 6:0, 7:0}

In [24]:
num_puzzles = 30
input_prompt = simple_prompt
total = 0
size = 7

In [25]:
for i in range(0, min(num_puzzles, len(puzzles_ds[str(size)]))):
    filepath= "/content/drive/MyDrive/Summer2025Research/KenKenSolver/images/boards_noto_sans/board"+str(size)+"_"+str(i)+".png"
    res, tokens, res_time = get_gpt_response(input_prompt, filepath)
    #print(res)
    gpt_responses[size].append(res)
    gpt_avg_time[size] += res_time
    gpt_tokens[size] += tokens

    solution = extract_solution(res)
    if solution and len(solution)==size and all(len(row) == size for row in solution) and validate_solution(puzzles_ds[str(size)][i], size, solution):

      gpt_accuracy[size] += 1

    total+=1
    print(str(gpt_accuracy[size])+"/"+str(total))
    time.sleep(15)


0/1
0/2
0/3
0/4
0/5
0/6
0/7
0/8
0/9
0/10
0/11
0/12
0/13
0/14
0/15
0/16
0/17
0/18
0/19
0/20
0/21
0/22
0/23
0/24
0/25
0/26
0/27
0/28
0/29
0/30


In [None]:
gpt_responses[size][5]

'Here is the solution to the provided KenKen puzzle:\n\n```python\n[[1, 2, 3],\n [3, 1, 2],\n [2, 3, 1]]\n```'

In [27]:
gpt_avg_time[size] = gpt_avg_time[size] / total
# gpt_accuracy[size] = gpt_accuracy[size] / total

In [None]:
print("GPT 4o Mini 3x3 Results: \nAccuracy: ", gpt_accuracy[3], "%\nAverage Time: ", gpt_avg_time[3], "s")

GPT 4o Mini 3x3 Results: 
Accuracy:  8 %
Average Time:  3.8245480704307555 s


In [15]:
print("GPT 4o Mini 4x4 Results: \nAccuracy: ", gpt_accuracy[4], "%\nAverage Time: ", gpt_avg_time[4], "s")

GPT 4o Mini 4x4 Results: 
Accuracy:  0 %
Average Time:  2.55120950937271 s


In [18]:
print("GPT 4o Mini 5x5 Results: \nAccuracy: ", gpt_accuracy[5], "%\nAverage Time: ", gpt_avg_time[5], "s")

GPT 4o Mini 5x5 Results: 
Accuracy:  0 %
Average Time:  3.4007462906837462 s


In [23]:
print("GPT 4o Mini 6x6 Results: \nAccuracy: ", gpt_accuracy[6], "%\nAverage Time: ", gpt_avg_time[6], "s")

GPT 4o Mini 6x6 Results: 
Accuracy:  0 %
Average Time:  3.3614277267456054 s


In [28]:
print("GPT 4o Mini 7x7 Results: \nAccuracy: ", gpt_accuracy[7], "%\nAverage Time: ", gpt_avg_time[7], "s")

GPT 4o Mini 7x7 Results: 
Accuracy:  0 %
Average Time:  4.487039693196615 s


In [29]:
import pandas as pd

In [30]:
results = pd.DataFrame({
    'accuracy (%)': gpt_accuracy,
    'avg_time (s)': gpt_avg_time
})

In [31]:
results.to_csv('/content/drive/MyDrive/Summer2025Research/SolverRepo/results/gpt_evaluation.csv', index=True)