In [6]:
!pip3 install z3-solver


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [7]:
import os
from z3 import *
import time
import base64

In [8]:
!pip3 install anthropic

Collecting anthropic
  Downloading anthropic-0.75.0-py3-none-any.whl (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.2/388.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting distro<2,>=1.7.0
  Downloading distro-1.9.0-py3-none-any.whl (20 kB)
Collecting docstring-parser<1,>=0.15
  Downloading docstring_parser-0.17.0-py3-none-any.whl (36 kB)
Collecting sniffio
  Downloading sniffio-1.3.1-py3-none-any.whl (10 kB)
Collecting httpx<1,>=0.25.0
  Downloading httpx-0.28.1-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jiter<1,>=0.4.0
  Downloading jiter-0.12.0-cp310-cp310-macosx_11_0_arm64.whl (319 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.8/319.8 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydantic<3,>=1.9.0
  Downloading pydantic-2.12.5-py3-none-any.whl (463 kB)
[2K     [90m

In [9]:
import anthropic

In [None]:
client = anthropic.Anthropic(api_key="YOUR_API_KEY_HERE")  # Replace with your actual API key

In [11]:
def get_claude_response(prompt, image_fp, model_str="claude-sonnet-4-20250514"):

  with open(image_fp, "rb") as image_file:
      image_data = base64.b64encode(image_file.read()).decode("utf-8")

  start = time.time()
  response = client.messages.create(
      model=model_str,
      max_tokens=2048,
      messages=[
          {
              "role": "user",
              "content": [
                  {"type": "text", "text": prompt},
                  {
                      "type": "image",
                      "source": {
                          "type": "base64",
                          "media_type": "image/png",
                          "data": image_data
                      }
                  }
              ]
          }
      ]
  )
  end = time.time()
  total_tokens = response.usage.input_tokens + response.usage.output_tokens
  return response.content[0].text, total_tokens, end-start

In [12]:
def extract_solution(response):
  res = response
  solution = [[]]
  row = 0

  strt = res.rfind("[[")
  end = res.find("]]", strt)
  if strt == -1 or end == -1:
    return None

  for i in range(strt, end):
    if res[i].isdigit():
      solution[row].append(int(res[i]))
    elif res[i] == ']':
      solution.append([])
      row+=1
  return solution

In [13]:
def parse_block_constraints(puzzle, cells):
    constraints = []
    for block in puzzle:
        op = block["op"]
        target = block["target"]
        vars_in_block = [cells[i][j] for i, j in block["cells"]]
        if op == "":
            constraints.append(vars_in_block[0] == target)
        elif op == "add":
            constraints.append(Sum(vars_in_block) == target)
        elif op == "mul":
            product = vars_in_block[0]
            for v in vars_in_block[1:]:
                product *= v
            constraints.append(product == target)
        elif op == "sub" and len(vars_in_block) == 2:
            a, b = vars_in_block
            constraints.append(Or(a - b == target, b - a == target))
        elif op == "div" and len(vars_in_block) == 2:
            a, b = vars_in_block
            constraints.append(Or(a / b == target, b / a == target))
        else:
            raise ValueError(f"Unsupported operation or malformed block: {block}")
    return constraints

In [14]:
def validate_solution(puzzle, size, solution):
  X = [ [ Int("x_%s_%s" % (i+1, j+1)) for j in range(size) ]
      for i in range(size) ]
  cells_c  = [ And(1 <= X[i][j], X[i][j] <= size)
              for i in range(size) for j in range(size) ]
  rows_c   = [ Distinct(X[i]) for i in range(size) ]
  cols_c   = [ Distinct([ X[i][j] for i in range(size) ])
              for j in range(size) ]
  constraints = cells_c + rows_c + cols_c + parse_block_constraints(puzzle, X)
  instance = [
        X[i][j] == solution[i][j]
        for i in range(size)
        for j in range(size)
    ]
  s = Solver()
  problem = constraints + instance
  s.add(problem)
  return s.check() == sat

In [15]:
simple_prompt = '''
    You will be provided an empty KenKen puzzle board, which is a puzzle similar to Sudoku but with mathematical operations. Like Sudoku,
    every row and column must contain the numbers 1 through n, where n is the size of the grid. The thick border lines represent cages,
    which contain a target number and arithmetic operator (+-/*) in the top left cell of each cage. For a given cage, all of the numbers
    that will make up that cage must arrive at the target number through the arithmetic operator. For example in a cage with two cells
    and the symbol 5+, it could be filled in with a 2 and a 3 because 2 + 3 = 5. If there is only one cell in the cage, then it can be
    automatically filled in with the target number.

    Your task is to provide a correct solution to the puzzle provided. The puzzle could have size 3, 4, 5, 6, or 7. All puzzles have at least
    one solution. Format your response as a 2 dimensional list representing the solution for the puzzle. An example response for a 3x3 KenKen puzzle is:
    [[1, 2, 3],[3, 1, 2],[2, 3, 1]]

  '''

In [17]:
import json
with open("./puzzles/puzzles_dict.json", "r") as f:
    puzzles_ds = json.load(f)


In [18]:
claude_accuracy = {3:0, 4:7, 5:0, 6:0, 7:0}
claude_avg_time = {3:0, 4:0, 5:0, 6:0, 7:0}
claude_responses = {3:[], 4:[], 5:[], 6:[], 7:[]}
claude_tokens = {3:0, 4:0, 5:0, 6:0, 7:0}

In [19]:
num_puzzles = 30
input_prompt = simple_prompt
total = 0
size = 7

In [None]:
for i in range(0, min(num_puzzles, len(puzzles_ds[str(size)]))):
    filepath= "./board_images/board"+str(size)+"_"+str(i)+".png"
    res, tokens, res_time = get_claude_response(input_prompt, filepath)
    #print(res)
    claude_responses[size].append(res)
    claude_avg_time[size] += res_time
    claude_tokens[size] += tokens

    solution = extract_solution(res)
    if solution and len(solution)==size and all(len(row) == size for row in solution) and validate_solution(puzzles_ds[str(size)][i], size, solution):
      claude_accuracy[size] += 1

    total+=1
    print(str(claude_accuracy[size])+"/"+str(total))
    time.sleep(5)

In [None]:
claude_avg_time[size] = claude_avg_time[size] / total

In [None]:
print("Claude Sonnet 4.0 3x3 Results: \nAccuracy: ", claude_accuracy[3], "%\nAverage Time: ", claude_avg_time[3], "s")

Claude Sonnet 4.0 3x3 Results: 
Accuracy:  39 %
Average Time:  26.54918639421463 s


In [None]:
print("Claude Sonnet 4.0 4x4 Results: \nAccuracy: ", claude_accuracy[4], "%\nAverage Time: ", claude_avg_time[4], "s")

Claude Sonnet 4.0 4x4 Results: 
Accuracy:  7 %
Average Time:  26.978993196487426 s


In [None]:
print("Claude Sonnet 4.0 5x5 Results: \nAccuracy: ", claude_accuracy[5], "%\nAverage Time: ", claude_avg_time[5], "s")

Claude Sonnet 4.0 5x5 Results: 
Accuracy:  0 %
Average Time:  24.524857256412506 s


In [None]:
print("Claude Sonnet 4.0 6x6 Results: \nAccuracy: ", claude_accuracy[6], "%\nAverage Time: ", claude_avg_time[6], "s")

Claude Sonnet 4.0 6x6 Results: 
Accuracy:  0 %
Average Time:  22.186544198989868 s


In [None]:
print("Claude Sonnet 4.0 7x7 Results: \nAccuracy: ", claude_accuracy[7], "%\nAverage Time: ", claude_avg_time[7], "s")

Claude Sonnet 4.0 7x7 Results: 
Accuracy:  0 %
Average Time:  21.67326311270396 s


###Saving the results

In [None]:
import pandas as pd

In [None]:
results = pd.DataFrame({
    'accuracy (%)': claude_accuracy,
    'avg_time (s)': claude_avg_time
})

In [None]:
results.to_csv('./results/claude_evaluation.csv', index=True)