In [1]:
!pip install z3-solver

Collecting z3-solver
  Downloading z3_solver-4.15.1.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (602 bytes)
Downloading z3_solver-4.15.1.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.5/29.5 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: z3-solver
Successfully installed z3-solver-4.15.1.0


In [2]:
import os
from z3 import *
import time
import base64

In [3]:
!pip install anthropic

Collecting anthropic
  Downloading anthropic-0.57.1-py3-none-any.whl.metadata (27 kB)
Downloading anthropic-0.57.1-py3-none-any.whl (292 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.8/292.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.57.1


In [4]:
import anthropic

In [22]:
client = anthropic.Anthropic()

In [17]:
def get_claude_response(prompt, image_fp, model_str="claude-sonnet-4-20250514"):

  with open(image_fp, "rb") as image_file:
      image_data = base64.b64encode(image_file.read()).decode("utf-8")

  start = time.time()
  response = client.messages.create(
      model=model_str,
      max_tokens=2048,
      messages=[
          {
              "role": "user",
              "content": [
                  {"type": "text", "text": prompt},
                  {
                      "type": "image",
                      "source": {
                          "type": "base64",
                          "media_type": "image/png",
                          "data": image_data
                      }
                  }
              ]
          }
      ]
  )
  end = time.time()
  total_tokens = response.usage.input_tokens + response.usage.output_tokens
  return response.content[0].text, total_tokens, end-start

In [7]:
def extract_solution(response):
  res = response
  solution = [[]]
  row = 0

  strt = res.rfind("[[")
  end = res.find("]]", strt)
  if strt == -1 or end == -1:
    return None

  for i in range(strt, end):
    if res[i].isdigit():
      solution[row].append(int(res[i]))
    elif res[i] == ']':
      solution.append([])
      row+=1
  return solution

In [8]:
def parse_block_constraints(puzzle, cells):
    constraints = []
    for block in puzzle:
        op = block["op"]
        target = block["target"]
        vars_in_block = [cells[i][j] for i, j in block["cells"]]
        if op == "":
            constraints.append(vars_in_block[0] == target)
        elif op == "add":
            constraints.append(Sum(vars_in_block) == target)
        elif op == "mul":
            product = vars_in_block[0]
            for v in vars_in_block[1:]:
                product *= v
            constraints.append(product == target)
        elif op == "sub" and len(vars_in_block) == 2:
            a, b = vars_in_block
            constraints.append(Or(a - b == target, b - a == target))
        elif op == "div" and len(vars_in_block) == 2:
            a, b = vars_in_block
            constraints.append(Or(a / b == target, b / a == target))
        else:
            raise ValueError(f"Unsupported operation or malformed block: {block}")
    return constraints

In [9]:
def validate_solution(puzzle, size, solution):
  X = [ [ Int("x_%s_%s" % (i+1, j+1)) for j in range(size) ]
      for i in range(size) ]
  cells_c  = [ And(1 <= X[i][j], X[i][j] <= size)
              for i in range(size) for j in range(size) ]
  rows_c   = [ Distinct(X[i]) for i in range(size) ]
  cols_c   = [ Distinct([ X[i][j] for i in range(size) ])
              for j in range(size) ]
  constraints = cells_c + rows_c + cols_c + parse_block_constraints(puzzle, X)
  instance = [
        X[i][j] == solution[i][j]
        for i in range(size)
        for j in range(size)
    ]
  s = Solver()
  problem = constraints + instance
  s.add(problem)
  return s.check() == sat

In [10]:
simple_prompt = '''
    You will be provided an empty KenKen puzzle board, which is a puzzle similar to Sudoku but with mathematical operations. Like Sudoku,
    every row and column must contain the numbers 1 through n, where n is the size of the grid. The thick border lines represent cages,
    which contain a target number and arithmetic operator (+-/*) in the top left cell of each cage. For a given cage, all of the numbers
    that will make up that cage must arrive at the target number through the arithmetic operator. For example in a cage with two cells
    and the symbol 5+, it could be filled in with a 2 and a 3 because 2 + 3 = 5. If there is only one cell in the cage, then it can be
    automatically filled in with the target number.

    Your task is to provide a correct solution to the puzzle provided. The puzzle could have size 3, 4, 5, 6, or 7. All puzzles have at least
    one solution. Format your response as a 2 dimensional list representing the solution for the puzzle. An example response for a 3x3 KenKen puzzle is:
    [[1, 2, 3],[3, 1, 2],[2, 3, 1]]

  '''

In [11]:
import json
with open("/content/drive/MyDrive/Summer2025Research/KenKenSolver/images/puzzles_all_sizes.json", "r") as f:
    puzzles_ds = json.load(f)
with open("/content/drive/MyDrive/Summer2025Research/KenKenSolver/images/puzzles_7x7.json", "r") as f:
    puzzles_7x7 = json.load(f)
puzzles_ds['7'] = puzzles_7x7['7']

In [19]:
claude_accuracy = {3:39, 4:7, 5:0, 6:0, 7:0}
claude_avg_time = {3:26.54918639421463, 4:26.978993196487426, 5:0, 6:0, 7:0}
claude_responses = {3:[], 4:[], 5:[], 6:[], 7:[]}
claude_tokens = {3:0, 4:0, 5:0, 6:0, 7:0}

In [18]:
num_puzzles = 100
input_prompt = simple_prompt
total = 0
size = 5

In [23]:
for i in range(0, min(num_puzzles, len(puzzles_ds[str(size)]))):
    filepath= "/content/drive/MyDrive/Summer2025Research/KenKenSolver/images/boards_noto_sans/board"+str(size)+"_"+str(i)+".png"
    res, tokens, res_time = get_claude_response(input_prompt, filepath)
    #print(res)
    claude_responses[size].append(res)
    claude_avg_time[size] += res_time
    claude_tokens[size] += tokens

    solution = extract_solution(res)
    if solution and len(solution)==size and len(solution[0])==size and validate_solution(puzzles_ds[str(size)][i], size, solution):
      claude_accuracy[size] += 1

    total+=1
    print(str(claude_accuracy[size])+"/"+str(total))
    time.sleep(5)


0/1
0/2
0/3
0/4
0/5
0/6
0/7
0/8
0/9
0/10
0/11
0/12
0/13
0/14
0/15
0/16
0/17
0/18
0/19
0/20
0/21
0/22
0/23
0/24
0/25
0/26
0/27
0/28
0/29
0/30
0/31
0/32
0/33
0/34
0/35
0/36
0/37
0/38
0/39
0/40
0/41
0/42
0/43
0/44
0/45
0/46
0/47
0/48
0/49
0/50
0/51
0/52
0/53
0/54
0/55
0/56
0/57
0/58
0/59
0/60
0/61
0/62
0/63
0/64
0/65
0/66
0/67
0/68
0/69
0/70
0/71
0/72
0/73
0/74
0/75
0/76
0/77
0/78
0/79
0/80
0/81
0/82
0/83
0/84
0/85
0/86
0/87
0/88
0/89
0/90
0/91
0/92
0/93
0/94
0/95
0/96
0/97
0/98
0/99
0/100


In [46]:
print("Claude Sonnet 4.0 3x3 Results: \nAccuracy: ", claude_accuracy[3], "%\nAverage Time: ", claude_avg_time[3], "s")

Claude Sonnet 4.0 3x3 Results: 
Accuracy:  39 %
Average Time:  26.54918639421463 s


In [59]:
print("Claude Sonnet 4.0 4x4 Results: \nAccuracy: ", claude_accuracy[4], "%\nAverage Time: ", claude_avg_time[4], "s")

Claude Sonnet 4.0 4x4 Results: 
Accuracy:  7 %
Average Time:  26.978993196487426 s


In [26]:
print("Claude Sonnet 4.0 5x5 Results: \nAccuracy: ", claude_accuracy[5], "%\nAverage Time: ", claude_avg_time[5], "s")

Claude Sonnet 4.0 5x5 Results: 
Accuracy:  0 %
Average Time:  24.524857256412506 s


###Saving the results

In [27]:
import pandas as pd

In [28]:
results = pd.DataFrame({
    'accuracy (%)': claude_accuracy,
    'avg_time (s)': claude_avg_time
})

In [29]:
results.to_csv('/content/drive/MyDrive/Summer2025Research/SolverRepo/results/claude_evaluation.csv', index=True)