## Layered Value Iteration on the Pig Game

In [1]:
# we need this later for plotting

import plotly.io as pio
pio.renderers.default = 'notebook_connected'  # or 'iframe' if notebook doesn't work


In [2]:
# creating a function to run layered value iteration

import numpy as np

def pig_layered_value_iteration(target_score=100, die_sides=6, max_turn=100, epsilon=1e-6):
    roll_values = list(range(1, die_sides + 1)) # can use different size dice
    roll_prob = 1 / die_sides 
    MAX_TURN = max_turn # the max turn - I just set it to 100 so we don't have runaway recursion

    V = np.zeros((target_score + 1, target_score + 1, MAX_TURN + 1)) # initialise the values all as zeros
    policy = np.ones_like(V, dtype=int)  # start with an array of 1s to start (i.e., we roll all the time to begin with)

    # Set terminal states (ps = player score, os = opponent score, t = turn)
    for ps in range(target_score + 1):
        for os in range(target_score + 1):
            for t in range(MAX_TURN + 1):
                if ps + t >= target_score:
                    V[ps, os, t] = 1.0
                    policy[ps, os, t] = 0  # hold
                elif os >= target_score:
                    V[ps, os, t] = 0.0

    # Perform layered value iteration
    for score_sum in reversed(range(0, 2 * target_score)): # iterate over all possible scores
        converged = False # set the convergence to false to start with

        while not converged: 
            max_diff = 0

            for ps in range(max(0, score_sum - target_score + 1), min(target_score, score_sum + 1)): # iterate over all possible player scores
                os = score_sum - ps # opponent score is the total score minus player score
                if os >= target_score or ps >= target_score: # either player or opponent already won
                    continue

                for t in range(MAX_TURN + 1): # iterate over all possible turns
                    if os >= target_score:
                        continue  # opponent already won

                    # --- ROLL value --- # first calculate the value of rolling
                    roll_value = 0.0
                    for roll in roll_values:
                        if roll == 1:
                            roll_value += roll_prob * (1 - V[os, ps, 0]) # add value of rolling a 1 (which is the value of the opponent's new state taken straight from the value array)
                        else:
                            new_t = t + roll # add the value of all other rolls
                            if ps + new_t >= target_score:
                                roll_value += roll_prob * 1.0 # if we win, add 1.0 times probabiluity of that roll
                            elif new_t <= MAX_TURN:
                                roll_value += roll_prob * V[ps, os, new_t] # otherwise, justr add the value of the state we end up in multiplied by the probability of getting there

                    # --- HOLD value --- now, calculate the value of holding
                    if ps + t >= target_score:
                        hold_value = 1.0
                    elif os >= target_score:
                        hold_value = 0.0
                    #elif os >= target_score - die_sides:
                    #    hold_value = 0.0 
                    elif os + ps + t > score_sum:
                        hold_value = 1 - V[os, ps + t, 0] # flip the scores and extract from the Value array
                    else:
                        hold_value = 0.0

                    new_v = max(roll_value, hold_value)
                    new_policy = 1 if roll_value >= hold_value else 0  # 1 = roll, 0 = hold
                    diff = abs(V[ps, os, t] - new_v)

                    V[ps, os, t] = new_v
                    policy[ps, os, t] = new_policy

                    max_diff = max(max_diff, diff) # calculate the new max difference to know whether to stop

            if max_diff < epsilon:
                converged = True

    return V, policy


In [6]:
# creating a function to plot our results as a decision surface

import plotly.graph_objects as go

def plot_decision_surface_from_policy(policy, target_score=25, max_turn=25):
    Z = np.full((target_score, target_score), np.nan) # create a grid of NaNs to start with

    for ps in range(target_score):
        for os in range(target_score):
            for t in range(max_turn + 1):
                if policy[ps, os, t] == 0:  # 0 = hold
                    Z[ps, os] = t
                    break
            else:
                Z[ps, os] = max_turn + 1  # never holds

    X, Y = np.meshgrid(np.arange(target_score), np.arange(target_score), indexing='ij')

    fig = go.Figure(data=[go.Surface(
        z=Z,
        x=X,
        y=Y,
        colorscale='RdYlGn',
        colorbar=dict(title="Turn Total (Hold Threshold)")
    )])

    fig.update_layout(
        scene=dict(
            xaxis_title='Player Score',
            yaxis_title='Opponent Score',
            zaxis_title='Min Turn Total to Hold',
        ),
        title="Optimal Roll vs Hold Surface (0 = Hold, 1 = Roll)",
        width=900,
        height=750
    )

    fig.show()


In [7]:
# running the layered value iteration function

die_size = 6
target_score = 100
max_turn = 100
V, policy = pig_layered_value_iteration(target_score=target_score, die_sides=die_size, max_turn=max_turn, epsilon=1e-6)

In [8]:
# plotting the decision surface

plot_decision_surface_from_policy(policy, target_score=target_score, max_turn=max_turn)


In [14]:
# SAVE THIS INTO .PKL FORMAT 
import pickle
policy_dict = {}

for ps in range(target_score):
    for os in range(target_score):
        for t in range(max_turn + 1):
            policy_dict[(os, ps, t)] = policy[ps, os, t] 

with open('layered_sol.pkl', 'wb') as f:
    pickle.dump(policy_dict, f)