In [1]:
import numpy as np

# Initialize parameters
gamma = 1  # Discount rate
alpha = 0.9  # Step size
w = np.array([0, 0]) * 1.0  # Initial weight vector

# Zero initialization of the Q-values
Q = {(1, 'ActL'): 0, (1, 'ActR'): 0, (2, 'ActL'): 0, (2, 'ActR'): 0, (3, 'ActL'): 0, (3, 'ActR'): 0}

# Define the feature function
def x(s, a):
    if a == 'ActL':
        return np.array([1, s])
    else:  # ActR
        return np.array([s, s**2])

# Define the approximate Q-function
def q_hat(s, a, w):
    return np.dot(w, x(s, a))

# Define the gradient of q_hat with respect to w
def grad_q_hat(s, a):
    return x(s, a)

# Perform the episodic semi-gradient SARSA update for two steps
for t, (s, a, r, s_next, a_next) in enumerate([(1, 'ActR', -1, 2, 'ActR'), (2, 'ActR', 10, 3, None)]):
    # Calculate the target Q-value, set it to zero if the key doesnt exist
    Q_target = r + gamma * Q.get((s_next, a_next), 0)
    
    # Update rule
    w += alpha * (Q_target - q_hat(s, a, w)) * grad_q_hat(s, a)
    
    # Print the required values
    print(f"Time step {t}:")
    print(f"State: {s}, Action: {a}")
    print(f"Q_hat: {q_hat(s, a, w).round(2)}")
    print(f"Grad(Q_hat): {grad_q_hat(s, a).round(2)}")
    print(f"x(s, a): {x(s, a).round(2)}\n")
    
print(w.round(2))

Time step 0:
State: 1, Action: ActR
Q_hat: -1.8
Grad(Q_hat): [1 1]
x(s, a): [1 1]

Time step 1:
State: 2, Action: ActR
Q_hat: 271.8
Grad(Q_hat): [2 4]
x(s, a): [2 4]

[26.82 54.54]
