# Implementation: Q-Learning from Scratch

**Goal**: Solve the Cliff Walking problem.

In [None]:
import numpy as np

# 1. Q-Table
# 4 States, 2 Actions (Left, Right)
# State 3 is Goal (+10)
Q = np.zeros((4, 2))

gamma = 0.9
alpha = 0.1

# 2. Mock Experience
# Agent was in S0, went Right (Act 1), got Reward 0, landed in S1
state = 0
action = 1
reward = 0
next_state = 1

# Suppose S1 is already known to be valuable
Q[1, 1] = 5.0 # Going right from S1 is good

# 3. Update
# Target = R + gamma * Max(Next)
best_next_action_val = np.max(Q[next_state])
td_target = reward + gamma * best_next_action_val
td_error = td_target - Q[state, action]

Q[state, action] = Q[state, action] + alpha * td_error

print(f"Old Q(0, Right): 0.0")
print(f"Target: {td_target} (0 + 0.9 * 5.0)")
print(f"New Q(0, Right): {Q[state, action]}")

## Conclusion
The agent learned that going Right from S0 is worth 0.45, because it leads to S1 which is worth 5.0.