In [2]:
import gym
import numpy as np
import random

env = gym.make("Blackjack-v1", sab=True)

EPISODES = 500_0
GAMMA = 1.0
ALPHA = 0.1
EPSILON = 0.1
ACTIONS = [0, 1]  # stick, hit

# Initialize Q-table
def init_Q():
    return {}  # Q[state] = np.array([q_stick, q_hit])

# Epsilon-greedy action selection
def epsilon_greedy(state, Q):
    if state not in Q:
        Q[state] = np.zeros(len(ACTIONS))
    if random.random() < EPSILON:
        return random.choice(ACTIONS)
    else:
        return np.argmax(Q[state])

# --- SARSA Implementation ---
def train_sarsa():
    Q = init_Q()
    for episode in range(EPISODES):
        state = env.reset()[0]
        action = epsilon_greedy(state, Q)
        done = False

        while not done:
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            if next_state not in Q:
                Q[next_state] = np.zeros(len(ACTIONS))
            next_action = epsilon_greedy(next_state, Q)
            
            Q[state][action] += ALPHA * (reward + GAMMA * Q[next_state][next_action] - Q[state][action])
            
            state = next_state
            action = next_action

    return Q

# --- Q-Learning Implementation ---
def train_q_learning():
    Q = init_Q()
    for episode in range(EPISODES):
        state = env.reset()[0]
        done = False

        while not done:
            action = epsilon_greedy(state, Q)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            if next_state not in Q:
                Q[next_state] = np.zeros(len(ACTIONS))

            Q[state][action] += ALPHA * (reward + GAMMA * np.max(Q[next_state]) - Q[state][action])
            state = next_state

    return Q


In [3]:
# Train
q_sarsa = train_sarsa()
q_qlearn = train_q_learning()

# Sample action comparison
state = (20, 10, False)
print(f"\nSARSA: Best action in state {state}: {np.argmax(q_sarsa[state])}")
print(f"Q-Learning: Best action in state {state}: {np.argmax(q_qlearn[state])}")


  if not isinstance(terminated, (bool, np.bool8)):



SARSA: Best action in state (20, 10, False): 0
Q-Learning: Best action in state (20, 10, False): 0
