<a href="https://colab.research.google.com/github/LcLnAinIng/Rummikub-Simulation-with-Rreinforcement-Learning/blob/Blackjack/Blackjack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Setting

In [None]:
import gymnasium as gym
from gymnasium.wrappers import FlattenObservation
from gymnasium.utils.env_checker import check_env
from collections import defaultdict
import numpy as np
from tqdm import tqdm  # Progress bar
from typing import Optional
from gymnasium import spaces
import matplotlib.pyplot as plt

from __future__ import annotations

from collections import defaultdict
import seaborn as sns
from matplotlib.patches import Patch

seed = 1310

# Agent - Q-Learning with Q-table

- State space is small → we can store Q(s,a) in a table (dict of arrays)


In [None]:
class BlackjackAgent_QLearning_Qtable:
    def __init__(self, env, episodes=1000, eps=0.1, alpha=0.1, gamma=0.9):
        """
        Initialize the agent.
        - env: the Blackjack environment
        - episodes: number of training episodes
        - eps: epsilon for epsilon-greedy (exploration rate)
        - alpha: learning rate (step size for Q update)
        - gamma: discount factor
        """
        self.env = env
        self.episodes = episodes
        self.eps = eps
        self.alpha = alpha
        self.gamma = gamma
        self.action_space = spaces.Discrete(2)
        self.action = ["stick", "hit"]

        # Storage for Q-values (for Q-learning), can later swap to NN
        self.Q = {}

    def get_action(self, state):
        """
        Decide which action to take given a state.
        - With probability eps: pick random action (exploration)
        - Otherwise: pick best action based on Q-values (exploitation)
        Returns: action
        """
        if state not in self.Q:
          self.Q[state] = np.zeros(self.action_space.n)

        if np.random.random() < self.eps:
          action = np.random.choice(self.action_space.n)
        else:
          action = np.argmax(self.Q[state])

        return action

    def update_q(self, state, action, reward, next_state, done):
        """
        Perform the Q-learning update rule:
        Q(s,a) ← Q(s,a) + α [ r + γ max_a' Q(s',a') - Q(s,a) ]
        """
        # Initialize unseen states
        if state not in self.Q:
          self.Q[state] = np.zeros(self.action_space.n)

        if next_state not in self.Q:
          self.Q[next_state] = np.zeros(self.action_space.n)

        target = reward # Start with immediate reward

        # Add future value if game not over
        '''
        if the episode is not finished, then we also care about the future reward we might get
        `np.max(self.Q[next_state])` = best possible value I can get in the next state
        gamma helps reduce importance of far-away rewards
        add the future rewards to the immediate reward

        `self.Q[state][action]` = Q(s,a)
        This compare the current guess with the new target we have just computed
        `(target - Q(s,a))` is the error in the estimate
        multiply by learning rate `alpha` to take a small step toward fixing it
        '''
        if not done:
          target += self.gamma * np.max(self.Q[next_state])

        self.Q[state][action] += self.alpha * (target - self.Q[state][action])

    def train(self):
        """
        Main training loop:
        - For each episode:
            1. Reset environment
            2. Loop through steps until terminal state:
                - Choose action
                - Take action in env
                - Get reward + next state
                - Update Q-values
        - Track performance (e.g., average reward)

        updates Q-values, uses epsilon-greedy
        """

        episode_rewards = []
        total_reward = 0

        for i in range(self.episodes):
          state, info = self.env.reset()
          done = False
          while not done:
            action = self.get_action(state)
            next_state, reward, terminated, truncated, info = self.env.step(action)
            done = terminated or truncated
            self.update_q(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

          episode_rewards.append(total_reward)

        return episode_rewards

    def play(self, num_games=10):
        """
        Run the agent in evaluation mode (greedy policy only).
        Print/return results (e.g., wins, losses, draws).

        no updates, greedy only.
        """
        results = []

        for _ in range(num_games):

          state, info = self.env.reset()
          done = False
          total_reward = 0

          while not done:
            if state in self.Q:
              action = np.argmax(self.Q[state])
            else:
              action = self.env.action_space.sample()

            next_state, reward, terminated, truncated, info = self.env.step(action)
            done = terminated or truncated
            state = next_state
            total_reward += reward

          results.append(total_reward)

        # after all games, summarize
        print(f'Average reward: {np.mean(results):.4f}')
        print(f"Wins: {results.count(1)}, Losses: {results.count(-1)}, Draws: {results.count(0)}")

        return results




# Agent - Q-Learning with DQN

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

class MiniDQNAgent:
    def __init__(self, env, episodes=1000, eps=0.9, alpha=0.001, gamma=0.9):
        """
        env: Blackjack environment
        episodes: number of training episodes
        eps: epsilon for epsilon-greedy
        alpha: learning rate for optimizer
        gamma: discount factor
        """
        self.env = env
        self.episodes = episodes
        self.eps = eps
        self.gamma = gamma

        # Neural network replaces Q-table
        obs_size = 3  # (player_sum, dealer_card, usable_ace)
        action_size = env.action_space.n
        self.q_net = self.build_model(obs_size, action_size)
        self.optimizer = optim.Adam(self.q_net.parameters(), lr=alpha)
        self.loss_fn = nn.MSELoss()

    def build_model(self, input_dim, output_dim):
        """
        Small neural net: input = state, output = Q-values for each action.
        """
        return nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def encode_state(self, state):
        """
        Convert environment state into a tensor for NN.
        Example: (player_sum, dealer_card, usable_ace)
        """
        return torch.tensor([state[0], state[1], int(state[2])], dtype=torch.float32)

    def get_action(self, state):
        """
        Epsilon-greedy action selection.
        """
        if np.random.rand() < self.eps:
            return self.env.action_space.sample()  # random
        else:
            with torch.no_grad():
                q_values = self.q_net(self.encode_state(state))
            return torch.argmax(q_values).item()

    def update_q(self, state, action, reward, next_state, done):
        """
        Perform one gradient descent step on TD error.
        """
        # Convert to tensors
        state_t = self.encode_state(state)
        next_state_t = self.encode_state(next_state)

        # Current Q(s,a)
        q_values = self.q_net(state_t)
        q_value = q_values[action]

        # Target: r + gamma * max Q(s',a')
        with torch.no_grad():
            next_q = self.q_net(next_state_t)
            max_next_q = torch.max(next_q)
            target = reward + (0 if done else self.gamma * max_next_q)

        # Loss = (target - Q(s,a))^2
        loss = self.loss_fn(q_value, target)

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def train(self):
        """
        Main training loop.
        """
        for i in range(self.episodes):
            state, info = self.env.reset()
            done = False
            while not done:
                action = self.get_action(state)
                next_state, reward, terminated, truncated, info = self.env.step(action)
                done = terminated or truncated

                self.update_q(state, action, reward, next_state, done)
                state = next_state

    def play(self, num_games=10):
        """
        Evaluate trained policy (greedy only).
        """
        results = []
        for _ in range(num_games):
            state, info = self.env.reset()
            done = False
            total_reward = 0
            while not done:
                with torch.no_grad():
                    q_values = self.q_net(self.encode_state(state))
                action = torch.argmax(q_values).item()
                next_state, reward, terminated, truncated, info = self.env.step(action)
                done = terminated or truncated
                state = next_state
                total_reward += reward
            results.append(total_reward)

        print(f"Average reward: {np.mean(results):.2f}")
        return results


# Running the code

In [None]:
if __name__ == "__main__":

    # Create Blackjack environment
    env = gym.make("Blackjack-v1", natural = True, sab = False)  # sab=False = default rules

    # Create agent
    agent = BlackjackAgent_QLearning_Qtable(env, episodes=50000, eps=0.1, alpha=0.1, gamma=0.9)

    # Train the agent
    rewards = agent.train()
    print("Training finished.")

    # Evaluate performance
    agent.play(num_games=50)


Training finished.
Average reward: -0.1600
Wins: 20, Losses: 28, Draws: 2


#### Intuition with Blackjack example
Suppose:

You’re in state (sum=15, dealer=10, usable_ace=False).

You choose action hit.

You get reward 0 (game continues).

Next state is (sum=18, dealer=10, usable_ace=False).

Your table says Q(next_state) = [stick: 0.5, hit: -0.3].

Then:

reward = 0

np.max(Q[next_state]) = 0.5

target = 0 + gamma * 0.5 = 0.45 (if gamma=0.9)

Suppose current Q(state, hit) = 0.1.

Update:

Q(15,10,hit)←0.1+0.9(0.45−0.1)=0.415

So now the agent thinks hitting on 15 vs dealer 10 is worth about 0.415 expected value.

### VS Code Helper

In [None]:
!python --version

Python 3.12.11


In [None]:
!pip freeze > requirement.txt