<a href="https://colab.research.google.com/github/Lanaanvar/RL-Game/blob/main/RL_Game.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# Tiny GridWorld with Q-learning + pygame GUI (Gymnasium env)
# pip installs if needed:
# pip install gymnasium pygame numpy

import sys
import time
import math
import random
from typing import Optional, Tuple, Dict

import numpy as np
import pygame
import gymnasium as gym
from gymnasium import spaces

# ----------------------------
# GridWorld Env (Gymnasium API)
# ----------------------------

class TinyGridWorld(gym.Env):
    """
    5x5 deterministic GridWorld.
    S (0,0) start, G (4,4) goal, W walls.
    Actions: 0=Up, 1=Right, 2=Down, 3=Left
    Observation: Discrete(25) = row*W + col
    Reward: +1 at goal, -0.01 per step, episode ends at goal or max_steps.
    """
    metadata = {"render_modes": ["human"]}

    def __init__(self, render_mode: Optional[str] = "human"):
        super().__init__()
        self.H = 5
        self.W = 5
        self.start = (0, 0)
        self.goal = (4, 4)
        self.walls = {(1, 2), (2, 2), (3, 1)}  # simple obstacles
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Discrete(self.H * self.W)
        self.max_steps = 60

        # Pygame render state
        self.render_mode = render_mode
        self._window = None
        self._clock = None
        self.cell_px = 90
        self.margin = 8
        self.font = None

        # Dynamics
        self.s = None
        self.steps = 0

        # For value heatmap rendering (set from outside)
        self.q_values = None  # shape [H*W, 4]

    def _to_state(self, rc: Tuple[int, int]) -> int:
        r, c = rc
        return r * self.W + c

    def _to_rc(self, s: int) -> Tuple[int, int]:
        return divmod(s, self.W)

    def _valid(self, rc: Tuple[int, int]) -> bool:
        r, c = rc
        if r < 0 or c < 0 or r >= self.H or c >= self.W:
            return False
        if (r, c) in self.walls:
            return False
        return True

    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
        super().reset(seed=seed)
        self.s = self._to_state(self.start)
        self.steps = 0
        if self.render_mode == "human":
            self._init_pygame()
            self.render()
        return self.s, {}

    def step(self, action: int):
        self.steps += 1
        r, c = self._to_rc(self.s)
        if action == 0:    nr, nc = r - 1, c
        elif action == 1:  nr, nc = r, c + 1
        elif action == 2:  nr, nc = r + 1, c
        else:              nr, nc = r, c - 1

        if self._valid((nr, nc)):
            r, c = nr, nc  # move
        # else bump into wall/border and stay

        self.s = self._to_state((r, c))

        terminated = (r, c) == self.goal
        reward = 1.0 if terminated else -0.01
        truncated = self.steps >= self.max_steps

        if self.render_mode == "human":
            self.render()
        return self.s, reward, terminated, truncated, {}

    # -------------- Rendering --------------

    def _init_pygame(self):
        if self._window is None:
            pygame.init()
            total_w = self.W * self.cell_px + 2 * self.margin
            total_h = self.H * self.cell_px + 2 * self.margin + 70  # extra for HUD
            self._window = pygame.display.set_mode((total_w, total_h))
            pygame.display.set_caption("Tiny GridWorld — Q-learning")
            self._clock = pygame.time.Clock()
            self.font = pygame.font.SysFont("consolas", 18)

    def _draw_heat(self, surf, rect, value, vmin, vmax):
        # Map value to color between blue (low) and green (high)
        if vmax <= vmin:
            t = 0.0
        else:
            t = (value - vmin) / (vmax - vmin)
            t = max(0.0, min(1.0, t))
        # simple gradient: (0,0,255) -> (0,200,0)
        r = 0
        g = int(40 + 160 * t)
        b = int(255 * (1 - t))
        pygame.draw.rect(surf, (r, g, b), rect)

    def render(self):
        if self._window is None:
            self._init_pygame()

        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                sys.exit(0)

        self._window.fill((25, 25, 30))
        # Compute max-Q per state for heatmap
        if self.q_values is not None:
            maxq = self.q_values.max(axis=1)
            vmin, vmax = float(maxq.min()), float(maxq.max())
        else:
            maxq = np.zeros(self.H * self.W)
            vmin, vmax = 0.0, 1.0

        # Draw grid
        for rr in range(self.H):
            for cc in range(self.W):
                x = self.margin + cc * self.cell_px
                y = self.margin + rr * self.cell_px
                cell_rect = pygame.Rect(x, y, self.cell_px - 2, self.cell_px - 2)

                if (rr, cc) in self.walls:
                    pygame.draw.rect(self._window, (60, 60, 60), cell_rect)
                elif (rr, cc) == self.goal:
                    pygame.draw.rect(self._window, (20, 120, 20), cell_rect)
                else:
                    # heat
                    idx = self._to_state((rr, cc))
                    self._draw_heat(self._window, cell_rect, float(maxq[idx]), vmin, vmax)
                    pygame.draw.rect(self._window, (45, 45, 50), cell_rect, 2)

        # Draw agent
        ar, ac = self._to_rc(self.s)
        ax = self.margin + ac * self.cell_px + self.cell_px // 2
        ay = self.margin + ar * self.cell_px + self.cell_px // 2
        pygame.draw.circle(self._window, (240, 230, 70), (ax, ay), self.cell_px // 4)

        # HUD area
        hud_y = self.margin + self.H * self.cell_px + 10
        pygame.draw.rect(self._window, (15, 15, 18),
                         pygame.Rect(self.margin, hud_y, self.W * self.cell_px - 2, 60))

        pygame.display.flip()
        self._clock.tick(60)

    def close(self):
        if self._window is not None:
            pygame.quit()
            self._window = None


# ----------------------------
# Q-learning Trainer + GUI loop
# ----------------------------

def epsilon_greedy(q, s, eps, rng):
    if rng.random() < eps:
        return rng.integers(0, q.shape[1])
    return int(np.argmax(q[s]))

def run_training_gui(episodes=300,
                     alpha=0.5,
                     gamma=0.97,
                     eps_start=1.0,
                     eps_end=0.05,
                     eps_decay_episodes=250,
                     render_every=1,
                     pause_ms=0):
    env = TinyGridWorld(render_mode="human")
    nS = env.observation_space.n
    nA = env.action_space.n
    q = np.zeros((nS, nA), dtype=np.float32)

    rng = np.random.default_rng(123)
    returns = []
    eps = eps_start

    # Precompute linear epsilon schedule
    def eps_schedule(ep):
        t = min(1.0, ep / float(eps_decay_episodes))
        return eps_start * (1 - t) + eps_end * t

    # Stats for HUD
    avg_window = 25
    last_avg = 0.0

    # Main loop
    for ep in range(1, episodes + 1):
        s, _ = env.reset()
        env.q_values = q  # for heatmap

        eps = eps_schedule(ep)
        done = False
        ep_return = 0.0

        steps = 0
        while not done:
            a = epsilon_greedy(q, s, eps, rng)
            s2, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated

            # Q-learning update (TD(0))
            best_next = np.max(q[s2])
            td_target = r + gamma * best_next * (0.0 if terminated else 1.0)
            q[s, a] += alpha * (td_target - q[s, a])

            s = s2
            ep_return += r
            steps += 1

            # Make it easy on the eyes
            if pause_ms > 0:
                pygame.time.delay(pause_ms)

        returns.append(ep_return)
        if len(returns) >= avg_window:
            last_avg = float(np.mean(returns[-avg_window:]))

        # Update HUD text after each episode
        if env._window is not None:
            hud_y = env.margin + env.H * env.cell_px + 10
            # Repaint the HUD region
            pygame.draw.rect(env._window, (15, 15, 18),
                             pygame.Rect(env.margin, hud_y, env.W * env.cell_px - 2, 60))
            info_lines = [
                f"Episode: {ep}/{episodes}",
                f"Epsilon: {eps:.3f}",
                f"Avg Return (last {avg_window}): {last_avg:.3f}",
                f"Steps last ep: {steps}"
            ]
            for i, line in enumerate(info_lines):
                txt = env.font.render(line, True, (230, 230, 230))
                env._window.blit(txt, (env.margin + 8, hud_y + 8 + i * 16))
            pygame.display.flip()

        # Optionally throttle rendering frequency
        if render_every > 1 and (ep % render_every) != 0:
            pass  # already minimal work each step

    # Final greedy demo run to show learned policy
    s, _ = env.reset()
    env.q_values = q
    done = False
    demo_steps = 0
    while not done and demo_steps < env.max_steps:
        # Handle window events to allow closing the window
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                env.close()
                return

        a = int(np.argmax(q[s]))
        s, _, term, trunc, _ = env.step(a)
        done = term or trunc
        demo_steps += 1
        pygame.time.delay(60)  # let you watch the final path

    env.close()
    print("Training complete. Final average return (last 25):", last_avg)


if __name__ == "__main__":
    # Fast defaults; tweak episodes up to ~800 if you want a “perfect” path every time.
    run_training_gui(
        episodes=300,      # quick
        alpha=0.5,         # learning rate
        gamma=0.97,        # discount
        eps_start=1.0,     # explore early
        eps_end=0.05,      # exploit later
        eps_decay_episodes=250,
        render_every=1,    # draw every episode
        pause_ms=0         # bump to 10–30 if you want slower animation
    )


Training complete. Final average return (last 25): 0.9268000000000001


In [18]:
import sys, time, math, random
import numpy as np
import pygame
import gymnasium as gym
from gymnasium import spaces
from pyvirtualdisplay import Display
import imageio

display = Display(visible=0, size=(600,600))
display.start()

class TinyGridWorld(gym.Env):
    metadata = {"render_modes": ["rgb_array"]}

    def __init__(self):
        super().__init__()
        self.H = 5
        self.W = 5
        self.start = (0,0)
        self.goal = (4,4)
        self.walls = {(1,2),(2,2),(3,1)}
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Discrete(self.H*self.W)
        self.max_steps = 60
        self.s = None
        self.steps = 0
        self.q_values = None
        self.cell = 100

        pygame.init()
        self.surface = pygame.Surface((self.W*self.cell, self.H*self.cell))

    def _to_state(self, rc):
        r,c = rc
        return r*self.W + c

    def _to_rc(self, s):
        return divmod(s, self.W)

    def _valid(self, rc):
        r,c = rc
        if r<0 or c<0 or r>=self.H or c>=self.W: return False
        if rc in self.walls: return False
        return True

    def reset(self, seed=None, options=None):
        self.s = self._to_state(self.start)
        self.steps = 0
        return self.s, {}

    def step(self, a):
        self.steps += 1
        r,c = self._to_rc(self.s)
        if a==0: nr,nc=r-1,c
        elif a==1: nr,nc=r,c+1
        elif a==2: nr,nc=r+1,c
        else: nr,nc=r,c-1
        if self._valid((nr,nc)): r,c = nr,nc
        self.s = self._to_state((r,c))
        done = (r,c)==self.goal
        reward = 1 if done else -0.01
        truncated = self.steps>=self.max_steps
        return self.s, reward, done, truncated, {}

    def render(self):
        self.surface.fill((30,30,30))
        if self.q_values is not None:
            maxq = self.q_values.max(1)
            vmin,vmax = maxq.min(), maxq.max()
        else:
            maxq = np.zeros(self.H*self.W)
            vmin,vmax = 0,1

        for r in range(self.H):
            for c in range(self.W):
                x,y=c*self.cell, r*self.cell
                idx=self._to_state((r,c))
                if (r,c) in self.walls:
                    color=(60,60,60)
                elif (r,c)==self.goal:
                    color=(20,120,20)
                else:
                    t=(maxq[idx]-vmin)/(vmax-vmin+1e-9)
                    color=(0,int(40+160*t), int(255*(1-t)))
                pygame.draw.rect(self.surface,color,(x,y,self.cell-2,self.cell-2))
        # Agent
        ar,ac=self._to_rc(self.s)
        pygame.draw.circle(self.surface,(240,230,70),(ac*self.cell+self.cell//2, ar*self.cell+self.cell//2), self.cell//4)
        return pygame.surfarray.array3d(self.surface).transpose(1,0,2)

def train():
    env = TinyGridWorld()
    q = np.zeros((env.observation_space.n, env.action_space.n))
    gamma=0.97; alpha=0.5; eps=1.0

    frames=[]
    for ep in range(200):
        s,_=env.reset()
        eps = max(0.05, eps*0.99)
        done=False
        while not done:
            a = np.random.randint(4) if random.random()<eps else np.argmax(q[s])
            s2,r,done,trunc,_ = env.step(a)
            q[s,a]+=alpha*(r+gamma*q[s2].max()*(1-done)-q[s,a])
            s=s2
            env.q_values=q
            frames.append(env.render())
    imageio.mimsave("learning.mp4", frames, fps=30)

train()
print("Done. Saved video as learning.mp4")




Done. Saved video as learning.mp4


In [19]:
from IPython.display import Video
Video("learning.mp4", embed=True)
