# Frogger sim environment

In [None]:
from coderbot_sim.frogger.widget import FroggerWidget
env = FroggerWidget()
env.render()

# This is an example of a solution that fails to reach the goal.
actions = [a for a in [0, 2, 0, 2, 0, 2] for _ in range(600)]

for t in range(len(actions)):
    action = actions[t]
    state = await env.step(action)
    # print(f"t={t:03d}", f"state={state}")
    

In [None]:
from coderbot_sim.frogger.tk import FroggerTkFrontend

env = FroggerTkFrontend()
env.render()

# This is an example of a solution that fails to reach the goal.
actions = [a for a in [0, 2, 0, 2, 0, 2] for _ in range(600)]

for t in range(len(actions)):
    action = actions[t]
    state = await env.step(action)
    # print(f"t={t:03d}", f"state={state}")

In [1]:
from coderbot_sim.frogger import FroggerEnv, ROWS, COLS
from collections import deque, namedtuple
import random
import math
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

seed = 130
random.seed(seed)

torch.manual_seed(seed)

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)


def make_obs(state):
    x, y = state["frog_pos"]  # frog_pos = (col, row)
    grid = state["grid"]
    fx = x / (COLS - 1)
    fy = y / (ROWS - 1)

    # 1.0 if car in that direction, 0.0 otherwise
    up = 1.0 if y > 0 and grid[y - 1][x] else 0.0
    down = 1.0 if y < ROWS - 1 and grid[y + 1][x] else 0.0
    left = 1.0 if x > 0 and grid[y][x - 1] else 0.0
    right = 1.0 if x < COLS - 1 and grid[y][x + 1] else 0.0

    return torch.tensor([up, down, left, right, fx, fy], device=device)


Transition = namedtuple(
    "Transition", ("state", "action", "reward", "next_state", "done")
)


class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, *args):
        self.buffer.append(Transition(*args))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        return Transition(*zip(*batch))

    def __len__(self):
        return len(self.buffer)


class QNet(nn.Module):
    def __init__(self, obs_dim=4, hidden=128, n_actions=5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, n_actions),
        )

    def forward(self, x):
        return self.net(x)


def select_action(net, state_tensor, eps, n_actions=5):
    if random.random() < eps:
        return random.randrange(n_actions)
    with torch.no_grad():
        qvals = net(state_tensor.unsqueeze(0))
        return int(qvals.argmax(dim=1).item())

def evaluate_policy(env, q_net, eval_episodes=5, max_steps=500):
    q_net.eval()
    scores = []
    with torch.no_grad():
        for _ in range(eval_episodes):
            state = env.reset()
            obs = make_obs(state)
            total = 0.0
            for _ in range(max_steps):
                action = select_action(q_net, obs, eps=0.0, n_actions=5)
                r = env.step(action, dt=0.02)
                reward = float(r["score"] - state["score"])
                if r["done"]:
                    reward = -0.1
                total += reward
                state = r
                obs = make_obs(state)
                if r["done"]:
                    break
            scores.append(total)
    q_net.train()
    return np.sum(scores)


env = FroggerEnv()

num_episodes = 2000
max_steps_per_episode = 400

start_train_after = 1000
train_every = 4
target_update_every = 1000
gamma = 0.99

eps_start=1.0
eps_final=0.05
eps_decay=15000

eval_every = 250
eval_episodes = 8

n_actions = 5
obs_dim = 6

q_net = QNet(obs_dim=obs_dim, hidden=128, n_actions=n_actions).to(device)
target_net = QNet(obs_dim=obs_dim, hidden=128, n_actions=n_actions).to(device)
target_net.load_state_dict(q_net.state_dict())
target_net.eval()

optimizer = optim.Adam(q_net.parameters(), lr=1e-4)
replay = ReplayBuffer(capacity=30000)

total_steps = 0
losses = []
episode_rewards = []
pbar = tqdm(range(num_episodes))

for ep in pbar:
    state = env.reset()
    obs = make_obs(state)
    ep_reward = 0.0

    for step in range(max_steps_per_episode):
        eps = eps_final + (eps_start - eps_final) * math.exp(
            -1.0 * total_steps / eps_decay
        )
        action = select_action(q_net, obs, eps, n_actions=n_actions)

        result = env.step(action, dt=0.02)
        reward = float(result["score"] - state["score"])  # score delta
        # collision penalty
        if result["done"]:
            reward = -0.1

        next_obs = make_obs(result)
        done = result["done"]

        replay.push(obs.cpu(), action, reward, next_obs.cpu(), done)

        ep_reward += reward
        total_steps += 1
        state = result
        obs = next_obs

        # train
        if len(replay) > start_train_after and total_steps % train_every == 0:
            batch = replay.sample(128)
            state_b = torch.stack(batch.state).to(device)
            action_b = torch.tensor(batch.action, dtype=torch.int64, device=device).unsqueeze(1)
            reward_b = torch.tensor(batch.reward, device=device).unsqueeze(1)
            next_state_b = torch.stack(batch.next_state).to(device)
            done_b = torch.tensor(batch.done, dtype=torch.uint8, device=device).unsqueeze(1)

            # current Q
            q_values = q_net(state_b).gather(1, action_b)

            # target Q
            with torch.no_grad():
                # Double DQN
                next_q_online = q_net(next_state_b)
                next_actions = next_q_online.argmax(dim=1, keepdim=True)
                next_q_target = target_net(next_state_b)
                next_q_values = next_q_target.gather(1, next_actions)
                target_q = reward_b + (1 - done_b.float()) * gamma * next_q_values

            loss = nn.functional.mse_loss(q_values, target_q)
            optimizer.zero_grad()
            loss.backward()
            # gradient clipping (stability)
            torch.nn.utils.clip_grad_norm_(q_net.parameters(), 10.0)
            optimizer.step()

            losses.append(loss.item())

        # update target network
        if total_steps % target_update_every == 0:
            target_net.load_state_dict(q_net.state_dict())

        if done:
            break

    episode_rewards.append(ep_reward)
    
    pbar.set_description(
        f"Ep {ep:4d} | steps {total_steps:6d} | ep_reward {ep_reward:.3f}"
    )

    # eval
    if ep % eval_every == 0:
        eval_score = evaluate_policy(
            env, q_net, eval_episodes, max_steps_per_episode
        )
        print(f"Eval epoch {ep}: mean_score={eval_score:.3f}")



Device: cuda


  0%|          | 0/2000 [00:00<?, ?it/s]

Eval epoch 0: mean_score=0.000
Eval epoch 250: mean_score=2.629
Eval epoch 500: mean_score=6.857
Eval epoch 750: mean_score=27.771
Eval epoch 1000: mean_score=27.771
Eval epoch 1250: mean_score=26.057
Eval epoch 1500: mean_score=19.771
Eval epoch 1750: mean_score=44.343


In [2]:
from coderbot_sim.frogger.widget import FroggerWidget

env = FroggerWidget()
env.render()
state = await env.reset()

q_net.eval()

for t in range(2000):
    obs = make_obs(state)

    with torch.no_grad():
        q_values = q_net(obs)
        action = select_action(q_net, obs, eps=0.0, n_actions=5)

    state = await env.step(action)

    if state.get("done", False):
        break

<coderbot_sim.frogger.widget.FroggerWidget object at 0x000002A0888F86E0>