# Frogger sim environment

In [None]:
from coderbot_sim.frogger.widget import FroggerWidget
env = FroggerWidget()
env.render()

# This is an example of a solution that fails to reach the goal.
actions = [a for a in [0, 2, 0, 2, 0, 2] for _ in range(600)]

for t in range(len(actions)):
    action = actions[t]
    state = await env.step(action)
    # print(f"t={t:03d}", f"state={state}")
    

In [None]:
from coderbot_sim.frogger.tk import FroggerTkFrontend

env = FroggerTkFrontend()
env.render()

# This is an example of a solution that fails to reach the goal.
actions = [a for a in [0, 2, 0, 2, 0, 2] for _ in range(600)]

for t in range(len(actions)):
    action = actions[t]
    state = await env.step(action)
    # print(f"t={t:03d}", f"state={state}")

In [42]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from tqdm.auto import tqdm

from coderbot_sim.frogger import FroggerEnv, ROWS, COLS

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


class Actor(nn.Module):
    def __init__(self, obs_dim, hidden=128, num_actions=3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, num_actions),
        )

    def forward(self, x):
        logits = self.net(x)
        return torch.softmax(logits, dim=-1)


class Critic(nn.Module):
    def __init__(self, obs_dim, hidden=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, 1)
        )

    def forward(self, x):
        return self.net(x)


def make_obs(state):
    x, y = state["frog_pos"]  # frog_pos = (col, row)

    x_norm = x / COLS
    y_norm = y / ROWS

    grid = torch.tensor(state["grid"], dtype=torch.float32, device=device)
    grid_flat = grid.flatten()

    obs = torch.cat([
        torch.tensor([x_norm, y_norm], dtype=torch.float32, device=device),
        grid_flat
    ])

    return obs


env = FroggerEnv()

obs_dim = 2 + ROWS * COLS
actor = Actor(obs_dim).to(device)
critic = Critic(obs_dim).to(device)

optimizerA = optim.Adam(actor.parameters(), lr=1e-4)
optimizerC = optim.Adam(critic.parameters(), lr=1e-3)

NUM_EPISODES = 2000
GAMMA = 0.99


pbar = tqdm(range(NUM_EPISODES))
for episode in pbar:
    state = env.reset()
    obs = make_obs(state)

    log_probs = []
    values = []
    rewards = []
    prev_score = state["score"]

    done = False
    while not done:
        # Actor output
        probs = actor(obs)
        dist = Categorical(probs)
        action = dist.sample()
        log_probs.append(dist.log_prob(action))

        # Critic value
        value = critic(obs)
        values.append(value)

        # Step the environment
        state = env.step(int(action.item() + 1))
        obs = make_obs(state)

        # Reward: score increase
        score = state["score"]
        reward = (score - prev_score) * ROWS
        prev_score = score

        rewards.append(reward)

        done = state.get("done", False)
        pbar.set_description(f"Episode {episode} | Score: {score:.2f}")

    returns = []
    G = 0
    for r in reversed(rewards):
        G = r + GAMMA * G
        returns.append(G)
    returns.reverse()

    returns = torch.tensor(returns, dtype=torch.float32, device=device)
    values = torch.cat(values).squeeze(-1)

    advantages = returns - values.detach()

    log_probs = torch.stack(log_probs)

    actor_loss = -(log_probs * advantages).mean()
    critic_loss = nn.functional.mse_loss(values, returns)

    optimizerA.zero_grad()
    actor_loss.backward()
    optimizerA.step()

    optimizerC.zero_grad()
    critic_loss.backward()
    optimizerC.step()


Using device: cuda


  0%|          | 0/2000 [00:00<?, ?it/s]

In [61]:
import torch
from coderbot_sim.frogger.widget import FroggerWidget

env = FroggerWidget()
env.render()

state = await env.reset() 

for t in range(2000):
    obs = make_obs(state).unsqueeze(0)

    with torch.no_grad():
        probs = actor(obs)
        # action = torch.argmax(probs, dim=-1).item()
        # OR: sample for stochastic policy
        action = torch.distributions.Categorical(probs).sample().item()

    state = await env.step(action + 1)

    if state.get("done", False):
        break


FroggerWidget(car_positions=[(0, 168, 80, 24), (200, 168, 80, 24), (400, 168, 80, 24), (600, 168, 80, 24), (0,â€¦