# Mountain car sim environment

In [None]:
%pip install coderbot_sim

In [None]:
from coderbot_sim.mountain_car.widget import MountainCarWidget

env = MountainCarWidget(manual_control=True)
env.render()

In [None]:
from coderbot_sim.mountain_car.widget import MountainCarWidget

env = MountainCarWidget()
env.render()

# This is an example of a solution that fails to reach the goal.
for t in range(200):
    action = 2 if t % 50 < 10 else 0
    state = await env.step(action)
    # print(f"t={t:03d}", f"state={state}")

state = await env.reset()

In [None]:
from coderbot_sim.mountain_car.widget import MountainCarWidget

env = MountainCarWidget()
env.render()

# This is an example of a solution that reaches the goal.
actions = [a for a in [0, 2, 0, 2, 0, 2] for _ in range(60)]

for t in range(len(actions)):
    action = actions[t]
    state = await env.step(action)
    # print(f"t={t:03d}", f"state={state}")

# We also provide a tkinker frontend

In [None]:
from coderbot_sim.mountain_car.tk import MountainCarTkFrontend

env = MountainCarTkFrontend()
env.render()

# This is an example of a solution that reaches the goal.
actions = [a for a in [0, 2, 0, 2, 0, 2] for _ in range(60)]

for t in range(len(actions)):
    action = actions[t]
    state = await env.step(action)
    # print(f"t={t:03d}", f"state={state}")

# Now try to use reinforcement learning to solve the problem!

In [None]:
import numpy as np
import random
from tqdm.auto import tqdm
from coderbot_sim.mountain_car import MountainCarEnv

env = MountainCarEnv()

NUM_POS_BINS = 40
NUM_VEL_BINS = 40
NUM_ACTIONS = 3  # {0: left, 1: idle, 2: right}

POS_MIN, POS_MAX = -1.2, 0.6
VEL_MIN, VEL_MAX = -0.07, 0.07


def discretize_state(state):
    pos, vel = state["position"], state["velocity"]
    pos_bin = int((pos - POS_MIN) / (POS_MAX - POS_MIN) * NUM_POS_BINS)
    vel_bin = int((vel - VEL_MIN) / (VEL_MAX - VEL_MIN) * NUM_VEL_BINS)
    pos_bin = np.clip(pos_bin, 0, NUM_POS_BINS - 1)
    vel_bin = np.clip(vel_bin, 0, NUM_VEL_BINS - 1)
    return pos_bin, vel_bin


# Q-table
Q = np.zeros((NUM_POS_BINS, NUM_VEL_BINS, NUM_ACTIONS))

# Hyperparameters
alpha = 0.1
gamma = 0.99
epsilon = 0.1
episodes = 1500


async def train():
    pbar = tqdm(range(episodes), desc="Training Episodes")
    completed = False
    min_steps = float("inf")

    for _ in pbar:
        state = env.reset()
        s = discretize_state(state)
        total_reward = 0

        for step in range(400):
            # epsilon-greedy
            if random.random() < epsilon:
                action = random.randint(0, NUM_ACTIONS - 1)
            else:
                action = np.argmax(Q[s])

            next_state = env.step(action)
            s2 = discretize_state(next_state)

            # classic reward heuristic (-1 reward per step)
            reward = next_state.get("reward", -1)
            done = next_state.get("done", False)
            total_reward += reward

            # Q-learning update
            Q[s][action] += alpha * (reward + gamma * np.max(Q[s2]) - Q[s][action])
            s = s2

            if done:
                completed = True
                min_steps = min(min_steps, step)
                break

        if completed:
            pbar.set_postfix({"status": f"goal in, min steps: {min_steps}"})
        else:
            pbar.set_postfix({"status": "failed"})

    print("Training complete!")


await train()

In [None]:
from coderbot_sim.mountain_car.widget import MountainCarWidget

env = MountainCarWidget()
env.render()

state = await env.reset()
s = discretize_state(state)
total_reward = 0

for step in range(500):
    action = np.argmax(Q[s])
    next_state = await env.step(action, dt=0.01)
    s2 = discretize_state(next_state)

    reward = next_state.get("reward", -1)
    done = next_state.get("done", False)
    total_reward += reward

    # Q-learning update
    Q[s][action] += alpha * (reward + gamma * np.max(Q[s2]) - Q[s][action])
    s = s2