In [1]:
import gymnasium
import highway_env
# import finite-mdp
# from finite_mdp.envs import finite_mdp_env

%load_ext tensorboard
import sys
from tqdm.notebook import trange
# !pip install tensorboardx gym pyvirtualdisplay
# doesn't work cause not linux
# !apt-get install -y xvfb ffmpeg
# !git clone https://github.com/Farama-Foundation/HighwayEnv.git 2> /dev/null
# !git clone https://github.com/eleurent/finite-mdp.git 2> /dev/null
sys.path.insert(0, '/content/HighwayEnv/scripts/')
# from utils import record_videos, show_videos



In [2]:
import gymnasium as gym
import random
import highway_env
from collections import defaultdict

import json
import os
from tqdm import tqdm

import sys
sys.path.append(os.path.abspath('..'))
from metrics import Metrics

class ValueIteration():
    def __init__(self, env, params):
        self.finite_mdp = self.is_finite_mdp(env)
        self.states = env.observation_space
        if self.finite_mdp:
            self.mpd = env.mdp
        elif not self.finite_mdp:
            try:
                self.mdp = env.unwrapped.to_finite_mdp()
            except AttributeError:
                raise TypeError("not finite mdp")
            
        self.env = env
        self.obs, self.info = env.reset() 
        self.state_action_value = env.action_space
        self.load_dictionary()
        
        use_metrics = params.get("use_metrics", False)

        self.gamma = params.get("gamma", 0.9) # Discount Factor
        self.episode_num = params.get("episode_num", 100)
        self.metrics = Metrics("value_iteration", "training_results", use_metrics)

    def train(self):
        for epoch in tqdm(range(self.episode_num), desc="Training Model"):
            done = False
            total_reward = 0
            episode_count = 0
            episode_rewards = []

            # Initialize the environment and get the initial state
            self.obs = self.env.reset()  # Assuming `reset` initializes the environment
            state = str(self.obs)

            while not done:
                # Select a random action
                action = random.randrange(0, self.state_action_value.n)

                # Take the action in the environment
                next_obs, reward, done, truncated, info = self.env.step(action)

                # Accumulate discounted rewards
                total_reward += reward * (self.gamma ** episode_count)
                episode_rewards.append(total_reward)
                
                episode_count += 1

                # Update policy if the current action is better
            if state not in self.policy or self.policy[state]['reward'] < total_reward:
                self.policy[state] = {'action': action, 'reward': total_reward}            

            self.save_policy()
            
            self.metrics.add("rollout/rewards", sum(episode_rewards) / len(episode_rewards), epoch)
            self.metrics.add("rollout/episode-length", episode_count, epoch)
            
        self.metrics.close()
        

    def load_dictionary(self):
        file_path = "policies.json"
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as file:  # Open in text mode for JSON
                    self.policy = json.load(file)
                    print("Dictionary loaded successfully.")
                    return True
            except (json.JSONDecodeError, IOError) as e:
                print(f"Error loading dictionary: {e}")
                self.policy = {}  # Use a regular dictionary as fallback
                return False
        else:
            self.policy = {}  # Use a regular dictionary if file doesn't exist
            print("No existing policy found. Starting with an empty dictionary.")
            return False

    def save_policy(self):
        file_path = "policies.json"
        try:
            # Ensure self.policy is serializable
            if isinstance(self.policy, defaultdict):
                self.policy = dict(self.policy)  # Convert defaultdict to dict

            with open(file_path, 'w') as file:  # Open in write mode
                json.dump(self.policy, file, indent=4)  # Save with pretty printing
                # print("Dictionary saved successfully.")
                return True
        except IOError as e:
            print(f"Error saving dictionary: {e}")
            return False
                
    def evaluate(self, env, episode_num):
        for _ in range(episode_num):
            state = env.reset()[0]  
            done = False      
            truncated = False 
            
            # Agent navigates map until it falls into a hole (terminated), reaches goal (terminated), or has taken 200 actions (truncated).
            while(not done and not truncated):  
                try:   
                    # Select best action
                    action = self.policy[str(state)]
                    # TODO: metrics how many times we are used the policy
                except KeyError:
                    action = random.randrange(0, env.action_space.n)
                next_state, reward, done, truncated, info = env.step(action)
                state = next_state
                env.render()

    def is_finite_mdp(self,env):
        try:
            finite_mdp = __import__("finite_mdp.envs.finite_mdp_env")
            if isinstance(env.unwrapped, finite_mdp.envs.finite_mdp_env.FiniteMDPEnv):
                return True
        except (ModuleNotFoundError, TypeError):
            return False

In [3]:
config = {
    "lanes_count": 3,
    "observation": {
        "type": "TimeToCollision",
        "horizon": 5,
    }}

env = gym.make("highway-fast-v0", render_mode="rgb_array", config=config)

params = {
    "use_metrics": True,
    "episode_num": 10,
    "gamma": 0.9, # Discount Factor
}

finite_mdp = ValueIteration(env, params=params)
finite_mdp.train()

Dictionary loaded successfully.


Training Model: 100%|██████████| 10/10 [00:02<00:00,  3.42it/s]


In [4]:
env = gym.make("highway-v0", render_mode="rgb_array",   config=config)
finite_mdp.evaluate(env, 10)

{"(array([[[0. , 0. , 0. , 0. , 0. ],\n        [0. , 0. , 0. , 0. , 0. ],\n        [0. , 0. , 0. , 0. , 0. ]],\n\n       [[0. , 0. , 0. , 0. , 0. ],\n        [0. , 0. , 0. , 0. , 0. ],\n        [0. , 0. , 0. , 0. , 0. ]],\n\n       [[0. , 0. , 0.5, 1. , 1. ],\n        [0. , 0. , 0. , 0. , 0. ],\n        [0. , 0. , 0. , 0. , 0. ]]], dtype=float32), {'speed': 25, 'crashed': False, 'action': 0, 'rewards': {'collision_reward': 0.0, 'right_lane_reward': 0.5, 'high_speed_reward': 0.5, 'on_road_reward': 1.0}})": {'action': 1, 'reward': 6.72896866994331}, "(array([[[1. , 1. , 1. , 1. , 1. ],\n        [0. , 0. , 0. , 0. , 0. ],\n        [0. , 0. , 0. , 0. , 0. ]],\n\n       [[1. , 1. , 1. , 1. , 1. ],\n        [0. , 0. , 0. , 0. , 0. ],\n        [0. , 0. , 0. , 0. , 0. ]],\n\n       [[1. , 1. , 1. , 1. , 1. ],\n        [0. , 0. , 0.5, 1. , 1. ],\n        [0. , 0. , 0. , 0. , 0. ]]], dtype=float32), {'speed': 25, 'crashed': False, 'action': 3, 'rewards': {'collision_reward': 0.0, 'right_lane_rew

2024-12-30 13:55:39.803 python[33762:4209915] +[IMKClient subclass]: chose IMKClient_Modern
2024-12-30 13:55:39.803 python[33762:4209915] +[IMKInputSession subclass]: chose IMKInputSession_Modern


{"(array([[[0. , 0. , 0. , 0. , 0. ],\n        [0. , 0. , 0. , 0. , 0. ],\n        [0. , 0. , 0. , 0. , 0. ]],\n\n       [[0. , 0. , 0. , 0. , 0. ],\n        [0. , 0. , 0. , 0. , 0. ],\n        [0. , 0. , 0. , 0. , 0. ]],\n\n       [[0. , 0. , 0.5, 1. , 1. ],\n        [0. , 0. , 0. , 0. , 0. ],\n        [0. , 0. , 0. , 0. , 0. ]]], dtype=float32), {'speed': 25, 'crashed': False, 'action': 0, 'rewards': {'collision_reward': 0.0, 'right_lane_reward': 0.5, 'high_speed_reward': 0.5, 'on_road_reward': 1.0}})": {'action': 1, 'reward': 6.72896866994331}, "(array([[[1. , 1. , 1. , 1. , 1. ],\n        [0. , 0. , 0. , 0. , 0. ],\n        [0. , 0. , 0. , 0. , 0. ]],\n\n       [[1. , 1. , 1. , 1. , 1. ],\n        [0. , 0. , 0. , 0. , 0. ],\n        [0. , 0. , 0. , 0. , 0. ]],\n\n       [[1. , 1. , 1. , 1. , 1. ],\n        [0. , 0. , 0.5, 1. , 1. ],\n        [0. , 0. , 0. , 0. , 0. ]]], dtype=float32), {'speed': 25, 'crashed': False, 'action': 3, 'rewards': {'collision_reward': 0.0, 'right_lane_rew

KeyboardInterrupt: 

### Run the Tensorboard

In [3]:
%reload_ext tensorboard

%tensorboard --logdir training_results --host localhost --port 6012