In [23]:
import gym
# from gym.envs.toy_text.frozen_lake import FrozenLakeEnv
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

import frozenlake_env
import importlib
importlib.reload(frozenlake_env)
from frozenlake_env import FrozenLakeEnv
PARAM_DIR = 'params'

def create_custom_env(custom_map, is_slippery=False, render_mode=None):
    """
    Creates a custom Frozen Lake environment.
    
    :param custom_map: List of strings representing the map.
    :param is_slippery: Boolean indicating whether the surface is slippery.
    :return: Custom Frozen Lake environment.
    """
    env = FrozenLakeEnv(desc=custom_map, is_slippery=is_slippery, render_mode=render_mode)
    return env

def train_ppo_on_map(custom_map, total_timesteps=10000, is_slippery=False, pretrained_model_path=None):
    """
    Trains a PPO agent on a custom Frozen Lake map.
    
    :param custom_map: List of strings representing the map.
    :param total_timesteps: Total timesteps for training.
    :param is_slippery: Boolean indicating whether the surface is slippery.
    :return: Trained PPO model.
    """
    # Create the environment
    env = create_custom_env(custom_map, is_slippery)
    env = make_vec_env(lambda: env, n_envs=1)  # Vectorize the environment

    # Initialize the PPO model
    if pretrained_model_path is not None:
        model = PPO.load(pretrained_model_path, env)
    else:
        model = PPO("MlpPolicy", env, verbose=1)

    # Train the model
    model.learn(total_timesteps=total_timesteps)

    return model

# Example usage with the baseline map
baseline_map = [
    'SFFF',
    'FHFH',
    'FFFH',
    'HFFG'
]

# Train the PPO agent on the baseline map
trained_model = train_ppo_on_map(baseline_map, total_timesteps=50000)

# Save the model
trained_model.save(f"{PARAM_DIR}/ppo_frozenlake_baseline")


Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 7.97     |
|    ep_rew_mean     | 0.03     |
| time/              |          |
|    fps             | 583      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5.79        |
|    ep_rew_mean          | 0.04        |
| time/                   |             |
|    fps                  | 468         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.019345885 |
|    clip_fraction        | 0.397       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.37       |
|    explained_variance   | -0.008      |
|    learnin

In [24]:
import pickle
maps = pickle.load(open("maps.pkl", "rb"))
maps

{'Baseline': ['SFFF', 'FHFH', 'FFFH', 'HFFG'],
 'Map 2.1': ['SFFF', 'FHFH', 'FFFH', 'HFGF'],
 'Map 2.2': ['SFFF', 'FHFH', 'FGFH', 'HFFF'],
 'Map 2.3': ['SFFG', 'FHFH', 'FFFH', 'HFFF'],
 'Map 3.1': ['SFFF', 'HHFH', 'FHFH', 'HFFG'],
 'Map 3.2': ['SFFF', 'FHHH', 'FFFF', 'HHFG'],
 'Map 3.3': ['SFFH', 'HFFF', 'FFHH', 'HFFG'],
 'Map 4.1': ['FFFS', 'HFHF', 'FHHF', 'GFFF'],
 'Map 4.2': ['FFFG', 'HFHF', 'HHFF', 'SFFF'],
 'Map 4.3': ['FFFF', 'FHHS', 'FHFH', 'FFFG'],
 'Map 5.1': ['FFFFS', 'HFFFH', 'FFHFH', 'HFHFH', 'HGFFF'],
 'Map 5.2': ['FFFFFG', 'FFFHHF', 'FHFFFF', 'HFFHFF', 'HHFFFH', 'FSFFFF'],
 'Map 5.3': ['HFG', 'HFH', 'FFS']}

In [7]:
pretrained_model_path = "ppo_frozenlake_baseline.zip"
trained_model2 = train_ppo_on_map(maps['Map 2.1'], total_timesteps=50000, pretrained_model_path=pretrained_model_path)
trained_model2.save("ppo_frozenlake_map2.1")

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 5.01     |
|    ep_rew_mean     | 1        |
| time/              |          |
|    fps             | 540      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 5             |
|    ep_rew_mean          | 1             |
| time/                   |               |
|    fps                  | 436           |
|    iterations           | 2             |
|    time_elapsed         | 9             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00028061357 |
|    clip_fraction        | 0.00381       |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.0195       |
|    explained_variance   | 0.0965        |


In [13]:
pretrained_model_path = "ppo_frozenlake_map2.1"
trained_model2 = train_ppo_on_map(maps['Map 2.2'], total_timesteps=50000, pretrained_model_path=pretrained_model_path)
trained_model2.save("ppo_frozenlake_map2.2")
pretrained_model_path = "ppo_frozenlake_map2.2"
trained_model2 = train_ppo_on_map(maps['Map 2.3'], total_timesteps=50000, pretrained_model_path=pretrained_model_path)
trained_model2.save("ppo_frozenlake_map2.3")



---------------------------------
| rollout/           |          |
|    ep_len_mean     | 81.9     |
|    ep_rew_mean     | 0.167    |
| time/              |          |
|    fps             | 552      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 161         |
|    ep_rew_mean          | 0.15        |
| time/                   |             |
|    fps                  | 439         |
|    iterations           | 2           |
|    time_elapsed         | 9           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.002729432 |
|    clip_fraction        | 0.0315      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.247      |
|    explained_variance   | -0.837      |
|    learning_rate        | 0.

## Generate GIF

In [25]:
import imageio
from stable_baselines3.common.vec_env import DummyVecEnv


def generate_gif_from_policy(model, custom_map, gif_filename="ppo_frozenlake.gif", is_slippery=False):
    # Create the environment
    def _init():
        env = gym.make("FrozenLake-v1", desc=custom_map, is_slippery=is_slippery, render_mode='rgb_array')
        return env
    env = DummyVecEnv([_init])
    obs = env.reset()

    # List to hold frames
    frames = []

    # Run the agent in the environment
    for _ in range(100):  # Set a limit to prevent infinite loops
        frames.append(env.render())
        action, _ = model.predict(obs)
        obs, _, done, _ = env.step(action)
        if done:
            break

    # Create the GIF
    imageio.mimsave(gif_filename, frames, duration=0.3)
    print(f"GIF saved as {gif_filename}")

# Example usage with the baseline map and trained model
baseline_map = maps['Baseline']
map_21 = maps['Map 2.1']
map_22 = maps['Map 2.2']
map_23 = maps['Map 2.3']
map_31 = maps['Map 3.1']
map_32 = maps['Map 3.2']
map_33 = maps['Map 3.3']
map_41 = maps['Map 4.1']
map_42 = maps['Map 4.2']
map_43 = maps['Map 4.3']

# Load the trained model

# Generate the GIF
trained_model = PPO.load("ppo_frozenlake_Map 3.1")
# generate_gif_from_policy(trained_model, baseline_map, gif_filename="ppo_frozenlake_baseline.gif")
# generate_gif_from_policy(trained_model, map_21, gif_filename="ppo_frozenlake_map2.1.gif")
# generate_gif_from_policy(trained_model, map_22, gif_filename="ppo_frozenlake_map2.2.gif")
generate_gif_from_policy(trained_model, map_23, gif_filename="ppo_frozenlake_map3.1.gif")



FileNotFoundError: [Errno 2] No such file or directory: 'ppo_frozenlake_Map 3.1.zip'

In [26]:
for map_ in maps:
    trained_model = train_ppo_on_map(maps[map_], total_timesteps=50000)
    trained_model.save(f"{PARAM_DIR}/ppo_frozenlake_{map_}")

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 7.39     |
|    ep_rew_mean     | 0.01     |
| time/              |          |
|    fps             | 210      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5.75        |
|    ep_rew_mean          | 0.01        |
| time/                   |             |
|    fps                  | 161         |
|    iterations           | 2           |
|    time_elapsed         | 25          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.020369183 |
|    clip_fraction        | 0.469       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.37       |
|    explained_variance   | -0.0511     |
|    learnin

KeyboardInterrupt: 

In [37]:
import torch
def predict_action_probabilities(policy, states):
    """
    Predict action probabilities for each state using the given policy.
    
    :param policy: The policy network.
    :param states: List of states.
    :return: Action probabilities for each state.
    """
    action_probs = []
    for state in states:
        obs = torch.tensor([state], dtype=torch.float32, device=policy.device)
        action_dist = policy.policy.get_distribution(obs)
        action_probs.append(action_dist.distribution.probs.detach().cpu().numpy())
    return np.array(action_probs)

def KL_divergence(policy_net_1, policy_net_2, states):
    """
    Compute the KL divergence between two policies for a given state.
    
    :param policy_net_1: Policy network 1.
    :param policy_net_2: Policy network 2.
    :param states: State for which to compute the KL divergence.
    :return: average KL divergence between the two policies.
    """
    # Get the action probabilities for the two policies
    action_probs_1 = predict_action_probabilities(policy_net_1, states).squeeze()
    action_probs_2 = predict_action_probabilities(policy_net_2, states).squeeze()
    print(action_probs_1)
    print(action_probs_2)
    action_probs_1 = np.clip(action_probs_1, 1e-10, 1.0)
    action_probs_2 = np.clip(action_probs_2, 1e-10, 1.0)

    kl_divergence = np.sum(action_probs_1 * np.log(action_probs_1 / action_probs_2), axis=1)

    # Compute the KL divergence
    # print(kl_divergence.shape)
    return np.mean(kl_divergence)

def get_all_states(map_name):
    """
    Get all the states in the environment.
    
    :param env: Environment.
    :return: List of all states.
    """

    env = create_custom_env(map_name)
    all_states = np.arange(env.observation_space.n)
    return all_states

def get_all_states(map_name, input_st='large'):
    """
    Get all the states in the environment.
    
    :param env: Environment.
    :return: List of all states.
    """

    env = create_custom_env(map_name)
    base_ = env.state_without_agent
    all_states = []
    for i in range(len(base_)):
        if base_[i] == 0:
            n_ad = base_.copy()
            n_ad[i] = 1
            all_states.append(n_ad)
    return np.array(all_states)


In [39]:
import numpy as np
policy_1 = PPO.load("ppo_frozenlake_Map 2.1")
policy_2 = PPO.load("ppo_frozenlake_Map 2.2")
states = get_all_states(maps['Baseline'])

policies = {
    'Baseline': "ppo_frozenlake_Baseline",
    'Map 2.1':  "ppo_frozenlake_Map 2.1",
    'Map 2.2':  "ppo_frozenlake_Map 2.2",
    'Map 2.3':  "ppo_frozenlake_Map 2.3",
    'Map 3.1':  "ppo_frozenlake_Map 3.1",
    'Map 3.2':  "ppo_frozenlake_Map 3.2",
    'Map 3.3':  "ppo_frozenlake_Map 3.3",
    'Map 4.1':  "ppo_frozenlake_Map 4.1",
    'Map 4.2':  "ppo_frozenlake_Map 4.2",
    'Map 4.3':  "ppo_frozenlake_Map 4.3"
}

KL = {} #dictionary of dictionaries

for policy1 in policies:
    for policy2 in policies:
        if policy1 == policy2 or policy1 == 'Baseline' or policy2 == 'Baseline': continue
        if policy1 not in KL: KL[policy1] = {}
        policy_net_1 = PPO.load(policies[policy1])
        policy_net_2 = PPO.load(policies[policy2])
        kl_div = KL_divergence(policy_net_1, policy_net_2, states)
        KL[policy1][policy2] = kl_div
        print(f"KL Divergence between {policy1} and {policy2}: {kl_div:.4f}")


  obs = torch.tensor([state], dtype=torch.float32, device=policy.device)


[[0.48731953 0.0585381  0.4113563  0.04278607]
 [0.48967266 0.05730338 0.4109015  0.04212256]
 [0.48684686 0.05883747 0.41127428 0.04304139]
 [0.49169934 0.05641292 0.41006723 0.04182051]
 [0.48713234 0.05904303 0.41030407 0.04352045]
 [0.48631883 0.05896634 0.41179577 0.04291908]
 [0.48774248 0.05822591 0.41149148 0.04254014]
 [0.487055   0.05856357 0.41167197 0.04270952]
 [0.4876499  0.05849111 0.4109372  0.04292183]
 [0.48878703 0.05791122 0.4107541  0.04254759]
 [0.490506   0.05728731 0.40974095 0.04246583]]
[[0.1775071  0.582194   0.04082894 0.19946991]
 [0.18272805 0.56462705 0.04151898 0.21112594]
 [0.18290667 0.5718236  0.04480426 0.20046547]
 [0.17095551 0.60395604 0.0400859  0.18500246]
 [0.18100758 0.57390964 0.04272125 0.20236152]
 [0.17838351 0.57969105 0.041299   0.20062642]
 [0.18590248 0.5593604  0.04381738 0.2109197 ]
 [0.17777842 0.5845456  0.0422067  0.19546928]
 [0.17919578 0.58067834 0.04260022 0.19752571]
 [0.18402362 0.5678443  0.04454069 0.20359136]
 [0.17547396

In [52]:
m = train_ppo_on_map(maps['Map 3.1'], total_timesteps=50000)



Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 4.36     |
|    ep_rew_mean     | 0        |
| time/              |          |
|    fps             | 570      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5.32        |
|    ep_rew_mean          | 0           |
| time/                   |             |
|    fps                  | 457         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.022768017 |
|    clip_fraction        | 0.25        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.37       |
|    explained_variance   | -23.9       |
|    learnin

In [54]:
generate_gif_from_policy(m, maps['Map 3.1'], gif_filename="ppo_frozenlake_map3.1.gif")
m.save("ppo_frozenlake_Map 3.1")

GIF saved as ppo_frozenlake_map3.1.gif


  if not isinstance(terminated, (bool, np.bool8)):


In [57]:
KL_divergence(PPO.load("ppo_frozenlake_Map 2.1"), m, states)

1.5754921

In [40]:
import pprint, pandas as pd

# pprint.pprint(KL)
df = pd.DataFrame(KL)
row_to_move = df.loc['Map 2.1']

df = df.drop('Map 2.1')
df = pd.concat([row_to_move.to_frame().T, df])
df

Unnamed: 0,Map 2.1,Map 2.2,Map 2.3,Map 3.1,Map 3.2,Map 3.3,Map 4.1,Map 4.2,Map 4.3
Map 2.1,,1.361318,0.368706,1.276628,1.172743,0.860992,0.664351,2.808307,0.373867
Map 2.2,1.223842,,0.639736,0.888057,2.317672,3.088597,1.658128,1.314476,0.584761
Map 2.3,0.291817,0.474352,,0.661631,0.951929,0.961718,1.437313,1.287378,0.513904
Map 3.1,1.246319,4.844479,1.963881,,4.356048,2.945076,0.93358,0.573592,1.780362
Map 3.2,3.450973,3.646537,2.978897,6.330189,,1.322003,6.87747,7.4211,4.952004
Map 3.3,3.783198,5.574817,3.468291,7.023938,1.520817,,8.26569,7.936028,6.388093
Map 4.1,1.803756,4.340181,3.165307,2.601337,5.514386,5.24191,,5.14525,1.233147
Map 4.2,4.005259,1.916686,2.895122,0.925905,6.287103,7.381369,3.511313,,2.522616
Map 4.3,0.525706,0.541773,0.59849,0.871927,2.269004,2.696992,0.383287,2.113011,


In [41]:
for policy_ in policies:
    mod = PPO.load(policies[policy_])
    generate_gif_from_policy(mod, maps[policy_], gif_filename=f"./gifs/ppo_frozenlake_{policy_}.gif")



ValueError: Error: Unexpected observation shape (1,) for Box environment, please use (16,) or (n_env, 16) for the observation shape.

In [34]:
import frozenlake_env
import importlib
importlib.reload(frozenlake_env)

from frozenlake_env import FrozenLakeEnv as FLEnv

env = FLEnv(desc=maps['Baseline'], is_slippery=False)
env.observation_space.n

env.seed

AttributeError: 'FrozenLakeEnv' object has no attribute 'seed'

In [36]:
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv

env = FrozenLakeEnv(desc=maps['Baseline'], is_slippery=False)
env.seed

AttributeError: 'FrozenLakeEnv' object has no attribute 'seed'