Necessary libraries
1. gymnasium - environment
2. pygame - rendering the agent and the environment
3. numpy - for Q-table
4. Hugging Face - for uploading the model to hugging face hub

In [None]:
# Installing necessary libraries
!pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit2/requirements-unit2.txt

Collecting gymnasium (from -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit2/requirements-unit2.txt (line 1))
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting pickle5 (from -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit2/requirements-unit2.txt (line 6))
  Downloading pickle5-0.0.11.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.1/132.1 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyyaml==6.0 (from -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit2/requirements-unit2.txt (line 7))
  Downloading PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (682 kB)
[2K     [90m━━━━━━━━━━━━━━━━

* python3-opengl - to render the 3d graphics
* ffmpeg - tool to convert and stream audio and video
* xvfb - display server that doesn't show screen in output but it will reside in the memory
* pyvirtualdisplay - it is a wrapper for xvfb

In [None]:
!sudo apt-get update
!sudo apt-get install -y python3-opengl
!apt install ffmpeg xvfb
!pip3 install pyvirtualdisplay

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (18.160.213.93)] [Co                                                                               Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
                                                                               Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [Waiting for headers] [2 InRelease 14.2 kB/110 kB 13%] [Connecting to ppa.la                                                                               Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Get:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease [18.1 kB]
Hit:8 https://ppa.launchp

In [None]:
# restarting the code cell so that all the installed packages works perfect
import os

os.kill(os.getpid(), 9)

In [None]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7827fff45090>

In [None]:
# importing the packages
import numpy as np
import gymnasium as gym
import random
import imageio
import os
import tqdm

import pickle5 as pickle
from tqdm.notebook import tqdm

* numpy - calculation purpose of Q table
* gym - environment
* random - used to generate random numbers for epsilon greedy policy
* imageio - to generate replay video
* tqdm - produce the loading of model training in interactive way
* pickel - it is used to convert the python object to a binary file ( Here the model object is converted to .pkl file and it is used to save the model and reuse it)

# Step 1 : Create an environment

[Documentaion of Frozen Lake](https://gymnasium.farama.org/environments/toy_text/frozen_lake/)


In [None]:
# to generate the map
from gymnasium.envs.toy_text.frozen_lake import generate_random_map

desc = generate_random_map(size=4)
desc

['SFHF', 'FFFF', 'FHHF', 'FFFG']

In [None]:
# create an environment
env = gym.make("FrozenLake-v1",
               desc=desc,
               map_name="4x4",
               is_slippery=False, # if false then it is deterministic else it is stochastic
               render_mode="rgb_array" # to render and save it as video
)

In [None]:
# observation space
print("Number of elements in observation space ",env.observation_space) # 4x4 (map size) = 16
print("Sample : ",env.observation_space.sample())

Number of elements in observation space  Discrete(16)
Sample :  12


In [None]:
# action space
print(env.action_space.n)
print(env.action_space.sample()) # 0 - left 1 - down 2 - right 3 - up

4
1


# Step 2 : Model Training

Algorithm:
1. Initialize the Q table
2. Define the greedy policy
    1. Epsilon greedy policy
    2. Greedy policy
3. Initialize parameters for training
4. Training loop
    1. Choose an action based on epsilon greedy policy
    2. Get reward
    3. Update Qtable with current state and action with reward
`Q[s,a] = Q[s,a] + learning_rate[ Reward[s,a] + gamma * max(Q[s'.a'] - Q[s,a])`

In [None]:
# Initialize the Q table
# Q table contains rows(states) and columns(actions)

def initialize_Q_table():
    state_space = env.observation_space.n
    action_space = env.action_space.n
    print("State space : ",state_space,"Action space : ",action_space)

    table = np.zeros((state_space,action_space)) # initializing with the 0
    return table

QTable = initialize_Q_table()

State space :  16 Action space :  4


In [None]:
# Epsilon Greedy Policy - handles the exploitation/exploration tradeoff
def epsilon_greedy_policy(QTable,state,epsilon):
    random_number = np.random.uniform(0,1)

    if random_number > epsilon: # exploitation
        action = np.argmax(QTable[state][:]) # or greedy_policy(QTable,state)
    else: # exploration
        action = env.action_space.sample()

    return action

# Greedy Policy - exploitation
def greedy_policy(QTable,state):
    # action with highest state,action value
    action = np.argmax(QTable[state][:])

    return action

In [None]:
# training parameters
n_training_episodes = 1000000
learning_rate = 0.05 # low as possible

# evaluation parameters
n_eval_episodes = 100

# environment parameters
env_id = "FrozenLake-v1"
max_steps = 99
gamma = 0.975 # discount factor - high(1) for long term
eval_seed = []

# exploration parameter
max_epsilon = 1.00
min_epsilon = 0.05
decay_rate = 0.005

def train(n_training_episodes,min_epsilon,max_epsilon,decay_rate,env,max_steps,QTable):
    for episode in tqdm(range(n_training_episodes)):
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate * episode)

        # init the env
        state,info = env.reset()
        step = 0
        terminated = False
        truncated = False

        # learning
        for step in range(max_steps):
            action = epsilon_greedy_policy(QTable,state,epsilon) # choose an action
            new_state,reward,terminated,trucated,info = env.step(action) # get reward
            QTable[state][action] = QTable[state][action] + learning_rate * ( reward + gamma * np.max(QTable[new_state]) - QTable[state][action])

            if terminated or truncated:
                break

            state = new_state
    return QTable

In [None]:
QTable_frozenLake = train(n_training_episodes,min_epsilon,max_epsilon,decay_rate,env,max_steps,QTable)

  0%|          | 0/1000000 [00:00<?, ?it/s]

In [None]:
QTable_frozenLake

array([[0.8590683 , 0.88109569, 0.83759159, 0.8590683 ],
       [0.8590683 , 0.85893622, 0.        , 0.83722924],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.88109569, 0.90368789, 0.8590683 , 0.8590683 ],
       [0.88109569, 0.        , 0.83547107, 0.83751378],
       [0.85880887, 0.        , 0.00153092, 0.        ],
       [0.07152119, 0.        , 0.        , 0.        ],
       [0.90368789, 0.92685937, 0.        , 0.88109569],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.92685937, 0.92685937, 0.950625  , 0.90368789],
       [0.92685937, 0.950625  , 0.975     , 0.        ],
       [0.950625  , 0.975     , 1.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ]])

In [None]:
# Evaluation
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):
    episode_rewards = []
    for episode in tqdm(range(n_eval_episodes)):
        if seed:
            state, info = env.reset(seed=seed[episode])
        else:
            state, info = env.reset()
        step = 0
        truncated = False
        terminated = False
        total_rewards_ep = 0

        for step in range(max_steps):
            action = greedy_policy(Q, state)
            new_state, reward, terminated, truncated, info = env.step(action)
            total_rewards_ep += reward

            if terminated or truncated:
                break
            state = new_state
        episode_rewards.append(total_rewards_ep)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward

# Evaluate our Agent
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, QTable_frozenLake, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

  0%|          | 0/100 [00:00<?, ?it/s]

Mean_reward=1.00 +/- 0.00


# Step 3 : Uploading to huggingface hub


In [None]:
# api for colab to huggingface
from huggingface_hub import notebook_login

notebook_login()
!git config --global credential.helper store

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi, snapshot_download
from huggingface_hub.repocard import metadata_eval_result, metadata_save

from pathlib import Path
import datetime
import json

In [None]:
def record_video(env, Qtable, out_directory, fps=1):
    """
    Generate a replay video of the agent
    :param env
    :param Qtable: Qtable of our agent
    :param out_directory
    :param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
    """
    images = []
    terminated = False
    truncated = False
    state, info = env.reset(seed=random.randint(0, 500))
    img = env.render()
    images.append(img)
    while not terminated or truncated:
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(Qtable[state][:])
        state, reward, terminated, truncated, info = env.step(
            action
        )  # We directly put next_state = state for recording logic
        img = env.render()
        images.append(img)
    imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)

In [None]:
def push_to_hub(repo_id, model, env, video_fps=1, local_repo_path="hub"):
    """
    Evaluate, Generate a video and Upload a model to Hugging Face Hub.
    This method does the complete pipeline:
    - It evaluates the model
    - It generates the model card
    - It generates a replay video of the agent
    - It pushes everything to the Hub

    :param repo_id: repo_id: id of the model repository from the Hugging Face Hub
    :param env
    :param video_fps: how many frame per seconds to record our video replay
    (with taxi-v3 and frozenlake-v1 we use 1)
    :param local_repo_path: where the local repository is
    """
    _, repo_name = repo_id.split("/")

    eval_env = env
    api = HfApi()

    # Step 1: Create the repo
    repo_url = api.create_repo(
        repo_id=repo_id,
        exist_ok=True,
    )

    # Step 2: Download files
    repo_local_path = Path(snapshot_download(repo_id=repo_id))

    # Step 3: Save the model
    if env.spec.kwargs.get("map_name"):
        model["map_name"] = env.spec.kwargs.get("map_name")
        if env.spec.kwargs.get("is_slippery", "") == False:
            model["slippery"] = False

    # Pickle the model
    with open((repo_local_path) / "q-learning.pkl", "wb") as f:
        pickle.dump(model, f)

    # Step 4: Evaluate the model and build JSON with evaluation metrics
    mean_reward, std_reward = evaluate_agent(
        eval_env, model["max_steps"], model["n_eval_episodes"], model["qtable"], model["eval_seed"]
    )

    evaluate_data = {
        "env_id": model["env_id"],
        "mean_reward": mean_reward,
        "n_eval_episodes": model["n_eval_episodes"],
        "eval_datetime": datetime.datetime.now().isoformat(),
    }

    # Write a JSON file called "results.json" that will contain the
    # evaluation results
    with open(repo_local_path / "results.json", "w") as outfile:
        json.dump(evaluate_data, outfile)

    # Step 5: Create the model card
    env_name = model["env_id"]
    if env.spec.kwargs.get("map_name"):
        env_name += "-" + env.spec.kwargs.get("map_name")

    if env.spec.kwargs.get("is_slippery", "") == False:
        env_name += "-" + "no_slippery"

    metadata = {}
    metadata["tags"] = [env_name, "q-learning", "reinforcement-learning", "custom-implementation"]

    # Add metrics
    eval = metadata_eval_result(
        model_pretty_name=repo_name,
        task_pretty_name="reinforcement-learning",
        task_id="reinforcement-learning",
        metrics_pretty_name="mean_reward",
        metrics_id="mean_reward",
        metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
        dataset_pretty_name=env_name,
        dataset_id=env_name,
    )

    # Merges both dictionaries
    metadata = {**metadata, **eval}

    model_card = f"""
  # **Q-Learning** Agent playing1 **{env_id}**
  This is a trained model of a **Q-Learning** agent playing **{env_id}** .

  ## Usage

  model = load_from_hub(repo_id="{repo_id}", filename="q-learning.pkl")

  # Don't forget to check if you need to add additional attributes (is_slippery=False etc)
  env = gym.make(model["env_id"])
  """

    evaluate_agent(env, model["max_steps"], model["n_eval_episodes"], model["qtable"], model["eval_seed"])

    readme_path = repo_local_path / "README.md"
    readme = ""
    print(readme_path.exists())
    if readme_path.exists():
        with readme_path.open("r", encoding="utf8") as f:
            readme = f.read()
    else:
        readme = model_card

    with readme_path.open("w", encoding="utf-8") as f:
        f.write(readme)

    # Save our metrics to Readme metadata
    metadata_save(readme_path, metadata)

    # Step 6: Record a video
    video_path = repo_local_path / "replay.mp4"
    record_video(env, model["qtable"], video_path, video_fps)

    # Step 7. Push everything to the Hub
    api.upload_folder(
        repo_id=repo_id,
        folder_path=repo_local_path,
        path_in_repo=".",
    )

    print("Your model is pushed to the Hub. You can view your model here: ", repo_url)

In [None]:
model = {
    "env_id": env_id,
    "max_steps": max_steps,
    "n_training_episodes": n_training_episodes,
    "n_eval_episodes": n_eval_episodes,
    "eval_seed": eval_seed,
    "learning_rate": learning_rate,
    "gamma": gamma,
    "max_epsilon": max_epsilon,
    "min_epsilon": min_epsilon,
    "decay_rate": decay_rate,
    "qtable": QTable_frozenLake,
}

In [None]:
username = "JaiSurya"  # FILL THIS
repo_name = "q-FrozenLake-v1-4x4-noSlippery"
push_to_hub(repo_id=f"{username}/{repo_name}", model=model, env=env)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

False


q-learning.pkl:   0%|          | 0.00/916 [00:00<?, ?B/s]

Your model is pushed to the Hub. You can view your model here:  https://huggingface.co/JaiSurya/q-FrozenLake-v1-4x4-noSlippery
