<a href="https://colab.research.google.com/github/Khalid-2402/big-data/blob/main/018_MC_Frozenlake.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Monte Carlo every-visit using Q table on Frozen Lake

In [3]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0


In [5]:
!pip install session_info

Collecting session_info
  Downloading session_info-1.0.0.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting stdlib_list (from session_info)
  Downloading stdlib_list-0.11.0-py3-none-any.whl.metadata (3.3 kB)
Downloading stdlib_list-0.11.0-py3-none-any.whl (83 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.6/83.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: session_info
  Building wheel for session_info (setup.py) ... [?25l[?25hdone
  Created wheel for session_info: filename=session_info-1.0.0-py3-none-any.whl size=8023 sha256=2b986061047bd001a36110552b5966b4f3045944fb8f433b47ac558594ccdc13
  Stored in directory: /root/.cache/pip/wheels/6a/aa/b9/eb5d4031476ec10802795b97ccf937b9bd998d68a9b268765a
Successfully built session_info
Installing collected packages: stdlib_list, session_info
Successfully installed session_info-1.0.0 stdlib_list-0.11.0


In [6]:
import numpy as np
import gymnasium as gym
import random
import session_info

In [7]:
n_episodes = 500000
current_epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.001
decay_rate = 0.0001
Reward_list = []


# Hyperparameters
n_episodes = 50000
max_steps = 100
gamma = 0.99  # Discount factor
alpha = 0.1  # Learning rate
epsilon_max = 1.0
epsilon_min = 0.01
epsilon_decay = 0.001

In [8]:
def print_policy(Q, env, cols, rows):
    def action_to_symbol(action):
        return ['←', '↓', '→', '↑'][action]

    policy = np.zeros((rows, cols), dtype=str)
    for state in range(env.observation_space.n):
        if np.sum(Q[state]) == 0:
            policy[state // cols, state % cols] = 'o'
        else:
            best_action = np.argmax(Q[state])
            policy[state // cols, state % cols] = action_to_symbol(best_action)

    # Mark special positions
    desc = env.unwrapped.desc
    for i in range(rows):
        for j in range(cols):
            if desc[i][j] in b'GH':
                policy[i, j] = desc[i][j].decode('utf-8')
    policy[0, 0] = 'S'

    print("=== Learned Policy ===")
    print()
    for row in policy:
        print(' '.join(row))


In [9]:
# Initialize the environment
env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=False)

# Initialize Q-table
Q = np.zeros((env.observation_space.n, env.action_space.n))
N = np.zeros((env.observation_space.n, env.action_space.n))  # For tracking visit counts

def epsilon_greedy_policy(state, epsilon):  # Explotaition vs Exploration
    if np.random.random() < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q[state])

total_rewards = []

for episode in range(n_episodes):
    state, _ = env.reset()
    epsilon = max(epsilon_min, epsilon_max - (epsilon_max - epsilon_min) * (episode / n_episodes))

    episode_states = []
    episode_actions = []
    episode_rewards = []

    for step in range(max_steps):
        action = epsilon_greedy_policy(state, epsilon)

        next_state, reward, done, _, _ = env.step(action)

        episode_states.append(state)
        episode_actions.append(action)
        episode_rewards.append(reward)

        if done:
            break

        state = next_state

    # Calculate returns and update Q-table
    G = 0
    for t in range(len(episode_states) - 1, -1, -1):
        state = episode_states[t]
        action = episode_actions[t]
        G = gamma * G + episode_rewards[t]

        N[state, action] += 1
        Q[state, action] += (alpha * (G - Q[state, action]))

    total_rewards.append(sum(episode_rewards))

    # Print progress
    if (episode + 1) % 1000 == 0:
        avg_reward = np.mean(total_rewards[-1000:])
        print(f"Episode: {episode + 1}, Average Reward (last 1000 episodes): {avg_reward:.2f}")

print("Training completed.")



Episode: 1000, Average Reward (last 1000 episodes): 0.01
Episode: 2000, Average Reward (last 1000 episodes): 0.02
Episode: 3000, Average Reward (last 1000 episodes): 0.02
Episode: 4000, Average Reward (last 1000 episodes): 0.03
Episode: 5000, Average Reward (last 1000 episodes): 0.05
Episode: 6000, Average Reward (last 1000 episodes): 0.03
Episode: 7000, Average Reward (last 1000 episodes): 0.05
Episode: 8000, Average Reward (last 1000 episodes): 0.05
Episode: 9000, Average Reward (last 1000 episodes): 0.06
Episode: 10000, Average Reward (last 1000 episodes): 0.08
Episode: 11000, Average Reward (last 1000 episodes): 0.09
Episode: 12000, Average Reward (last 1000 episodes): 0.10
Episode: 13000, Average Reward (last 1000 episodes): 0.10
Episode: 14000, Average Reward (last 1000 episodes): 0.14
Episode: 15000, Average Reward (last 1000 episodes): 0.11
Episode: 16000, Average Reward (last 1000 episodes): 0.16
Episode: 17000, Average Reward (last 1000 episodes): 0.16
Episode: 18000, Average

In [10]:
# Test the learned policy
n_test_episodes = 100
test_rewards = []

for _ in range(n_test_episodes):
    state = env.reset()[0]
    episode_reward = 0

    for _ in range(max_steps):
        action = np.argmax(Q[state])
        next_state, reward, done, _, _ = env.step(action)
        episode_reward += reward
        if done:
            break
        state = next_state

    test_rewards.append(episode_reward)

print(f"Average reward over {n_test_episodes} test episodes: {np.mean(test_rewards):.2f}")

Average reward over 100 test episodes: 1.00


In [11]:
print_policy(Q,env, 4,4 )

=== Learned Policy ===

S ← ↓ ↑
↓ H ↓ H
→ → ↓ H
H → → G


In [12]:
session_info.show(html=False)

-----
gymnasium           1.0.0
numpy               1.26.4
session_info        1.0.0
-----
IPython             7.34.0
jupyter_client      6.1.12
jupyter_core        5.7.2
notebook            6.5.5
-----
Python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0]
Linux-6.1.85+-x86_64-with-glibc2.35
-----
Session information updated at 2024-10-31 10:55
