In [77]:
import sys
import gymnasium as gym
import numpy as np
from collections import defaultdict
import time # to get the time

env = gym.make('MountainCar-v0',render_mode="rgb_array")

In [78]:
num_episodes = 60000
alpha = 0.15
epsilon = 0.15
nA = env.action_space.n
np.random.seed(3)
Observation = [100,100]
Q = np.random.uniform(low=0, high=1, size=(Observation + [nA]))
Q.shape

(100, 100, 3)

In [79]:
def epsilon_greedy(Q, state, nA, epsilon):
    if np.random.random() > epsilon:
        return np.argmax(Q[state])
    else:
        return np.random.choice(nA)

In [80]:
def get_discrete_state(state, n_bins=(100, 100)):
    """
    Convert the continuous state values to discrete values.

    Parameters:
        state (np.ndarray): The current state of the environment.
        n_bins (tuple): The number of bins to use for each state variable.

    Returns:
        tuple: The discrete state representation.
    """
    # Define the bounds for each state variable
    bounds = [
        [-1.2, 0.5],  # cart position
        [-0.07, 0.07],  # cart velocity
    ]
    
    # Calculate the bin width for each state variable
    bin_widths = [(bounds[i][1] - bounds[i][0]) / n_bins[i] for i in range(len(bounds))]
    # Convert each state variable to a discrete value
    discrete_state = tuple(int((state[i] - bounds[i][0]) / bin_widths[i]) for i in range(len(bounds)))
    
    # Make sure the discrete state is within the bounds of the Q-table
    for i in range(len(bounds)):
        if discrete_state[i] < 0:
            discrete_state = list(discrete_state)
            discrete_state[i] = 0
            discrete_state = tuple(discrete_state)
        elif discrete_state[i] >= n_bins[i]:
            discrete_state = list(discrete_state)
            discrete_state[i] = n_bins[i] - 1
            discrete_state = tuple(discrete_state)
    
    return discrete_state

In [81]:
# MC algorithm
total_reward = 0
total_time = 0
max_time = 0
min_time = 1e+20

# loop over episodes
for i_episode in range(1, num_episodes+1):
    episode_reward = 0

    # total = 0
    # episode = []

    t0 = time.time() #set the initial time
    state = env.reset(seed=32)[0]
    d_state = get_discrete_state(state)
    action = epsilon_greedy(Q, d_state, nA, epsilon)
    terminated, truncated = False,False
    n = 1
    while not (terminated or truncated):
        next_state, reward, terminated, truncated, info = env.step(action)
        if abs(state[1]) >= abs(next_state[1]):
            reward = -10
        if next_state[0] >= 0.5:
            reward = 50
        next_state = get_discrete_state(next_state)
        next_action = epsilon_greedy(Q, next_state, nA, epsilon)
        episode_reward += reward
        Q[d_state][action] += (1/n)*(episode_reward-Q[d_state][action])
        d_state = next_state
        action = next_action
        n += 1

    t1 = time.time() #episode has finished
    
    episode_time = t1 - t0 #episode total time
    if episode_time > max_time:
        max_time = episode_time
    if episode_time < min_time:
        min_time = episode_time
    total_time += episode_time
    total_reward += episode_reward #episode total reward
    # monitor progress
    if i_episode % 10 == 0:
        print(f"\rEpisode {i_episode}/{num_episodes}.", end="")
        sys.stdout.flush()


Episode 60000/60000.

In [82]:
mean_reward = total_reward / num_episodes
mean_time = total_time / num_episodes

print("Mean Reward: " + str(mean_reward))
print("Time Average: " + str(mean_time))
print("Biggist ep time: " + str(max_time))
print("smallest ep time: " + str(min_time))

Mean Reward: -184.07355
Time Average: 0.014679027791817983
Biggist ep time: 0.09396934509277344
smallest ep time: 0.00798487663269043


In [83]:
env = gym.make('MountainCar-v0',render_mode="rgb_array_list")

In [91]:
from gym.utils.save_video import save_video
# Evaluate learned policy
state = env.reset(seed=32)
state = state[0]
state = get_discrete_state(state)
terminated, truncated = False,False
counter = 0
while not (terminated or truncated):
    action = np.argmax(Q[state])
    next_state, reward, terminated, truncated, info = env.step(action)
    next_state = get_discrete_state(next_state)
    state = next_state
    counter += 1
    

print("number of action: " + str(counter))

save_video(
  env.render(),
  "videos",
  fps=35,
  episode_index=0
)

number of action: 194
Moviepy - Building video c:\Users\marwa\Desktop\task3\videos/rl-video-episode-0.mp4.
Moviepy - Writing video c:\Users\marwa\Desktop\task3\videos/rl-video-episode-0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready c:\Users\marwa\Desktop\task3\videos/rl-video-episode-0.mp4
