In [27]:
import gymnax
import gymnasium as gym
import jax
import numpy as np
import random
import matplotlib.pyplot as plt

# Q-Learning

In [28]:
rng = jax.random.PRNGKey(0)
rng, key_reset, key_policy, key_step = jax.random.split(rng, 4)
env = gym.make("MountainCar-v0")

In [29]:
# Get the observation space
observation_space = env.observation_space
# Print the min and max values for the observation space
print("Observation space low values:", observation_space.low)
print("Observation space high values:", observation_space.high)

Observation space low values: [-1.2  -0.07]
Observation space high values: [0.6  0.07]


In [30]:
action_space = env.action_space
print("Number of actions:", action_space.n)

Number of actions: 3


In [31]:
print(action_space)
print(observation_space)

Discrete(3)
Box([-1.2  -0.07], [0.6  0.07], (2,), float32)


In [32]:
num_bins = 100
position_bins = np.linspace(env.observation_space.low[0], env.observation_space.high[0], num_bins)
velocity_bins = np.linspace(env.observation_space.low[1], env.observation_space.high[1], num_bins)

#print(position_bins)
#print(velocity_bins)

In [33]:
# Q-table initialization
num_actions = env.action_space.n
q_table = np.zeros((num_bins, num_bins, num_actions))

In [34]:
def discretize_state(state):
    position, velocity = state
    position_bin = np.digitize(position, position_bins) - 1  
    velocity_bin = np.digitize(velocity, velocity_bins) - 1
    position_bin = np.clip(position_bin, 0, num_bins - 1)
    velocity_bin = np.clip(velocity_bin, 0, num_bins - 1)
    return position_bin, velocity_bin

In [35]:
def epsilon_greedy(state, epsilon):
    if random.uniform(0, 1) < epsilon:
        return random.choice(range(num_actions))
    else:
        position_bin, velocity_bin = state
        return np.argmax(q_values_table[position_bin, velocity_bin])

In [46]:
def train_q_learning(env, num_bins, num_episodes, num_steps, alpha, gamma, epsilon, epsilon_decay, min_epsilon):
    # Initialize Q-table
    num_actions = env.action_space.n
    q_values_table = np.zeros((num_bins, num_bins, num_actions))
    
    # Tracking performance
    episode_rewards = []
    episode_lengths = []

    for i in range(num_episodes):
        # Initialize the state by resetting the environment
        state, _ = env.reset()
    
        # Discretize the state
        discretized_state = discretize_state(state)
    
        total_reward = 0
        for t in range(num_steps):
            # Select the action using the epsilon-greedy policy
            action = epsilon_greedy(discretized_state, epsilon)
    
            # Perform the selected action and store the next state information
            next_state, reward, done, _, _ = env.step(action)
    
            # Discretize the next state
            discretized_next_state = discretize_state(next_state)
    
            # Find the action a' with the maximum Q-value in the next state
            next_action = np.argmax(q_values_table[discretized_next_state])
    
            # Update Q-value of the state-action pair using the Q-learning update rule
            q_values_table[discretized_state[0], discretized_state[1], action] += alpha * (
                reward + gamma * q_values_table[discretized_next_state[0], discretized_next_state[1], next_action] - q_values_table[discretized_state[0], discretized_state[1], action]
            )
    
            # Update current state to next state
            discretized_state = discretized_next_state
    
            # Track total reward for the episode
            total_reward += reward
    
            # If the current state is the terminal state, break
            if done:
                break
        
        # Decay epsilon
        epsilon = max(min_epsilon, epsilon * epsilon_decay)

        # Track performance
        episode_rewards.append(total_reward)
        episode_lengths.append(t)

        #if (i + 1) % 100 == 0:
        #    print(f"Episode {i + 1}: Total Reward: {total_reward}, Epsilon: {epsilon:.2f}")

    return episode_rewards, episode_lengths

In [49]:
import random

# Define ranges for hyperparameters
alpha_range = (0.01, 0.15)
gamma_range = (0.8, 0.99)
epsilon_decay_range = (0.999, 0.9999)
min_epsilon_range = (0.05, 0.1)

num_trials = 30  # Number of random samples to test
all_results = []

for _ in range(num_trials):
    # Randomly sample hyperparameters
    alpha = random.uniform(*alpha_range)
    gamma = random.uniform(*gamma_range)
    epsilon_decay = random.uniform(*epsilon_decay_range)
    min_epsilon = random.uniform(*min_epsilon_range)

    # Train with sampled hyperparameters
    episode_rewards, episode_lengths = train_q_learning(
        env, num_bins, num_episodes=5000, num_steps=400,  # Reduce episodes for faster testing
        alpha=alpha, gamma=gamma, epsilon=1.0,
        epsilon_decay=epsilon_decay, min_epsilon=min_epsilon
    )

    # Record the results
    avg_reward = sum(episode_rewards[-100:]) / 100  # Average reward over last 100 episodes
    all_results.append((avg_reward, alpha, gamma, epsilon_decay, min_epsilon))
    print("Done")

# Find the best parameters
best_result = max(all_results, key=lambda x: x[0])
print("Best Hyperparameters:", best_result[1:])
print("Best Average Reward:", best_result[0])


Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Done
Best Hyperparameters: (0.13333875294963685, 0.8609157583736841, 0.9992401308627211, 0.07413601675463205)
Best Average Reward: -311.87


In [47]:
# Q-Learning Algorithm
num_episodes = 10000
num_steps = 400  # Max steps per episode
epsilon = 1.0  # Initial epsilon (exploration probability)

alpha_values = [0.01, 0.1, 0.15]  # Learning rate
gamma_values = [0.8, 0.9, 0.99]   # Discount factor
epsilon_decay_values = [0.999, 0.9995, 0.9999]  # Epsilon decay
min_epsilon_values = [0.05, 0.1] 

all_episode_reward_list = []
all_episode_lengths_list = []

for alpha in alpha_values:
    for gamma in gamma_values:
        for epsilon_decay in epsilon_decay_values:
            for min_epsilon in min_epsilon_values:
                episode_reward_list, episode_lengths_list = train_q_learning(env,num_bins,num_episodes,num_steps,alpha,gamma,epsilon,epsilon_decay,min_epsilon)
                all_episode_reward_list.append(episode_reward_list)
                all_episode_lengths_list.append(episode_lengths_list)

KeyboardInterrupt: 

# Deep Q-Network (dqn)

# Soft Actor-Critic (sac)