### IMPORTS

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
from skimage.color import rgb2gray
from skimage.transform import resize
import random
from collections import deque

2024-07-11 14:38:16.655816: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### SETUP

In [12]:
seed = 777
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = epsilon_max - epsilon_min
batch_size = 32
max_steps_per_episode = 5000
target_update_freq = 5000  # More frequent updates for better monitoring
replay_buffer_size = 100000
epsilon_random_frames = 50000
epsilon_greedy_frames = 1000000.0
update_after_actions = 4
checkpoint_freq = 10  # Save model every 10 episodes

In [13]:
# Wrappers for environment

# Because Car Racing has continuous actions (steer, gas, brake) this ensures action space is noramilzed 
class NormalizeActionWrapper(gym.ActionWrapper):
    def action(self, action):
        action = np.clip(action, -1, 1)
        return action

# Sets RGB channels to grayscale (1 v 3 channels) and reseizes from 96x96 to 84x84 for faster processing
class ResizeAndGrayscaleWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(ResizeAndGrayscaleWrapper, self).__init__(env)
        from gym.spaces import Box
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        
    def observation(self, observation):
        gray_observation = rgb2gray(observation)
        resized_observation = resize(gray_observation, (84, 84), anti_aliasing=True)
        return np.expand_dims(resized_observation, axis=-1)

def make_carracing_env(seed):
    env = gym.make("CarRacing-v2", render_mode='rgb_array')
    env = ResizeAndGrayscaleWrapper(env)
    env = NormalizeActionWrapper(env)
    env.reset(seed=seed)
    return env

# Create the environment
env = make_carracing_env(seed)

num_actions = env.action_space.shape[0]

In [14]:
# Create Model
def create_q_model():
    # Network defined by the Deepmind paper
    inputs = layers.Input(shape=(84, 84, 1))

    # Convolutions on the frames on the screen
    layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs)
    layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1)
    layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2)

    layer4 = layers.Flatten()(layer3)

    layer5 = layers.Dense(512, activation="relu")(layer4)
    action = layers.Dense(num_actions, activation="linear")(layer5)

    return keras.Model(inputs=inputs, outputs=action)


# The first model makes the predictions for Q-values which are used to make an action.
model = create_q_model()
# Build target model for the prediction of future rewards.
# The weights of target model get updated every 10000 steps thus when the loss between the Q-values is calculated the target Q-value is stable.
model_target = create_q_model()
model_target.set_weights(model.get_weights())  # Initialize target model with same weights as model



In [15]:
# Define optimizer and loss function
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)
loss_function = keras.losses.Huber()

# Replay buffer
replay_buffer = deque(maxlen=replay_buffer_size)

# Training step
def train_step(states, actions, rewards, next_states, dones):
    with tf.GradientTape() as tape:
        q_values = model(states, training=True)
        
        # Get the predicted Q-values for the taken actions
        q_values_for_actions = tf.reduce_sum(actions * q_values, axis=1)

        next_q_values = model_target(next_states, training=False)
        max_next_q_values = tf.reduce_max(next_q_values, axis=1)
        target_q_values = rewards + (1.0 - dones) * gamma * max_next_q_values

        loss = loss_function(target_q_values, q_values_for_actions)

    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

# Policy function for selecting actions
def epsilon_greedy_policy(state, epsilon):
    if np.random.rand() < epsilon:
        return env.action_space.sample()  # Sample continuous actions from the action space
    else:
        state = np.expand_dims(state, axis=0)
        q_values = model.predict(state)
        return q_values[0]  # Return the continuous action vector


# Training loop
num_episodes = 1000
episode_reward_history = []
running_reward = 0
frame_count = 0

for episode in range(num_episodes):
    state, _ = env.reset()
    state = np.array(state)
    episode_reward = 0

    for step in range(max_steps_per_episode):
        frame_count += 1

        # Use epsilon-greedy for exploration
        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            action = env.action_space.sample()
        else:
            state_tensor = tf.convert_to_tensor(state)
            state_tensor = tf.expand_dims(state_tensor, 0)
            action_probs = model(state_tensor, training=False)
            action = action_probs[0].numpy()

        # Decay probability of taking random action
        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

        # Apply the sampled action in our environment
        next_state, reward, done, _, _ = env.step(action)
        next_state = np.array(next_state)
        episode_reward += reward

        # Save actions and states in replay buffer
        replay_buffer.append((state, action, reward, next_state, done))
        state = next_state

        # Update every fourth frame and once batch size is over 32
        if frame_count % update_after_actions == 0 and len(replay_buffer) > batch_size:
            batch = random.sample(replay_buffer, batch_size)
            states, actions, rewards, next_states, dones = map(np.array, zip(*batch))

            train_step(states, actions, rewards, next_states, dones)

        if frame_count % target_update_freq == 0:
            model_target.set_weights(model.get_weights())
            print(f"Running reward: {running_reward:.2f} at episode {episode}, frame count {frame_count}")

        if len(replay_buffer) > replay_buffer_size:
            replay_buffer.popleft()

        if done:
            break

    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        episode_reward_history.pop(0)
    running_reward = np.mean(episode_reward_history)

    print(f"Episode {episode + 1}/{num_episodes} - Reward: {episode_reward} - Running Reward: {running_reward:.2f} - Epsilon: {epsilon:.2f}")

    # Save model every checkpoint_freq episodes
    if (episode + 1) % checkpoint_freq == 0:
        model.save(f"car_racing_model_episode_{episode + 1}.keras")
        model_target.save(f"car_racing_model_target_episode_{episode + 1}.keras")
        print(f"Checkpoint saved at episode {episode + 1}")

    if running_reward > 40:  # Condition to consider the task solved
        print(f"Solved at episode {episode}!")
        model.save(f"car_racing_model_solved.h5")
        model_target.save(f"car_racing_model_target_solved.h5")
        break

Running reward: 0.00 at episode 0, frame count 5000
Episode 1/1000 - Reward: -426.0563380281976 - Running Reward: -426.06 - Epsilon: 1.00
Running reward: -426.06 at episode 1, frame count 10000
Episode 2/1000 - Reward: -435.6913183280052 - Running Reward: -430.87 - Epsilon: 0.99
Running reward: -430.87 at episode 2, frame count 15000
Episode 3/1000 - Reward: -432.4758842444031 - Running Reward: -431.41 - Epsilon: 0.99
Running reward: -431.41 at episode 3, frame count 20000
Episode 4/1000 - Reward: -431.8181818182119 - Running Reward: -431.51 - Epsilon: 0.98
Episode 5/1000 - Reward: -420.600000000004 - Running Reward: -429.33 - Epsilon: 0.98
Running reward: -429.33 at episode 5, frame count 25000
Episode 6/1000 - Reward: -433.7349397590668 - Running Reward: -430.06 - Epsilon: 0.97
Running reward: -430.06 at episode 6, frame count 30000
Episode 7/1000 - Reward: -429.2929292929587 - Running Reward: -429.95 - Epsilon: 0.97
Running reward: -429.95 at episode 7, frame count 35000
Episode 8/1



Episode 10/1000 - Reward: -432.25806451615915 - Running Reward: -429.89 - Epsilon: 0.96
Checkpoint saved at episode 10
Running reward: -429.89 at episode 10, frame count 50000
Episode 11/1000 - Reward: -426.31578947371304 - Running Reward: -429.57 - Epsilon: 0.95
Running reward: -429.57 at episode 11, frame count 55000
Episode 12/1000 - Reward: -433.7539432176962 - Running Reward: -429.92 - Epsilon: 0.95
Running reward: -429.92 at episode 12, frame count 60000
Episode 13/1000 - Reward: -434.9845201238699 - Running Reward: -430.31 - Epsilon: 0.94
Running reward: -430.31 at episode 13, frame count 65000
Episode 14/1000 - Reward: -436.74698795183843 - Running Reward: -430.77 - Epsilon: 0.94
Running reward: -430.77 at episode 14, frame count 70000
Episode 15/1000 - Reward: -429.05405405408357 - Running Reward: -430.65 - Epsilon: 0.93
Running reward: -430.65 at episode 15, frame count 75000
Episode 16/1000 - Reward: -421.05263157897497 - Running Reward: -430.05 - Epsilon: 0.93
Running rewar

KeyboardInterrupt: 

In [5]:
# Video recording function
def record_video(env, policy, filename, episode_length=1000):
    wrapped_env = RecordVideo(env, video_folder='videos', episode_trigger=lambda x: True, name_prefix=filename)
    done = False
    state = wrapped_env.reset()
    state = np.array(state)
    for _ in range(episode_length):
        wrapped_env.render()
        action = policy(state)
        state, reward, done, _, _ = wrapped_env.step(action)
        state = np.array(state)
        if done:
            break
    wrapped_env.close()



In [6]:
# Sample policy (random actions for now, replace with your policy)
def trained_policy(state):
    state = np.expand_dims(state, axis=0)
    q_values = model.predict(state)
    return np.argmax(q_values[0])

record_video(env, trained_policy, 'last_episode')

  logger.warn(


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.