In [13]:
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, Dense, Flatten, Lambda, Add, Concatenate, Layer
from tensorflow.keras.optimizers import Adam

import gymnasium as gym
import ale_py
gym.register_envs(ale_py)

from collections import deque
import cv2
from tqdm import tqdm
import matplotlib.pyplot as plt

# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [18]:
# Constants
ENV_NAME = "ALE/Breakout-v5"
FRAMES_IN_STATE = 3  # Number of consecutive frames to stack for state representation
FRAME_SIZE = (84, 84)  # Size of each frame after preprocessing
BATCH_SIZE = 32
REPLAY_BUFFER_SIZE = 100000
GAMMA = 0.99  # Discount factor
EPSILON_START = 1.0
EPSILON_END = 0.1
EPSILON_DECAY_STEPS = 1000000
LEARNING_RATE = 0.00025
TARGET_UPDATE_FREQ = 10000  # Steps between target network updates
TRAIN_FREQ = 4  # Steps between training
SAVE_MODEL_FREQ = 50000  # Steps between saving model checkpoints
TOTAL_STEPS = 5000000  # Total steps to train for
REPLAY_START_SIZE = 50000  # Steps to populate replay buffer before training starts
SAVE_DIR = "breakout_dqn_model"
PLOT_FREQUENCY = 10000  # Steps between plotting progress

class PreprocessAtari:
    """Class to preprocess Atari frames."""
    def __init__(self, frame_size=FRAME_SIZE):
        self.frame_size = frame_size
        
    def process(self, frame):
        """Convert RGB to grayscale, crop, and resize."""
        # Convert to grayscale
        gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        
        # Crop the frame (remove the score at the top)
        cropped = gray[34:194, :]
        
        # Resize the frame
        resized = cv2.resize(cropped, self.frame_size, interpolation=cv2.INTER_AREA)
        
        # Normalize pixel values
        normalized = resized / 255.0
        
        return normalized

class FrameStack:
    """Class to manage frame stacking for state representation."""
    def __init__(self, num_frames=FRAMES_IN_STATE, frame_size=FRAME_SIZE):
        self.num_frames = num_frames
        self.frames = deque(maxlen=num_frames)
        self.frame_size = frame_size
        
    def reset(self, initial_frame):
        """Reset the frame stack with initial frame."""
        self.frames.clear()
        processed_frame = PreprocessAtari(self.frame_size).process(initial_frame)
        for _ in range(self.num_frames):
            self.frames.append(processed_frame)
        return self.get_state()
    
    def add_frame(self, frame):
        """Add a new frame to the stack."""
        processed_frame = PreprocessAtari(self.frame_size).process(frame)
        self.frames.append(processed_frame)
        return self.get_state()
    
    def get_state(self):
        """Get the current state as stacked frames."""
        return np.stack(self.frames, axis=-1)

class ReplayBuffer:
    """Experience replay buffer to store and sample transitions."""
    def __init__(self, capacity=REPLAY_BUFFER_SIZE):
        self.buffer = deque(maxlen=capacity)
        
    def add(self, state, action, reward, next_state, done):
        """Add a transition to the buffer."""
        self.buffer.append((state, action, reward, next_state, done))
        
    def sample(self, batch_size):
        """Sample a batch of transitions."""
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            np.array(states, dtype=np.float32),
            np.array(actions, dtype=np.int32),
            np.array(rewards, dtype=np.float32),
            np.array(next_states, dtype=np.float32),
            np.array(dones, dtype=np.bool_)
        )
        
    def __len__(self):
        return len(self.buffer)

class DuelingLayer(Layer):
    """Custom Keras layer implementing the dueling network architecture."""
    def __init__(self, num_actions, **kwargs):
        super(DuelingLayer, self).__init__(**kwargs)
        self.num_actions = num_actions
        self.dense_value = Dense(256, activation='relu')
        self.dense_advantage = Dense(256, activation='relu')
        self.value = Dense(1)
        self.advantage = Dense(num_actions)
    
    def call(self, inputs):
        # Value stream
        value_stream = self.dense_value(inputs)
        value = self.value(value_stream)
        
        # Advantage stream
        advantage_stream = self.dense_advantage(inputs)
        advantage = self.advantage(advantage_stream)
        
        # Combine value and advantage streams
        advantage_mean = tf.reduce_mean(advantage, axis=1, keepdims=True)
        q_values = value + (advantage - advantage_mean)
        
        return q_values

def build_dueling_dqn(input_shape, num_actions):
    inputs = Input(shape=input_shape)
    
    # Convolutional layers
    x = Conv2D(32, (8, 8), strides=4, activation='relu')(inputs)
    x = Conv2D(64, (4, 4), strides=2, activation='relu')(x)
    x = Conv2D(64, (3, 3), strides=1, activation='relu')(x)
    x = Flatten()(x)
    
    # Common feature layer
    features = Dense(512, activation='relu')(x)
    
    # Dueling layer that implements the value and advantage streams
    outputs = DuelingLayer(num_actions)(features)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

class DoubleDuelingDQNAgent:
    """Agent implementing Double Dueling DQN."""
    def __init__(self, state_shape, num_actions):
        self.state_shape = state_shape
        self.num_actions = num_actions
        
        # Create main and target networks
        self.main_network = build_dueling_dqn(state_shape, num_actions)
        self.target_network = build_dueling_dqn(state_shape, num_actions)
        self.target_network.set_weights(self.main_network.get_weights())
        
        # Compile the model
        self.main_network.compile(optimizer=Adam(learning_rate=LEARNING_RATE))
        
        # Create replay buffer
        self.replay_buffer = ReplayBuffer()
        
        # Exploration parameters
        self.epsilon = EPSILON_START
        self.epsilon_decay = (EPSILON_START - EPSILON_END) / EPSILON_DECAY_STEPS
        self.epsilon_min = EPSILON_END
        
        # Tracking variables
        self.step_count = 0
        self.training_step = 0
        
        # Create directories for saving models
        if not os.path.exists(SAVE_DIR):
            os.makedirs(SAVE_DIR)
            
        # Metrics
        self.rewards_history = []
        self.episode_lengths = []
        self.episode_count = 0
        self.running_reward = 0
        self.best_reward = float('-inf')
        self.losses = []
        
    def get_action(self, state, training=True):
        """Get an action using epsilon-greedy policy."""
        if training and random.random() < self.epsilon:
            return random.randrange(self.num_actions)
        
        q_values = self.main_network.predict(np.expand_dims(state, axis=0), verbose=0)[0]
        return np.argmax(q_values)
    
    def update_epsilon(self):
        """Decay epsilon value."""
        self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_decay)
        
    def train(self, batch):
        """Train the model on a batch of experiences."""
        states, actions, rewards, next_states, dones = batch
        
        # Get the actions from the main network for double DQN
        next_actions = np.argmax(self.main_network.predict(next_states, verbose=0), axis=1)
        
        # Get Q-values from the target network
        next_q_values = self.target_network.predict(next_states, verbose=0)
        
        # Select Q-values for the actions chosen by the main network
        target_q_values = np.zeros_like(rewards)
        for i in range(len(rewards)):
            if dones[i]:
                target_q_values[i] = rewards[i]
            else:
                target_q_values[i] = rewards[i] + GAMMA * next_q_values[i, next_actions[i]]
        
        # Create a mask for the actions that were taken
        masks = tf.one_hot(actions, self.num_actions)
        
        with tf.GradientTape() as tape:
            q_values = self.main_network(states)
            
            # Apply the masks to get the Q-values for the actions taken
            q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
            
            # Calculate loss
            loss = tf.reduce_mean(tf.square(target_q_values - q_action))
            
        # Backpropagation
        grads = tape.gradient(loss, self.main_network.trainable_variables)
        self.main_network.optimizer.apply_gradients(zip(grads, self.main_network.trainable_variables))
        
        return loss.numpy()
    
    def update_target_network(self):
        """Update weights of the target network."""
        self.target_network.set_weights(self.main_network.get_weights())
        
    def save_model(self, name="model"):
        """Save the model weights."""
        self.main_network.save_weights(f"{SAVE_DIR}/{name}.weights.h5")
        
    def load_model(self, name="model"):
        """Load the model weights."""
        self.main_network.load_weights(f"{SAVE_DIR}/{name}.weights.h5")
        self.target_network.set_weights(self.main_network.get_weights())
        
    def plot_metrics(self):
        """Plot training metrics."""
        plt.figure(figsize=(15, 5))
        
        plt.subplot(1, 3, 1)
        plt.plot(self.rewards_history)
        plt.title('Episode Rewards')
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        
        plt.subplot(1, 3, 2)
        plt.plot(self.episode_lengths)
        plt.title('Episode Lengths')
        plt.xlabel('Episode')
        plt.ylabel('Steps')
        
        if self.losses:
            plt.subplot(1, 3, 3)
            plt.plot(self.losses)
            plt.title('Training Loss')
            plt.xlabel('Training Step')
            plt.ylabel('Loss')
        
        plt.tight_layout()
        plt.savefig(f"{SAVE_DIR}/metrics.png")
        plt.close()

def train_agent():
    """Train the agent on the Atari Breakout environment."""
    env = gym.make(ENV_NAME)
    
    state_shape = (*FRAME_SIZE, FRAMES_IN_STATE)
    num_actions = env.action_space.n
    
    agent = DoubleDuelingDQNAgent(state_shape, num_actions)
    frame_stack = FrameStack()
    preprocessor = PreprocessAtari()
    
    # Variables for tracking
    episode_reward = 0
    episode_steps = 0
    state = frame_stack.reset(env.reset()[0])
    
    # Progress bar
    pbar = tqdm(total=TOTAL_STEPS, desc="Training")
    
    # Main training loop
    while agent.step_count < TOTAL_STEPS:
        # Get action
        action = agent.get_action(state)
        
        # Take step in environment
        next_frame, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        
        # Process next state
        next_state = frame_stack.add_frame(next_frame)
        
        # Store in replay buffer
        agent.replay_buffer.add(state, action, reward, next_state, done)
        
        # Update state
        state = next_state
        
        # Track episode stats
        episode_reward += reward
        episode_steps += 1
        agent.step_count += 1
        pbar.update(1)
        
        # Update epsilon
        agent.update_epsilon()
        
        # Train if replay buffer is large enough and it's time to train
        if len(agent.replay_buffer) > BATCH_SIZE and agent.step_count > REPLAY_START_SIZE and agent.step_count % TRAIN_FREQ == 0:
            batch = agent.replay_buffer.sample(BATCH_SIZE)
            loss = agent.train(batch)
            agent.losses.append(loss)
            agent.training_step += 1
            
        # Update target network periodically
        if agent.step_count % TARGET_UPDATE_FREQ == 0 and agent.step_count > REPLAY_START_SIZE:
            agent.update_target_network()
            print(f"\nUpdated target network at step {agent.step_count}")
        
        # Save model periodically
        if agent.step_count % SAVE_MODEL_FREQ == 0 and agent.step_count > REPLAY_START_SIZE:
            agent.save_model(f"model_step_{agent.step_count}")
            print(f"\nSaved model at step {agent.step_count}")
        
        # Plot metrics periodically
        if agent.step_count % PLOT_FREQUENCY == 0 and agent.step_count > REPLAY_START_SIZE:
            agent.plot_metrics()
            
        # Reset environment if episode is done
        if done:
            # Track episode metrics
            agent.rewards_history.append(episode_reward)
            agent.episode_lengths.append(episode_steps)
            agent.episode_count += 1
            
            # Update running reward
            if agent.running_reward == 0:
                agent.running_reward = episode_reward
            else:
                agent.running_reward = 0.05 * episode_reward + 0.95 * agent.running_reward
                
            # Save best model
            if episode_reward > agent.best_reward:
                agent.best_reward = episode_reward
                agent.save_model("best_model")
                
            # Print episode info
            print(f"\nEpisode {agent.episode_count} - Reward: {episode_reward}, Steps: {episode_steps}, Epsilon: {agent.epsilon:.4f}, Running Reward: {agent.running_reward:.2f}")
            
            # Reset episode variables
            episode_reward = 0
            episode_steps = 0
            state = frame_stack.reset(env.reset()[0])
    
    # Close environment and progress bar
    env.close()
    pbar.close()
    
    # Final save and plot
    agent.save_model("final_model")
    agent.plot_metrics()
    
    return agent

def evaluate_agent(model_path="final_model", episodes=10, render=False):
    """Evaluate the trained agent."""
    env = gym.make(ENV_NAME, render_mode="human" if render else None)
    
    state_shape = (*FRAME_SIZE, FRAMES_IN_STATE)
    num_actions = env.action_space.n
    
    agent = DoubleDuelingDQNAgent(state_shape, num_actions)
    agent.load_model(model_path)
    agent.epsilon = 0.0  # No exploration during evaluation
    
    frame_stack = FrameStack()
    
    total_rewards = []
    
    for episode in range(episodes):
        state = frame_stack.reset(env.reset()[0])
        episode_reward = 0
        done = False
        
        while not done:
            action = agent.get_action(state, training=False)
            next_frame, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            state = frame_stack.add_frame(next_frame)
            episode_reward += reward
            
        total_rewards.append(episode_reward)
        print(f"Episode {episode+1} - Reward: {episode_reward}")
    
    env.close()
    
    print(f"Average Reward over {episodes} episodes: {np.mean(total_rewards):.2f}")
    return total_rewards

In [None]:
# Train the agent
print("Starting training...")
agent = train_agent()

Starting training...


Training:   0%|          | 268/5000000 [00:00<3:29:41, 397.38it/s]


Episode 1 - Reward: 2.0, Steps: 200, Epsilon: 0.9998, Running Reward: 2.00


Training:   0%|          | 460/5000000 [00:00<2:55:18, 475.31it/s]


Episode 2 - Reward: 3.0, Steps: 231, Epsilon: 0.9996, Running Reward: 2.05


Training:   0%|          | 814/5000000 [00:02<3:57:58, 350.11it/s] 


Episode 3 - Reward: 2.0, Steps: 210, Epsilon: 0.9994, Running Reward: 2.05


Training:   0%|          | 1007/5000000 [00:02<2:34:34, 539.03it/s]


Episode 4 - Reward: 3.0, Steps: 257, Epsilon: 0.9992, Running Reward: 2.10


Training:   0%|          | 1203/5000000 [00:03<1:59:51, 695.09it/s]


Episode 5 - Reward: 3.0, Steps: 213, Epsilon: 0.9990, Running Reward: 2.14

Episode 6 - Reward: 0.0, Steps: 129, Epsilon: 0.9989, Running Reward: 2.03


Training:   0%|          | 1606/5000000 [00:03<1:31:22, 911.66it/s]


Episode 7 - Reward: 1.0, Steps: 167, Epsilon: 0.9987, Running Reward: 1.98

Episode 8 - Reward: 0.0, Steps: 129, Epsilon: 0.9986, Running Reward: 1.88


Training:   0%|          | 1799/5000000 [00:03<1:35:35, 871.49it/s]


Episode 9 - Reward: 1.0, Steps: 154, Epsilon: 0.9985, Running Reward: 1.84


Training:   0%|          | 2079/5000000 [00:04<1:33:49, 887.78it/s]


Episode 10 - Reward: 2.0, Steps: 218, Epsilon: 0.9983, Running Reward: 1.85

Episode 11 - Reward: 0.0, Steps: 128, Epsilon: 0.9982, Running Reward: 1.75


Training:   0%|          | 2392/5000000 [00:04<1:23:52, 993.09it/s]


Episode 12 - Reward: 1.0, Steps: 174, Epsilon: 0.9980, Running Reward: 1.72


Training:   0%|          | 2593/5000000 [00:04<1:24:38, 984.00it/s]


Episode 13 - Reward: 2.0, Steps: 224, Epsilon: 0.9978, Running Reward: 1.73

Episode 14 - Reward: 0.0, Steps: 128, Epsilon: 0.9977, Running Reward: 1.64


Training:   0%|          | 2884/5000000 [00:04<1:41:49, 817.87it/s]


Episode 15 - Reward: 4.0, Steps: 272, Epsilon: 0.9974, Running Reward: 1.76

Episode 16 - Reward: 3.0, Steps: 260, Epsilon: 0.9972, Running Reward: 1.82

Episode 17 - Reward: 2.0, Steps: 223, Epsilon: 0.9970, Running Reward: 1.83


Training:   0%|          | 3507/5000000 [00:05<39:20, 2116.63it/s] 


Episode 18 - Reward: 1.0, Steps: 190, Epsilon: 0.9968, Running Reward: 1.79

Episode 19 - Reward: 0.0, Steps: 132, Epsilon: 0.9967, Running Reward: 1.70


Training:   0%|          | 3898/5000000 [00:05<1:11:31, 1164.29it/s]


Episode 20 - Reward: 0.0, Steps: 144, Epsilon: 0.9966, Running Reward: 1.62

Episode 21 - Reward: 0.0, Steps: 137, Epsilon: 0.9965, Running Reward: 1.54


Training:   0%|          | 4171/5000000 [00:05<1:19:11, 1051.47it/s]


Episode 22 - Reward: 0.0, Steps: 128, Epsilon: 0.9964, Running Reward: 1.46

Episode 23 - Reward: 0.0, Steps: 148, Epsilon: 0.9962, Running Reward: 1.39


Training:   0%|          | 4506/5000000 [00:06<1:21:19, 1023.80it/s]


Episode 24 - Reward: 0.0, Steps: 139, Epsilon: 0.9961, Running Reward: 1.32

Episode 25 - Reward: 0.0, Steps: 161, Epsilon: 0.9960, Running Reward: 1.25


Training:   0%|          | 4921/5000000 [00:06<1:34:22, 882.08it/s] 


Episode 26 - Reward: 3.0, Steps: 276, Epsilon: 0.9957, Running Reward: 1.34


Training:   0%|          | 5116/5000000 [00:06<1:30:18, 921.81it/s]


Episode 27 - Reward: 2.0, Steps: 211, Epsilon: 0.9955, Running Reward: 1.37


Training:   0%|          | 5296/5000000 [00:07<1:46:57, 778.35it/s]


Episode 28 - Reward: 0.0, Steps: 144, Epsilon: 0.9954, Running Reward: 1.30

Episode 29 - Reward: 0.0, Steps: 148, Epsilon: 0.9953, Running Reward: 1.24


Training:   0%|          | 5696/5000000 [00:07<1:40:24, 829.03it/s]


Episode 30 - Reward: 3.0, Steps: 260, Epsilon: 0.9950, Running Reward: 1.33


Training:   0%|          | 5864/5000000 [00:08<2:26:00, 570.08it/s]


Episode 31 - Reward: 2.0, Steps: 247, Epsilon: 0.9948, Running Reward: 1.36


Training:   0%|          | 6035/5000000 [00:08<2:19:17, 597.51it/s]


Episode 32 - Reward: 1.0, Steps: 159, Epsilon: 0.9947, Running Reward: 1.34


Training:   0%|          | 6233/5000000 [00:08<1:48:52, 764.42it/s]


Episode 33 - Reward: 0.0, Steps: 134, Epsilon: 0.9945, Running Reward: 1.27


Training:   0%|          | 6414/5000000 [00:08<1:58:50, 700.29it/s]


Episode 34 - Reward: 2.0, Steps: 232, Epsilon: 0.9943, Running Reward: 1.31


Training:   0%|          | 6605/5000000 [00:08<1:41:39, 818.68it/s]


Episode 35 - Reward: 1.0, Steps: 175, Epsilon: 0.9942, Running Reward: 1.30

Episode 36 - Reward: 0.0, Steps: 131, Epsilon: 0.9940, Running Reward: 1.23


Training:   0%|          | 6846/5000000 [00:09<2:20:51, 590.82it/s]


Episode 37 - Reward: 1.0, Steps: 163, Epsilon: 0.9939, Running Reward: 1.22


Training:   0%|          | 7023/5000000 [00:09<1:56:54, 711.80it/s]


Episode 38 - Reward: 0.0, Steps: 146, Epsilon: 0.9938, Running Reward: 1.16

Episode 39 - Reward: 0.0, Steps: 130, Epsilon: 0.9937, Running Reward: 1.10


Training:   0%|          | 7293/5000000 [00:10<2:17:40, 604.42it/s]


Episode 40 - Reward: 1.0, Steps: 175, Epsilon: 0.9935, Running Reward: 1.10


Training:   0%|          | 7528/5000000 [00:10<2:07:52, 650.69it/s]


Episode 41 - Reward: 1.0, Steps: 158, Epsilon: 0.9934, Running Reward: 1.09


Training:   0%|          | 7601/5000000 [00:10<2:05:05, 665.15it/s]


Episode 42 - Reward: 1.0, Steps: 182, Epsilon: 0.9932, Running Reward: 1.09


Training:   0%|          | 7756/5000000 [00:10<2:32:02, 547.22it/s]


Episode 43 - Reward: 0.0, Steps: 138, Epsilon: 0.9931, Running Reward: 1.03


Training:   0%|          | 8069/5000000 [00:11<2:42:11, 512.99it/s]


Episode 44 - Reward: 3.0, Steps: 231, Epsilon: 0.9929, Running Reward: 1.13


Training:   0%|          | 8255/5000000 [00:11<2:19:59, 594.29it/s]


Episode 45 - Reward: 3.0, Steps: 254, Epsilon: 0.9926, Running Reward: 1.22


Training:   0%|          | 8504/5000000 [00:12<2:02:34, 678.73it/s]


Episode 46 - Reward: 2.0, Steps: 217, Epsilon: 0.9924, Running Reward: 1.26

Episode 47 - Reward: 0.0, Steps: 129, Epsilon: 0.9923, Running Reward: 1.20


Training:   0%|          | 8838/5000000 [00:12<2:16:05, 611.28it/s]


Episode 48 - Reward: 1.0, Steps: 180, Epsilon: 0.9922, Running Reward: 1.19


Training:   0%|          | 9010/5000000 [00:13<2:30:25, 552.98it/s]


Episode 49 - Reward: 3.0, Steps: 257, Epsilon: 0.9919, Running Reward: 1.28


Training:   0%|          | 9265/5000000 [00:13<2:26:05, 569.34it/s]


Episode 50 - Reward: 2.0, Steps: 214, Epsilon: 0.9917, Running Reward: 1.32


Training:   0%|          | 9329/5000000 [00:13<2:41:41, 514.44it/s]


Episode 51 - Reward: 0.0, Steps: 132, Epsilon: 0.9916, Running Reward: 1.25


Training:   0%|          | 9779/5000000 [00:14<3:23:07, 409.46it/s]


Episode 52 - Reward: 6.0, Steps: 387, Epsilon: 0.9913, Running Reward: 1.49


Training:   0%|          | 10113/5000000 [00:15<2:02:30, 678.82it/s]


Episode 53 - Reward: 3.0, Steps: 249, Epsilon: 0.9910, Running Reward: 1.56


Training:   0%|          | 10191/5000000 [00:15<3:07:20, 443.92it/s]


Episode 54 - Reward: 2.0, Steps: 234, Epsilon: 0.9908, Running Reward: 1.58


Training:   0%|          | 10545/5000000 [00:16<2:57:46, 467.78it/s]


Episode 55 - Reward: 3.0, Steps: 245, Epsilon: 0.9906, Running Reward: 1.66


Training:   0%|          | 10727/5000000 [00:16<2:41:51, 513.74it/s]


Episode 56 - Reward: 3.0, Steps: 235, Epsilon: 0.9904, Running Reward: 1.72


Training:   0%|          | 10896/5000000 [00:17<2:07:36, 651.60it/s]


Episode 57 - Reward: 2.0, Steps: 200, Epsilon: 0.9902, Running Reward: 1.74


Training:   0%|          | 11029/5000000 [00:17<3:14:08, 428.29it/s]


Episode 58 - Reward: 0.0, Steps: 131, Epsilon: 0.9901, Running Reward: 1.65


Training:   0%|          | 11241/5000000 [00:18<2:42:04, 512.99it/s]


Episode 59 - Reward: 0.0, Steps: 142, Epsilon: 0.9900, Running Reward: 1.57


Training:   0%|          | 11599/5000000 [00:18<1:47:16, 775.04it/s]


Episode 60 - Reward: 6.0, Steps: 389, Epsilon: 0.9896, Running Reward: 1.79


Training:   0%|          | 11681/5000000 [00:18<2:06:37, 656.60it/s]


Episode 61 - Reward: 0.0, Steps: 140, Epsilon: 0.9895, Running Reward: 1.70


Training:   0%|          | 11917/5000000 [00:19<2:29:16, 556.94it/s]


Episode 62 - Reward: 2.0, Steps: 195, Epsilon: 0.9893, Running Reward: 1.71


Training:   0%|          | 12291/5000000 [00:19<2:09:43, 640.81it/s]


Episode 63 - Reward: 3.0, Steps: 242, Epsilon: 0.9891, Running Reward: 1.78

Episode 64 - Reward: 1.0, Steps: 158, Epsilon: 0.9890, Running Reward: 1.74


Training:   0%|          | 12569/5000000 [00:20<1:48:14, 767.95it/s]


Episode 65 - Reward: 1.0, Steps: 186, Epsilon: 0.9888, Running Reward: 1.70


Training:   0%|          | 12731/5000000 [00:20<2:21:06, 589.06it/s]


Episode 66 - Reward: 2.0, Steps: 225, Epsilon: 0.9886, Running Reward: 1.72


Training:   0%|          | 12824/5000000 [00:20<2:29:16, 556.81it/s]


Episode 67 - Reward: 0.0, Steps: 132, Epsilon: 0.9885, Running Reward: 1.63


Training:   0%|          | 13135/5000000 [00:21<2:11:50, 630.45it/s]


Episode 68 - Reward: 1.0, Steps: 185, Epsilon: 0.9883, Running Reward: 1.60

Episode 69 - Reward: 0.0, Steps: 132, Epsilon: 0.9882, Running Reward: 1.52


Training:   0%|          | 13385/5000000 [00:21<2:07:50, 650.11it/s]


Episode 70 - Reward: 2.0, Steps: 206, Epsilon: 0.9880, Running Reward: 1.54


Training:   0%|          | 13663/5000000 [00:21<1:45:47, 785.52it/s]


Episode 71 - Reward: 2.0, Steps: 224, Epsilon: 0.9878, Running Reward: 1.57

Episode 72 - Reward: 0.0, Steps: 131, Epsilon: 0.9877, Running Reward: 1.49


Training:   0%|          | 13979/5000000 [00:22<2:44:26, 505.34it/s]


Episode 73 - Reward: 2.0, Steps: 201, Epsilon: 0.9875, Running Reward: 1.51


Training:   0%|          | 14252/5000000 [00:23<1:55:41, 718.28it/s]


Episode 74 - Reward: 1.0, Steps: 183, Epsilon: 0.9873, Running Reward: 1.49


Training:   0%|          | 14444/5000000 [00:23<2:06:43, 655.66it/s]


Episode 75 - Reward: 3.0, Steps: 248, Epsilon: 0.9871, Running Reward: 1.56


Training:   0%|          | 14616/5000000 [00:23<2:30:15, 552.97it/s]


Episode 76 - Reward: 3.0, Steps: 233, Epsilon: 0.9869, Running Reward: 1.64


Training:   0%|          | 14879/5000000 [00:24<2:02:57, 675.75it/s]


Episode 77 - Reward: 1.0, Steps: 172, Epsilon: 0.9867, Running Reward: 1.60

Episode 78 - Reward: 0.0, Steps: 154, Epsilon: 0.9866, Running Reward: 1.52


Training:   0%|          | 15095/5000000 [00:24<2:35:57, 532.72it/s]


Episode 79 - Reward: 1.0, Steps: 164, Epsilon: 0.9865, Running Reward: 1.50


Training:   0%|          | 15223/5000000 [00:24<2:31:47, 547.33it/s]


Episode 80 - Reward: 0.0, Steps: 127, Epsilon: 0.9863, Running Reward: 1.42


Training:   0%|          | 15466/5000000 [00:25<2:13:47, 620.93it/s]


Episode 81 - Reward: 2.0, Steps: 202, Epsilon: 0.9862, Running Reward: 1.45


Training:   0%|          | 15645/5000000 [00:25<3:34:01, 388.13it/s]


Episode 82 - Reward: 1.0, Steps: 199, Epsilon: 0.9860, Running Reward: 1.43


Training:   0%|          | 15827/5000000 [00:26<3:44:56, 369.30it/s]


Episode 83 - Reward: 2.0, Steps: 206, Epsilon: 0.9858, Running Reward: 1.46


Training:   0%|          | 16110/5000000 [00:26<2:13:37, 621.66it/s]


Episode 84 - Reward: 2.0, Steps: 193, Epsilon: 0.9856, Running Reward: 1.48


Training:   0%|          | 16303/5000000 [00:27<3:38:45, 379.71it/s]


Episode 85 - Reward: 5.0, Steps: 318, Epsilon: 0.9853, Running Reward: 1.66


Training:   0%|          | 16557/5000000 [00:27<2:33:20, 541.66it/s]


Episode 86 - Reward: 1.0, Steps: 173, Epsilon: 0.9852, Running Reward: 1.63

Episode 87 - Reward: 0.0, Steps: 134, Epsilon: 0.9851, Running Reward: 1.55


Training:   0%|          | 16821/5000000 [00:28<2:22:47, 581.61it/s]


Episode 88 - Reward: 2.0, Steps: 226, Epsilon: 0.9849, Running Reward: 1.57


Training:   0%|          | 17092/5000000 [00:28<2:51:37, 483.88it/s]


Episode 89 - Reward: 1.0, Steps: 188, Epsilon: 0.9847, Running Reward: 1.54


Training:   0%|          | 17344/5000000 [00:29<2:26:02, 568.62it/s]


Episode 90 - Reward: 3.0, Steps: 259, Epsilon: 0.9845, Running Reward: 1.61


Training:   0%|          | 17499/5000000 [00:29<2:30:02, 553.44it/s]


Episode 91 - Reward: 1.0, Steps: 164, Epsilon: 0.9843, Running Reward: 1.58


Training:   0%|          | 17745/5000000 [00:30<3:15:22, 425.00it/s]


Episode 92 - Reward: 3.0, Steps: 262, Epsilon: 0.9841, Running Reward: 1.65


Training:   0%|          | 18047/5000000 [00:30<2:51:18, 484.67it/s]


Episode 93 - Reward: 4.0, Steps: 304, Epsilon: 0.9838, Running Reward: 1.77


Training:   0%|          | 18236/5000000 [00:31<2:41:53, 512.87it/s]


Episode 94 - Reward: 2.0, Steps: 215, Epsilon: 0.9836, Running Reward: 1.78


Training:   0%|          | 18493/5000000 [00:31<2:00:23, 689.58it/s]


Episode 95 - Reward: 0.0, Steps: 146, Epsilon: 0.9835, Running Reward: 1.69

Episode 96 - Reward: 0.0, Steps: 134, Epsilon: 0.9834, Running Reward: 1.61


Training:   0%|          | 18823/5000000 [00:32<3:54:29, 354.03it/s]


Episode 97 - Reward: 5.0, Steps: 301, Epsilon: 0.9831, Running Reward: 1.78


Training:   0%|          | 19031/5000000 [00:33<4:43:38, 292.68it/s]


Episode 98 - Reward: 1.0, Steps: 168, Epsilon: 0.9829, Running Reward: 1.74


Training:   0%|          | 19105/5000000 [00:33<5:09:26, 268.27it/s]


Episode 99 - Reward: 0.0, Steps: 149, Epsilon: 0.9828, Running Reward: 1.65


Training:   0%|          | 19298/5000000 [00:34<4:57:15, 279.26it/s]


Episode 100 - Reward: 0.0, Steps: 123, Epsilon: 0.9827, Running Reward: 1.57


Training:   0%|          | 19468/5000000 [00:35<4:10:46, 331.01it/s]


Episode 101 - Reward: 2.0, Steps: 192, Epsilon: 0.9825, Running Reward: 1.59


Training:   0%|          | 19655/5000000 [00:35<5:51:09, 236.38it/s]


Episode 102 - Reward: 2.0, Steps: 201, Epsilon: 0.9823, Running Reward: 1.61


Training:   0%|          | 19843/5000000 [00:36<5:05:14, 271.92it/s]


Episode 103 - Reward: 1.0, Steps: 165, Epsilon: 0.9822, Running Reward: 1.58


Training:   0%|          | 20005/5000000 [00:37<9:50:12, 140.63it/s]


Episode 104 - Reward: 2.0, Steps: 212, Epsilon: 0.9820, Running Reward: 1.60


Training:   0%|          | 20174/5000000 [00:38<5:00:27, 276.23it/s]


Episode 105 - Reward: 0.0, Steps: 130, Epsilon: 0.9819, Running Reward: 1.52


Training:   0%|          | 20468/5000000 [00:38<2:40:37, 516.67it/s]


Episode 106 - Reward: 2.0, Steps: 235, Epsilon: 0.9817, Running Reward: 1.55


Training:   0%|          | 20526/5000000 [00:38<3:25:37, 403.62it/s]


Episode 107 - Reward: 1.0, Steps: 158, Epsilon: 0.9815, Running Reward: 1.52


Training:   0%|          | 20725/5000000 [00:39<3:24:32, 405.74it/s]


Episode 108 - Reward: 0.0, Steps: 127, Epsilon: 0.9814, Running Reward: 1.44


Training:   0%|          | 20989/5000000 [00:40<2:59:43, 461.73it/s]


Episode 109 - Reward: 2.0, Steps: 229, Epsilon: 0.9812, Running Reward: 1.47

Episode 110 - Reward: 0.0, Steps: 129, Epsilon: 0.9811, Running Reward: 1.40


Training:   0%|          | 21194/5000000 [00:40<3:18:23, 418.26it/s]


Episode 111 - Reward: 1.0, Steps: 168, Epsilon: 0.9809, Running Reward: 1.38


Training:   0%|          | 21388/5000000 [00:41<4:30:33, 306.69it/s]


Episode 112 - Reward: 2.0, Steps: 188, Epsilon: 0.9808, Running Reward: 1.41


Training:   0%|          | 21721/5000000 [00:42<3:17:08, 420.88it/s]


Episode 113 - Reward: 4.0, Steps: 262, Epsilon: 0.9805, Running Reward: 1.54


Training:   0%|          | 21907/5000000 [00:42<2:59:32, 462.10it/s]


Episode 114 - Reward: 2.0, Steps: 210, Epsilon: 0.9803, Running Reward: 1.56


Training:   0%|          | 22133/5000000 [00:43<4:51:10, 284.93it/s]


Episode 115 - Reward: 4.0, Steps: 281, Epsilon: 0.9801, Running Reward: 1.68


Training:   0%|          | 22398/5000000 [00:43<2:40:35, 516.56it/s]


Episode 116 - Reward: 2.0, Steps: 208, Epsilon: 0.9799, Running Reward: 1.70


Training:   0%|          | 22548/5000000 [00:44<4:31:40, 305.36it/s]


Episode 117 - Reward: 1.0, Steps: 181, Epsilon: 0.9797, Running Reward: 1.66


Training:   0%|          | 22752/5000000 [00:45<3:44:33, 369.42it/s]


Episode 118 - Reward: 1.0, Steps: 188, Epsilon: 0.9796, Running Reward: 1.63


Training:   0%|          | 22966/5000000 [00:45<4:57:50, 278.51it/s]


Episode 119 - Reward: 1.0, Steps: 206, Epsilon: 0.9794, Running Reward: 1.60


Training:   0%|          | 23106/5000000 [00:46<5:02:17, 274.40it/s]


Episode 120 - Reward: 0.0, Steps: 127, Epsilon: 0.9793, Running Reward: 1.52


Training:   0%|          | 23305/5000000 [00:46<3:08:48, 439.29it/s]


Episode 121 - Reward: 1.0, Steps: 156, Epsilon: 0.9791, Running Reward: 1.49


Training:   0%|          | 23516/5000000 [00:47<4:12:52, 327.99it/s]


Episode 122 - Reward: 3.0, Steps: 249, Epsilon: 0.9789, Running Reward: 1.57


Training:   0%|          | 23690/5000000 [00:47<2:38:12, 524.22it/s]


Episode 123 - Reward: 1.0, Steps: 184, Epsilon: 0.9787, Running Reward: 1.54


Training:   0%|          | 23842/5000000 [00:48<3:58:32, 347.68it/s]


Episode 124 - Reward: 1.0, Steps: 154, Epsilon: 0.9786, Running Reward: 1.51


Training:   0%|          | 23975/5000000 [00:48<4:05:51, 337.32it/s]


Episode 125 - Reward: 1.0, Steps: 168, Epsilon: 0.9785, Running Reward: 1.49


Training:   0%|          | 24241/5000000 [00:49<3:27:10, 400.28it/s]


Episode 126 - Reward: 2.0, Steps: 238, Epsilon: 0.9782, Running Reward: 1.51


Training:   0%|          | 24439/5000000 [00:49<3:11:50, 432.25it/s]


Episode 127 - Reward: 1.0, Steps: 186, Epsilon: 0.9781, Running Reward: 1.49


Training:   0%|          | 24582/5000000 [00:50<3:53:46, 354.72it/s]


Episode 128 - Reward: 1.0, Steps: 162, Epsilon: 0.9779, Running Reward: 1.46


Training:   0%|          | 24781/5000000 [00:50<3:02:38, 454.00it/s]


Episode 129 - Reward: 0.0, Steps: 125, Epsilon: 0.9778, Running Reward: 1.39


Training:   0%|          | 24839/5000000 [00:51<3:30:41, 393.55it/s]


Episode 130 - Reward: 1.0, Steps: 178, Epsilon: 0.9777, Running Reward: 1.37


Training:   1%|          | 25073/5000000 [00:51<2:45:16, 501.69it/s]


Episode 131 - Reward: 1.0, Steps: 174, Epsilon: 0.9775, Running Reward: 1.35


Training:   1%|          | 25138/5000000 [00:51<2:46:00, 499.46it/s]


Episode 132 - Reward: 0.0, Steps: 145, Epsilon: 0.9774, Running Reward: 1.28


Training:   1%|          | 25415/5000000 [00:51<1:21:02, 1023.04it/s]


Episode 133 - Reward: 1.0, Steps: 166, Epsilon: 0.9772, Running Reward: 1.27


Training:   1%|          | 25537/5000000 [00:52<2:10:05, 637.32it/s] 


Episode 134 - Reward: 1.0, Steps: 164, Epsilon: 0.9771, Running Reward: 1.26


Training:   1%|          | 25632/5000000 [00:52<3:05:48, 446.19it/s]


Episode 135 - Reward: 1.0, Steps: 179, Epsilon: 0.9769, Running Reward: 1.24


Training:   1%|          | 25898/5000000 [00:53<3:33:47, 387.77it/s]


Episode 136 - Reward: 0.0, Steps: 136, Epsilon: 0.9768, Running Reward: 1.18


Training:   1%|          | 26077/5000000 [00:54<3:27:06, 400.25it/s]


Episode 137 - Reward: 3.0, Steps: 247, Epsilon: 0.9766, Running Reward: 1.27


Training:   1%|          | 26247/5000000 [00:54<4:48:10, 287.65it/s]


Episode 138 - Reward: 1.0, Steps: 176, Epsilon: 0.9764, Running Reward: 1.26


Training:   1%|          | 26489/5000000 [00:55<3:57:37, 348.83it/s]


Episode 139 - Reward: 3.0, Steps: 228, Epsilon: 0.9762, Running Reward: 1.35


Training:   1%|          | 26805/5000000 [00:56<4:13:52, 326.48it/s]


Episode 140 - Reward: 4.0, Steps: 333, Epsilon: 0.9759, Running Reward: 1.48


Training:   1%|          | 27083/5000000 [00:57<4:10:13, 331.23it/s]


Episode 141 - Reward: 3.0, Steps: 242, Epsilon: 0.9757, Running Reward: 1.55


Training:   1%|          | 27228/5000000 [00:57<3:37:44, 380.63it/s]


Episode 142 - Reward: 1.0, Steps: 161, Epsilon: 0.9755, Running Reward: 1.53


Training:   1%|          | 27407/5000000 [00:57<3:40:28, 375.89it/s]


Episode 143 - Reward: 1.0, Steps: 187, Epsilon: 0.9754, Running Reward: 1.50


Training:   1%|          | 27832/5000000 [00:58<2:25:22, 570.06it/s]


Episode 144 - Reward: 7.0, Steps: 403, Epsilon: 0.9750, Running Reward: 1.78


Training:   1%|          | 27978/5000000 [00:59<3:23:14, 407.74it/s]


Episode 145 - Reward: 1.0, Steps: 162, Epsilon: 0.9749, Running Reward: 1.74


Training:   1%|          | 28158/5000000 [00:59<3:00:13, 459.80it/s]


Episode 146 - Reward: 1.0, Steps: 172, Epsilon: 0.9747, Running Reward: 1.70


Training:   1%|          | 28264/5000000 [00:59<2:16:16, 608.05it/s]


Episode 147 - Reward: 1.0, Steps: 158, Epsilon: 0.9746, Running Reward: 1.66


Training:   1%|          | 28530/5000000 [01:00<2:26:37, 565.08it/s]


Episode 148 - Reward: 0.0, Steps: 159, Epsilon: 0.9744, Running Reward: 1.58


Training:   1%|          | 28597/5000000 [01:00<3:42:36, 372.21it/s]


Episode 149 - Reward: 0.0, Steps: 126, Epsilon: 0.9743, Running Reward: 1.50


Training:   1%|          | 28698/5000000 [01:00<3:48:23, 362.78it/s]


Episode 150 - Reward: 0.0, Steps: 139, Epsilon: 0.9742, Running Reward: 1.43


Training:   1%|          | 28850/5000000 [01:01<3:20:10, 413.91it/s]


Episode 151 - Reward: 0.0, Steps: 135, Epsilon: 0.9741, Running Reward: 1.36


Training:   1%|          | 29045/5000000 [01:01<3:35:38, 384.19it/s]


Episode 152 - Reward: 0.0, Steps: 147, Epsilon: 0.9739, Running Reward: 1.29


Training:   1%|          | 29270/5000000 [01:02<3:11:11, 433.30it/s]


Episode 153 - Reward: 4.0, Steps: 300, Epsilon: 0.9737, Running Reward: 1.42


Training:   1%|          | 29668/5000000 [01:03<2:34:06, 537.53it/s]


Episode 154 - Reward: 3.0, Steps: 242, Epsilon: 0.9734, Running Reward: 1.50


Training:   1%|          | 29795/5000000 [01:03<3:08:58, 438.36it/s]


Episode 155 - Reward: 2.0, Steps: 211, Epsilon: 0.9733, Running Reward: 1.53


Training:   1%|          | 29925/5000000 [01:04<6:28:20, 213.31it/s]


Episode 156 - Reward: 1.0, Steps: 182, Epsilon: 0.9731, Running Reward: 1.50


Training:   1%|          | 30064/5000000 [01:05<6:11:08, 223.18it/s]


Episode 157 - Reward: 0.0, Steps: 137, Epsilon: 0.9730, Running Reward: 1.43


Training:   1%|          | 30196/5000000 [01:05<3:27:12, 399.75it/s]


Episode 158 - Reward: 0.0, Steps: 136, Epsilon: 0.9728, Running Reward: 1.35


Training:   1%|          | 30424/5000000 [01:06<6:48:13, 202.89it/s]


Episode 159 - Reward: 3.0, Steps: 224, Epsilon: 0.9726, Running Reward: 1.44


Training:   1%|          | 30603/5000000 [01:07<4:51:33, 284.08it/s]


Episode 160 - Reward: 1.0, Steps: 182, Epsilon: 0.9725, Running Reward: 1.42


Training:   1%|          | 30720/5000000 [01:07<4:11:05, 329.85it/s]


Episode 161 - Reward: 0.0, Steps: 127, Epsilon: 0.9724, Running Reward: 1.34


Training:   1%|          | 30882/5000000 [01:08<4:16:08, 323.33it/s]


Episode 162 - Reward: 0.0, Steps: 142, Epsilon: 0.9722, Running Reward: 1.28


Training:   1%|          | 30996/5000000 [01:08<4:40:42, 295.02it/s]


Episode 163 - Reward: 0.0, Steps: 125, Epsilon: 0.9721, Running Reward: 1.21


Training:   1%|          | 31150/5000000 [01:09<4:47:17, 288.27it/s]


Episode 164 - Reward: 1.0, Steps: 162, Epsilon: 0.9720, Running Reward: 1.20


Training:   1%|          | 31325/5000000 [01:09<4:11:56, 328.70it/s]


Episode 165 - Reward: 1.0, Steps: 183, Epsilon: 0.9718, Running Reward: 1.19


Training:   1%|          | 31513/5000000 [01:10<5:27:54, 252.54it/s]


Episode 166 - Reward: 1.0, Steps: 161, Epsilon: 0.9717, Running Reward: 1.18


Training:   1%|          | 31675/5000000 [01:11<5:40:58, 242.85it/s]


Episode 167 - Reward: 1.0, Steps: 176, Epsilon: 0.9715, Running Reward: 1.17


Training:   1%|          | 31911/5000000 [01:12<4:27:13, 309.85it/s]


Episode 168 - Reward: 2.0, Steps: 230, Epsilon: 0.9713, Running Reward: 1.22


Training:   1%|          | 32065/5000000 [01:12<5:52:54, 234.62it/s]


Episode 169 - Reward: 1.0, Steps: 168, Epsilon: 0.9711, Running Reward: 1.20


Training:   1%|          | 32244/5000000 [01:13<4:30:02, 306.60it/s]


Episode 170 - Reward: 0.0, Steps: 149, Epsilon: 0.9710, Running Reward: 1.14


Training:   1%|          | 32382/5000000 [01:13<3:50:10, 359.70it/s]


Episode 171 - Reward: 0.0, Steps: 141, Epsilon: 0.9709, Running Reward: 1.09


Training:   1%|          | 32498/5000000 [01:14<4:03:02, 340.64it/s]


Episode 172 - Reward: 0.0, Steps: 135, Epsilon: 0.9708, Running Reward: 1.03


Training:   1%|          | 32692/5000000 [01:14<4:31:19, 305.12it/s]


Episode 173 - Reward: 3.0, Steps: 225, Epsilon: 0.9706, Running Reward: 1.13


Training:   1%|          | 32873/5000000 [01:16<6:37:18, 208.36it/s]


Episode 174 - Reward: 1.0, Steps: 161, Epsilon: 0.9704, Running Reward: 1.12


Training:   1%|          | 33048/5000000 [01:16<6:03:09, 227.96it/s]


Episode 175 - Reward: 0.0, Steps: 142, Epsilon: 0.9703, Running Reward: 1.07


Training:   1%|          | 33262/5000000 [01:17<4:42:38, 292.88it/s]


Episode 176 - Reward: 1.0, Steps: 159, Epsilon: 0.9701, Running Reward: 1.06


Training:   1%|          | 33354/5000000 [01:18<4:44:35, 290.87it/s]


Episode 177 - Reward: 0.0, Steps: 138, Epsilon: 0.9700, Running Reward: 1.01


Training:   1%|          | 33448/5000000 [01:18<5:23:35, 255.80it/s]


Episode 178 - Reward: 0.0, Steps: 140, Epsilon: 0.9699, Running Reward: 0.96


Training:   1%|          | 33653/5000000 [01:19<4:23:16, 314.39it/s]


Episode 179 - Reward: 1.0, Steps: 176, Epsilon: 0.9697, Running Reward: 0.96


Training:   1%|          | 33832/5000000 [01:20<6:28:24, 213.10it/s]


Episode 180 - Reward: 2.0, Steps: 207, Epsilon: 0.9696, Running Reward: 1.01


Training:   1%|          | 34072/5000000 [01:21<5:30:06, 250.72it/s]


Episode 181 - Reward: 2.0, Steps: 203, Epsilon: 0.9694, Running Reward: 1.06


Training:   1%|          | 34276/5000000 [01:21<3:37:51, 379.89it/s]


Episode 182 - Reward: 2.0, Steps: 232, Epsilon: 0.9692, Running Reward: 1.11


Training:   1%|          | 34460/5000000 [01:22<3:52:10, 356.45it/s]


Episode 183 - Reward: 0.0, Steps: 139, Epsilon: 0.9690, Running Reward: 1.06


Training:   1%|          | 34620/5000000 [01:23<4:46:13, 289.13it/s]


Episode 184 - Reward: 1.0, Steps: 163, Epsilon: 0.9689, Running Reward: 1.05


Training:   1%|          | 34683/5000000 [01:23<4:13:07, 326.94it/s]


Episode 185 - Reward: 3.0, Steps: 259, Epsilon: 0.9687, Running Reward: 1.15


Training:   1%|          | 35024/5000000 [01:24<4:06:33, 335.63it/s]


Episode 186 - Reward: 0.0, Steps: 141, Epsilon: 0.9685, Running Reward: 1.09


Training:   1%|          | 35135/5000000 [01:24<4:14:25, 325.23it/s]


Episode 187 - Reward: 0.0, Steps: 137, Epsilon: 0.9684, Running Reward: 1.04


Training:   1%|          | 35277/5000000 [01:25<4:11:38, 328.83it/s]


Episode 188 - Reward: 0.0, Steps: 149, Epsilon: 0.9683, Running Reward: 0.99


Training:   1%|          | 35568/5000000 [01:25<3:56:00, 350.59it/s]


Episode 189 - Reward: 2.0, Steps: 208, Epsilon: 0.9681, Running Reward: 1.04


Training:   1%|          | 35632/5000000 [01:26<6:20:26, 217.48it/s]


Episode 190 - Reward: 0.0, Steps: 144, Epsilon: 0.9680, Running Reward: 0.98


Training:   1%|          | 35758/5000000 [01:27<6:11:32, 222.69it/s]


Episode 191 - Reward: 0.0, Steps: 130, Epsilon: 0.9678, Running Reward: 0.94


Training:   1%|          | 35937/5000000 [01:28<9:07:40, 151.06it/s]


Episode 192 - Reward: 1.0, Steps: 184, Epsilon: 0.9677, Running Reward: 0.94


Training:   1%|          | 36139/5000000 [01:29<8:02:57, 171.30it/s] 


Episode 193 - Reward: 2.0, Steps: 197, Epsilon: 0.9675, Running Reward: 0.99


Training:   1%|          | 36340/5000000 [01:30<5:32:43, 248.64it/s]


Episode 194 - Reward: 2.0, Steps: 191, Epsilon: 0.9673, Running Reward: 1.04


Training:   1%|          | 36504/5000000 [01:30<2:54:04, 475.21it/s]


Episode 195 - Reward: 0.0, Steps: 139, Epsilon: 0.9672, Running Reward: 0.99


Training:   1%|          | 36555/5000000 [01:30<2:51:37, 482.00it/s]


Episode 196 - Reward: 0.0, Steps: 142, Epsilon: 0.9671, Running Reward: 0.94


Training:   1%|          | 36789/5000000 [01:31<5:06:08, 270.20it/s]


Episode 197 - Reward: 2.0, Steps: 198, Epsilon: 0.9669, Running Reward: 0.99


Training:   1%|          | 37090/5000000 [01:32<5:49:51, 236.42it/s]


Episode 198 - Reward: 3.0, Steps: 258, Epsilon: 0.9667, Running Reward: 1.09


Training:   1%|          | 37309/5000000 [01:33<4:48:16, 286.91it/s]


Episode 199 - Reward: 2.0, Steps: 213, Epsilon: 0.9665, Running Reward: 1.14


Training:   1%|          | 37414/5000000 [01:34<7:30:55, 183.42it/s]


Episode 200 - Reward: 0.0, Steps: 135, Epsilon: 0.9663, Running Reward: 1.08


Training:   1%|          | 37579/5000000 [01:35<7:38:25, 180.42it/s]


Episode 201 - Reward: 1.0, Steps: 186, Epsilon: 0.9662, Running Reward: 1.08


Training:   1%|          | 37849/5000000 [01:35<2:54:21, 474.32it/s] 


Episode 202 - Reward: 2.0, Steps: 194, Epsilon: 0.9660, Running Reward: 1.12


Training:   1%|          | 37960/5000000 [01:36<3:52:48, 355.23it/s]


Episode 203 - Reward: 0.0, Steps: 134, Epsilon: 0.9659, Running Reward: 1.07


Training:   1%|          | 38162/5000000 [01:37<4:24:45, 312.35it/s]


Episode 204 - Reward: 1.0, Steps: 180, Epsilon: 0.9657, Running Reward: 1.06


Training:   1%|          | 38397/5000000 [01:37<4:10:04, 330.68it/s]


Episode 205 - Reward: 4.0, Steps: 282, Epsilon: 0.9655, Running Reward: 1.21


Training:   1%|          | 38657/5000000 [01:38<2:46:24, 496.89it/s]


Episode 206 - Reward: 1.0, Steps: 172, Epsilon: 0.9653, Running Reward: 1.20


Training:   1%|          | 38712/5000000 [01:38<3:54:03, 353.29it/s]


Episode 207 - Reward: 0.0, Steps: 137, Epsilon: 0.9652, Running Reward: 1.14


Training:   1%|          | 38981/5000000 [01:40<6:40:48, 206.29it/s]


Episode 208 - Reward: 4.0, Steps: 288, Epsilon: 0.9649, Running Reward: 1.28


Training:   1%|          | 39196/5000000 [01:40<4:51:57, 283.19it/s]


Episode 209 - Reward: 2.0, Steps: 209, Epsilon: 0.9647, Running Reward: 1.32


Training:   1%|          | 39520/5000000 [01:42<6:41:43, 205.80it/s]


Episode 210 - Reward: 4.0, Steps: 330, Epsilon: 0.9644, Running Reward: 1.45


Training:   1%|          | 39719/5000000 [01:43<8:29:59, 162.10it/s] 


Episode 211 - Reward: 2.0, Steps: 203, Epsilon: 0.9643, Running Reward: 1.48


Training:   1%|          | 39875/5000000 [01:44<6:28:09, 212.98it/s]


Episode 212 - Reward: 0.0, Steps: 132, Epsilon: 0.9641, Running Reward: 1.41


Training:   1%|          | 40005/5000000 [01:44<4:14:44, 324.52it/s]


Episode 213 - Reward: 1.0, Steps: 165, Epsilon: 0.9640, Running Reward: 1.39


Training:   1%|          | 40143/5000000 [01:45<8:09:18, 168.94it/s]


Episode 214 - Reward: 0.0, Steps: 138, Epsilon: 0.9639, Running Reward: 1.32


Training:   1%|          | 40480/5000000 [01:48<8:10:18, 168.58it/s] 


Episode 215 - Reward: 4.0, Steps: 331, Epsilon: 0.9636, Running Reward: 1.45


Training:   1%|          | 40671/5000000 [01:49<4:51:30, 283.54it/s] 


Episode 216 - Reward: 1.0, Steps: 176, Epsilon: 0.9634, Running Reward: 1.43


Training:   1%|          | 40870/5000000 [01:50<6:13:28, 221.30it/s]


Episode 217 - Reward: 1.0, Steps: 200, Epsilon: 0.9632, Running Reward: 1.41


Training:   1%|          | 41004/5000000 [01:50<7:11:37, 191.48it/s]


Episode 218 - Reward: 0.0, Steps: 144, Epsilon: 0.9631, Running Reward: 1.34


Training:   1%|          | 41214/5000000 [01:51<4:57:04, 278.20it/s]


Episode 219 - Reward: 1.0, Steps: 166, Epsilon: 0.9630, Running Reward: 1.32


Training:   1%|          | 41324/5000000 [01:52<5:57:13, 231.35it/s]


Episode 220 - Reward: 0.0, Steps: 139, Epsilon: 0.9628, Running Reward: 1.25


Training:   1%|          | 41585/5000000 [01:53<6:43:59, 204.56it/s]


Episode 221 - Reward: 4.0, Steps: 255, Epsilon: 0.9626, Running Reward: 1.39


Training:   1%|          | 41684/5000000 [01:54<8:36:41, 159.94it/s] 


Episode 222 - Reward: 0.0, Steps: 128, Epsilon: 0.9625, Running Reward: 1.32


Training:   1%|          | 41851/5000000 [01:55<8:00:01, 172.15it/s] 


Episode 223 - Reward: 1.0, Steps: 155, Epsilon: 0.9623, Running Reward: 1.31


Training:   1%|          | 42106/5000000 [01:55<2:00:43, 684.42it/s]


Episode 224 - Reward: 2.0, Steps: 221, Epsilon: 0.9621, Running Reward: 1.34


Training:   1%|          | 42203/5000000 [01:55<1:49:34, 754.13it/s]


Episode 225 - Reward: 0.0, Steps: 142, Epsilon: 0.9620, Running Reward: 1.27


Training:   1%|          | 42381/5000000 [01:56<3:06:48, 442.31it/s]


Episode 226 - Reward: 0.0, Steps: 124, Epsilon: 0.9619, Running Reward: 1.21


Training:   1%|          | 42503/5000000 [01:56<5:04:37, 271.23it/s]


Episode 227 - Reward: 1.0, Steps: 173, Epsilon: 0.9618, Running Reward: 1.20


Training:   1%|          | 42741/5000000 [01:57<3:51:05, 357.53it/s]


Episode 228 - Reward: 1.0, Steps: 189, Epsilon: 0.9616, Running Reward: 1.19


Training:   1%|          | 42878/5000000 [01:57<3:31:56, 389.82it/s]


Episode 229 - Reward: 1.0, Steps: 182, Epsilon: 0.9614, Running Reward: 1.18


Training:   1%|          | 43063/5000000 [01:58<4:44:39, 290.22it/s]


Episode 230 - Reward: 0.0, Steps: 152, Epsilon: 0.9613, Running Reward: 1.12


Training:   1%|          | 43195/5000000 [01:59<5:27:20, 252.38it/s]


Episode 231 - Reward: 1.0, Steps: 152, Epsilon: 0.9611, Running Reward: 1.11


Training:   1%|          | 43477/5000000 [02:00<8:13:57, 167.24it/s]


Episode 232 - Reward: 4.0, Steps: 280, Epsilon: 0.9609, Running Reward: 1.26


Training:   1%|          | 43649/5000000 [02:01<10:12:08, 134.94it/s]


Episode 233 - Reward: 1.0, Steps: 180, Epsilon: 0.9607, Running Reward: 1.25


Training:   1%|          | 43917/5000000 [02:02<5:16:54, 260.65it/s] 


Episode 234 - Reward: 3.0, Steps: 246, Epsilon: 0.9605, Running Reward: 1.33


Training:   1%|          | 44148/5000000 [02:03<3:55:14, 351.11it/s]


Episode 235 - Reward: 2.0, Steps: 188, Epsilon: 0.9603, Running Reward: 1.37


Training:   1%|          | 44331/5000000 [02:03<4:21:24, 315.96it/s]


Episode 236 - Reward: 3.0, Steps: 249, Epsilon: 0.9601, Running Reward: 1.45


Training:   1%|          | 44691/5000000 [02:05<5:18:19, 259.45it/s]


Episode 237 - Reward: 4.0, Steps: 313, Epsilon: 0.9598, Running Reward: 1.58


Training:   1%|          | 44874/5000000 [02:06<5:15:24, 261.84it/s]


Episode 238 - Reward: 1.0, Steps: 183, Epsilon: 0.9597, Running Reward: 1.55


Training:   1%|          | 45003/5000000 [02:06<6:08:36, 224.04it/s]


Episode 239 - Reward: 2.0, Steps: 196, Epsilon: 0.9595, Running Reward: 1.57


Training:   1%|          | 45235/5000000 [02:08<7:33:21, 182.15it/s]


Episode 240 - Reward: 1.0, Steps: 192, Epsilon: 0.9593, Running Reward: 1.54


Training:   1%|          | 45445/5000000 [02:09<6:23:59, 215.05it/s]


Episode 241 - Reward: 3.0, Steps: 245, Epsilon: 0.9591, Running Reward: 1.61


Training:   1%|          | 45636/5000000 [02:10<7:52:45, 174.66it/s]


Episode 242 - Reward: 1.0, Steps: 170, Epsilon: 0.9589, Running Reward: 1.58


Training:   1%|          | 45972/5000000 [02:11<2:30:19, 549.27it/s]


Episode 243 - Reward: 3.0, Steps: 229, Epsilon: 0.9587, Running Reward: 1.65

Episode 244 - Reward: 1.0, Steps: 162, Epsilon: 0.9586, Running Reward: 1.62


Training:   1%|          | 46187/5000000 [02:12<5:59:17, 229.80it/s]


Episode 245 - Reward: 1.0, Steps: 175, Epsilon: 0.9584, Running Reward: 1.59


Training:   1%|          | 46350/5000000 [02:13<8:05:01, 170.22it/s]


Episode 246 - Reward: 1.0, Steps: 161, Epsilon: 0.9583, Running Reward: 1.56


Training:   1%|          | 46505/5000000 [02:14<6:24:11, 214.89it/s] 


Episode 247 - Reward: 0.0, Steps: 145, Epsilon: 0.9582, Running Reward: 1.48


Training:   1%|          | 46665/5000000 [02:14<4:48:04, 286.58it/s]


Episode 248 - Reward: 1.0, Steps: 167, Epsilon: 0.9580, Running Reward: 1.46


Training:   1%|          | 46831/5000000 [02:15<4:08:34, 332.11it/s]


Episode 249 - Reward: 0.0, Steps: 127, Epsilon: 0.9579, Running Reward: 1.39


Training:   1%|          | 47160/5000000 [02:16<3:34:09, 385.46it/s]


Episode 250 - Reward: 4.0, Steps: 287, Epsilon: 0.9576, Running Reward: 1.52


Training:   1%|          | 47255/5000000 [02:16<3:54:29, 352.02it/s]


Episode 251 - Reward: 1.0, Steps: 164, Epsilon: 0.9575, Running Reward: 1.49


Training:   1%|          | 47520/5000000 [02:18<5:15:40, 261.47it/s]


Episode 252 - Reward: 3.0, Steps: 262, Epsilon: 0.9573, Running Reward: 1.57


Training:   1%|          | 47703/5000000 [02:18<5:07:44, 268.20it/s]


Episode 253 - Reward: 2.0, Steps: 211, Epsilon: 0.9571, Running Reward: 1.59


Training:   1%|          | 47960/5000000 [02:20<6:59:21, 196.81it/s]


Episode 254 - Reward: 2.0, Steps: 231, Epsilon: 0.9569, Running Reward: 1.61


Training:   1%|          | 48168/5000000 [02:20<3:56:30, 348.96it/s]


Episode 255 - Reward: 1.0, Steps: 194, Epsilon: 0.9567, Running Reward: 1.58


Training:   1%|          | 48407/5000000 [02:22<6:48:04, 202.24it/s]


Episode 256 - Reward: 3.0, Steps: 257, Epsilon: 0.9565, Running Reward: 1.65


Training:   1%|          | 48651/5000000 [02:23<9:07:58, 150.59it/s] 


Episode 257 - Reward: 3.0, Steps: 257, Epsilon: 0.9562, Running Reward: 1.72


Training:   1%|          | 48799/5000000 [02:24<7:10:09, 191.84it/s] 


Episode 258 - Reward: 0.0, Steps: 142, Epsilon: 0.9561, Running Reward: 1.63


Training:   1%|          | 48934/5000000 [02:25<7:15:09, 189.62it/s]


Episode 259 - Reward: 0.0, Steps: 139, Epsilon: 0.9560, Running Reward: 1.55


Training:   1%|          | 49108/5000000 [02:26<8:26:47, 162.82it/s]


Episode 260 - Reward: 1.0, Steps: 168, Epsilon: 0.9558, Running Reward: 1.52


Training:   1%|          | 49225/5000000 [02:26<5:11:44, 264.68it/s]


Episode 261 - Reward: 0.0, Steps: 130, Epsilon: 0.9557, Running Reward: 1.45


Training:   1%|          | 49514/5000000 [02:28<10:09:13, 135.43it/s]


Episode 262 - Reward: 3.0, Steps: 262, Epsilon: 0.9555, Running Reward: 1.52


Training:   1%|          | 49664/5000000 [02:29<8:40:08, 158.62it/s] 


Episode 263 - Reward: 0.0, Steps: 129, Epsilon: 0.9553, Running Reward: 1.45


Training:   1%|          | 49810/5000000 [02:30<6:43:19, 204.55it/s]


Episode 264 - Reward: 1.0, Steps: 173, Epsilon: 0.9552, Running Reward: 1.43


Training:   1%|          | 49982/5000000 [02:31<8:12:02, 167.67it/s]2025-05-21 11:49:53.079450: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:557] Omitted potentially buggy algorithm eng14{} for conv (f32[32,64,7,7]{3,2,1,0}, u8[0]{0}) custom-call(f32[32,64,9,9]{3,2,1,0}, f32[64,64,3,3]{3,2,1,0}, f32[64]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"leakyrelu_alpha":0,"side_input_scale":0},"force_earliest_schedule":false,"operation_queue_id":"0","wait_on_operation_queues":[]}
Training:   1%|          | 50040/5000000 [02:38<84:29:27, 16.27it/s]


Episode 265 - Reward: 3.0, Steps: 252, Epsilon: 0.9550, Running Reward: 1.50


Training:   1%|          | 50365/5000000 [03:02<86:36:40, 15.87it/s] 


Episode 266 - Reward: 4.0, Steps: 327, Epsilon: 0.9547, Running Reward: 1.63


Training:   1%|          | 50493/5000000 [03:13<107:45:48, 12.76it/s]


Episode 267 - Reward: 0.0, Steps: 128, Epsilon: 0.9546, Running Reward: 1.55


Training:   1%|          | 50661/5000000 [03:27<113:29:26, 12.11it/s]


Episode 268 - Reward: 1.0, Steps: 170, Epsilon: 0.9544, Running Reward: 1.52


Training:   1%|          | 50849/5000000 [03:43<126:51:00, 10.84it/s]


Episode 269 - Reward: 1.0, Steps: 187, Epsilon: 0.9542, Running Reward: 1.49


Training:   1%|          | 51015/5000000 [03:56<95:36:18, 14.38it/s] 


Episode 270 - Reward: 1.0, Steps: 164, Epsilon: 0.9541, Running Reward: 1.47


Training:   1%|          | 51142/5000000 [04:07<120:57:01, 11.37it/s]


Episode 271 - Reward: 0.0, Steps: 127, Epsilon: 0.9540, Running Reward: 1.40


Training:   1%|          | 51293/5000000 [04:19<95:27:54, 14.40it/s] 


Episode 272 - Reward: 1.0, Steps: 154, Epsilon: 0.9538, Running Reward: 1.38


Training:   1%|          | 51437/5000000 [04:29<111:30:45, 12.33it/s]


Episode 273 - Reward: 0.0, Steps: 145, Epsilon: 0.9537, Running Reward: 1.31


Training:   1%|          | 51613/5000000 [04:42<91:33:56, 15.01it/s] 


Episode 274 - Reward: 1.0, Steps: 176, Epsilon: 0.9535, Running Reward: 1.29


Training:   1%|          | 51825/5000000 [04:56<100:04:10, 13.74it/s]


Episode 275 - Reward: 2.0, Steps: 212, Epsilon: 0.9534, Running Reward: 1.33


Training:   1%|          | 51961/5000000 [05:06<89:24:04, 15.37it/s] 


Episode 276 - Reward: 0.0, Steps: 134, Epsilon: 0.9532, Running Reward: 1.26


Training:   1%|          | 52249/5000000 [05:27<92:47:24, 14.81it/s] 


Episode 277 - Reward: 4.0, Steps: 287, Epsilon: 0.9530, Running Reward: 1.40


Training:   1%|          | 52393/5000000 [05:37<88:37:44, 15.51it/s] 


Episode 278 - Reward: 0.0, Steps: 145, Epsilon: 0.9528, Running Reward: 1.33


Training:   1%|          | 52581/5000000 [05:50<93:26:00, 14.71it/s] 


Episode 279 - Reward: 1.0, Steps: 190, Epsilon: 0.9527, Running Reward: 1.31


Training:   1%|          | 52745/5000000 [06:02<102:57:53, 13.35it/s]


Episode 280 - Reward: 1.0, Steps: 161, Epsilon: 0.9525, Running Reward: 1.30


Training:   1%|          | 52881/5000000 [06:12<90:58:54, 15.10it/s] 


Episode 281 - Reward: 0.0, Steps: 138, Epsilon: 0.9524, Running Reward: 1.23


Training:   1%|          | 53061/5000000 [06:24<96:49:18, 14.19it/s] 


Episode 282 - Reward: 1.0, Steps: 180, Epsilon: 0.9522, Running Reward: 1.22


Training:   1%|          | 53193/5000000 [06:33<99:21:54, 13.83it/s] 


Episode 283 - Reward: 0.0, Steps: 132, Epsilon: 0.9521, Running Reward: 1.16


Training:   1%|          | 53497/5000000 [06:54<97:21:51, 14.11it/s] 


Episode 284 - Reward: 4.0, Steps: 302, Epsilon: 0.9519, Running Reward: 1.30


Training:   1%|          | 53701/5000000 [07:08<95:33:45, 14.38it/s] 


Episode 285 - Reward: 2.0, Steps: 207, Epsilon: 0.9517, Running Reward: 1.34


Training:   1%|          | 53953/5000000 [07:25<93:43:08, 14.66it/s] 


Episode 286 - Reward: 3.0, Steps: 251, Epsilon: 0.9514, Running Reward: 1.42


Training:   1%|          | 54093/5000000 [07:35<87:46:50, 15.65it/s] 


Episode 287 - Reward: 0.0, Steps: 138, Epsilon: 0.9513, Running Reward: 1.35


Training:   1%|          | 54265/5000000 [07:46<95:24:08, 14.40it/s] 


Episode 288 - Reward: 1.0, Steps: 174, Epsilon: 0.9512, Running Reward: 1.33


Training:   1%|          | 54501/5000000 [08:02<97:19:09, 14.12it/s] 


Episode 289 - Reward: 3.0, Steps: 237, Epsilon: 0.9509, Running Reward: 1.41


Training:   1%|          | 54639/5000000 [08:13<125:37:23, 10.94it/s]


Episode 290 - Reward: 0.0, Steps: 134, Epsilon: 0.9508, Running Reward: 1.34


Training:   1%|          | 54873/5000000 [08:32<97:42:52, 14.06it/s] 


Episode 291 - Reward: 3.0, Steps: 235, Epsilon: 0.9506, Running Reward: 1.43


Training:   1%|          | 55173/5000000 [08:56<96:04:15, 14.30it/s] 


Episode 292 - Reward: 4.0, Steps: 303, Epsilon: 0.9503, Running Reward: 1.55


Training:   1%|          | 55351/5000000 [09:09<106:17:44, 12.92it/s]


Episode 293 - Reward: 1.0, Steps: 175, Epsilon: 0.9502, Running Reward: 1.53


Training:   1%|          | 55561/5000000 [09:24<97:10:12, 14.13it/s] 


Episode 294 - Reward: 2.0, Steps: 212, Epsilon: 0.9500, Running Reward: 1.55


Training:   1%|          | 55693/5000000 [09:34<93:28:46, 14.69it/s] 


Episode 295 - Reward: 0.0, Steps: 133, Epsilon: 0.9499, Running Reward: 1.47


Training:   1%|          | 56017/5000000 [09:59<166:03:09,  8.27it/s]


Episode 296 - Reward: 4.0, Steps: 322, Epsilon: 0.9496, Running Reward: 1.60


Training:   1%|          | 56149/5000000 [10:11<128:07:05, 10.72it/s]


Episode 297 - Reward: 0.0, Steps: 133, Epsilon: 0.9495, Running Reward: 1.52


Training:   1%|          | 56313/5000000 [10:24<107:54:24, 12.73it/s]


Episode 298 - Reward: 1.0, Steps: 164, Epsilon: 0.9493, Running Reward: 1.49


Training:   1%|          | 56553/5000000 [10:44<111:42:00, 12.29it/s]


Episode 299 - Reward: 2.0, Steps: 239, Epsilon: 0.9491, Running Reward: 1.52


Training:   1%|          | 56649/5000000 [10:53<114:38:16, 11.98it/s]

KeyboardInterrupt: 

Training:   1%|          | 56652/5000000 [11:04<114:38:16, 11.98it/s]

In [None]:
# Evaluate the trained agent
print("\nEvaluating the trained agent...")
evaluate_agent("best_model", episodes=5, render=True)