In [1]:
import numpy as np
import tensorflow as tf
import gymnasium as gym
from tensorflow import keras
import random
import time

# Ensure TensorFlow uses GPU if available
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Use mixed precision for faster computation on compatible hardware
# tf.keras.mixed_precision.set_global_policy('mixed_float16')

@tf.function
def compute_loss_and_gradients(model, observations, actions, weights):
    """TF function for computing loss and gradients"""
    with tf.GradientTape() as tape:
        logits = model(observations)  # Forward pass
        # Use sparse categorical crossentropy for discrete action space
        loss_values = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=False)(actions, logits)
        # Weight the losses by the advantages
        weighted_loss = tf.reduce_mean(loss_values * weights)
        
    # Compute gradients
    grads = tape.gradient(weighted_loss, model.trainable_variables)
    return weighted_loss, grads

def create_model(input_shape, n_actions):
    """Create a more powerful model architecture"""
    inputs = keras.Input(shape=input_shape)
    
    # Normalize inputs for better stability
    x = keras.layers.BatchNormalization()(inputs)
    
    # Create wider layers with residual connections
    x = keras.layers.Dense(128, activation=None)(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Activation('relu')(x)
    x = keras.layers.Dropout(0.2)(x)
    
    # Second layer
    residual = x
    x = keras.layers.Dense(128, activation=None)(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Activation('relu')(x)
    x = keras.layers.Dropout(0.2)(x)
    
    # Add residual connection
    x = keras.layers.Add()([x, residual])
    
    # Output layer with advantage baseline
    action_logits = keras.layers.Dense(n_actions)(x)
    action_probs = keras.layers.Activation('softmax', name='policy')(action_logits)
    value = keras.layers.Dense(1, name='value')(x)
    
    model = keras.Model(inputs=inputs, outputs=[action_probs, value])
    return model

def discount_rewards(rewards, discount_factor=0.99, normalize=True):
    """Calculate discounted rewards more efficiently"""
    discounted_rewards = np.zeros_like(rewards, dtype=np.float32)
    running_add = 0
    
    for t in reversed(range(len(rewards))):
        running_add = running_add * discount_factor + rewards[t]
        discounted_rewards[t] = running_add
    
    if normalize and len(discounted_rewards) > 0:
        discounted_rewards = (discounted_rewards - np.mean(discounted_rewards)) / (np.std(discounted_rewards) + 1e-8)
    
    return discounted_rewards

def compute_advantages(rewards, values, discount_factor=0.99, gae_lambda=0.95):
    """Compute Generalized Advantage Estimation for more stable learning"""
    # Ensure values has one more element than rewards (for the terminal state)
    if len(values) != len(rewards) + 1:
        # If values is the same length as rewards, append a zero for the terminal state
        if len(values) == len(rewards):
            values = np.append(values, 0)
        else:
            raise ValueError(f"Values shape {len(values)} is not compatible with rewards shape {len(rewards)}")
    
    # Now we're sure values has exactly one more element than rewards
    deltas = rewards + discount_factor * values[1:] - values[:-1]
    advantages = np.zeros_like(rewards, dtype=np.float32)
    
    # GAE calculation
    gae = 0
    for t in reversed(range(len(rewards))):
        gae = deltas[t] + discount_factor * gae_lambda * gae
        advantages[t] = gae
    
    # Normalize advantages
    if len(advantages) > 0:
        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)
    
    return advantages

def collect_trajectories(env, model, n_episodes):
    """Collect experience trajectories efficiently"""
    observations = []
    actions = []
    rewards = []
    values = []
    dones = []
    
    for _ in range(n_episodes):
        observation, info = env.reset()
        episode_observations = []
        episode_actions = []
        episode_rewards = []
        episode_values = []
        episode_dones = []
        
        done = False
        while not done:
            # Convert to batch of 1 and get policy and value prediction
            obs_tensor = tf.convert_to_tensor([observation], dtype=tf.float32)
            policy_probs, value = model(obs_tensor)
            
            # Sample action from policy
            action_probs = policy_probs.numpy()[0]
            action = np.random.choice(len(action_probs), p=action_probs)
            
            # Take action in environment
            next_observation, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            # Store step data
            episode_observations.append(observation)
            episode_actions.append(action)
            episode_rewards.append(reward)
            episode_values.append(value.numpy()[0][0])
            episode_dones.append(done)
            
            observation = next_observation
        
        # Add episode data to batch data
        observations.extend(episode_observations)
        actions.extend(episode_actions)
        rewards.extend(episode_rewards)
        values.extend(episode_values)
        dones.extend(episode_dones)
    
    return np.array(observations), np.array(actions), np.array(rewards), np.array(values), np.array(dones)

def train_lunar_lander(
    n_epochs=500,
    n_episodes_per_update=16,
    discount_factor=0.99,
    gae_lambda=0.95,
    learning_rate=3e-4,
    entropy_weight=0.01,
    value_loss_weight=0.5,
    max_grad_norm=0.5,
    target_reward=200,
    progress_bar=True
):
    """Train a Lunar Lander agent with optimized parameters"""
    # Create environment
    env = gym.make("LunarLander-v3")
    n_actions = env.action_space.n
    
    # Create model
    model = create_model(env.observation_space.shape, n_actions)
    
    # Separate optimizers for policy and value networks with optimized parameters
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=max_grad_norm)
    
    # Metrics tracking
    mean_rewards = []
    start_time = time.time()
    
    # Training loop
    for epoch in range(n_epochs):
        # Collect trajectories
        observations, actions, rewards, values, dones = collect_trajectories(env, model, n_episodes_per_update)
        
        # Calculate episode returns and mean reward
        episode_rewards = []
        start_idx = 0
        for i, done in enumerate(dones):
            if done:
                episode_rewards.append(sum(rewards[start_idx:i+1]))
                start_idx = i + 1
        
        mean_reward = np.mean(episode_rewards)
        mean_rewards.append(mean_reward)
        
        # Calculate advantages and returns
        returns = []
        start_idx = 0
        for i, done in enumerate(dones):
            if done:
                episode_rewards = rewards[start_idx:i+1]
                episode_values = values[start_idx:i+1]
                
                # Compute advantages using GAE
                advantages = compute_advantages(
                    episode_rewards, 
                    episode_values, 
                    discount_factor, 
                    gae_lambda
                )
                
                # Get value targets (Advantage + value)
                value_targets = advantages + episode_values
                returns.extend(value_targets)
                start_idx = i + 1
        
        returns = np.array(returns)
        
        # Training in batches for better stability
        batch_size = 512
        indices = np.arange(len(observations))
        np.random.shuffle(indices)
        
        for start_idx in range(0, len(observations), batch_size):
            batch_indices = indices[start_idx:start_idx + batch_size]
            
            # Get batch data
            batch_observations = tf.convert_to_tensor(observations[batch_indices], dtype=tf.float32)
            batch_actions = tf.convert_to_tensor(actions[batch_indices], dtype=tf.int32)
            batch_returns = tf.convert_to_tensor(returns[batch_indices], dtype=tf.float32)
            
            # Compute loss and apply gradients
            with tf.GradientTape() as tape:
                # Forward pass
                action_probs, value_preds = model(batch_observations)
                
                # Flatten value predictions
                value_preds = tf.squeeze(value_preds)
                
                # Compute policy loss (negative log likelihood weighted by advantages)
                action_masks = tf.one_hot(batch_actions, n_actions)
                log_probs = tf.math.log(tf.maximum(action_probs, 1e-8))
                policy_loss = -tf.reduce_mean(
                    tf.reduce_sum(log_probs * action_masks, axis=1) * batch_returns
                )
                
                # Compute value loss
                value_loss = tf.reduce_mean(tf.square(batch_returns - value_preds))
                
                # Compute entropy bonus
                entropy = -tf.reduce_mean(
                    tf.reduce_sum(action_probs * log_probs, axis=1)
                )
                
                # Total loss
                loss = policy_loss + value_loss_weight * value_loss - entropy_weight * entropy
            
            # Get gradients and apply
            grads = tape.gradient(loss, model.trainable_variables)
            
            # Clip gradients for stability
            grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
            
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        # Print progress
        elapsed_time = time.time() - start_time
        if progress_bar:
            print(f"\rEpoch {epoch+1}/{n_epochs} | Mean Reward: {mean_reward:.2f} | Time: {elapsed_time:.2f}s", end="")
        
        # Check if we've reached our target reward
        if mean_reward >= target_reward:
            if progress_bar:
                print(f"\nReached target reward of {target_reward} in {epoch+1} epochs!")
            break

    env.close()
    
    # Final stats
    if progress_bar:
        print(f"\nTraining completed in {time.time() - start_time:.2f} seconds")
    
    return model, mean_rewards

def evaluate_agent(model, env_name="LunarLander-v3", n_episodes=10, render=False):
    """Evaluate the trained agent"""
    env = gym.make(env_name, render_mode="rgb_array" if render else None)
    rewards = []
    
    for episode in range(n_episodes):
        observation, _ = env.reset()
        episode_reward = 0
        done = False
        
        while not done:
            obs_tensor = tf.convert_to_tensor([observation], dtype=tf.float32)
            action_probs, _ = model(obs_tensor)
            action = np.argmax(action_probs[0])  # Take greedy action during evaluation
            observation, reward, terminated, truncated, _ = env.step(action)
            episode_reward += reward
            done = terminated or truncated
            
            if render:
                env.render()
        
        rewards.append(episode_reward)
        print(f"Episode {episode+1}/{n_episodes}: Reward = {episode_reward:.2f}")
    
    env.close()
    print(f"Average Reward over {n_episodes} episodes: {np.mean(rewards):.2f}")
    return np.mean(rewards)

2025-05-19 15:32:24.899264: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747661544.929262  124124 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747661544.937998  124124 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-19 15:32:24.963006: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Train the agent
model, rewards = train_lunar_lander(
    n_epochs=500,
    n_episodes_per_update=8,  # Reduced for faster iterations
    learning_rate=3e-4,
    entropy_weight=0.01,
    value_loss_weight=0.5,
    gae_lambda=0.95,
    target_reward=210
)

# Save the model
model.save("lunar_lander_model.keras")

# Evaluate the agent
evaluate_agent(model, render=True)

I0000 00:00:1747661549.045084  124124 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1530 MB memory:  -> device: 0, name: Quadro P600, pci bus id: 0000:01:00.0, compute capability: 6.1


Epoch 500/500 | Mean Reward: -250.75 | Time: 6958.41ss
Training completed in 6958.41 seconds


ValueError: Invalid filepath extension for saving. Please add either a `.keras` extension for the native Keras format (recommended) or a `.h5` extension. Use `model.export(filepath)` if you want to export a SavedModel for use with TFLite/TFServing/etc. Received: filepath=lunar_lander_model.

In [3]:
# Save the model
model.save("lunar_lander_model.keras")

# Evaluate the agent
evaluate_agent(model, render=True)

Episode 1/10: Reward = -679.91
Episode 2/10: Reward = -486.24
Episode 3/10: Reward = -107.41
Episode 4/10: Reward = -172.51
Episode 5/10: Reward = -125.23
Episode 6/10: Reward = -510.38
Episode 7/10: Reward = -198.39
Episode 8/10: Reward = -114.42
Episode 9/10: Reward = -592.81
Episode 10/10: Reward = -83.31
Average Reward over 10 episodes: -307.06


-307.0609646966353