In [None]:

import os
# Set environment variable to handle OpenMP runtime conflict
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

# Rest of the imports
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
from torch.distributions import Normal
import matplotlib.pyplot as plt
import time


class RunningMeanStd:
    def __init__(self, epsilon=1e-4):
        self.mean = 0
        self.std = 1
        self.var = 1
        self.epsilon = epsilon
        self.count = self.epsilon

    def update(self, x):
        batch_mean = np.mean(x)
        batch_var = np.var(x)
        batch_count = len(x)
        
        delta = batch_mean - self.mean
        self.mean = self.mean + delta * batch_count / (self.count + batch_count)
        m_a = self.var * self.count
        m_b = batch_var * batch_count
        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
        self.var = M2 / (self.count + batch_count)
        self.std = np.sqrt(self.var)
        self.count += batch_count



# Actor and Critic Network definitions remain the same
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        # More robust architecture with normalization and dropout
        self.net = nn.Sequential(
            nn.Linear(state_dim, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.1),
            
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.1),
            
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.ReLU()
        )
        
        # Separate heads for mean and std
        self.mean_layer = nn.Sequential(
            nn.Linear(128, action_dim),
            nn.Tanh()
        )
        
        self.log_std_layer = nn.Sequential(
            nn.Linear(128, action_dim),
            nn.Tanh()  # Bound the std
        )
        
    def forward(self, state):
        features = self.net(state)
        mean = self.mean_layer(features)
        log_std = self.log_std_layer(features)
        # Ensure std is positive
        std = torch.exp(log_std)
        return mean, std

class Critic(nn.Module):
    def __init__(self, state_dim):
        super().__init__()
        # More robust architecture matching the actor
        self.net = nn.Sequential(
            nn.Linear(state_dim, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.1),
            
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.1),
            
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            
            nn.Linear(128, 1)
        )
        
    def forward(self, state):
        return self.net(state)



class PPO:
    def __init__(self, state_dim, action_dim):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.actor = Actor(state_dim, action_dim).to(self.device)
        self.critic = Critic(state_dim).to(self.device)
        
        # More conservative learning rates and optimizers
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4)  # Reduced from 3e-4
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=5e-4)  # Reduced from 1e-3
        
      

        # PPO hyperparameters with conservative values
        self.gamma = 0.99  # Increased from 0.98 for longer-term rewards
        self.gae_lambda = 0.95
        self.clip_ratio = 0.1  # More conservative clipping
        self.max_grad_norm = 0.3  # Reduced from 0.5 for more stable updates
        
        # Adaptive entropy coefficient
        self.initial_entropy_coef = 0.01
        self.entropy_coef = self.initial_entropy_coef
        self.min_entropy_coef = 0.001
        self.entropy_decay = 0.995
        
        # Normalization components
        self.reward_normalizer = RunningMeanStd()
        self.state_normalizer = RunningMeanStd()
        self.advantage_normalizer = RunningMeanStd()
        
        # Experience replay buffer
        self.max_memory_size = 2048
        self.batch_size = 64
        
        # Training tracking
        self.best_reward = float('-inf')
        self.training_step = 0
        
    def normalize_state(self, state):
        """Normalize states using running statistics"""
        if isinstance(state, np.ndarray):
            self.state_normalizer.update(state)
            return (state - self.state_normalizer.mean) / (self.state_normalizer.std + 1e-8)
        return state

    def normalize_reward(self, reward):
        """Normalize rewards using running statistics"""
        self.reward_normalizer.update(np.array([reward]))
        return (reward - self.reward_normalizer.mean) / (self.reward_normalizer.std + 1e-8)
    
    def get_action(self, state):
        """Get action with normalized states"""
        with torch.no_grad():
            state = torch.FloatTensor(self.normalize_state(state)).to(self.device)
            mean, std = self.actor(state)
            dist = Normal(mean, std)
            action = dist.sample()
            action = torch.clamp(action, -1.0, 1.0)
            log_prob = dist.log_prob(action).sum(dim=-1)
            value = self.critic(state)
            return action.cpu().numpy(), value.item(), log_prob.item()

    def update(self, memory, episode_reward):
        """Updated PPO training step with improvements"""
        
        
        # Prepare data
        states = torch.FloatTensor(self.normalize_state(memory['states'])).to(self.device)
        actions = torch.FloatTensor(memory['actions']).to(self.device)
        old_log_probs = torch.FloatTensor(memory['log_probs']).to(self.device)
        rewards = torch.FloatTensor([self.normalize_reward(r) for r in memory['rewards']]).to(self.device)
        returns = torch.FloatTensor(memory['returns']).to(self.device)
        advantages = torch.FloatTensor(memory['advantages']).to(self.device)
        
        # Normalize advantages
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
        
        # Mini-batch updates with multiple epochs
        indices = torch.randperm(states.size(0))
        n_epochs = 4  # Multiple epochs over the same data
        batch_size = self.batch_size
        
        for _ in range(n_epochs):
            indices = np.arange(states.size(0))
            np.random.shuffle(indices)
            
            for start_idx in range(0, states.size(0), self.batch_size):
                # Get mini-batch
                idx = indices[start_idx:start_idx + self.batch_size]
                batch_states = states[idx]
                batch_actions = actions[idx]
                batch_old_log_probs = old_log_probs[idx]
                batch_returns = returns[idx]
                batch_advantages = advantages[idx]
                
                # Get current policy distribution
                means, stds = self.actor(batch_states)
                dist = Normal(means, stds)
                new_log_probs = dist.log_prob(batch_actions).sum(dim=-1)
                entropy = dist.entropy().mean()
                
                # Calculate policy loss with clipping
                ratios = torch.exp(new_log_probs - batch_old_log_probs)
                surr1 = ratios * batch_advantages
                surr2 = torch.clamp(ratios, 1 - self.clip_ratio, 1 + self.clip_ratio) * batch_advantages
                policy_loss = -torch.min(surr1, surr2).mean()
                
                # Calculate value loss
                current_values = self.critic(batch_states).squeeze()
                value_loss = 0.5 * ((current_values - batch_returns) ** 2).mean()
                
                # Calculate entropy loss for exploration
                entropy_loss = -self.entropy_coef * entropy
                
                # Total loss
                total_loss = policy_loss + value_loss + entropy_loss
                
                # Update networks with gradient clipping
                self.actor_optimizer.zero_grad()
                self.critic_optimizer.zero_grad()
                total_loss.backward()
                
                # Clip gradients
                torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)
                torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm)
                
                self.actor_optimizer.step()
                self.critic_optimizer.step()
                
        # Decay entropy coefficient
        if episode_reward > self.best_reward:
            self.best_reward = episode_reward
            self.entropy_coef = max(self.entropy_coef * self.entropy_decay, self.min_entropy_coef)
        
        self.training_step += 1
        
        # Return losses for monitoring
        return {
            'policy_loss': policy_loss.item(),
            'value_loss': value_loss.item(),
            'entropy': entropy.item()
        }

    def save(self, path):
        """Save model with additional training state"""
        torch.save({
            'actor_state_dict': self.actor.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'actor_optimizer_state_dict': self.actor_optimizer.state_dict(),
            'critic_optimizer_state_dict': self.critic_optimizer.state_dict(),
            'best_reward': self.best_reward,
            'training_step': self.training_step,
            'entropy_coef': self.entropy_coef,
            'reward_normalizer_state': {
                'mean': self.reward_normalizer.mean,
                'std': self.reward_normalizer.std,
                'var': self.reward_normalizer.var,
                'count': self.reward_normalizer.count
            },
            'state_normalizer_state': {
                'mean': self.state_normalizer.mean,
                'std': self.state_normalizer.std,
                'var': self.state_normalizer.var,
                'count': self.state_normalizer.count
            }
        }, path)

    def load(self, path):
        """Load model with additional training state"""
        checkpoint = torch.load(path)
        self.actor.load_state_dict(checkpoint['actor_state_dict'])
        self.critic.load_state_dict(checkpoint['critic_state_dict'])
        self.actor_optimizer.load_state_dict(checkpoint['actor_optimizer_state_dict'])
        self.critic_optimizer.load_state_dict(checkpoint['critic_optimizer_state_dict'])
        self.best_reward = checkpoint['best_reward']
        self.training_step = checkpoint['training_step']
        self.entropy_coef = checkpoint['entropy_coef']
        
        # Load normalizer states
        self.reward_normalizer.mean = checkpoint['reward_normalizer_state']['mean']
        self.reward_normalizer.std = checkpoint['reward_normalizer_state']['std']
        self.reward_normalizer.var = checkpoint['reward_normalizer_state']['var']
        self.reward_normalizer.count = checkpoint['reward_normalizer_state']['count']
        
        self.state_normalizer.mean = checkpoint['state_normalizer_state']['mean']
        self.state_normalizer.std = checkpoint['state_normalizer_state']['std']
        self.state_normalizer.var = checkpoint['state_normalizer_state']['var']
        self.state_normalizer.count = checkpoint['state_normalizer_state']['count']


def plot_training_results(rewards):
    """
    Plot the episode rewards and moving average with thicker lines
    """
    try:
        # Convert to numpy array if it isn't already
        rewards = np.array(rewards, dtype=np.float32)
        
        # Create figure and close any existing plots
        plt.close('all')
        plt.figure(figsize=(10, 6))
        
        # Plot individual episode rewards with moderate thickness
        plt.plot(rewards, alpha=0.3, color='blue', label='Episode Reward', linewidth=1.0)
        
        # Calculate and plot moving average with thicker line
        window_size = 100
        if len(rewards) >= window_size:
            moving_avg = np.zeros(len(rewards) - window_size + 1)
            for i in range(len(moving_avg)):
                moving_avg[i] = np.mean(rewards[i:i+window_size])
            plt.plot(range(window_size-1, len(rewards)), 
                    moving_avg, 
                    color='red', 
                    label='100-Episode Moving Average',
                    linewidth=3.0)  # Increased linewidth for moving average
        
        plt.title('Training Progress', fontsize=14, pad=15)  # Larger title with padding
        plt.xlabel('Episode', fontsize=12)
        plt.ylabel('Reward', fontsize=12)
        plt.legend(fontsize=10)
        plt.grid(True, alpha=0.3)  # Lighter grid
        
        # Enhance overall appearance
        plt.tight_layout()
        
        # Save the plot with high DPI for crisp lines
        plt.savefig('training_results.png', dpi=300, bbox_inches='tight')
        plt.show()
        
    except Exception as e:
        print(f"Error in plotting: {e}")
    
    finally:
        # Clean up
        plt.close('all')



def train(total_episodes=10000, eval_frequency=100, render_every=20, show_gui=False):
    """
    Improved training function with better monitoring and stability
    
    Args:
        total_episodes (int): Maximum number of episodes to train
        eval_frequency (int): How often to run evaluation episodes
        render_every (int): How often to render training episodes
        show_gui (bool): Whether to show the GUI during training
    """
    # Create environment
    env = gym.make('BipedalWalker-v3')
    eval_env = gym.make('BipedalWalker-v3')  # Separate env for evaluation
    
    # Initialize agent
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    agent = PPO(state_dim, action_dim)
    
    # Training variables
    best_reward = float('-inf')
    best_avg_reward = float('-inf')
    episode_rewards = []
    eval_rewards = []
    display_env = None
    
    # Early stopping variables
    patience = 200000
    patience_counter = 0
    min_improvement = 1.0  # Minimum improvement to reset patience
    
    # Progress tracking
    training_info = {
        'episode_rewards': [],
        'eval_rewards': [],
        'eval_episodes': [],
        'actor_losses': [],
        'critic_losses': [],
        'entropy_values': [],
        'learning_rates': []
    }
    
    try:
        for episode in range(total_episodes):
            # Initialize episode memory
            memory = {
                'states': [],
                'actions': [],
                'rewards': [],
                'dones': [],
                'log_probs': [],
                'values': []
            }
            
            # Reset environment
            state, _ = env.reset()
            episode_reward = 0
            steps = 0
            
            # Create display environment if needed
            if show_gui and episode % render_every == 0:
                if display_env is not None:
                    display_env.close()
                display_env = gym.make('BipedalWalker-v3', render_mode='human')
                display_state, _ = display_env.reset()
            
            # Episode loop
            while True:
                # Get action from policy
                action, value, log_prob = agent.get_action(state)
                
                # Take step in environment
                next_state, reward, terminated, truncated, _ = env.step(action)
                done = terminated or truncated
                
                # Store transition
                memory['states'].append(state)
                memory['actions'].append(action)
                memory['rewards'].append(reward)
                memory['dones'].append(done)
                memory['log_probs'].append(log_prob)
                memory['values'].append(value)
                
                episode_reward += reward
                state = next_state
                steps += 1
                
                # Display if needed
                if show_gui and episode % render_every == 0 and display_env is not None:
                    display_action, _, _ = agent.get_action(display_state)
                    display_state, _, terminated, truncated, _ = display_env.step(display_action)
                    if terminated or truncated:
                        break
                
                if done:
                    break
                
                # Check memory limit
                if len(memory['states']) >= agent.max_memory_size:
                    # Get final value estimate for partial trajectory
                    _, final_value, _ = agent.get_action(state)
                    memory['values'].append(final_value)
                    break
            
            # Compute returns and advantages
            final_value = 0 if done else agent.get_action(state)[1]
            returns, advantages = compute_returns(
                memory['rewards'],
                memory['dones'],
                memory['values'],
                final_value,
                agent.gamma,
                agent.gae_lambda
            )
            
            # Update memory
            memory['returns'] = returns
            memory['advantages'] = advantages
            
            
            loss_info = agent.update(memory, episode_reward)
            losses = [loss_info]
           
            
            # Store episode results
            episode_rewards.append(episode_reward)
            training_info['episode_rewards'].append(episode_reward)
            training_info['actor_losses'].append(np.mean([l['policy_loss'] for l in losses]))
            training_info['critic_losses'].append(np.mean([l['value_loss'] for l in losses]))
            training_info['entropy_values'].append(np.mean([l['entropy'] for l in losses]))
            training_info['learning_rates'].append(agent.actor_optimizer.param_groups[0]['lr'])
            
            # Calculate average reward
            avg_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) >= 100 else np.mean(episode_rewards)
            
            # Run evaluation episodes
            if episode % eval_frequency == 0:
                eval_reward = evaluate_policy(agent, eval_env, episodes=5)
                eval_rewards.append(eval_reward)
                training_info['eval_rewards'].append(eval_reward)
                training_info['eval_episodes'].append(episode)
                
                # Update best models if improved
                if eval_reward > best_avg_reward:
                    best_avg_reward = eval_reward
                    agent.save('best_model.pth')
                    print(f"New best model saved with eval reward: {eval_reward:.2f}")
            
            # Early stopping check
            if avg_reward > best_reward + min_improvement:
                best_reward = avg_reward
                patience_counter = 0
                print(f"New best reward: {best_reward:.2f}")
            else:
                patience_counter += 1
            
            # Print progress
            print(f"Episode {episode + 1}")
            print(f"Reward: {episode_reward:.2f}")
            print(f"Average Reward (last 100): {avg_reward:.2f}")
            print(f"Steps: {steps}")
            print(f"Learning Rate: {agent.actor_optimizer.param_groups[0]['lr']:.2e}")
            print(f"Entropy Coefficient: {agent.entropy_coef:.2e}")
            print("-" * 50)
            
            # Save training state periodically
            if episode % 100 == 0:
                agent.save(f'checkpoint_episode_{episode}.pth')
                
                # Plot and save training progress
                plot_training_progress(training_info, save_path=f'training_progress_episode_{episode}.png')
            
            # Early stopping
            if patience_counter >= patience:
                print("Early stopping triggered!")
                break
            
            # Success criterion
            if avg_reward >= 300:  # You can adjust this threshold
                print("Environment solved!")
                break
            
            # Clear memory
            del memory
    
    except KeyboardInterrupt:
        print("\nTraining interrupted by user")
    
    finally:
        # Cleanup
        env.close()
        eval_env.close()
        if display_env is not None:
            display_env.close()
        
        # Save final model and progress
        agent.save('final_model.pth')
        plot_training_progress(training_info, save_path='final_training_progress.png')
        
        # Print final statistics
        print("\nTraining Summary:")
        print(f"Total Episodes: {episode + 1}")
        print(f"Best Average Reward: {best_reward:.2f}")
        print(f"Best Evaluation Reward: {best_avg_reward:.2f}")
        print(f"Final Learning Rate: {agent.actor_optimizer.param_groups[0]['lr']:.2e}")
        print(f"Final Entropy Coefficient: {agent.entropy_coef:.2e}")
    
    return agent, training_info


def evaluate_policy(agent, env, episodes=5):
    """
    Evaluate the policy without exploration
    """
    eval_rewards = []
    
    for _ in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            with torch.no_grad():
                # Use mean action (no sampling)
                state_tensor = torch.FloatTensor(agent.normalize_state(state)).to(agent.device)
                mean, _ = agent.actor(state_tensor)
                action = torch.clamp(mean, -1.0, 1.0).cpu().numpy()
            
            state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            done = terminated or truncated
        
        eval_rewards.append(total_reward)
    
    return np.mean(eval_rewards)

def plot_training_progress(info, save_path='training_progress.png'):
    """
    Plot comprehensive training progress
    """
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
    
    # Plot rewards
    ax1.plot(info['episode_rewards'], alpha=0.3, color='blue', label='Episode Reward')
    window_size = min(100, len(info['episode_rewards']))
    if window_size > 0:
        moving_avg = np.convolve(info['episode_rewards'], 
                               np.ones(window_size)/window_size, 
                               mode='valid')
        ax1.plot(range(window_size-1, len(info['episode_rewards'])), 
                moving_avg, color='red', label='Moving Average')
    ax1.set_title('Training Rewards')
    ax1.set_xlabel('Episode')
    ax1.set_ylabel('Reward')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot evaluation rewards
    if info['eval_rewards']:
        ax2.plot(
        info['eval_episodes'],
        info['eval_rewards'],
        marker='o',
        label='Eval Reward'
    )
    ax2.set_title('Evaluation Rewards')
    ax2.set_xlabel('Episode')
    ax2.set_ylabel('Reward')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # Plot losses
    ax3.plot(info['actor_losses'], label='Actor Loss', alpha=0.7)
    ax3.plot(info['critic_losses'], label='Critic Loss', alpha=0.7)
    ax3.set_title('Losses')
    ax3.set_xlabel('Episode')
    ax3.set_ylabel('Loss')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # Plot learning rate and entropy
    ax4.plot(info['learning_rates'], label='Learning Rate', color='green')
    ax4.set_title('Learning Rate')
    ax4.set_xlabel('Episode')
    ax4.set_ylabel('Learning Rate')
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()


def compute_returns(rewards, dones, values, next_value, gamma, gae_lambda):
    returns = []
    advantages = []
    gae = 0
    values = values + [next_value]  # Append the next value
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * (1 - dones[step]) - values[step]
        gae = delta + gamma * gae_lambda * (1 - dones[step]) * gae
        returns.insert(0, gae + values[step])
        advantages.insert(0, gae)
    return returns, advantages

if __name__ == "__main__":
    # Set random seeds
    torch.manual_seed(42)
    np.random.seed(42)
    
    # Train the agent
    print("Starting training...")
    agent, training_info = train(render_every=100, total_episodes=10000, show_gui=False)

Starting training...
New best model saved with eval reward: -101.28
New best reward: -128.29
Episode 1
Reward: -128.29
Average Reward (last 100): -128.29
Steps: 143
Learning Rate: 1.00e-04
Entropy Coefficient: 9.95e-03
--------------------------------------------------
Episode 2
Reward: -143.70
Average Reward (last 100): -135.99
Steps: 434
Learning Rate: 1.00e-04
Entropy Coefficient: 9.95e-03
--------------------------------------------------
New best reward: -124.99
Episode 3
Reward: -102.98
Average Reward (last 100): -124.99
Steps: 57
Learning Rate: 1.00e-04
Entropy Coefficient: 9.90e-03
--------------------------------------------------
New best reward: -119.17
Episode 4
Reward: -101.70
Average Reward (last 100): -119.17
Steps: 66
Learning Rate: 1.00e-04
Entropy Coefficient: 9.85e-03
--------------------------------------------------
New best reward: -114.88
Episode 5
Reward: -97.73
Average Reward (last 100): -114.88
Steps: 89
Learning Rate: 1.00e-04
Entropy Coefficient: 9.80e-03
--