In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from collections import deque
import time
import seaborn as sns
from tqdm import tqdm
from matplotlib.patches import Patch
import os
import pygame

In [2]:
# Set device for training (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the QNet class
class REINFORCE(nn.Module):
    def __init__(self, env, lr=0.005, device=device):
        super(REINFORCE, self).__init__()
        
        # Set device for training (GPU if available)
        self.device = device
        
        # Get state and action space dimensions
        self.state_space_dim = env.observation_space.shape[0]
        self.action_space_dim = env.action_space.n
        
        # Define possible actions
        self.actions = torch.arange(self.action_space_dim).to(device)
        
        # Set learning rate
        self.lr = lr
        
        # Define neural network architecture
        self.net = nn.Sequential(
            nn.Linear(self.state_space_dim, 16, bias=True),
            nn.Tanh(),
            nn.Linear(16, 32, bias=True),
            nn.Tanh(),
            nn.Linear(32, self.action_space_dim, bias=True),
            nn.Softmax(dim=-1),
            )
        
        # Define optimizer
        self.optimizer = optim.Adam(self.net.parameters(), lr=self.lr)
        # Define lr scheduler to decrease learning rate every 100 episodes (optional)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.9)
    
    
    def forward(self, x):
        # Forward pass through the network
        return self.net(x.to(self.device))
    
    
    # Choose action based on epsilon-greedy policy
    def act(self, state):
        '''
        state: current state of the environment
        
        The act method takes in a state and returns an action based on the policy.
        It first gets the action probabilities from the network, then chooses an action based on these probabilities.
        '''
        # Get action probabilities
        action_probs = self.forward(state)
        
        # Choose action based on action probabilities 
        #torch's multinomial function returns a tensor of size 1 containing the index of the action chosen
        #it works by sampling from the multinomial distribution defined by the action probabilities
        action = torch.multinomial(action_probs, 1).item()
        
        return action
        
    def update_policy(self, rewards, log_probs, gamma):
        '''
        Rewards: list of rewards from the most recent episode
        Log_probs: list of log probabilities for the actions taken in the most recent episode
        Gamma: discount factor
        
        
        The update_policy method takes in a list of rewards and log probabilities, and a discount factor gamma.
        It then calculates the policy gradient loss and backpropagates it through the network.
        Finally, it steps the optimizer and returns the loss as a float.
        
        '''
        # Convert rewards to a PyTorch tensor and send to the appropriate device
        rewards = torch.tensor(rewards, dtype=torch.float32, device=self.device)
        
        # we flip the rewards tensor, using torch.flip() to get the cumulative sum in the reverse direction
        # making the short term rewards have a higher weight than the long term rewards
        rewards = rewards.flip(dims=(0,))
        
        
        #pytorch's cumsum computes the cumulative sum of the elements along a given axis, in this case the rewards
        #we multiply the rewards by gamma to the power of their index, which is the same as multiplying by gamma
        discounted_rewards = torch.cumsum(rewards * gamma, dim=0)
        
        #Now we flip the rewards tensor back to its original order
        discounted_rewards = discounted_rewards.flip(dims=(0,))
        
        # Normalize discounted rewards (substract mean and divide by standard deviation)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
        
        # Stack the log probabilities into a tensor
        log_probs = torch.stack(log_probs)
        
        # Calculate the policy gradient loss:
        # We use the negative log probabilities to get the log probabilities of the actions that were actually taken
        # We then multiply these log probabilities by the discounted rewards to get the policy gradient loss
        policy_gradient_loss = -log_probs * discounted_rewards 
        
        # Sum the policy gradient loss over all timesteps
        loss = policy_gradient_loss.sum()
        
        # Reset gradients to zero before backpropagation beacuse PyTorch accumulates gradients
        self.optimizer.zero_grad() #reset the gradients of the scheduler (the optimizer) to zero
        
        # Backpropagate the loss 
        loss.backward()
        
        # Optionally clip gradients to prevent large updates, this works by scaling the gradients if their norm is larger than 1
        torch.nn.utils.clip_grad_norm_(self.net.parameters(), 1.0)
        
        # Step the optimizer to update the network parameters
        self.optimizer.step()
        
        # Return the loss as a float for logging purposes
        return loss.item()  

        
    def save_model(self, filename):
        # Save model to file
        torch.save(self.state_dict(), filename)
    
    def load_model(self, filename, device='cuda'):
        # Load model from file
        self.load_state_dict(torch.load(filename, map_location=device))


In [3]:
#Auxiliary functions

# Function to preprocess states
def preprocess_state(state):
    # Convert state to a numpy array
    return torch.from_numpy(state).float().unsqueeze(0).to(device)

# Function to clear the video directory
def clear_video_directory(video_dir):
    # Get list of video files in video directory 
    video_files = [f for f in os.listdir(video_dir) if f.endswith('.mp4')]
    # Delete all video files in video directory
    for file in video_files:
        os.remove(os.path.join(video_dir, file))
        


In [4]:

import wandb
import os

# Initialize Weights & Biases
wandb.init(project="reinforce_lunar_lander", entity="ai42")

#HYPERPARAMETERS
lr=5e-3 # learning rate
gamma= 0.99 # discount factor
max_episodes= 5000  # max number of episodes to learn from

# Initialize environment and agent
env = gym.make('LunarLander-v2', continuous=False, render_mode='rgb_array')
reinforce_agent = REINFORCE(env).to(device)



best_score = -np.inf

# Define a video directory
video_dir = '/home/ndelafuente/Desktop/Learn2Earn_RL/LunarLander/Reinforce_Lunar_Lander/Videos' # Set this to your preferred directory
os.makedirs(video_dir, exist_ok=True)

# Wrap your environment to record videos
env = gym.wrappers.RecordVideo(env, video_folder=video_dir, episode_trigger=lambda episode_id: True)

# Set up lists to hold results
all_rewards = []
avg_rewards = []
losses = []

# Set up flag to indicate when to record a video of the agent playing
record_video = False

# Replace 'max_episodes' with the actual number
for episode in range(max_episodes): 
        
    state, info = env.reset()
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)

    log_probs = []
    rewards = []
    total_reward = 0

    #If the last episode was the best so far, we record a video of the agent playing
    if record_video:
        # Wrap environment to record videos if the last episode was the best so far
        env = gym.wrappers.RecordVideo(env, video_folder=video_dir, episode_trigger=lambda episode_id: True)
        
    # Run episode
    for t in range(env.spec.max_episode_steps):  # Limit the number of timesteps per episode: env.spec.max_episode_steps by default is 1000
        
        # Get action from agent 
        action = reinforce_agent.act(state)
        
        # Take step in environment 
        next_state, reward, terminated, truncated, info = env.step(action)
        
        # Preprocess next state
        next_state = preprocess_state(next_state)
        
        #we use the log_prob method of the distribution to get the log probability of the action taken in the current state 
        log_prob = torch.log(reinforce_agent.forward(state)[0][action])
        
        #we append the log probability and the reward to their respective lists
        log_probs.append(log_prob)
        rewards.append(reward)

        #update the total reward
        total_reward += reward
        
        # Update state
        state = next_state

        # Check if the episode is terminated or truncated (the lander has crashed or landed or the episode has reached the maximum number of timesteps)
        if terminated or truncated:
            break
     
    
    # Calculate loss using the reinforce_agent.update_policy() method 
    loss = reinforce_agent.update_policy(rewards, log_probs, gamma)
    
    # Append loss to list
    losses.append(loss)

    # Append total reward to list
    all_rewards.append(total_reward)
    
    # Calculate average reward over the last 100 episodes and append to list
    avg_reward = np.mean(all_rewards[-100:])  
    avg_rewards.append(avg_reward)

    # Save the model if it has improved
    if avg_reward > best_score:
        reinforce_agent.save_model('best_model.pth')
        best_score = avg_reward
        
    if reward > best_score: #check if episode reward is better than best score so far 
        record_video = True
    
    #After the episode is finished, remove the video recording wrapper for the next episode
    if isinstance(env, gym.wrappers.RecordVideo):
        # Get list of video files in video directory
        video_files = [f for f in os.listdir(video_dir) if f.endswith('.mp4')]
        if video_files:
            # Log the most recent video file to Weights & Biases 
            last_video_file = video_files[-1]  # The most recent video file
            wandb.log({"episode_video": wandb.Video(os.path.join(video_dir, last_video_file), fps=4, format="mp4")})
        
        env = env.env
    
    # Log results to Weights & Biases for tracking
    wandb.log({"Episode Reward": total_reward, "Average Reward": avg_reward, "Loss": loss, "Episode": episode})   
    
    # Clear videos directory 
    clear_video_directory(video_dir)

    # Print progress
    if episode % 100 == 0:
        print(f"Episode {episode}, Average Reward: {avg_reward}, Loss: {loss}")


# Close environment and Weights & Biases
env.close()
wandb.finish()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mneildlf[0m ([33mai42[0m). Use [1m`wandb login --relogin`[0m to force relogin


  logger.warn(


Moviepy - Building video /home/ndelafuente/Desktop/Learn2Earn_RL/LunarLander/Reinforce_Lunar_Lander/Videos/rl-video-episode-0.mp4.
Moviepy - Writing video /home/ndelafuente/Desktop/Learn2Earn_RL/LunarLander/Reinforce_Lunar_Lander/Videos/rl-video-episode-0.mp4



                                                   

Moviepy - Done !
Moviepy - video ready /home/ndelafuente/Desktop/Learn2Earn_RL/LunarLander/Reinforce_Lunar_Lander/Videos/rl-video-episode-0.mp4




Episode 0, Average Reward: -31.517590767269127, Loss: 0.37171459197998047
Episode 100, Average Reward: -156.41221147461295, Loss: -0.28967952728271484
Episode 200, Average Reward: -144.5927366673507, Loss: -5.400518894195557
