In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from collections import deque
import time
import seaborn as sns
from tqdm import tqdm
from matplotlib.patches import Patch
import os
import pygame

In [2]:
# Set device for training (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Define the QNet class
class QNet(nn.Module):
    def __init__(self, env, lr=0.005, device=device):
        super(QNet, self).__init__()
        
        # Set device for training (GPU if available)
        self.device = device
        
        # Get state and action space dimensions
        self.state_space_dim = env.observation_space.shape[0]
        self.action_space_dim = env.action_space.n
        
        # Define possible actions
        self.actions = torch.arange(self.action_space_dim).to(device)
        
        # Set learning rate
        self.lr = lr
        
        # Define neural network architecture
        self.net = nn.Sequential(
            nn.Linear(self.state_space_dim, 16, bias=True),
            nn.Tanh(),
            nn.Linear(16, 32, bias=True),
            nn.Tanh(),
            nn.Linear(32, self.action_space_dim, bias=True),
            nn.Softmax(dim=-1),
            )
        
        # Define optimizer
        self.optimizer = optim.Adam(self.net.parameters(), lr=self.lr)
    
    
    def forward(self, x):
        # Forward pass through the network
        return self.net(x.to(self.device))
    
    
    # Choose action based on epsilon-greedy policy
    def act(self, state):
        # Get action probabilities
        action_probs = self.forward(state)
        
        # Choose action based on action probabilities 
        action = torch.multinomial(action_probs, 1).item()
        
        return action
        
    def update_policy(self, rewards, log_probs, gamma):
        # Compute discounted rewards going backwards from the last state of the episode
        discounted_rewards = []
        R = 0
        
        for r in rewards[::-1]:    
            # Reset reward sum if we encounter a non-zero reward
            R = r + gamma * R
            
            # Insert discounted reward at the beginning of the list
            discounted_rewards.insert(0, R)
    
        # Convert discounted rewards to tensor and pass to device
        discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32).to(self.device)
        
        #Normalize discounted rewards
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
        
        #List to store policy gradient loss 
        policy_gradient_loss = []
        
        # Compute policy gradient loss for each step of the episode
        for log_prob, reward in zip(log_probs, discounted_rewards):
            
            # append the negative of the policy gradient loss to the list
            policy_gradient_loss.append(-log_prob * reward)
        
        #Stack the sum of the policy gradient losses of this episode to the list of policy gradient loss    
        loss = torch.stack(policy_gradient_loss).sum()
        
        
        # Reset gradients to zero
        self.optimizer.zero_grad() 
        
        # Backpropagate the loss
        loss.backward()
        
        # Perform the optimization step
        self.optimizer.step()
        
        return loss.item() # Return the loss as a float for logging
        
    def save_model(self, filename):
        # Save model to file
        torch.save(self.state_dict(), filename)
    
    def load_model(self, filename, device='cuda'):
        # Load model from file
        self.load_state_dict(torch.load(filename, map_location=device))

In [None]:
env = gym.make('LunarLander-v2', continuous=False)
QNet = QNet(env).to(device)
optimizer = optim.Adam(QNet.parameters(), lr=0.005)

#HYPERPARAMETERS
lr=0.005, 
gamma= 0.99, 
batch_size= 8,
max_episodes= 5000
