In [1]:
%pip install gymnasium

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from collections import deque
import time
import seaborn as sns
from tqdm import tqdm
from matplotlib.patches import Patch
import os
import pygame

In [None]:
# Set device for training (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Define the QNet class
class QNet(nn.Module):
    def __init__(self, env, lr=0.005, device=device):
        super(QNet, self).__init__()
        
        # Set device for training (GPU if available)
        self.device = device
        
        # Get state and action space dimensions
        self.state_space_dim = env.observation_space.shape[0]
        self.action_space_dim = env.action_space.n
        
        # Define possible actions
        self.actions = torch.arange(self.action_space_dim).to(device)
        
        # Set learning rate
        self.lr = lr
        
        # Define neural network architecture
        self.net = nn.Sequential(
            nn.Linear(self.state_space_dim, 16, bias=True),
            nn.Tanh(),
            nn.Linear(16, 32, bias=True),
            nn.Tanh(),
            nn.Linear(32, self.action_space_dim, bias=True),
            nn.Softmax(dim=-1),
            )
        
        # Define optimizer
        self.optimizer = optim.Adam(self.net.parameters(), lr=self.lr)
    
    
    def forward(self, x):
        # Forward pass through the network
        return self.net(x.to(self.device))
    
    
    # Choose action based on epsilon-greedy policy
    def get_action(self, state, epsilon=0.0):
       
        # Epsilon is the probability of choosing a random action
        
        if random.random() < epsilon:
            
            # Choose random action
            return random.choice(self.actions)
        
        else: 
            
            # Choose greedy action
            with torch.no_grad(): # Don't track gradients
                
                # Get Q-values for each action
                q_values = self.forward(state)
                
                # Choose best possible action
                return torch.argmax(q_values).item()
        
    
    def save_model(self, filename):
        # Save model to file
        torch.save(self.state_dict(), filename)
    
    def load_model(self, filename, device='cuda'):
        # Load model from file
        self.load_state_dict(torch.load(filename, map_location=device))

In [None]:
env = gym.make('LunarLander-v2', continuous=False)
QNet = QNet(env).to(device)
optimizer = optim.Adam(QNet.parameters(), lr=0.005)

#HYPERPARAMETERS
lr=0.005, 
gamma= 0.99, 
batch_size= 8,
max_episodes= 5000

epsilon_start= 1.0,
epsilon_final= 0.01,
epsilon_decay= 0.995