In [1]:
import requests
import json
import gymnasium as gym
import stable_baselines3 as stb
import torch 
from torch import nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

In [6]:
# define the NN for the Q-learning agent
class DQN(nn.Module):
    def __init__(self,input_dim,output_dim):
        super(DQN,self).__init__()
        self.fc1 = nn.Linear(input_dim,128)
        self.fc2 = nn.Linear(128,128)
        self.fc3 = nn.Linear(128,output_dim)
        
    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x) # outputs Q-values
    
    
env = gym.make("LunarLander-v2", render_mode="human")
    
# Parameters
input_dim = env.observation_space.shape[0]  # 8 state inputs
output_dim = env.action_space.n  # 4 discrete actions
learning_rate = 0.001
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
batch_size = 64
memory_size = 100000
episodes = 1000
target_update = 10    
     
     

# Initialize Q-network and target network
policy_net = DQN(input_dim, output_dim)
target_net = DQN(input_dim, output_dim)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()



# Optimizer and loss function
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

memory = deque(maxlen=memory_size)




# Function to select an action based on epsilon-greedy policy
def select_action(state, epsilon):
    if random.random() < epsilon:
        return env.action_space.sample()  # Random action (exploration)
    else:
        with torch.no_grad():
            state = torch.FloatTensor(state).unsqueeze(0)
            q_values = policy_net(state)
            return q_values.argmax().item()  # Best action (exploitation)
        
        
# Store experiences in replay memory
def store_experience(memory, experience):
    memory.append(experience)

# Sample a random batch from replay memory
def sample_batch(memory, batch_size):
    return random.sample(memory, batch_size)

# Update the policy network
def optimize_model():
    if len(memory) < batch_size:
        return
    batch = sample_batch(memory, batch_size)
    state_batch = torch.FloatTensor([exp[0] for exp in batch])
    action_batch = torch.LongTensor([exp[1] for exp in batch]).unsqueeze(1)
    reward_batch = torch.FloatTensor([exp[2] for exp in batch])
    next_state_batch = torch.FloatTensor([exp[3] for exp in batch])
    done_batch = torch.FloatTensor([exp[4] for exp in batch])

    # Compute Q(s, a)
    q_values = policy_net(state_batch).gather(1, action_batch)

    # Compute Q(s', a') from the target network (max Q-value for next state)
    next_q_values = target_net(next_state_batch).max(1)[0]
    next_q_values = next_q_values * (1 - done_batch)  # Zero out if the episode is done
    expected_q_values = reward_batch + (gamma * next_q_values)

    # Compute the loss
    loss = loss_fn(q_values.squeeze(), expected_q_values.detach())

    # Optimize the network
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Main training loop
for episode in range(episodes):
    state, _ = env.reset()
    total_reward = 0

    for t in range(1000):  # Maximum steps per episode
        action = select_action(state, epsilon)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        total_reward += reward

        # Store the experience in replay memory
        store_experience(memory, (state, action, reward, next_state, done))

        state = next_state

        # Optimize the model
        optimize_model()

        if done:
            break

    # Decay epsilon (exploration)
    epsilon = max(epsilon_min, epsilon_decay * epsilon)

    # Update the target network
    if episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())

    print(f"Episode {episode + 1}, Total Reward: {total_reward}")

# Close the environment
env.close()

  state_batch = torch.FloatTensor([exp[0] for exp in batch])


Episode 1, Total Reward: -280.8753187456331
Episode 2, Total Reward: -213.04787505672826
Episode 3, Total Reward: -185.73228888754053
Episode 4, Total Reward: -95.74447776198635
Episode 5, Total Reward: -200.75514298349992
Episode 6, Total Reward: -160.01263329974586
Episode 7, Total Reward: -137.9937803653483
Episode 8, Total Reward: -114.28593634645749
Episode 9, Total Reward: -163.99764781665442
Episode 10, Total Reward: -89.43146424364397
Episode 11, Total Reward: -120.72758590662622
Episode 12, Total Reward: -207.3516905235723
Episode 13, Total Reward: -301.62450856618807
Episode 14, Total Reward: -90.41218028976866
Episode 15, Total Reward: -110.27022758258401
Episode 16, Total Reward: -121.57356595850742
Episode 17, Total Reward: -20.71741146816575
Episode 18, Total Reward: -302.60374614490036
Episode 19, Total Reward: -59.68022407489589
Episode 20, Total Reward: -128.1451201042813
Episode 21, Total Reward: -121.98241617557072
Episode 22, Total Reward: -227.6087757175111
Episode

KeyboardInterrupt: 