# Train a DQN model on a single EC2 instance

In [None]:
'''
This code is for demonstration purpose. 
You will not connect to the ray cluster.
You will train a DQN model on a single node.
'''
import random
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from collections import deque, namedtuple
import time
import gym

In [None]:
'''
DO NOT MODIFY THIS CELL.
We use DQN as our RL training algorithm
See https://huggingface.co/learn/deep-rl-course/en/unit3/deep-q-algorithm to get more infomation about DQN.
'''
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size, seed, fc1_unit=64,
                 fc2_unit=64):
        """
        Initialize parameters and build model.
        Params
        =======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_unit (int): Number of nodes in first hidden layer
            fc2_unit (int): Number of nodes in second hidden layer
        """
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_unit)
        self.fc2 = nn.Linear(fc1_unit, fc2_unit)
        self.fc3 = nn.Linear(fc2_unit, action_size)

    def forward(self, x):
        """
        Build a network that maps state -> action values.
        """
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [None]:
'''
DO NOT MODIFY THIS CELL.
The ReplayBuffer is an essential component in Deep Q-Networks (DQN)
It stores the Agent's experiences at each time step, comprised of the state, action, reward, next state, and done flag, in a memory buffer.
This allows for the experiences to be randomly sampled later to train the neural network.
'''
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""
    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.
        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float()
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long()
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float()
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float()
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float()

        return states, actions, rewards, next_states, dones

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [None]:
'''
The Agent is interacting with the environment to learn an optimal policy for decision-making.
'''
class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        random.seed(seed)
        self.state_size = state_size
        self.action_size = action_size

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed)
        self.qnetwork_target = QNetwork(state_size, action_size, seed)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # Set test enviroment
        self.test_env= gym.make('LunarLander-v2')
        self.test_env.seed(TEST_SEED)

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))

        # Get expected Q values from local model
        q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(q_expected, q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

    # test agent by playing games and return average rewards
    def play_game(self, max_timesteps, num_games=10):
        total_reward_sum = 0
        for _ in range(num_games):
            state = self.test_env.reset()
            total_reward = 0
            for t in range(max_timesteps):

                state_tensor = torch.from_numpy(state).float().unsqueeze(0)

                with torch.no_grad():
                    action_values = self.qnetwork_local(state_tensor)
                action = torch.argmax(action_values).item()

                next_state, reward, done, _ = self.test_env.step(action)
                total_reward += reward
                state = next_state
                if done:
                    break
            total_reward_sum += total_reward

        average_reward = total_reward_sum / num_games
        return average_reward

In [None]:
'''
This code iteratively trains an Agent over a specified number of episodes,
adjusting the exploration-exploitation balance via the epsilon decay.
It tracks the Agent's performance, evaluating and reporting the average score periodically.
Training concludes either when the maximum number of episodes is reached or the agent consistently achieves a predefined average score,
suggesting the environment is solved.
'''
def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """

    eps = eps_start  # initialize epsilon
    average_score=0
    # train the model
    for i_episode in range(1, n_episodes + 1):
        time_start=time.time()
        state = env.reset()
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        time_end=time.time()
        time_train.append(time_end-time_start)
        print(f'Episode {i_episode} finished.')
        # test every 10 episodes or the average score of testing is near 100
        if i_episode % 10 == 0 or average_score>=90:
            average_score = agent.play_game(200, num_games=10)
            print(f'Episode {i_episode}\tAverage Score: {average_score:.2f}')

        # end training if avg score >=100
        if average_score >= 100:
            print(f'Environment solved in {i_episode} episodes!')
            break

In [None]:
'''
The driver code to run the training. 
'''

# set random seed; DO NOT CHANGE
SEED=0  # seed for train
TEST_SEED=42 # seed for test

# hyper parameters
BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 64  # minibatch size
GAMMA = 0.99  # discount factor
TAU = 1e-3  # for soft update of target parameters
LR = 5e-4  # learning rate
UPDATE_EVERY = 4  # how often to update the network


# initialize environment for training
env = gym.make('LunarLander-v2')
env.seed(SEED)

# initialize agent
agent = Agent(state_size=8, action_size=4, seed=SEED)

# record training time
time_train=[]
start_time = time.time()

# run the training session
dqn()
end_ime= time.time()
total_time = time.time() - start_time

print(f"Total time: {total_time:.2f} seconds")
print(f"Total Training Time: {sum(time_train)} seconds")
print(f"Average time per episode: {np.mean(time_train)} seconds")