## **Deep Deterministic Policy Gradient (DDPG)**

An actor-critic algorithm for continuous action spaces that uses deterministic policies and experience reply.


**Import**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym


**Define Actor and Critic Networks**

In [None]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, 400),
            nn.ReLU(),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Linear(300, action_dim),
            nn.Tanh()
        )
        self.max_action = max_action

    def forward(self, state):
        return self.max_action * self.model(state)


**Initialize Networks and Optimizers**

In [None]:
actor = Actor(state_dim=3, action_dim=1, max_action=1)
critic = Critic(state_dim=3, action_dim=1)
actor_target = Actor(state_dim=3, action_dim=1, max_action=1)
critic_target = Critic(state_dim=3, action_dim=1)
actor_optimizer = optim.Adam(actor.parameters(), lr=1e-3)
critic_optimizer = optim.Adam(critic.parameters(), lr=1e-3)


**Define Replay Buffer**

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        batch = np.random.choice(self.buffer, batch_size, replace=False)
        state, action, reward, next_state, done = zip(*batch)
        return (
            torch.tensor(state, dtype=torch.float32),
            torch.tensor(action, dtype=torch.float32),
            torch.tensor(reward, dtype=torch.float32),
            torch.tensor(next_state, dtype=torch.float32),
            torch.tensor(done, dtype=torch.float32)
        )


**Training Loop**

In [None]:
def train():
    replay_buffer = ReplayBuffer(1000000)
    state = env.reset()
    episode_reward = 0
    for t in range(1000000):
        action = actor(torch.tensor(state, dtype=torch.float32))
        next_state, reward, done, _ = env.step(action.detach().numpy())
        replay_buffer.push(state, action.detach().numpy(), reward, next_state, done)
        state = next_state
        episode_reward += reward

        if len(replay_buffer.buffer) > 1000:
            batch = replay_buffer.sample(100)
            state_batch, action_batch, reward_batch, next_state_batch, done_batch = batch

            # Update Critic
            target_q = reward_batch + (1 - done_batch) * 0.99 * critic_target(next_state_batch, actor_target(next_state_batch))
            current_q = critic(state_batch, action_batch)
            critic_loss = nn.MSELoss()(current_q, target_q)
            critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_optimizer.step()

            # Update Actor
            actor_loss = -critic(state_batch, actor(state_batch)).mean()
            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()

            # Update Target Networks
            for target_param, param in zip(actor_target.parameters(), actor.parameters()):
                target_param.data.copy_(0.995 * target_param.data + 0.005 * param.data)
            for target_param, param in zip(critic_target.parameters(), critic.parameters()):
                target_param.data.copy_(0.995 * target_param.data + 0.005 * param.data)

        if done:
            state = env.reset()
            episode_reward = 0
