In [8]:
import gymnasium as gym
import matplotlib
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

#from ddpg import DDPGAgent

ModuleNotFoundError: No module named 'tqdm'

In [4]:
BATCH_SIZE = 256
LR = 0.001
TAU = 0.001
GAMMA = 0.99
EPOCHS = 100
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

env = gym.make('Pendulum-v1', g=9.81)

In [5]:
class DDPGCritic(nn.Module):
    def __init__(self, num_state, num_action, n_hidden_action=16, n_hidden_state=32, n_hidden=256):
        super().__init__()
        self.state_net  = nn.Linear(in_features=num_state, out_features=n_hidden_state)
        self.action_net = nn.Linear(in_features=num_action, out_features=n_hidden_action)
        self.linear1 = nn.Linear(in_features=(n_hidden_state + n_hidden_action), out_features=n_hidden)
        self.linear2 = nn.Linear(in_features=n_hidden, out_features=n_hidden)
        self.linear3 = nn.Linear(in_features=n_hidden, out_features=1)
        self.relu    = nn.ReLU()
        
    def forward(self, state, action):
        s_embedding = self.relu(self.state_net(state))
        a_embedding = self.relu(self.action_net(action))
        embedding = torch.cat([s_embedding, a_embedding], dim=-1)
        out = self.relu(self.linear1(embedding))
        out = self.relu(self.linear2(out))
        out = self.linear3(out)
        return out
    
    
class DDPGActor(nn.Module):
    def __init__(self, num_state, n_hidden=256, bound:tuple = None):
        super().__init__()
        self.linear1 = nn.Linear(in_features=num_state, out_features=n_hidden)
        self.linear2 = nn.Linear(in_features=n_hidden, out_features=n_hidden)
        self.linear3 = nn.Linear(in_features=n_hidden, out_features=1)
        self.relu    = nn.ReLU()
        self.tanh    = nn.Tanh()
        self.bound   = bound
        
    def forward(self, state):
        out = self.relu(self.linear1(state))
        out = self.relu(self.linear2(out))
        out = self.tanh(self.linear3(out))
        if self.bound:
            out = torch.clip(out, min=self.bound[0], max=self.bound[1])
        return out

In [50]:
from utils import ReplayMemory, Transition
import random, math
from itertools import count
import tqdm

class DDPGAgent(torch.nn.Module):
    def __init__(self, env, batch_size, tau, gamma, device, critic_lr=1e-4, actor_lr=1e-3, memory_size=10e6):
        super().__init__()
        self.env = env
        self.batch_size = batch_size
        self.tau = tau
        self.gamma = gamma
        self.device = device
        
        self.num_state = env.observation_space.shape[0]
        self.num_action= env.action_space.shape[0]
        self.lower_bound, self.upper_bound = env.action_space.low[0], env.action_space.high[0]
        
        self.critic        = DDPGCritic(self.num_state, self.num_action)
        self.target_critic = DDPGCritic(self.num_state, self.num_action)
        self.target_critic.load_state_dict(self.critic.state_dict())
        
        self.actor        = DDPGActor(self.num_state, bound=(self.lower_bound, self.upper_bound))
        self.target_actor = DDPGActor(self.num_state, bound=(self.lower_bound, self.upper_bound))
        self.target_actor.load_state_dict(self.actor.state_dict())
        
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr)
        self.actor_optimizer  = torch.optim.Adam(self.actor.parameters(),  lr=actor_lr)
        
        self.memory = ReplayMemory(memory_size)
        self.steps_done = 0
        self.critic_loss_history = []
        self.actor_loss_history = []
        self.critic_criterion = nn.MSELoss()
            
    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch=Transition(*zip(*transitions))

        next_state, state, action, reward = list(map(torch.cat, [batch.next_state, batch.state, batch.action, batch.reward]))
        
        with torch.no_grad():
            target_q = (self.target_critic(next_state, self.target_actor(next_state)) * self.gamma) + reward

        self.critic_optimizer.zero_grad()
        critic_loss = self.critic_criterion(self.critic(state, action), target_q)
        critic_loss.backward()
        self.critic_optimizer.step()
        self.critic_loss_history.append(critic_loss)

        self.actor_optimizer.zero_grad()
        actor_loss = -self.critic(state, self.actor(state)).mean()
        actor_loss.backward()
        self.actor_optimizer.step()
        print(actor_loss)
        self.actor_loss_history.append(actor_loss)
        
        
    def train(self, episodes):
        try: 
            assert torch.cuda.is_available()
            num_episodes = episodes
        except:
            print("CUDA Unavailable")
            num_episodes = 50
            
        for _ in tqdm.tqdm(range(num_episodes), ncols=100):
            state, _ = self.env.reset()
            state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
            for _ in count():
                action = self.actor(state)
                observation, reward, terminated, truncated, _ = self.env.step([action.item()])
                done = terminated or truncated
                if terminated:
                    next_state = None
                else:
                    next_state = torch.tensor(observation, dtype=torch.float32, device=self.device).unsqueeze(0)
                self.memory.push(state, action, next_state, torch.tensor(reward, dtype=torch.float32).unsqueeze(0).unsqueeze(0))
                state = next_state
                
                self.optimize_model()
                
                self.target_actor.load_state_dict(self.soft_update(self.actor, self.target_actor))
                self.target_critic.load_state_dict(self.soft_update(self.critic, self.target_critic))
                
                if done:
                    break

    def soft_update(self, network, target_network):
        network_dict = network.state_dict()
        target_dict = target_network.state_dict()
        for key in network_dict:
            target_dict[key] = (1-self.tau) * target_dict[key] + self.tau * network_dict[key]
        return target_dict
                
    def savefig(self, root=None):
        durations_t = torch.tensor(self.episode_durations, dtype=torch.float)
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())
        plt.title('Result')
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.savefig(root if root is None else 'ddpg.png')
        plt.show()
        
            
    def plot_duration(self, show_result=False):
        plt.figure(1)
        durations_t = torch.tensor(self.episode_durations, dtype=torch.float)
        if show_result:
            plt.title("Result")
        else:
            plt.clf()
            plt.title("Training")
            plt.xlabel('Episode')
        plt.ylabel('Duration')
        plt.plot(durations_t.numpy())
        if len(durations_t) >= 100:
            means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
            means = torch.cat((torch.zeros(99), means))
            plt.plot(means.numpy())
        plt.pause(0.001)  # pause a bit so that plots are updated
        if not show_result:
            display.display(plt.gcf())
            display.clear_output(wait=True)
        else:
            display.display(plt.gcf())
        

In [51]:
agent = DDPGAgent(env=env, batch_size=BATCH_SIZE, gamma=GAMMA, tau=TAU, device=device, memory_size=10000)
agent.train(EPOCHS)

CUDA Unavailable


  2%|█▎                                                              | 1/50 [00:00<00:37,  1.30it/s]

tensor(0.1548, grad_fn=<NegBackward0>)





RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [73]:
import numpy as np
np.clip([0.2], 0, 3)[0]

0.2

In [35]:
agent.critic_loss_history

[]