In [40]:
import torch
import torch.optim as optim
import torch.nn.functional as F
import gym
import numpy as np
import copy
import random
from model import Actor
from model import Critic
from buffer import Buffer
from noise import OUStrategy
from collections import deque

In [41]:
GAMMA = 0.99
TAU = 1e-3
BUF_SIZE = 4096
BATCH_SIZE = 256
LR = 1e-4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [44]:
class DDPG:

    def __init__(self, state_dim, action_dim):
        self.critic = Critic(state_dim, action_dim).to(device)
        self.target_c = copy.deepcopy(self.critic)

        self.actor = Actor(state_dim).to(device)
        self.target_a = copy.deepcopy(self.actor)

        self.optimizer_c = optim.Adam(self.critic.parameters(), lr=LR)
        self.optimizer_a = optim.Adam(self.actor.parameters(), lr=LR)

    def act(self, state):
        state = torch.from_numpy(np.array(state)).float().to(device)
        return self.actor.forward(state).detach().squeeze(0).cpu().numpy()

    def update(self, batch):
        states, actions, rewards, next_states, dones = zip(*batch)
        states = torch.from_numpy(np.array(states)).float().to(device)
        actions = torch.from_numpy(np.array(actions)).float().to(device)
        rewards = torch.from_numpy(np.array(rewards)).float().to(device).unsqueeze(1)
        next_states = torch.from_numpy(np.array(next_states)).float().to(device)
        dones = torch.from_numpy(np.array(dones)).to(device)

        Q_current = self.critic(states, actions)
        Q_next = self.target_c(next_states, self.target_a(next_states).detach())
        y = (rewards + GAMMA * Q_next).detach()

        ##################Update critic#######################
        loss_c = F.mse_loss(y, Q_current)
        self.optimizer_c.zero_grad()
        loss_c.backward()
        self.optimizer_c.step()

        ##################Update actor#######################
        loss_a = -self.critic.forward(states, self.actor(states)).mean()
        self.optimizer_a.zero_grad()
        loss_a.backward()
        self.optimizer_a.step()

        ##################Update targets#######################
        for target_pr, pr in zip(self.target_a.parameters(), self.actor.parameters()):
            target_pr.data.copy_(TAU * pr.data + (1 - TAU) * target_pr.data)

        for target_pr, pr in zip(self.target_c.parameters(), self.critic.parameters()):
            target_pr.data.copy_(TAU * pr.data + (1 - TAU) * target_pr.data)
        
    def testing(self, num_repeat=100):
        env_ = gym.make('MountainCarContinuous-v0')
        rews = np.zeros(shape=(num_repeat, ))
        for k in range(num_repeat):
            state = env_.reset()
            done = False
            while not done:
                action = self.act(state)
                state, reward, done, _ = env_.step([action])
                rews[k] += reward
        return np.mean(rews)

In [47]:
episodes = 70

seed = 22
env = gym.make('MountainCarContinuous-v0')
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

agent = DDPG(2, 1)
buf = Buffer(BUF_SIZE)
noise = OUStrategy(env.action_space, min_sigma=1e-4)
updates_noise = 0
for episode in range(episodes):
    state = env.reset()
    episode_reward = 0
    done = False
    total_reward = 0
    while not done:
        action = agent.act(state)
        action = noise.get_action_from_raw_action(action, updates_noise)
        updates_noise += 1
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        buf.add((state, action, reward, next_state, done))
        if len(buf) >= BATCH_SIZE:
            agent.update(buf.sample(BATCH_SIZE))
        state = next_state
    print(f"I did {episode}th episode. Result: {total_reward}, sigma = {noise.sigma}")
    if not episode % 10:
        print(f'test_mean_reward = {agent.testing()}')

I did 0th episode. Result: 85.29541299223042, sigma = 0.298776408
test_mean_reward = -0.613522488802396
I did 1th episode. Result: 70.23170211316557, sigma = 0.296560147
I did 2th episode. Result: 80.06586350641675, sigma = 0.294889704
I did 3th episode. Result: 81.10407377311334, sigma = 0.29389403599999997
I did 4th episode. Result: 88.75603232836036, sigma = 0.293177275
I did 5th episode. Result: 80.88380839077469, sigma = 0.291554816
I did 6th episode. Result: 91.04907534163837, sigma = 0.29096101399999996
I did 7th episode. Result: -26.53283269950482, sigma = 0.287965013
I did 8th episode. Result: 87.6861493942902, sigma = 0.286783407
I did 9th episode. Result: 82.83763496669746, sigma = 0.285703767
I did 10th episode. Result: 88.04199659479687, sigma = 0.284396203
test_mean_reward = -0.061572509417478515
I did 11th episode. Result: 93.5441474924562, sigma = 0.283901368
I did 12th episode. Result: 86.66400893856255, sigma = 0.28280073499999997
I did 13th episode. Result: 87.292951