In [1]:
import torch
import torch.optim as optim
import torch.nn.functional as F
import gym
import numpy as np
import copy
import random
from model import Actor
from model import Critic
from buffer import Buffer
from noise import OUStrategy
from collections import deque

In [2]:
GAMMA = 0.99
TAU = 1e-3
BUF_SIZE = 4096
BATCH_SIZE = 256
LR = 1e-4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [10]:
class DDPG:

    def __init__(self, state_dim, action_dim):
        self.critic = Critic(state_dim, action_dim).to(device)
        self.target_c = copy.deepcopy(self.critic)

        self.actor = Actor(state_dim).to(device)
        self.target_a = copy.deepcopy(self.actor)

        self.optimizer_c = optim.Adam(self.critic.parameters(), lr=LR)
        self.optimizer_a = optim.Adam(self.actor.parameters(), lr=LR)

    def act(self, state):
        state = torch.from_numpy(np.array(state)).float().to(device)
        return self.actor.forward(state).detach().squeeze(0).cpu().numpy()

    def update(self, batch):
        states, actions, rewards, next_states, dones = zip(*batch)
        states = torch.from_numpy(np.array(states)).float().to(device)
        actions = torch.from_numpy(np.array(actions)).float().to(device)
        rewards = torch.from_numpy(np.array(rewards)).float().to(device).unsqueeze(1)
        next_states = torch.from_numpy(np.array(next_states)).float().to(device)
        dones = torch.from_numpy(np.array(dones)).to(device)

        Q_current = self.critic(states, actions)
        Q_next = self.target_c(next_states, self.target_a(next_states).detach())
        y = (rewards + GAMMA * Q_next).detach()

        ##################Update critic#######################
        loss_c = F.mse_loss(y, Q_current)
        self.optimizer_c.zero_grad()
        loss_c.backward()
        self.optimizer_c.step()

        ##################Update actor#######################
        loss_a = -self.critic.forward(states, self.actor(states)).mean()
        self.optimizer_a.zero_grad()
        loss_a.backward()
        self.optimizer_a.step()

        ##################Update targets#######################
        for target_pr, pr in zip(self.target_a.parameters(), self.actor.parameters()):
            target_pr.data.copy_(TAU * pr.data + (1 - TAU) * target_pr.data)

        for target_pr, pr in zip(self.target_c.parameters(), self.critic.parameters()):
            target_pr.data.copy_(TAU * pr.data + (1 - TAU) * target_pr.data)

In [12]:
episodes = 150

seed = 22
env = gym.make('MountainCarContinuous-v0')
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

agent = DDPG(2, 1)
buf = Buffer(BUF_SIZE)
noise = OUStrategy(env.action_space, min_sigma=1e-4)
updates_noise = 0
results = deque(maxlen=100)
for episode in range(episodes):
    state = env.reset()
    episode_reward = 0
    done = False
    total_reward = 0
    while not done:
        action = agent.act(state)
        action = noise.get_action_from_raw_action(action, updates_noise)
        updates_noise += 1
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        buf.add((state, action, reward, next_state, done))
        if len(buf) >= BATCH_SIZE:
            agent.update(buf.sample(BATCH_SIZE))
        state = next_state
    results.append(total_reward)
    print(f"I did {episode}th episode. Result: {total_reward}, sigma = {noise.sigma}, mean reward = {np.mean(results)}")

I did 0th episode. Result: 85.29541299223042, sigma = 0.298776408, mean reward = 85.29541299223042
I did 1th episode. Result: 70.23170211316557, sigma = 0.296560147, mean reward = 77.76355755269799
I did 2th episode. Result: 80.06586350641675, sigma = 0.294889704, mean reward = 78.53099287060424
I did 3th episode. Result: 81.10407377311334, sigma = 0.29389403599999997, mean reward = 79.17426309623151
I did 4th episode. Result: 88.75603232836036, sigma = 0.293177275, mean reward = 81.09061694265729
I did 5th episode. Result: 80.88380839077469, sigma = 0.291554816, mean reward = 81.05614885067685
I did 6th episode. Result: 91.04907534163837, sigma = 0.29096101399999996, mean reward = 82.48370977795707
I did 7th episode. Result: -26.53283269950482, sigma = 0.287965013, mean reward = 68.85664196827433
I did 8th episode. Result: 87.6861493942902, sigma = 0.286783407, mean reward = 70.94880946005387
I did 9th episode. Result: 82.83763496669746, sigma = 0.285703767, mean reward = 72.137692010

I did 81th episode. Result: 94.22029680692164, sigma = 0.225435863, mean reward = 85.03092822018458
I did 82th episode. Result: 88.80848316447174, sigma = 0.224731098, mean reward = 85.07644093035671
I did 83th episode. Result: 87.36167345930845, sigma = 0.22400534, mean reward = 85.1036460795109
I did 84th episode. Result: 85.96176065469734, sigma = 0.223159622, mean reward = 85.11374154510133
I did 85th episode. Result: 92.91332165249264, sigma = 0.222880715, mean reward = 85.20443433704773
I did 86th episode. Result: 95.65993403049043, sigma = 0.222610805, mean reward = 85.32461249444363
I did 87th episode. Result: 94.57774545061937, sigma = 0.222313904, mean reward = 85.42976173258198
I did 88th episode. Result: 93.01902816179802, sigma = 0.222026, mean reward = 85.51503438909002
I did 89th episode. Result: 91.27755201764911, sigma = 0.22155815599999998, mean reward = 85.57906236274069
I did 90th episode. Result: 91.49628519236082, sigma = 0.221078316, mean reward = 85.644086789439