In [3]:
import gym

env = gym.make('Hopper-v2')



In [26]:
action = env.action_space.sample()
env.reset()
env.step(action)

(array([ 1.24943720e+00, -2.87405197e-03, -9.19906749e-03, -1.23329904e-03,
        -1.87178906e-04, -1.78119501e-01, -6.09777862e-02, -1.66516963e+00,
        -1.33327390e+00, -1.13110405e+00, -1.30295591e-01]),
 0.9102182663706267,
 False,
 {})

In [24]:
import torch
from torch import nn
import torch.nn.functional as F

class WNet(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_units=(400, 300), gate=F.relu):
        super(WNet, self).__init__()
        fc_list = []
        for i in range(len(hidden_units)):
            in_dim = input_dim if i == 0 else hidden_units[i - 1]
            out_dim = hidden_units[i]
            layer = nn.Linear(in_dim, out_dim)
            fc_list.append(layer)
            
        fc_list.append(nn.Linear(hidden_units[-1], output_dim))
            
        self.fc_list = nn.ModuleList(fc_list)
        self.gate = gate
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.to(self.device)
    
    def forward(self, x):
        x.to(self.device)
        for layer in self.fc_list:
            x = self.gate(layer(x))
            
        return x

In [33]:
# construct the replay buffer
import sys
sys.path.append('../')
from deep_rl import *

replay = UniformReplay(memory_size=int(1e6), batch_size=256)

In [29]:
from torch.optim import Adam
net = WNet(11, 1)
opt = Adam(list(net.parameters()))

In [43]:
from torch.optim import Adam
# construct the replay buffer
import sys
sys.path.append('../')
from deep_rl import *

replay = UniformReplay(memory_size=int(1e6), batch_size=64)

loss_vec = []
steps = int(1e6)
warm_up = int(1e4)
log_interval = int(1e4)


net = WNet(11, 1)
opt = Adam(list(net.parameters()))

for i in range(steps):
    if i == 0:
        state = env.reset()
    
    action = env.action_space.sample()
    next_state, reward, done, info = env.step(action)
    
    replay.feed(dict(
            state=[state],
            action=[action],
            reward=[reward],
            next_state=[next_state],
            mask=1-np.asarray([done], dtype=np.int32),
        ))
    
    state = next_state
    if done:
        state = env.reset()
        
    if i > warm_up:
        transitions = replay.sample()
        states = tensor(transitions.state).cuda()
        actions = tensor(transitions.action)
        rewards = tensor(transitions.reward).unsqueeze(-1).cuda()
        next_states = tensor(transitions.next_state)
        mask = tensor(transitions.mask).unsqueeze(-1)
        
        phi_dot_w = net(states.cuda())
        loss = (phi_dot_w - rewards).pow(2).mul(0.5).mean()
        loss_vec.append(loss)
        net.zero_grad()
        loss.backward()
        opt.step()
        
        if i % log_interval == 0:
            print(f'Steps : {i},  Loss : {loss}')
        

Steps : 20000,  Loss : 0.0010396852158010006
Steps : 30000,  Loss : 0.0026800506748259068
Steps : 40000,  Loss : 0.0010459991171956062
Steps : 50000,  Loss : 0.002869775053113699
Steps : 60000,  Loss : 0.0013931540306657553
Steps : 70000,  Loss : 0.0008314452134072781
Steps : 80000,  Loss : 0.001108307158574462
Steps : 90000,  Loss : 0.001379550900310278
Steps : 100000,  Loss : 0.0064567821100354195
Steps : 110000,  Loss : 0.004761221818625927
Steps : 120000,  Loss : 0.0008333222940564156
Steps : 130000,  Loss : 0.0007928359555080533
Steps : 140000,  Loss : 0.0010601794347167015
Steps : 150000,  Loss : 0.0008834320469759405
Steps : 160000,  Loss : 0.002309125615283847
Steps : 170000,  Loss : 0.0023637961130589247
Steps : 180000,  Loss : 0.002173013286665082
Steps : 190000,  Loss : 0.0020743415225297213
Steps : 200000,  Loss : 0.0018699580105021596
Steps : 210000,  Loss : 0.0017723074415698647
Steps : 220000,  Loss : 0.002121815225109458
Steps : 230000,  Loss : 0.004485963378101587
Step