In [1]:
import os
from typing import Dict, List, Tuple

import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from IPython.display import clear_output

In [2]:
class Network(nn.Module):
    def __init__(self, input_dim: int, output_dim: int) -> None:
        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x: torch.Tensor) -> torch.tensor:
        return self.layers(x)

In [3]:
env = gym.make("CartPole-v1", max_episode_steps=200, render_mode="rgb_array")
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [4]:
env.action_space

Discrete(2)

In [10]:
obs_dim = env.observation_space.shape[0]

In [19]:
np.random.choice(1000, 10, replace=False)

array([733, 884, 961, 258, 737, 515, 565, 157, 283, 931])

In [24]:
a = np.array([[1,0],[2,0],[3,0]])

In [25]:
a[[0,2]]

array([[1, 0],
       [3, 0]])

In [67]:
#  ReplayBuffer(obs_dim, memory_size, batch_size)
memory_size = 1000
batch_size = 32
obs_buf = np.zeros([memory_size, obs_dim], dtype=np.float32) # (1000, 4)
next_obs_buf = np.zeros([memory_size, obs_dim], dtype=np.float32) # (1000, 4)
acts_buf = np.zeros([memory_size], dtype=np.float32) # (1000)
reward_buf = np.zeros([memory_size], dtype=np.float32)
done_buf = np.zeros([memory_size], dtype=np.float32)
max_size = memory_size
ptr = 0
size = 0

In [66]:
obs_buf[ptr]
next_obs_buf[ptr]
acts_buf[ptr]
reward_buf[ptr]
done_buf[ptr]
ptr = (ptr+1) % batch_size

size = min(size+1, max_size)

In [None]:
def sample_batch(total_size, batch_size):
    idxs = np.random.choice(total_size, batch_size, replace = False)
    obs = obs_buf[idxs]
    next_obs = obs_buf[idxs]
    acts = acts_buf[idxs]
    rews = reward_buf[idxs]
    done = done_buf[idxs]

    return dict(obs, next_obs, acts, rews, done)

In [None]:
def select_action(self, state: np.ndarray):

    if self.epsilon > np.random.random():
        cur_action = self.env.action_space.sample()
    else:
        cur_action = self.dqn(torch.tensor(state).to(self.device)).argmax()
        cur_action = cur_action.detach().cpu().numpy()
    
    if not self.is_test:
        self.transition = [state, cur_action]

    return cur_action

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [30]:
action_dim = 2

In [31]:
dqn = Network(obs_dim, action_dim).to(device)

In [33]:
dqn.state_dict()

OrderedDict([('layers.0.weight',
              tensor([[-0.4184, -0.0120, -0.3271, -0.3870],
                      [-0.2199, -0.4720, -0.1129, -0.2971],
                      [ 0.3034,  0.3587, -0.1767, -0.3518],
                      [-0.4461, -0.0095, -0.0333,  0.3036],
                      [ 0.2715,  0.0498,  0.2841,  0.2165],
                      [-0.3455, -0.3539, -0.1807, -0.3753],
                      [-0.2220,  0.1836, -0.2093,  0.0080],
                      [ 0.0045, -0.4624,  0.3227, -0.1238],
                      [-0.2241, -0.0072,  0.0693, -0.4616],
                      [ 0.2722, -0.2485, -0.3032, -0.4520],
                      [-0.4138,  0.4903,  0.1565,  0.2835],
                      [-0.2071, -0.4861,  0.2236,  0.1208],
                      [-0.2571,  0.2422,  0.3296,  0.2087],
                      [ 0.3261,  0.1997, -0.4248, -0.0871],
                      [ 0.1127,  0.3033,  0.1792, -0.0927],
                      [-0.2899,  0.0779,  0.1746, -0.0665],
       

In [34]:
target_dqn = Network(4,2).to(device)

In [35]:
target_dqn.state_dict()

OrderedDict([('layers.0.weight',
              tensor([[-1.6414e-01,  4.9155e-01, -2.2300e-01,  5.6139e-02],
                      [-2.8100e-01, -4.9909e-01,  1.8309e-01,  4.5254e-01],
                      [ 3.6658e-01, -4.8942e-01,  3.9300e-01,  4.8208e-01],
                      [-1.9271e-01,  4.5149e-01,  2.4398e-01,  4.1273e-01],
                      [-1.9908e-01, -3.8869e-01,  3.4082e-01, -4.5730e-01],
                      [ 4.9417e-01, -2.2093e-01, -4.2019e-01,  1.9404e-01],
                      [-1.2816e-01, -1.9174e-01, -4.7431e-01, -3.4796e-01],
                      [-1.9817e-03, -3.5265e-01, -4.8137e-02,  6.8204e-02],
                      [-2.4143e-01,  1.6056e-01, -3.0735e-01, -1.3117e-01],
                      [-2.1297e-01,  7.5299e-02, -2.0012e-01,  3.5245e-01],
                      [-4.7856e-01, -8.2361e-02, -4.4949e-01, -2.9486e-01],
                      [ 1.3504e-01, -3.2623e-01,  2.2601e-01, -3.5069e-01],
                      [ 2.0544e-01, -3.2201e-01,  4.502

In [37]:
target_dqn.load_state_dict(dqn.state_dict())

<All keys matched successfully>

In [38]:
target_dqn.state_dict()

OrderedDict([('layers.0.weight',
              tensor([[-0.4184, -0.0120, -0.3271, -0.3870],
                      [-0.2199, -0.4720, -0.1129, -0.2971],
                      [ 0.3034,  0.3587, -0.1767, -0.3518],
                      [-0.4461, -0.0095, -0.0333,  0.3036],
                      [ 0.2715,  0.0498,  0.2841,  0.2165],
                      [-0.3455, -0.3539, -0.1807, -0.3753],
                      [-0.2220,  0.1836, -0.2093,  0.0080],
                      [ 0.0045, -0.4624,  0.3227, -0.1238],
                      [-0.2241, -0.0072,  0.0693, -0.4616],
                      [ 0.2722, -0.2485, -0.3032, -0.4520],
                      [-0.4138,  0.4903,  0.1565,  0.2835],
                      [-0.2071, -0.4861,  0.2236,  0.1208],
                      [-0.2571,  0.2422,  0.3296,  0.2087],
                      [ 0.3261,  0.1997, -0.4248, -0.0871],
                      [ 0.1127,  0.3033,  0.1792, -0.0927],
                      [-0.2899,  0.0779,  0.1746, -0.0665],
       

In [39]:
target_dqn.eval()

Network(
  (layers): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=2, bias=True)
  )
)

In [40]:
optimizer = optim.Adam(dqn.parameters())
transition = list()
is_test = False

In [44]:
test_state = env.reset()[0]

In [47]:
# return q value for each action
cur_actions = dqn(torch.tensor(test_state).to(device))
cur_actions

tensor([ 0.0099, -0.0013], grad_fn=<AddBackward0>)

In [69]:
cur_actions.gather(1, 0)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3460, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/v1/6w9qlgx10wv239bq41hlsstc0000gn/T/ipykernel_46868/3305181383.py", line 1, in <module>
    cur_actions.gather(1, 0)
AttributeError: 'numpy.ndarray' object has no attribute 'gather'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2057, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/opt/homebrew/lib/python3.9/site-packages/IPython/core/ultratb.py", line 1118, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/opt/homebrew/lib/python3.9/site-packages/IPython/core/ultratb.py", line 1012, in structured_traceback
    return VerboseTB.structured_traceback(
  File "/opt/homebrew

In [53]:
cur_actions = cur_actions.argmax().detach().cpu().numpy()

In [54]:
transition = [test_state, cur_actions]
transition

[array([-1.5709920e-05, -1.3862245e-02, -4.1380551e-02,  4.6462081e-02],
       dtype=float32),
 array(0)]

In [56]:
next_state, reward, terminated, truncated, _ = env.step(cur_actions)
next_state, reward, terminated, truncated, _

(array([-0.0044603 , -0.4028905 , -0.03393516,  0.60546416], dtype=float32),
 1.0,
 False,
 False,
 {})

In [57]:
transition += [reward, next_state, terminated]
transition

[array([-1.5709920e-05, -1.3862245e-02, -4.1380551e-02,  4.6462081e-02],
       dtype=float32),
 array(0),
 1.0,
 array([-0.0044603 , -0.4028905 , -0.03393516,  0.60546416], dtype=float32),
 False]

In [68]:
# obs_buf[ptr]
# next_obs_buf[ptr]
# acts_buf[ptr]
# reward_buf[ptr]
# done_buf[ptr]
obs_buf[0] = transition[0]
next_obs_buf[0] = transition[3]
acts_buf[0] = transition[1]
reward_buf[0] = transition[2]
done_buf[0] = transition[4]

In [70]:
[[1.1,2.1],[3.1,3.3],[1.2,1.4]]

[[1.1, 2.1], [3.1, 3.3], [1.2, 1.4]]

In [84]:
action = torch.LongTensor(np.array([[1],[0],[1]]).reshape(-1,1))

In [103]:
torch.LongTensor(np.array([[1],[0],[1]]))

tensor([[1],
        [0],
        [1]])

In [104]:
torch.tensor([[1.1,2.1],[3.1,3.3],[1.2,1.4]])

tensor([[1.1000, 2.1000],
        [3.1000, 3.3000],
        [1.2000, 1.4000]])

In [105]:
torch.tensor([[1.1,2.1],[3.1,3.3],[1.2,1.4]]).gather(1, action)

tensor([[2.1000],
        [3.1000],
        [1.4000]])

In [102]:
torch.tensor([[1.1,2.1],[3.1,3.3],[1.2,1.4]])

tensor([[1.1000, 2.1000],
        [3.1000, 3.3000],
        [1.2000, 1.4000]])

In [116]:
np.random.random()

0.7191010772176967