In [1]:
import wrappers
from dqn_model import DQN

import gymnasium as gym

import argparse
import time
import numpy as np
import collections

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.tensorboard import SummaryWriter

2024-07-11 22:22:28.487228: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
device = torch.device(
    "cuda:0" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

NVIDIA GeForce RTX 3060
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [4]:
DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
MEAN_BOUND = 19.0

GAMMA = 0.99 # for bellman equation
BATCH_SIZE = 64 # size that will be batched from replay buffer 
REPLAY_SIZE = 11_000 # max capacity of replay buffer
REPLAY_START_SIZE = 10_000 # num of frames we wait befor start training to populate replay buffer
LEARNING_RATE = 1e-4 # lr used in optimizer
SYNC_TARGET_FRAMES = 1_000 # frequency of models syncronization

EPSILON_DECAY_LAST_FRAME = 200_000
EPSILON_START = 1.0 # start exploration parameter value
EPSILON_FINAL = 0.01 # final exploration parameter value


In [6]:
Experience = collections.namedtuple('Experience', field_names=['state',
                                                               'action',
                                                               'reward',
                                                               'done',
                                                               'new_state'])

In [7]:
class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)

        states, actions, rewards, dones, next_states = \
            zip(*[self.buffer[idx] for idx in indices])
        
        return np.array(states), np.array(actions), \
                np.array(rewards, dtype=np.float32), \
                np.array(dones, dtype=bool), \
                np.array(next_states)

In [8]:
class Agent:
    def __init__(self, env, exp_buffer, dev=device):
        self.env = env
        self.exp_buffer = exp_buffer
        self.device = dev
        self._reset()
    
    def _reset(self):
        self.state, _ = self.env.reset()
        self.total_reward = 0.0

    @torch.no_grad()
    def play_step(self, net, epsilon=0.0):
        done_reward = None

        if np.random.random() < epsilon:
            action = self.env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a).to(self.device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        new_state, reward, terminated, truncated, info = self.env.step(action)
        done = terminated or truncated
        self.total_reward += reward
        exp = Experience(self.state, action, reward, done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

In [9]:
tt = torch.tensor(
    [
        [1,2,3],
        [5,6,7]
    ]
)
print(tt.max(1).values)

tensor([3, 7])


In [10]:
def calc_loss(batch, net, target_net, device=device):
    states, actions, rewards, dones, next_states = batch
    states_v = torch.tensor(np.array(states, copy=False)).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.tensor(dones).to(device)
    next_states_v = torch.tensor(np.array(next_states, copy=False)).to(device)

    # As we get [
    #               [q11, q12, q13, q14],
    #               [q21, q22, q23, q24]
    #           ]
    # where q_(batch_sample)_(number_of_action) 
    # as output from network and gather is collecting elements of tensor for 
    # given axis and take element according to given index
    # 
    # As we have 2d net out and 1d action list, we extend dimension of action
    # through the last axis to get [a1, a2, a3] -> [[a1], [a2], [a3], ...]. 
    # Now having such index we can correctly take needed elements from net out.
    # But out of this operation is  [[q1x], [q2x], [q3x], ...], we need to 
    # reduce last dimension of this tensor, so we use reverse of unsqueeze func(squeeze)
    # to the last(-1) dim of tensor. :)
    state_action_values = net(states_v).gather(
        1, actions_v.unsqueeze(-1)).squeeze(-1)
    

    # tt = torch.tensor(
    #     [
    #         [1,2,3],
    #         [5,6,7]
    #     ]
    # )
    # print(tt.max(1))
    # ======================
    # torch.return_types.max(
    #     values=tensor([3, 7]),
    #     indices=tensor([2, 2]))

    next_state_values = target_net(next_states_v).max(1).values

    # final step, so we add only reward
    next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()
    
    expected_state_action_values = next_state_values * GAMMA + rewards_v
    
    return nn.MSELoss()(state_action_values, expected_state_action_values)
    

In [11]:
env = gym.make(DEFAULT_ENV_NAME)
env = wrappers.wrap_env(env)
net = DQN(env.observation_space.shape, env.action_space.n).to(device)
tgt_net = DQN(env.observation_space.shape, env.action_space.n).to(device)

writer = SummaryWriter(comment="-" + DEFAULT_ENV_NAME)
print(net)

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


DQN(
  (conv_layer): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
    (6): Flatten(start_dim=1, end_dim=-1)
  )
  (dense): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)


In [12]:
buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = EPSILON_START

In [13]:
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
frame_idx = 0
ts_frame = 0
ts = time.time()
best_m_reward = None

In [14]:
while True:
    frame_idx += 1
    epsilon = max(EPSILON_FINAL, EPSILON_START -
                    frame_idx / EPSILON_DECAY_LAST_FRAME)

    reward = agent.play_step(net, epsilon)
    if reward is not None:
        total_rewards.append(reward)
        speed = (frame_idx - ts_frame) / (time.time() - ts)
        ts_frame = frame_idx
        ts = time.time()
        m_reward = np.mean(total_rewards[-100:])
        print("%d: done %d games, reward %.3f, "
                "eps %.2f, speed %.2f f/s" % (
            frame_idx, len(total_rewards), m_reward, epsilon,
            speed
        ))
        writer.add_scalar("epsilon", epsilon, frame_idx)
        writer.add_scalar("speed", speed, frame_idx)
        writer.add_scalar("reward_100", m_reward, frame_idx)
        writer.add_scalar("reward", reward, frame_idx)
        if best_m_reward is None or best_m_reward < m_reward:
            torch.save(net.state_dict(), DEFAULT_ENV_NAME +
                        "-best_%.0f.dat" % m_reward)
            if best_m_reward is not None:
                print("Best reward updated %.3f -> %.3f" % (
                    best_m_reward, m_reward))
            best_m_reward = m_reward
        if m_reward > MEAN_BOUND:
            print("Solved in %d frames!" % frame_idx)
            break

    if len(buffer) < REPLAY_START_SIZE:
        continue

    if frame_idx % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict())

    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch, net, tgt_net, device=device)
    loss_t.backward()
    optimizer.step()
writer.close()

1199: done 1 games, reward -20.000, eps 0.99, speed 160.19 f/s
2342: done 2 games, reward -19.500, eps 0.99, speed 1084.47 f/s
Best reward updated -20.000 -> -19.500
3268: done 3 games, reward -20.000, eps 0.98, speed 1098.23 f/s
4503: done 4 games, reward -19.500, eps 0.98, speed 1099.63 f/s
5346: done 5 games, reward -19.800, eps 0.97, speed 1095.25 f/s
6538: done 6 games, reward -19.667, eps 0.97, speed 1104.00 f/s
7480: done 7 games, reward -19.857, eps 0.96, speed 1087.30 f/s
8552: done 8 games, reward -19.875, eps 0.96, speed 1096.72 f/s
9342: done 9 games, reward -20.000, eps 0.95, speed 1059.09 f/s
10240: done 10 games, reward -20.000, eps 0.95, speed 254.60 f/s
11220: done 11 games, reward -19.909, eps 0.94, speed 111.57 f/s
12010: done 12 games, reward -20.000, eps 0.94, speed 110.86 f/s
13093: done 13 games, reward -20.000, eps 0.93, speed 111.76 f/s
14084: done 14 games, reward -20.000, eps 0.93, speed 110.60 f/s
14846: done 15 games, reward -20.067, eps 0.93, speed 110.76 

In [5]:
env = gym.make(DEFAULT_ENV_NAME, render_mode='human')
env = wrappers.wrap_env(env)
net = DQN(env.observation_space.shape,
    env.action_space.n).to(device)

state = torch.load('PongNoFrameskip-v4-best_19.dat', map_location=lambda stg,_: stg)
net.load_state_dict(state)
state, _ = env.reset()
total_reward = 0.0
c = collections.Counter()
while True:
    start_ts = time.time()
    state_v = torch.tensor(np.array([state], copy=False))
    q_vals = net(state_v.to(device)).cpu().data.numpy()[0]
    print(q_vals)
    action = np.argmax(q_vals)
    c[action] += 1
    state, reward, terminated, truncated, _ = env.step(action)
    total_reward += reward
    if terminated or truncated:
        break
    print("Total reward: %.2f" % total_reward)
    print("Action counts:", c)
env.close()

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


[1.4211023 1.3704684 1.4227457 1.4174623 1.3425744 1.3661815]
Total reward: 0.00
Action counts: Counter({2: 1})
[1.4048976 1.4015887 1.4458507 1.421081  1.4255883 1.3823009]
Total reward: 0.00
Action counts: Counter({2: 2})
[1.4297353 1.42418   1.418351  1.4811763 1.4058511 1.4501446]
Total reward: 0.00
Action counts: Counter({2: 2, 3: 1})
[1.4648919 1.4785863 1.5010842 1.4588428 1.4302855 1.4190791]
Total reward: 0.00
Action counts: Counter({2: 3, 3: 1})
[1.5709363 1.5084101 1.4693091 1.5135931 1.4102865 1.4774141]
Total reward: 0.00
Action counts: Counter({2: 3, 3: 1, 0: 1})
[1.5194988 1.4382467 1.4146986 1.4288013 1.3884304 1.4065276]
Total reward: 0.00
Action counts: Counter({2: 3, 0: 2, 3: 1})
[1.4416405 1.4322675 1.4063925 1.4703335 1.3424435 1.4292159]
Total reward: 0.00
Action counts: Counter({2: 3, 3: 2, 0: 2})
[1.4462404 1.428262  1.4105765 1.4619288 1.3734533 1.4580679]
Total reward: 0.00
Action counts: Counter({2: 3, 3: 3, 0: 2})
[1.4847684 1.4103986 1.4487268 1.3791623 1.4

In [20]:
while True:
    start_ts = time.time()
    state_v = torch.tensor(np.array([state], copy=False))
    q_vals = net(state_v.to(device)).cpu().data.numpy()[0]
    print(q_vals)
    action = np.argmax(q_vals)
    c[action] += 1
    state, reward, terminated, truncated, _ = env.step(action)
    total_reward += reward
    if terminated or truncated:
        break
    print("Total reward: %.2f" % total_reward)
    print("Action counts:", c)
env.close()

[1.4211023 1.3704684 1.4227457 1.4174623 1.3425744 1.3661815]
Total reward: 0.00
Action counts: Counter({2: 1})
[1.4048976 1.4015887 1.4458507 1.421081  1.4255883 1.3823009]
Total reward: 0.00
Action counts: Counter({2: 2})
[1.4297353 1.42418   1.418351  1.4811763 1.4058511 1.4501446]
Total reward: 0.00
Action counts: Counter({2: 2, 3: 1})
[1.4648919 1.4785863 1.5010842 1.4588428 1.4302855 1.4190791]
Total reward: 0.00
Action counts: Counter({2: 3, 3: 1})
[1.5709363 1.5084101 1.4693091 1.5135931 1.4102865 1.4774141]
Total reward: 0.00
Action counts: Counter({2: 3, 3: 1, 0: 1})
[1.5194988 1.4382467 1.4146986 1.4288013 1.3884304 1.4065276]
Total reward: 0.00
Action counts: Counter({2: 3, 0: 2, 3: 1})
[1.4416405 1.4322675 1.4063925 1.4703335 1.3424435 1.4292159]
Total reward: 0.00
Action counts: Counter({2: 3, 3: 2, 0: 2})
[1.4462404 1.428262  1.4105765 1.4619288 1.3734533 1.4580679]
Total reward: 0.00
Action counts: Counter({2: 3, 3: 3, 0: 2})
[1.4847684 1.4103986 1.4487268 1.3791623 1.4

In [7]:
env.env.close()

: 