In [42]:
import os
import numpy as np
import torch
import gymnasium as gym
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
class SerializedBuffer:
    def __init__(self, path, device="mps"):
        '''Path is the file location to the serialized buffer'''
        tmp = torch.load(path)
        self.buffer_size = self._n = tmp['state'].size(0)
        self.device = device
        self.states = tmp['state'].clone().to(self.device)
        self.actions = tmp['action'].clone().to(self.device)
        self.rewards = tmp['reward'].clone().to(self.device)
        self.dones = tmp['done'].clone().to(self.device)
        self.next_states = tmp['next_state'].clone().to(self.device)

    def sample(self, bath_size):
        idx = np.random.randint(low = 0, high = self._n, size = bath_size)
        return (
            self.states[idx],
            self.actions[idx],
            self.rewards[idx],
            self.dones[idx],
            self.next_states[idx]
        )

class Buffer(SerializedBuffer):

    def __init__(self, buffer_size, state_shape, action_shape, device):
        self._n = 0
        self._p = 0
        self.buffer_size = buffer_size
        self.device = device

        self.states = torch.empty(
            (buffer_size, *state_shape), dtype=torch.float, device=device)
        self.actions = torch.empty(
            (buffer_size, *action_shape), dtype=torch.float, device=device)
        self.rewards = torch.empty(
            (buffer_size, 1), dtype=torch.float, device=device)
        self.dones = torch.empty(
            (buffer_size, 1), dtype=torch.float, device=device)
        self.next_states = torch.empty(
            (buffer_size, *state_shape), dtype=torch.float, device=device)

    def append(self, state, action, reward, done, next_state):
        self.states[self._p].copy_(torch.from_numpy(state))
        self.actions[self._p].copy_(torch.from_numpy(action))
        self.rewards[self._p] = float(reward)
        self.dones[self._p] = float(done)
        self.next_states[self._p].copy_(torch.from_numpy(next_state))

        self._p = (self._p + 1) % self.buffer_size
        self._n = min(self._n + 1, self.buffer_size)

    def save(self, path):
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))

        torch.save({
            'state': self.states.clone().cpu(),
            'action': self.actions.clone().cpu(),
            'reward': self.rewards.clone().cpu(),
            'done': self.dones.clone().cpu(),
            'next_state': self.next_states.clone().cpu(),
        }, path)




In [4]:
env = gym.make("Taxi-v3")

In [8]:
gym.logger.set_level(40)

In [17]:
def make_env(env_id):
    return gym.make(env_id)

'''normalization is used to train the neural network better
but there is ome problem with this wrapper need to look more 
deep to fix it. moving on for now.
'''
# class NormalizedEnv(gym.Wrapper):

#     def __init__(self, env):
#         gym.Wrapper.__init__(self, env)
#         self._max_episode_steps = env._max_episode_steps

#         self.scale = env.action_space.n
#         self.action_space.n /= self.scale
#         self.action_space.low /= self.scale

#     def step(self, action):
#         return self.env.step(action * self.scale)

'normalization is used to train the neural network better\nbut there is ome problem with this wrapper need to look more \ndeep to fix it. moving on for now.\n'

In [19]:
algo = torch.load("../works_ig_3000.pth", map_location="cpu")

  algo = torch.load("../works_ig_3000.pth", map_location="cpu")


In [33]:
from tqdm import tqdm
import numpy as np
import torch


def soft_update(target, source, tau):
    for t, s in zip(target.parameters(), source.parameters()):
        t.data.mul_(1.0 - tau)
        t.data.add_(tau * s.data)


def disable_gradient(network):
    for param in network.parameters():
        param.requires_grad = False


def add_random_noise(action, std):
    action += np.random.randn(*action.shape) * std
    return action.clip(-1.0, 1.0)


def collect_demo(env, algo, buffer_size, device, std, p_rand, seed=0):
    # env.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    buffer = Buffer(
        buffer_size=buffer_size,
        state_shape=env.observation_space.shape,
        action_shape=env.action_space.shape,
        device=device
    )

    total_return = 0.0
    num_episodes = 0

    state,_ = env.reset()
    t = 0
    episode_return = 0.0

    for _ in tqdm(range(1, buffer_size + 1)):
        t += 1

        if np.random.rand() < p_rand:
            action = env.action_space.sample()
        else:
            print(state)
            action = algo(state)
            action = add_random_noise(action, std)

        next_state, reward, done, _ = env.step(action)
        mask = False if t == env._max_episode_steps else done
        buffer.append(state, action, reward, mask, next_state)
        episode_return += reward

        if done:
            num_episodes += 1
            total_return += episode_return
            state = env.reset()
            t = 0
            episode_return = 0.0

        state = next_state

    print(f'Mean return of the expert is {total_return / num_episodes}')
    return buffer


In [43]:
class DQN(nn.Module):

    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

In [45]:
algo = DQN(500, 6)
algo.load_state_dict(torch.load("../works_ig_3000.pth", map_location="cpu"))

  algo.load_state_dict(torch.load("../works_ig_3000.pth", map_location="cpu"))


<All keys matched successfully>

In [46]:
buffer_size = 10000
device = "mps"
std = 0.1
p_rand = 0.3
seed = 420
env_id = "Taxi-v3"


In [47]:
buffer = collect_demo(
        env=env,
        algo=algo,
        buffer_size=buffer_size,
        device=device,
        std=std,
        p_rand=p_rand,
        seed=seed
)
buffer.save(os.path.join(
    'buffers',
    env_id,
    f'size{buffer_size}_std{std}_prand{p_rand}.pth'
))

  0%|          | 0/10000 [00:00<?, ?it/s]

469





TypeError: linear(): argument 'input' (position 1) must be Tensor, not int

In [39]:
algo

OrderedDict([('layer1.weight',
              tensor([[ 0.0139, -0.0546, -0.0773,  ..., -0.0972,  0.0198, -0.1513],
                      [ 0.0155, -0.0044, -0.0116,  ..., -0.0708,  0.0200, -0.1557],
                      [ 0.0070,  0.0014,  0.0011,  ..., -0.1007,  0.0269, -0.1578],
                      ...,
                      [-0.0047, -0.1089, -0.1138,  ..., -0.0831,  0.0488, -0.1288],
                      [ 0.0056, -0.0413, -0.0788,  ..., -0.0944,  0.0317, -0.1185],
                      [-0.0028, -0.0912, -0.1148,  ..., -0.0731,  0.0034,  0.0667]])),
             ('layer1.bias',
              tensor([-0.0939, -0.0793, -0.0906,  0.0472, -0.0955, -0.0807, -0.1067, -0.0922,
                       0.0311, -0.0746,  0.4107, -0.0213, -0.0802, -0.0773,  0.2199,  0.2363,
                       0.0493, -0.0834,  0.1591,  0.0537, -0.0853, -0.0834, -0.0969, -0.0997,
                      -0.0089, -0.0876,  0.0939, -0.0954, -0.0939, -0.0995, -0.0844, -0.0543,
                       0.0715,