In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import mario_env

In [3]:
import gym

In [4]:
from stable_baselines3.common.env_checker import check_env

In [5]:
# If the environment don't follow the interface, an error will be thrown


In [6]:
from stable_baselines3 import DQN, PPO, A2C, DDPG, TD3, SAC
from stable_baselines3.common.cmd_util import make_vec_env
# wrap it

In [87]:
import torch
import torch.nn as nn


# Necessary for my KFAC implementation.
class AddBias(nn.Module):
    def __init__(self, bias):
        super(AddBias, self).__init__()
        self._bias = nn.Parameter(bias.unsqueeze(1))

    def forward(self, x):
        if x.dim() == 2:
            bias = self._bias.t().view(1, -1)
        else:
            bias = self._bias.t().view(1, -1, 1, 1)

        return x + bias


def init(module, weight_init, bias_init, gain=1):
    weight_init(module.weight.data, gain=gain)
    bias_init(module.bias.data)
    return module


# https://github.com/openai/baselines/blob/master/baselines/common/tf_util.py#L87
def init_normc_(weight, gain=1):
    weight.normal_(0, 1)
    weight *= gain / torch.sqrt(weight.pow(2).sum(1, keepdim=True))

In [91]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F


"""
Modify standard PyTorch distributions so they are compatible with this code.
"""

FixedCategorical = torch.distributions.Categorical

old_sample = FixedCategorical.sample
FixedCategorical.sample = lambda self: old_sample(self).unsqueeze(-1)

log_prob_cat = FixedCategorical.log_prob
FixedCategorical.log_probs = lambda self, actions: log_prob_cat(self, actions.squeeze(-1)).unsqueeze(-1)

FixedCategorical.mode = lambda self: self.probs.argmax(dim=1, keepdim=True)

FixedNormal = torch.distributions.Normal
log_prob_normal = FixedNormal.log_prob
FixedNormal.log_probs = lambda self, actions: log_prob_normal(self, actions).sum(-1, keepdim=True)

entropy = FixedNormal.entropy
FixedNormal.entropy = lambda self: entropy(self).sum(-1)

FixedNormal.mode = lambda self: self.mean


class Categorical(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(Categorical, self).__init__()

        init_ = lambda m: init(m,
              nn.init.orthogonal_,
              lambda x: nn.init.constant_(x, 0),
              gain=0.01)

        self.linear = init_(nn.Linear(num_inputs, num_outputs))

    def forward(self, x):
        x = self.linear(x)
        return FixedCategorical(logits=x)


class DiagGaussian(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(DiagGaussian, self).__init__()

        init_ = lambda m: init(m,
              init_normc_,
              lambda x: nn.init.constant_(x, 0))

        self.fc_mean = init_(nn.Linear(num_inputs, num_outputs))
        self.logstd = AddBias(torch.zeros(num_outputs))

    def forward(self, x):
        action_mean = self.fc_mean(x)

        #  An ugly hack for my KFAC implementation.
        zeros = torch.zeros(action_mean.size())
        if x.is_cuda:
            zeros = zeros.cuda()

        action_logstd = self.logstd(zeros)
        return FixedNormal(action_mean, action_logstd.exp())

In [93]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)


class Policy(nn.Module):
    def __init__(self, obs_shape, action_space, recurrent_policy=None):
        super(Policy, self).__init__()
        if len(obs_shape) == 3:
            self.base = CNNBase(obs_shape[0], recurrent_policy=None)
        elif len(obs_shape) == 1:
            assert not recurrent_policy, \
                "Recurrent policy is not implemented for the MLP controller"
            self.base = MLPBase(obs_shape[0])
        else:
            raise NotImplementedError

        if action_space.__class__.__name__ == "Discrete":
            num_outputs = action_space.n
            self.dist = Categorical(self.base.output_size, num_outputs)
        elif action_space.__class__.__name__ == "Box":
            num_outputs = action_space.shape[0]
            self.dist = DiagGaussian(self.base.output_size, num_outputs)
        else:
            raise NotImplementedError

        self.state_size = self.base.state_size

    def forward(self, inputs, states, masks):
        raise NotImplementedError

    def act(self, inputs, states, masks, deterministic=False):
        value, actor_features, states = self.base(inputs, states, masks)
        dist = self.dist(actor_features)

        if deterministic:
            action = dist.mode()
        else:
            action = dist.sample()

        action_log_probs = dist.log_probs(action)
        dist_entropy = dist.entropy().mean()

        return value, action, action_log_probs, states

    def get_value(self, inputs, states, masks):
        value, _, _ = self.base(inputs, states, masks)
        return value

    def evaluate_actions(self, inputs, states, masks, action):
        value, actor_features, states = self.base(inputs, states, masks)
        dist = self.dist(actor_features)

        action_log_probs = dist.log_probs(action)
        dist_entropy = dist.entropy().mean()

        return value, action_log_probs, dist_entropy, states


class CNNBase(nn.Module):
    def __init__(self, num_inputs, use_gru):
        super(CNNBase, self).__init__()

        init_ = lambda m: init(m,
                      nn.init.orthogonal_,
                      lambda x: nn.init.constant_(x, 0),
                      nn.init.calculate_gain('relu'))

        self.main = nn.Sequential(
            init_(nn.Conv2d(num_inputs, 32, 8, stride=4)),
            nn.ReLU(),
            init_(nn.Conv2d(32, 64, 4, stride=2)),
            nn.ReLU(),
            init_(nn.Conv2d(64, 32, 3, stride=1)),
            nn.ReLU(),
            Flatten(),
            init_(nn.Linear(32 * 7 * 7, 512)),
            nn.ReLU()
        )

        if use_gru:
            self.gru = nn.GRUCell(512, 512)
            nn.init.orthogonal_(self.gru.weight_ih.data)
            nn.init.orthogonal_(self.gru.weight_hh.data)
            self.gru.bias_ih.data.fill_(0)
            self.gru.bias_hh.data.fill_(0)

        init_ = lambda m: init(m,
          nn.init.orthogonal_,
          lambda x: nn.init.constant_(x, 0))

        self.critic_linear = init_(nn.Linear(512, 1))

        self.train()

    @property
    def state_size(self):
        if hasattr(self, 'gru'):
            return 512
        else:
            return 1

    @property
    def output_size(self):
        return 512

    def forward(self, inputs, states, masks):
        x = self.main(inputs / 255.0)

        if hasattr(self, 'gru'):
            if inputs.size(0) == states.size(0):
                x = states = self.gru(x, states * masks)
            else:
                x = x.view(-1, states.size(0), x.size(1))
                masks = masks.view(-1, states.size(0), 1)
                outputs = []
                for i in range(x.size(0)):
                    hx = states = self.gru(x[i], states * masks[i])
                    outputs.append(hx)
                x = torch.cat(outputs, 0)

        return self.critic_linear(x), x, states


class MLPBase(nn.Module):
    def __init__(self, num_inputs):
        super(MLPBase, self).__init__()

        init_ = lambda m: init(m,
              init_normc_,
              lambda x: nn.init.constant_(x, 0))

        self.actor = nn.Sequential(
            init_(nn.Linear(num_inputs, 64)),
            nn.Tanh(),
            init_(nn.Linear(64, 64)),
            nn.Tanh()
        )

        self.critic = nn.Sequential(
            init_(nn.Linear(num_inputs, 64)),
            nn.Tanh(),
            init_(nn.Linear(64, 64)),
            nn.Tanh()
        )

        self.critic_linear = init_(nn.Linear(64, 1))

        self.train()

    @property
    def state_size(self):
        return 1

    @property
    def output_size(self):
        return 64

    def forward(self, inputs, states, masks):
        hidden_critic = self.critic(inputs)
        hidden_actor = self.actor(inputs)

        return self.critic_linear(hidden_critic), hidden_actor, states

In [99]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim



# TODO: In order to make this code faster:
# 1) Implement _extract_patches as a single cuda kernel
# 2) Compute QR decomposition in a separate process
# 3) Actually make a general KFAC optimizer so it fits PyTorch


def _extract_patches(x, kernel_size, stride, padding):
    if padding[0] + padding[1] > 0:
        x = F.pad(x, (padding[1], padding[1], padding[0],
                      padding[0])).data  # Actually check dims
    x = x.unfold(2, kernel_size[0], stride[0])
    x = x.unfold(3, kernel_size[1], stride[1])
    x = x.transpose_(1, 2).transpose_(2, 3).contiguous()
    x = x.view(
        x.size(0), x.size(1), x.size(2),
        x.size(3) * x.size(4) * x.size(5))
    return x


def compute_cov_a(a, classname, layer_info, fast_cnn):
    batch_size = a.size(0)

    if classname == 'Conv2d':
        if fast_cnn:
            a = _extract_patches(a, *layer_info)
            a = a.view(a.size(0), -1, a.size(-1))
            a = a.mean(1)
        else:
            a = _extract_patches(a, *layer_info)
            a = a.view(-1, a.size(-1)).div_(a.size(1)).div_(a.size(2))
    elif classname == 'AddBias':
        is_cuda = a.is_cuda
        a = torch.ones(a.size(0), 1)
        if is_cuda:
            a = a.cuda()

    return a.t() @ (a / batch_size)


def compute_cov_g(g, classname, layer_info, fast_cnn):
    batch_size = g.size(0)

    if classname == 'Conv2d':
        if fast_cnn:
            g = g.view(g.size(0), g.size(1), -1)
            g = g.sum(-1)
        else:
            g = g.transpose(1, 2).transpose(2, 3).contiguous()
            g = g.view(-1, g.size(-1)).mul_(g.size(1)).mul_(g.size(2))
    elif classname == 'AddBias':
        g = g.view(g.size(0), g.size(1), -1)
        g = g.sum(-1)

    g_ = g * batch_size
    return g_.t() @ (g_ / g.size(0))


def update_running_stat(aa, m_aa, momentum):
    # Do the trick to keep aa unchanged and not create any additional tensors
    m_aa *= momentum / (1 - momentum)
    m_aa += aa
    m_aa *= (1 - momentum)


class SplitBias(nn.Module):
    def __init__(self, module):
        super(SplitBias, self).__init__()
        self.module = module
        self.add_bias = AddBias(module.bias.data)
        self.module.bias = None

    def forward(self, input):
        x = self.module(input)
        x = self.add_bias(x)
        return x


class KFACOptimizer(optim.Optimizer):
    def __init__(self,
                 model,
                 lr=0.25,
                 momentum=0.9,
                 stat_decay=0.99,
                 kl_clip=0.001,
                 damping=1e-2,
                 weight_decay=0,
                 fast_cnn=False,
                 Ts=1,
                 Tf=10):
        defaults = dict()

        def split_bias(module):
            for mname, child in module.named_children():
                if hasattr(child, 'bias') and child.bias is not None:
                    module._modules[mname] = SplitBias(child)
                else:
                    split_bias(child)

        split_bias(model)

        super(KFACOptimizer, self).__init__(model.parameters(), defaults)

        self.known_modules = {'Linear', 'Conv2d', 'AddBias'}

        self.modules = []
        self.grad_outputs = {}

        self.model = model
        self._prepare_model()

        self.steps = 0

        self.m_aa, self.m_gg = {}, {}
        self.Q_a, self.Q_g = {}, {}
        self.d_a, self.d_g = {}, {}

        self.momentum = momentum
        self.stat_decay = stat_decay

        self.lr = lr
        self.kl_clip = kl_clip
        self.damping = damping
        self.weight_decay = weight_decay

        self.fast_cnn = fast_cnn

        self.Ts = Ts
        self.Tf = Tf

        self.optim = optim.SGD(
            model.parameters(),
            lr=self.lr * (1 - self.momentum),
            momentum=self.momentum)

    def _save_input(self, module, input):
        if torch.is_grad_enabled() and self.steps % self.Ts == 0:
            classname = module.__class__.__name__
            layer_info = None
            if classname == 'Conv2d':
                layer_info = (module.kernel_size, module.stride,
                              module.padding)

            aa = compute_cov_a(input[0].data, classname, layer_info,
                               self.fast_cnn)

            # Initialize buffers
            if self.steps == 0:
                self.m_aa[module] = aa.clone()

            update_running_stat(aa, self.m_aa[module], self.stat_decay)

    def _save_grad_output(self, module, grad_input, grad_output):
        if self.acc_stats:
            classname = module.__class__.__name__
            layer_info = None
            if classname == 'Conv2d':
                layer_info = (module.kernel_size, module.stride,
                              module.padding)

            gg = compute_cov_g(grad_output[0].data, classname, layer_info,
                               self.fast_cnn)

            # Initialize buffers
            if self.steps == 0:
                self.m_gg[module] = gg.clone()

            update_running_stat(gg, self.m_gg[module], self.stat_decay)

    def _prepare_model(self):
        for module in self.model.modules():
            classname = module.__class__.__name__
            if classname in self.known_modules:
                assert not ((classname in ['Linear', 'Conv2d']) and module.bias is not None), \
                                    "You must have a bias as a separate layer"

                self.modules.append(module)
                module.register_forward_pre_hook(self._save_input)
                module.register_backward_hook(self._save_grad_output)

    def step(self):
        # Add weight decay
        if self.weight_decay > 0:
            for p in self.model.parameters():
                p.grad.data.add_(self.weight_decay, p.data)

        updates = {}
        for i, m in enumerate(self.modules):
            assert len(list(m.parameters())
                       ) == 1, "Can handle only one parameter at the moment"
            classname = m.__class__.__name__
            p = next(m.parameters())

            la = self.damping + self.weight_decay

            if self.steps % self.Tf == 0:
                # My asynchronous implementation exists, I will add it later.
                # Experimenting with different ways to this in PyTorch.
                self.d_a[m], self.Q_a[m] = torch.symeig(
                    self.m_aa[m], eigenvectors=True)
                self.d_g[m], self.Q_g[m] = torch.symeig(
                    self.m_gg[m], eigenvectors=True)

                self.d_a[m].mul_((self.d_a[m] > 1e-6).float())
                self.d_g[m].mul_((self.d_g[m] > 1e-6).float())

            if classname == 'Conv2d':
                p_grad_mat = p.grad.data.view(p.grad.data.size(0), -1)
            else:
                p_grad_mat = p.grad.data

            v1 = self.Q_g[m].t() @ p_grad_mat @ self.Q_a[m]
            v2 = v1 / (
                self.d_g[m].unsqueeze(1) * self.d_a[m].unsqueeze(0) + la)
            v = self.Q_g[m] @ v2 @ self.Q_a[m].t()

            v = v.view(p.grad.data.size())
            updates[p] = v

        vg_sum = 0
        for p in self.model.parameters():
            v = updates[p]
            vg_sum += (v * p.grad.data * self.lr * self.lr).sum()

        nu = min(1, math.sqrt(self.kl_clip / vg_sum))

        for p in self.model.parameters():
            v = updates[p]
            p.grad.data.copy_(v)
            p.grad.data.mul_(nu)

        self.optim.step()
        self.steps += 1

In [101]:
import torch
import torch.nn as nn
import torch.optim as optim



class A2C_ACKTR(object):
    def __init__(self,
                 actor_critic,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 eps=None,
                 alpha=None,
                 max_grad_norm=None,
                 acktr=False):

        self.actor_critic = actor_critic
        self.acktr = acktr

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef

        self.max_grad_norm = max_grad_norm

        if acktr:
            self.optimizer = KFACOptimizer(actor_critic)
        else:
            self.optimizer = optim.RMSprop(
                actor_critic.parameters(), lr, eps=eps, alpha=alpha)

    def update(self, rollouts):
        obs_shape = rollouts.observations.size()[2:]
        action_shape = rollouts.actions.size()[-1]
        num_steps, num_processes, _ = rollouts.rewards.size()

        values, action_log_probs, dist_entropy, states = self.actor_critic.evaluate_actions(
            rollouts.observations[:-1].view(-1, *obs_shape),
            rollouts.states[0].view(-1, self.actor_critic.state_size),
            rollouts.masks[:-1].view(-1, 1),
            rollouts.actions.view(-1, action_shape))

        values = values.view(num_steps, num_processes, 1)
        action_log_probs = action_log_probs.view(num_steps, num_processes, 1)

        advantages = rollouts.returns[:-1] - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(advantages.detach() * action_log_probs).mean()

        if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0:
            # Sampled fisher, see Martens 2014
            self.actor_critic.zero_grad()
            pg_fisher_loss = -action_log_probs.mean()

            value_noise = torch.randn(values.size())
            if values.is_cuda:
                value_noise = value_noise.cuda()

            sample_values = values + value_noise
            vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean()

            fisher_loss = pg_fisher_loss + vf_fisher_loss
            self.optimizer.acc_stats = True
            fisher_loss.backward(retain_graph=True)
            self.optimizer.acc_stats = False

        self.optimizer.zero_grad()
        (value_loss * self.value_loss_coef + action_loss -
         dist_entropy * self.entropy_coef).backward()

        if self.acktr == False:
            nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
                                     self.max_grad_norm)

        self.optimizer.step()

        return value_loss.item(), action_loss.item(), dist_entropy.item()

In [105]:
import torch
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler


class RolloutStorage(object):
    def __init__(self, num_steps, num_processes, obs_shape, action_space, state_size):
        self.observations = torch.zeros(num_steps + 1, num_processes, *obs_shape)
        self.states = torch.zeros(num_steps + 1, num_processes, state_size)
        self.rewards = torch.zeros(num_steps, num_processes, 1)
        self.value_preds = torch.zeros(num_steps + 1, num_processes, 1)
        self.returns = torch.zeros(num_steps + 1, num_processes, 1)
        self.action_log_probs = torch.zeros(num_steps, num_processes, 1)
        if action_space.__class__.__name__ == 'Discrete':
            action_shape = 1
        else:
            action_shape = action_space.shape[0]
        self.actions = torch.zeros(num_steps, num_processes, action_shape)
        if action_space.__class__.__name__ == 'Discrete':
            self.actions = self.actions.long()
        self.masks = torch.ones(num_steps + 1, num_processes, 1)

        self.num_steps = num_steps
        self.step = 0

    def cuda(self):
        self.observations = self.observations.cuda()
        self.states = self.states.cuda()
        self.rewards = self.rewards.cuda()
        self.value_preds = self.value_preds.cuda()
        self.returns = self.returns.cuda()
        self.action_log_probs = self.action_log_probs.cuda()
        self.actions = self.actions.cuda()
        self.masks = self.masks.cuda()

    def insert(self, current_obs, state, action, action_log_prob, value_pred, reward, mask):
        self.observations[self.step + 1].copy_(current_obs)
        self.states[self.step + 1].copy_(state)
        self.actions[self.step].copy_(action)
        self.action_log_probs[self.step].copy_(action_log_prob)
        self.value_preds[self.step].copy_(value_pred)
        self.rewards[self.step].copy_(reward)
        self.masks[self.step + 1].copy_(mask)

        self.step = (self.step + 1) % self.num_steps

    def after_update(self):
        self.observations[0].copy_(self.observations[-1])
        self.states[0].copy_(self.states[-1])
        self.masks[0].copy_(self.masks[-1])

    def compute_returns(self, next_value, use_gae, gamma, tau):
        if use_gae:
            self.value_preds[-1] = next_value
            gae = 0
            for step in reversed(range(self.rewards.size(0))):
                delta = self.rewards[step] + gamma * self.value_preds[step + 1] * self.masks[step + 1] - self.value_preds[step]
                gae = delta + gamma * tau * self.masks[step + 1] * gae
                self.returns[step] = gae + self.value_preds[step]
        else:
            self.returns[-1] = next_value
            for step in reversed(range(self.rewards.size(0))):
                self.returns[step] = self.returns[step + 1] * \
                    gamma * self.masks[step + 1] + self.rewards[step]


    def feed_forward_generator(self, advantages, num_mini_batch):
        num_steps, num_processes = self.rewards.size()[0:2]
        batch_size = num_processes * num_steps
        mini_batch_size = batch_size // num_mini_batch
        sampler = BatchSampler(SubsetRandomSampler(range(batch_size)), mini_batch_size, drop_last=False)
        for indices in sampler:
            observations_batch = self.observations[:-1].view(-1,
                                        *self.observations.size()[2:])[indices]
            states_batch = self.states[:-1].view(-1, self.states.size(-1))[indices]
            actions_batch = self.actions.view(-1, self.actions.size(-1))[indices]
            return_batch = self.returns[:-1].view(-1, 1)[indices]
            masks_batch = self.masks[:-1].view(-1, 1)[indices]
            old_action_log_probs_batch = self.action_log_probs.view(-1, 1)[indices]
            adv_targ = advantages.view(-1, 1)[indices]

            yield observations_batch, states_batch, actions_batch, \
                return_batch, masks_batch, old_action_log_probs_batch, adv_targ

    def recurrent_generator(self, advantages, num_mini_batch):
        num_processes = self.rewards.size(1)
        num_envs_per_batch = num_processes // num_mini_batch
        perm = torch.randperm(num_processes)
        for start_ind in range(0, num_processes, num_envs_per_batch):
            observations_batch = []
            states_batch = []
            actions_batch = []
            return_batch = []
            masks_batch = []
            old_action_log_probs_batch = []
            adv_targ = []

            for offset in range(num_envs_per_batch):
                ind = perm[start_ind + offset]
                observations_batch.append(self.observations[:-1, ind])
                states_batch.append(self.states[0:1, ind])
                actions_batch.append(self.actions[:, ind])
                return_batch.append(self.returns[:-1, ind])
                masks_batch.append(self.masks[:-1, ind])
                old_action_log_probs_batch.append(self.action_log_probs[:, ind])
                adv_targ.append(advantages[:, ind])

            observations_batch = torch.cat(observations_batch, 0)
            states_batch = torch.cat(states_batch, 0)
            actions_batch = torch.cat(actions_batch, 0)
            return_batch = torch.cat(return_batch, 0)
            masks_batch = torch.cat(masks_batch, 0)
            old_action_log_probs_batch = torch.cat(old_action_log_probs_batch, 0)
            adv_targ = torch.cat(adv_targ, 0)

            yield observations_batch, states_batch, actions_batch, \
                return_batch, masks_batch, old_action_log_probs_batch, adv_targ

In [106]:
actor_critic = Policy((17*11,), env.action_space)

In [107]:
agent = A2C_ACKTR(actor_critic, 1,
                               0.5, lr=0.1,
                               eps=1e-1, alpha=0.9,
                               max_grad_norm=False)

In [108]:
rollouts = RolloutStorage(100, 1, (17*11,), env.action_space, actor_critic.state_size)

In [112]:
import time

start = time.time()
for j in range(100):
    for step in range(100):
        # Sample actions
        with torch.no_grad():
            value, action, action_log_prob, states = actor_critic.act(
                    rollouts.observations[step],
                    rollouts.states[step],
                    rollouts.masks[step])
        cpu_actions = action.squeeze(1).cpu().numpy()

        # Obser reward and next obs
        obs, reward, done, info = env.step(cpu_actions)
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
        episode_rewards += reward

        # If done then clean the history of observations.
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
        final_rewards *= masks
        final_rewards += (1 - masks) * episode_rewards
        episode_rewards *= masks

        if args.cuda:
            masks = masks.cuda()

        if current_obs.dim() == 4:
            current_obs *= masks.unsqueeze(2).unsqueeze(2)
        else:
            current_obs *= masks

#         update_current_obs(obs)
        rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks)

    with torch.no_grad():
        next_value = actor_critic.get_value(rollouts.observations[-1],
                                            rollouts.states[-1],
                                            rollouts.masks[-1]).detach()

    rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

    value_loss, action_loss, dist_entropy = agent.update(rollouts)

    rollouts.after_update()

    if j % args.save_interval == 0 and args.save_dir != "":
        save_path = os.path.join(args.save_dir, args.algo)
        try:
            os.makedirs(save_path)
        except OSError:
            pass

        # A really ugly way to save a model to CPU
        save_model = actor_critic
        if args.cuda:
            save_model = copy.deepcopy(actor_critic).cpu()

        save_model = [save_model,
                        hasattr(envs, 'ob_rms') and envs.ob_rms or None]

        torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

    if j % args.log_interval == 0:
        end = time.time()
        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
            format(j, total_num_steps,
                   int(total_num_steps / (end - start)),
                   final_rewards.mean(),
                   final_rewards.median(),
                   final_rewards.min(),
                   final_rewards.max(), dist_entropy,
                   value_loss, action_loss))

RecursionError: maximum recursion depth exceeded

In [123]:
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

In [130]:
import numpy as np
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.9 * np.ones(n_actions))
env = gym.make("MarioEnv-v0")
model = TD3('MlpPolicy', env, verbose=1, action_noise=action_noise, train_freq=1, learning_rate=0.1, policy_kwargs=dict(net_arch=[400, 300]), batch_size=1).learn(10000)

Using cpu device
Wrapping the env in a DummyVecEnv.
called reset




[Errno 17] File exists
/Users/jackboynton/Library/Application Support/Dolphin/Pipes/p3
buffering p3 input fifo...
[Errno 17] File exists
/Users/jackboynton/Library/Application Support/Dolphin/Pipes/p4
buffering p3 input fifo...
sent reload state
1.0
1.0
1.0
1.0
1.0
called reset
sent reload state
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
called reset
sent reload state
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
called reset
sent reload state
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
speed 0.0
1.0
called reset
sent reload state


BrokenPipeError: [Errno 32] Broken pipe

In [None]:
import struct, binascii

a = b'\xdf\x01'[:1]
struct.unpack("B", a)

In [15]:
a = b'\xdf'[:1]

In [16]:
a

b'\xdf'

In [17]:
b'\xdf\x01'[:1]


b'\xdf'

In [19]:
binascii.hexlify(a)

b'df'

TypeError: 'bytes' object cannot be interpreted as an integer

b'\xdf\xc8'

In [None]:
import gym
import torch
import torch.nn as nn
import numpy as np
from collections import deque
import random
from itertools import count
import torch.nn.functional as F
from tensorboardX import SummaryWriter
import mario_env

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class QNetwork(nn.Module):
    def __init__(self):
        super(QNetwork, self).__init__()

        self.fc1 = nn.Linear(187, 64*4)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.fc_value = nn.Linear(64*4, 256*4)
        self.fc_adv = nn.Linear(64*4, 256*4)

        self.value = nn.Linear(256*4, 1)
        self.adv = nn.Linear(1024, 17)

    def forward(self, state):
        state = torch.flatten(state)
        y = self.relu(self.fc1(state))
        
        value = self.relu(self.fc_value(y))
        adv = self.relu(self.fc_adv(y))

        value = self.relu(self.value(value))
        adv = self.relu(self.adv(adv))

        Q = self.sigmoid(value + adv)
        return Q

    def select_action(self, state):
        state = torch.flatten(state)
        with torch.no_grad():
            Q = self.forward(state)
            action_index = Q
        return action_index


class Memory(object):
    def __init__(self, memory_size: int) -> None:
        self.memory_size = memory_size
        self.buffer = deque(maxlen=self.memory_size)

    def add(self, experience) -> None:
        self.buffer.append(experience)

    def size(self):
        return len(self.buffer)

    def sample(self, batch_size: int, continuous: bool = True):
        if batch_size > len(self.buffer):
            batch_size = len(self.buffer)
        if continuous:
            rand = random.randint(0, len(self.buffer) - batch_size)
            return [self.buffer[i] for i in range(rand, rand + batch_size)]
        else:
            indexes = np.random.choice(np.arange(len(self.buffer)), size=batch_size, replace=False)
            return [self.buffer[i] for i in indexes]

    def clear(self):
        self.buffer.clear()


env = gym.make('MarioEnv-v0')
n_state = env.observation_space.shape[0]
n_action = 17

onlineQNetwork = QNetwork().to(device)
targetQNetwork = QNetwork().to(device)
targetQNetwork.load_state_dict(onlineQNetwork.state_dict())

optimizer = torch.optim.Adam(onlineQNetwork.parameters(), lr=1e-2)

GAMMA = 0.99
EXPLORE = 20000
INITIAL_EPSILON = 0.1
FINAL_EPSILON = 0.0001
REPLAY_MEMORY = 50000
BATCH = 16

UPDATE_STEPS = 4

memory_replay = Memory(REPLAY_MEMORY)

epsilon = INITIAL_EPSILON
learn_steps = 0
writer = SummaryWriter('logs/ddqn')
begin_learn = False

episode_reward = 0

# onlineQNetwork.load_state_dict(torch.load('ddqn-policy.para'))
for epoch in count():

    state = env.reset()
    episode_reward = 0
    for time_steps in range(200):
        p = random.random()
        if p < epsilon:
            action = [random.randint(0, 1) for _ in range(17)] 
        else:
            tensor_state = torch.FloatTensor(state).unsqueeze(0).to(device)
            action = onlineQNetwork.select_action(tensor_state)
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        memory_replay.add((state, next_state, action, reward, done))
        if memory_replay.size() > 5:
            if begin_learn is False:
                print('learn begin!')
                begin_learn = True
            learn_steps += 1
            if learn_steps % UPDATE_STEPS == 0:
                targetQNetwork.load_state_dict(onlineQNetwork.state_dict())
            batch = memory_replay.sample(5, False)
            
            batch_state, batch_next_state, batch_action, batch_reward, batch_done = zip(*batch)
            print(batch_action.shape)
            batch_state = torch.FloatTensor(batch_state).to(device)
            batch_next_state = torch.FloatTensor(batch_next_state).to(device)
            batch_action = torch.FloatTensor(batch_action).unsqueeze(1).to(device)
            batch_reward = torch.FloatTensor(batch_reward).unsqueeze(1).to(device)
            batch_done = torch.FloatTensor(batch_done).unsqueeze(1).to(device)

            with torch.no_grad():
                onlineQ_next = onlineQNetwork(batch_next_state)
                targetQ_next = targetQNetwork(batch_next_state)
                online_max_action = torch.argmax(onlineQ_next, dim=1, keepdim=True)
                y = batch_reward + (1 - batch_done) * GAMMA * targetQ_next.gather(1, online_max_action.long())

            loss = F.mse_loss(onlineQNetwork(batch_state).gather(1, batch_action.long()), y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            writer.add_scalar('loss', loss.item(), global_step=learn_steps)

            if epsilon > FINAL_EPSILON:
                epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

                
        if done:
            break
        state = next_state

    writer.add_scalar('episode reward', episode_reward, global_step=epoch)
    if epoch % 10 == 0:
        torch.save(onlineQNetwork.state_dict(), 'ddqn-policy.para')
        print('Ep {}\tMoving average score: {:.2f}\t'.format(epoch, episode_reward))




called reset
[Errno 17] File exists
/Users/jackboynton/Library/Application Support/Dolphin/Pipes/p3
buffering p3 input fifo...
[Errno 17] File exists
/Users/jackboynton/Library/Application Support/Dolphin/Pipes/p4
buffering p3 input fifo...
sent reload state
called reset
[Errno 17] File exists
/Users/jackboynton/Library/Application Support/Dolphin/Pipes/p3
buffering p3 input fifo...


In [2]:
import pickle
with open("centertraj.pkl", "rb") as f:
    d = pickle.load(f)

In [3]:
trajx = [x[-1] for x in d if x[1] == "xpos"]
trajy = [x[-1] for x in d if x[1] == "ypos"]
trajz = [x[-1] for x in d if x[1] == "zpos"]

In [5]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

x_scal, y_scal, z_scal = MinMaxScaler(), MinMaxScaler(), MinMaxScaler()

x_scal = x_scal.fit(np.array(trajx).reshape(-1, 1))
y_scal = y_scal.fit(np.array(trajy).reshape(-1, 1))
z_scal = z_scal.fit(np.array(trajz).reshape(-1, 1))



In [6]:
import joblib
joblib.dump(x_scal, 'xscaler.gz')
joblib.dump(y_scal, 'yscaler.gz')
joblib.dump(z_scal, 'zscaler.gz')

['zscaler.gz']

In [None]:


my_scaler = joblib.load('scaler.gz')

In [21]:
m = min([len(trajx), len(trajy), len(trajz)])

In [23]:
trajx = trajx[:m]
trajy = trajy[:m]
trajz = trajz[:m]

In [24]:
import numpy as np

In [25]:
import pickle
with open("player_data.pkl", "rb") as f:
    da = pickle.load(f)

In [27]:
trajxx = [x[-1] for x in da if x[1] == "xpos"]
trajyy = [x[-1] for x in da if x[1] == "ypos"]
trajzz = [x[-1] for x in da if x[1] == "zpos"]

In [28]:
ma = min([len(trajxx), len(trajyy), len(trajzz)])

In [29]:
trajxx = trajxx[:m]
trajyy = trajyy[:m]
trajzz = trajzz[:m]

In [43]:


def dirr(trajx, trajy, trajz, trajxx, trajyy, trajzz, dd = 1):
    
for (p, pp), (s, ss) in zip(list(zip(list(zip(trajxx[0::dd], trajyy[0::dd], trajzz[0::dd])), list(zip(trajxx[dd::dd], trajyy[dd::dd], trajzz[dd::dd])))), list(zip(list(zip(trajx[0::dd], trajy[0::dd], trajz[0::dd])), list(zip(trajx[dd::dd], trajy[dd::dd], trajz[dd::dd]))))):
    v_curr = (pp[0]-p[0], pp[1] - p[1], pp[2]-p[2])
    v_expe = (ss[0]-s[0], ss[1] - s[1], ss[2]-s[2])
    print(np.array(v_curr) @ np.array(v_expe))
    
    
    
    

1133.6501794047654
2384.7161886803806
1254.0401016287506
1305.6652372758836
2746.7744315452874
1415.5688238479197
1444.6295337919146
2947.988361954689
1504.133692028001
3100.858219832182
1597.962252125144
3261.185566753149
1663.827190425247
1697.8560490552336
3500.256709381938
1803.7970437668264
1840.6180363409221
3756.43410333246
1916.4304004833102
3951.093328502029
2036.2199012767524
2077.872013270855
4240.5918735247105
2163.8300720751286
4466.059766834602
2309.5637998729944
2360.790909178555
4822.999005060643
2461.158890692517
5074.076165555045
2615.0951406285167
2668.53078421019
5446.169399030507
2778.6906266622245
5728.572066754103
2952.2136333324015
3012.4958276748657
6148.048194356263
3136.794318906963
6466.88978137821
3332.754770057276
3400.8153281714767
6940.276356194168
3541.002830181271
7300.2450863569975
3762.1952761057764
3838.985590785742
7834.672211598605
3997.2414968572557
8241.065694134682
4247.088305406272
4333.695003597066
8844.340251088142
4512.512576118112
9303.066

12724.783007986844
6353.951871305704
6345.313044857234
12668.454865574837
6322.143656589091
12598.49486348033
6268.224649384618
6252.483310341835
12494.740772318095
6242.504060011357
12470.638015963137
6236.623551201075
6249.857323095202
12536.442918691784
6282.435304250568
12569.4258036986
6284.955428905785
6285.797432482243
12570.449240002781
6280.881480157375
12545.823713459074
6264.950204566121
6259.808768514544
12510.179592132568
6250.26343492046
12483.718374572694
6233.1251868754625
6227.90270633623
12445.207946356386
6217.3014548793435
6211.9988740980625
12408.2483246997
6196.171435363591
12381.740639947355
6185.575545165688
6180.358747866005
12344.815892845392
6164.457654468715
12318.454105556011
6153.90398453176
6148.582479085773
12281.398601066321
6132.786041624844
12255.215173587203
6122.420324027538
6117.119542479515
12217.937867626548
6100.8400012254715
12190.388691157103
6090.113128494471
6085.663834873587
12157.003656566143
6069.571812108159
12127.461507290602
6058.42165

In [35]:
list(zip(list(zip(trajxx[0::5], trajyy[0::5], trajzz[0::5])), list(zip(trajxx[5::5], trajyy[5::5], trajzz[5::5]))))

[((-19071.37890625, 547.4615478515625, 21660.38671875),
  (-19080.84375, 545.1007690429688, 21759.927734375)),
 ((-19080.84375, 545.1007690429688, 21759.927734375),
  (-19091.50390625, 542.4258422851562, 21872.30078125)),
 ((-19091.50390625, 542.4258422851562, 21872.30078125),
  (-19103.603515625, 539.3914184570312, 21999.1484375)),
 ((-19103.603515625, 539.3914184570312, 21999.1484375),
  (-19117.33984375, 536.017333984375, 22142.3359375)),
 ((-19117.33984375, 536.017333984375, 22142.3359375),
  (-19132.21484375, 532.2312622070312, 22304.0390625)),
 ((-19132.21484375, 532.2312622070312, 22304.0390625),
  (-19133.921875, 528.1104125976562, 22487.400390625)),
 ((-19133.921875, 528.1104125976562, 22487.400390625),
  (-19132.4765625, 525.4210205078125, 22694.46875)),
 ((-19132.4765625, 525.4210205078125, 22694.46875),
  (-19130.849609375, 522.8514404296875, 22928.224609375)),
 ((-19130.849609375, 522.8514404296875, 22928.224609375),
  (-19129.001953125, 519.9747314453125, 23192.10546875))

In [9]:
import mario_env
env = gym.make("MarioEnv-v0")

In [11]:
env.reset()

try:
    while 1:
        env.step([0] * 17)
except KeyboardInterrupt:
    env.save_traj()
    

called reset
sent reload state
speed 46918.39735722964
221465.2679867281 False


TypeError: 'int' object is not subscriptable

In [26]:
!ls -oh

total 10968
-rw-r--r--   1 jackboynton   1.8M Apr 29 12:25 Untitled.ipynb
-rw-r--r--   1 jackboynton   372K May  2 15:53 Untitled1.ipynb
-rw-r--r--   1 jackboynton    13K May  1 18:16 Untitled2.ipynb
drwxr-xr-x  12 jackboynton   384B Apr 29 12:46 [1m[36m__pycache__[m[m
-rw-r--r--   1 jackboynton   3.6K May  1 21:35 buffers.py
-rw-r--r--   1 jackboynton   390K Apr 28 22:46 centertraj.pkl
-rw-r--r--   1 jackboynton   4.8K Apr 28 22:47 controller_data.pkl
drwxr-xr-x  11 jackboynton   352B Apr 22 16:23 [1m[36mdata[m[m
-rw-r--r--   1 jackboynton   1.3K May  1 21:35 data_structures.py
-rw-r--r--   1 jackboynton   2.3M May  1 18:35 ddqn-policy.para
-rw-r--r--   1 jackboynton   293B Apr 22 20:00 deb.py
drwxr-xr-x  26 jackboynton   832B May  1 14:45 [1m[36mdolphin[m[m
-rw-r--r--   1 jackboynton   402B Apr 29 12:32 gen_expert_traj.py
-rw-r--r--   1 jackboynton    85K May  2 15:49 left_traj.npy
-rw-r--r--   1 jackboynton   5.4K Apr 28 21:48 main.py
drwxr-xr-x   7 jackb

In [50]:
import numpy as np
a = np.load("trymax.npy")

In [51]:
a

array([['zpos', '25525.787109375'],
       ['ypos', '516.6240234375'],
       ['xpos', '-18017.79296875'],
       ...,
       ['zpos', '25749.92578125'],
       ['ypos', '497.08209228515625'],
       ['xpos', '-20115.0']], dtype='<U19')

In [60]:
xpos = [float(x[1]) for x in a if x[0] == "xpos"]
ypos = [float(x[1]) for x in a if x[0] == "ypos"]
zpos = [float(x[1]) for x in a if x[0] == "zpos"]

In [61]:
import matplotlib.pylab as plt
import matplotlib
%matplotlib notebook
import pandas as pd
df = pd.DataFrame({'col_x':xpos[:m], 'col_y':ypos[:m], 'col_z':zpos[:m]
})
m = min([len(x) for x in [xpos, ypos, zpos]])

In [66]:
import plotly.express as px

fig = px.scatter_3d(df, x='col_z', y='col_x', z='col_y',
                    title="3D Scatter Plot")
fig.show()

In [67]:
print(min(xpos), max(xpos))
print(min(ypos), max(ypos))
print(min(zpos), max(zpos))

-27077.66796875 22791.869140625
496.66680908203125 4287.50537109375
-14364.5126953125 54478.3515625


In [68]:
xpos = np.array(xpos)
ypos = np.array(ypos)
zpos = np.array(zpos)

In [73]:



xpos = (xpos - xpos.min()) / (xpos.max() - xpos.min())

1.0