In [1]:
from NeuralShield.Utils import loader
import numpy as np

import stable_baselines as sb
print(f"stable_baseline version: {sb.__version__}")
import tensorflow as tf
print(f"tensorflow version: {tf.__version__}")

stable_baseline version: 2.9.0a0
tensorflow version: 1.15.0


## Load pretrained policy
The [model zoo](https://github.com/araffin/rl-baselines-zoo) is trained with [stable-baselines](https://stable-baselines.readthedocs.io/en/master/).   


In [2]:
pi = loader.get_original_policy("AntBulletEnv-v0", "ppo2")

pi

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fb128202588>

## Get parameters and backward propagation

In [14]:
params = pi.get_parameters()
print([k for k in params])

['model/pi_fc0/w:0', 'model/pi_fc0/b:0', 'model/vf_fc0/w:0', 'model/vf_fc0/b:0', 'model/vf/w:0', 'model/vf/b:0', 'model/pi/w:0', 'model/pi/b:0', 'model/pi/logstd:0', 'model/q/w:0', 'model/q/b:0']


In [18]:
writer = tf.summary.FileWriter("./cp_graph", pi.sess.graph)
pi.predict(np.zeros(28, dtype=np.float32))
writer.close()

### Output distribution is Gaussian distribution

In [4]:
pi.act_model.proba_distribution

<stable_baselines.common.distributions.DiagGaussianProbabilityDistribution at 0x7fb05dbdf208>

### Actor neural network

In [5]:
import torch as th
from torch import nn

In [19]:
class ActorNetwork(nn.Module):
    def __init__(self, weights, biases, logstd=None, action_scale=1.0):
        super(ActorNetwork, self).__init__()
        
        layers = []
        for w, b in zip(weights, biases):
            layer = nn.Linear(*w.shape)
            layer.weight.data = th.tensor(w.T, dtype=layer.weight.data.dtype)
            layer.bias.data = th.tensor(b, dtype=layer.bias.data.dtype)
            layers.append(layer)
            layers.append(nn.Tanh())
        
        layers.pop()
        
        self.actor = nn.Sequential(*layers)
        self.logstd = logstd
    
    def forward(self, x):
        return self.actor(x)

In [21]:
weights = []
biases = []
logstd = None

for k in params:
    scopes = k.split("/")
    if "pi" in scopes[-2]:
        if "w" in scopes[-1]:
            weights.append(params[k])
        if "b" in scopes[-1]:
            biases.append(params[k])
        if "logstd" in scopes[-1]:
            logstd = params[k]

actor_net = ActorNetwork(weights, biases)
actor_net(th.from_numpy(np.zeros(28, dtype=np.float32)))

tensor([ 0.5848,  0.7180, -0.0198, -0.1675, -0.2798, -0.3186, -0.4818,  0.7939],
       grad_fn=<AddBackward0>)

In [8]:
pi.predict(np.zeros(28, dtype=np.float32), deterministic=True)

(array([ 0.5847716 ,  0.7179805 , -0.01980111, -0.16752051, -0.2798085 ,
        -0.31864387, -0.4818455 ,  0.7939162 ], dtype=float32), None)