In [None]:
'''PPO scaling optimization'''
'''https://arxiv.org/pdf/2005.12729.pdf'''

'''
R = 0
RS = []
while not done:
    s, r, ... = env.step(action)
    R = R * GAMMA + r
    RS.append(R)

    scaled_reward = r / np.std(RS)
'''

In [1]:
import numpy as np


class RewardScaling:
    def __init__(self, gamma):
        self.gamma = gamma
        self.init_statistics()
        

    def init_statistics(self):
        self.iter = 0
        self.R = 0
        self.E_x_squared = 0
        self.E_x = 0
        
    
    def update_mean(self, E_x, x):
        E_x *= (1 - (1 / self.iter))
        E_x += (x / self.iter)
        return E_x
    
    
    def update_statistics(self, reward):

        self.iter += 1

        # tracking E(x), E(x ** 2) for V(x)
        # V(x) = E(x ** 2) - E(x) ** 2

        # R = R * gamma + r
        self.R *= self.gamma
        self.R += reward

        # E(x ** 2)
        self.E_x_squared = self.update_mean(self.E_x_squared, self.R ** 2)

        # E(x)
        self.E_x = self.update_mean(self.E_x, self.R)

        
    # for scale observations
    def get_norm_factor(self):

        # V(x) = E(x ** 2) - E(x) ** 2
        V_x = self.E_x_squared - self.E_x ** 2

        # SD(x) = V(x) ** 0.5
        return np.sqrt(V_x)

In [2]:
# settings

max_count = 10000
gamma = 0.95
rs = RewardScaling(gamma)

In [3]:
from time import time

start = time()
list_std = []
for reward in range(0, max_count):
    rs.update_statistics(reward)
    list_std.append(rs.get_norm_factor())

end = time()
print('time:', end - start, 'value:', list_std[-1])

time: 0.02803778648376465 value: 57733.716668208566


In [4]:
start = time()

RS = []
R = 0
list_std = []
for reward in range(0, max_count):
    R = R * gamma + reward
    RS.append(R)
    list_std.append(np.std(RS))

end = time()
print('time:', end - start, 'value:', list_std[-1])

time: 2.606989622116089 value: 57733.7166682084
