In [None]:
import numpy as np
import torch

def lambdaReturns(rewards, values, gamma=0.99, lambda_=0.95):
    n = len(rewards)
    returns = torch.zeros(n)
    bootstrap = values[-1]
    for t in reversed(range(n)):
        returns[t] = rewards[t] + gamma*((1 - lambda_)*values[t + 1] + lambda_*bootstrap)
        bootstrap = returns[t]
    return returns

rewards = [1, 0, 0, 1, 0, 2]
values = [0.5, 0.6, 0.4, 0.7, 0.9, 0.8, 0.0]

gamma = 0.99
lambda_ = 0.95
lmbdaReturns = lambdaReturns(rewards, values, gamma, lambda_)

print("Rewards:", rewards)
print("Values:", values)
print("Lambda-returns:", lmbdaReturns)

Rewards: [1, 0, 0, 1, 0, 2]
Values: [0.5, 0.6, 0.4, 0.7, 0.9, 0.8, 0.0]
Lambda-returns: tensor([3.4506, 2.5741, 2.7159, 2.8509, 1.9206, 2.0000])


In [11]:
import torch
x = torch.randn((4, 255, 3))
torch.stack((x, x), dim=1).shape

torch.Size([4, 2, 255, 3])

In [25]:
rewards = torch.tensor([1, 0, 0, 1, 0, 2])
values = torch.tensor([0.5, 0.6, 0.4, 0.7, 0.9, 0.8])
rewards = rewards[:-1]
next_values = values[1:]
last = next_values[-1]
horizonLength = 6
gamma = 0.99
inputs = rewards + gamma * next_values * (1 - lambda_)

outputs = []
# single step
for index in reversed(range(horizonLength - 1)):
    last = inputs[index] + gamma * lambda_ * last
    outputs.append(last)
returns = torch.stack(list(reversed(outputs)))
print("Lambda-returns:", returns)

Lambda-returns: tensor([2.5676, 1.6352, 1.7176, 1.7894, 0.7920])


In [24]:
import numpy as np
import torch

def lambdaReturns(rewards, values, gamma=0.99, lambda_=0.95):
    horizonLength = len(rewards)
    bootstrap = values[-1]
    returns = torch.zeros(horizonLength - 1)
    for t in reversed(range(horizonLength - 1)):
        returns[t] = rewards[t] + gamma*((1 - lambda_)*values[t + 1] + lambda_*bootstrap)
        bootstrap = returns[t]
    return returns

rewards = [1, 0, 0, 1, 0, 2]
values = [0.5, 0.6, 0.4, 0.7, 0.9, 0.8]

gamma = 0.99
lambda_ = 0.95
lmbdaReturns = lambdaReturns(rewards, values, gamma, lambda_)

print("Rewards:", rewards)
print("Values:", values)
print("Lambda-returns:", lmbdaReturns)

Rewards: [1, 0, 0, 1, 0, 2]
Values: [0.5, 0.6, 0.4, 0.7, 0.9, 0.8]
Lambda-returns: tensor([2.5676, 1.6352, 1.7176, 1.7894, 0.7920])


In [20]:
import torch

def lambdaReturns(rewards, nextValues, lambda_=0.95, gamma=0.99):
    td_targets = rewards + gamma * nextValues
    returns = torch.zeros_like(rewards)
    bootstrap = td_targets[-1]  # Initialize with last TD target
    for t in reversed(range(len(rewards))):
        returns[t] = (1 - lambda_) * td_targets[t] + lambda_ * bootstrap
        bootstrap = rewards[t] + gamma * returns[t]
    return returns

# Example usage
rewards = torch.tensor([1., 0., 0., 1., 0., 2.])
values = torch.tensor([0.5, 0.6, 0.4, 0.7, 0.9, 0.8])

# Use all but last reward, and all values
lambda_returns = lambdaReturns(rewards[:-1], values[1:])
print("Lambda returns:", lambda_returns)

Lambda returns: tensor([1.6676, 1.6884, 1.7741, 0.8394, 0.7920])


In [23]:
rewards = torch.tensor([1., 0., 0., 1., 0., 2.])
values = torch.tensor([0.5, 0.6, 0.4, 0.7, 0.9, 0.8])
horizonLength = len(rewards)
returns = torch.zeros_like(rewards[:-1])

bootstrap = values[-1]
for i in reversed(range(len(returns))):
    returns[i] = rewards[i] + gamma * ((1 - lambda_)*values[i] + lambda_*bootstrap)
    bootstrap = returns[i]

print(f"returns: {returns}")

returns: tensor([2.5545, 1.6265, 1.6978, 1.7842, 0.7970])


In [None]:
def lambdaValues(rewards, values, gamma=0.997, lambda_=0.95):
    # 1 less reward than values, last value is the bootstrap
    # I GET IT NOW, USUALLY THEY HAVE 1 FEWER RETURN THAN VALUES BECAUSE THE BOOTSTRAP IS USELESS FOR LOSS CALC AS IT WOULD BE DELTA BETWEEN V_T and V_T
    returns = torch.zeros_like(values)
    bootstrap = values[-1]
    returns[-1] = bootstrap
    for i in reversed(range(len(rewards))):
        returns[i] = rewards[i] + gamma * ((1 - lambda_)*values[i] + lambda_*bootstrap)
        bootstrap = returns[i]
    return returns

rewards = torch.tensor([1., 0., 0., 1., 0.,])
values = torch.tensor([0.5, 0.6, 0.4, 0.7, 0.9, 0.8])
lambdaReturns = lambdaValues(rewards, values)
lambdaReturns

tensor([2.5964, 1.6591, 1.7201, 1.7951, 0.8026, 0.8000])

In [31]:
def lambdaValues(rewards, values, gamma=0.997, lambda_=0.95):
    # 1 less reward than values, last value is the bootstrap
    returns = torch.zeros_like(values)
    bootstrap = values[-1]
    print(f"Initial bootstrap (V_T): {bootstrap.item()}")
    returns[-1] = bootstrap
    print(f"Initial returns[-1] set to bootstrap: {returns[-1].item()}")

    for i in reversed(range(len(rewards))):
        print(f"\nStep i={i}:")
        print(f"rewards[i]: {rewards[i]}, values[i]: {values[i]}, bootstrap: {bootstrap}")
        update = rewards[i] + gamma * ((1 - lambda_) * values[i] + lambda_ * bootstrap)
        print(f"  Update: {update.item()}")
        returns[i] = update
        bootstrap = returns[i]
        print(f"  Returns[i]: {returns[i].item()}")
        print(f"  Updated bootstrap: {bootstrap.item()}")

rewards = torch.tensor([1., 0., 0., 1., 0.,])
values = torch.tensor([0.5, 0.6, 0.4, 0.7, 0.9, 0.8])
lambdaReturns = lambdaValues(rewards, values)
lambdaReturns

Initial bootstrap (V_T): 0.800000011920929
Initial returns[-1] set to bootstrap: 0.800000011920929

Step i=4:
rewards[i]: 0.0, values[i]: 0.8999999761581421, bootstrap: 0.800000011920929
  Update: 0.8025850057601929
  Returns[i]: 0.8025850057601929
  Updated bootstrap: 0.8025850057601929

Step i=3:
rewards[i]: 1.0, values[i]: 0.699999988079071, bootstrap: 0.8025850057601929
  Update: 1.7950633764266968
  Returns[i]: 1.7950633764266968
  Updated bootstrap: 1.7950633764266968

Step i=2:
rewards[i]: 0.0, values[i]: 0.4000000059604645, bootstrap: 1.7950633764266968
  Update: 1.7201342582702637
  Returns[i]: 1.7201342582702637
  Updated bootstrap: 1.7201342582702637

Step i=1:
rewards[i]: 0.0, values[i]: 0.6000000238418579, bootstrap: 1.7201342582702637
  Update: 1.659135103225708
  Returns[i]: 1.659135103225708
  Updated bootstrap: 1.659135103225708

Step i=0:
rewards[i]: 1.0, values[i]: 0.5, bootstrap: 1.659135103225708
  Update: 2.596374750137329
  Returns[i]: 2.596374750137329
  Updated

In [8]:
import torch

def compute_lambda_values(
    rewards,
    values,
    gamma = 0.997,
    lmbda = 0.95,
):
    vals = [values[-1:]]
    interm = rewards + gamma * values * (1 - lmbda)
    for t in reversed(range(len(values))):
        vals.append(interm[t] + gamma * lmbda * vals[-1])
    ret = torch.cat(list(reversed(vals))[:-1])
    return ret

rewards = torch.tensor([1., 0., 0., 1., 0.,])
values = torch.tensor([0.6, 0.4, 0.7, 0.9, 0.8])

compute_lambda_values(rewards, values)

tensor([2.6098, 1.6680, 1.7401, 1.8003, 0.7976])

In [9]:
def lambdaValues(rewards, values, gamma=0.997, lambda_=0.95):
    returns = torch.zeros_like(values)
    bootstrap = values[-1]
    returns[-1] = bootstrap
    for i in reversed(range(len(rewards))):
        returns[i] = rewards[i] + gamma * ((1 - lambda_)*values[i] + lambda_*bootstrap)
        bootstrap = returns[i]
    return returns

rewards = torch.tensor([1., 0., 0., 1., 0.,])
values = torch.tensor([0.5, 0.6, 0.4, 0.7, 0.9, 0.8])

lambdaValues(rewards, values)

tensor([2.5964, 1.6591, 1.7201, 1.7951, 0.8026, 0.8000])

In [None]:
import torch

class RewardEMA:
    """running mean and std"""

    def __init__(self, device, alpha=1e-2):
        self.device = device
        self.alpha = alpha
        self.range = torch.tensor([0.05, 0.95], device=device)

    def __call__(self, x, ema_vals):
        flat_x = torch.flatten(x.detach())
        x_quantile = torch.quantile(input=flat_x, q=self.range)
        # this should be in-place operation
        ema_vals[:] = self.alpha * x_quantile + (1 - self.alpha) * ema_vals
        scale = torch.clip(ema_vals[1] - ema_vals[0], min=1.0)
        offset = ema_vals[0]
        return offset.detach(), scale.detach()

# Test the RewardEMA class
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize RewardEMA instance
reward_ema = RewardEMA(device=device, alpha=0.1)

# Initialize EMA values (e.g., starting with some range)
ema_vals = torch.tensor([0.0, 1.0], device=device)

# Simulate some reward data
rewards = [
    torch.randn(10, device=device) * 2 + 5,  # Batch 1
    torch.randn(10, device=device) * 1 + 7,  # Batch 2
    torch.randn(10, device=device) * 3 + 4,  # Batch 3
]

# Test RewardEMA on each batch
for i, batch in enumerate(rewards, 1):
    offset, scale = reward_ema(batch, ema_vals)
    print(f"Batch {i}:")
    print(f"  Rewards: {batch}")
    print(f"  Updated EMA Values: {ema_vals}")
    print(f"  Offset: {offset}, Scale: {scale}")
    print()


Batch 1:
  Rewards: tensor([5.6989, 3.9511, 4.8963, 7.0852, 3.7286, 8.6404, 4.8623, 8.1160, 5.1036,
        0.9738], device='cuda:0')
  Updated EMA Values: tensor([0.2213, 1.7404], device='cuda:0')
  Offset: 0.22134535014629364, Scale: 1.5190964937210083

Batch 2:
  Rewards: tensor([5.6851, 5.4815, 6.5169, 5.9742, 5.8991, 6.4466, 5.5185, 5.9447, 8.4563,
        7.2663], device='cuda:0')
  Updated EMA Values: tensor([0.7490, 2.3585], device='cuda:0')
  Offset: 0.7490271925926208, Scale: 1.6094470024108887

Batch 3:
  Rewards: tensor([4.8100, 1.1124, 8.8829, 4.9382, 0.5479, 1.6851, 7.1290, 1.7753, 5.9428,
        6.0926], device='cuda:0')
  Updated EMA Values: tensor([0.7543, 2.9320], device='cuda:0')
  Offset: 0.7543126344680786, Scale: 2.1776814460754395



In [7]:
import torch
torch.set_printoptions(sci_mode=False)

# Your log-probabilities tensor
logprobs = torch.tensor([44.7902, 33.0753, 45.2147, 45.6813, 37.9168, 42.2072, 
                         44.5198, 39.6648, 43.7184, 33.4006, 39.2761, 43.4411, 
                         44.0044, 39.6464, 37.1331])

# Convert log-probabilities to probabilities using softmax
probabilities = torch.exp(logprobs)

# Print probabilities
print(probabilities)

tensor([28322812601091227648., 231429933891584., 43300628275493076992., 69045759010565783552., 29312823230201856., 2139703778599763968.,
        21612387583658033152., 168346156609306624., 9697492445868064768., 320402916114432., 114128362070343680., 7349037863519911936.,
        12908250017415823360., 165277196317884416., 13387510772137984.])
