In [1]:
import retro
import torch
from torch import nn
from copy import deepcopy

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Interesting sources:
- https://github.com/liuruoze/HierNet-SC2/blob/396646056dbe5f8f20e43e0ef35e59db09e907c0/algo/ppo.py#L180 - Ruo Ze Liu's PPO2 implementation (in Tensorflow)
- https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/ppo2/ppo2.py#L185 - Original code from Stable Balines
- https://medium.com/analytics-vidhya/coding-ppo-from-scratch-with-pytorch-part-1-4-613dfc1b14c8 - Wonderful tutorial by Eric Yang Yu (Pytorch), this implementation is for **CONTINUOUS** action spaces, which are most of Gym's Environments (Robots joints)
- https://spinningup.openai.com/en/latest/spinningup/spinningup.html - Spinning Up, by OpenAI. Interesting hyperlinks
- https://www.alexirpan.com/2018/02/14/rl-hard.html - Deep Reinforcement Learning Doesn't Work Yet. Problem is... I can't refuse the challenge of GANs or RLs...
- https://lilianweng.github.io/posts/2018-02-19-rl-overview/ - Lilian Weng, OpeanAI Research Leader, with her glorious blogs which are basically equivalent to a review paper.

**POSSIBLE ISSUES:** Usually, PPO is applied with 2 different networks, policy and value function. Considering Deep-RL instability, it's possible that the idea of uniting both in the same network, though appealing, may be troublesome.

"Also, what we know about good CNN design from supervised learning land doesn’t seem to apply to reinforcement learning land, because you’re mostly bottlenecked by credit assignment / supervision bitrate, not by a lack of a powerful representation. Your ResNets, batchnorms, or very deep networks have no power here." - Hacker News comment from Andrej Karpathy, back when he was at OpenAI (extracted from Alex Irpan's blog)

# PPO(2)

In [21]:
class SubjectActor(nn.Module):

    '''
    Simple Neural Network for testing
    '''

    def __init__(self, mode='exploration'):

        super(SubjectActor, self).__init__()

        self.mode = mode

        # Input = 3x200x256

        self.neuron1 = nn.Linear(3*200*256, 128)
        self.neuron2 = nn.Linear(128, 128)
        self.neuron3 = nn.Linear(128, 12)

        self.relu = nn.ReLU()

        # Our environment is MultiBinary, so we'll use Sigmoid.
        # MultiBinary Environment: For each button, determine if pressed or not.

        self.sigmoid = nn.Sigmoid()

    def forward(self, obs):

        x = obs.contiguous().view(obs.size(0), -1)

        x = self.neuron1(x)
        x = self.relu(x)
        x = self.neuron2(x)
        x = self.relu(x)

        if self.mode == 'exploration': # Just collecting data
            # Random actions preferred for diversity of states
            # Alternative (and fancy) method: torch.distributions.Bernoulli()

            fake_input = torch.randn_like(x, device=device)
            actions = self.neuron3(fake_input)

            del fake_input

        else:

            actions = self.neuron3(x)

        actions = self.sigmoid(actions)

        del x

        return actions

In [4]:
class SubjectCritic(nn.Module):

    '''
    Simple Neural Network for testing
    '''

    def __init__(self):

        super(SubjectCritic, self).__init__()

        # Input = 3x200x256

        self.neuron1 = nn.Linear(3*200*256, 128)
        self.neuron2 = nn.Linear(128, 128)
        self.neuron3 = nn.Linear(128, 1)

        self.relu = nn.ReLU()

    def forward(self, obs):

        x = obs.contiguous().view(obs.size(0), -1)

        x = self.neuron1(x)
        x = self.relu(x)
        x = self.neuron2(x)
        x = self.relu(x)

        expected_reward = self.neuron3(x)

        del x

        return expected_reward

In [52]:
# Here, we'll use Value Function V(s) --> Depends only on state (s)
# By default, both models are separate...and they're unstable enough.

policy = SubjectActor().to(device)
value_function = SubjectCritic().to(device)

In [10]:
'''
Unfortunately, RL algorithms tend to be too sentimental,
so we'll spend quite some time adjusting hyperparameters

Some parameters used in Gym's environments, which are too simple:
https://github.com/araffin/rl-baselines-zoo/blob/master/hyperparams/ppo2.yml

For complex games, like we want for Hakisa, we should stick to something
close to the parameters used by Ruo Ze Liu for HierNet, in StarCraft 2:

https://github.com/liuruoze/HierNet-SC2/blob/396646056dbe5f8f20e43e0ef35e59db09e907c0/param.py

"Even Ignoring Generalization Issues, The Final Results Can be Unstable and Hard to Reproduce" - Alex Irpan

"[Supervised learning] wants to work. Even if you screw something up you'll usually get something non-random back.
RL must be forced to work.
If you screw something up or don't tune something well enough you're exceedingly likely to get a policy that is even worse than random.
And even if it's all well tuned you'll get a bad policy 30% of the time, just because." - Andrej Karpathy
'''

gamma = 0.99 # Gamma for the Discount Rewards.
lamb = 0.95 # Lambda for Generalized Advantage Estimation. Together with gamma, basically a "weight" for Exponential Moving Average
BATCH_SIZE = 16 # In reality, all tensors are batch 1, so we'll use gradient accumulation to simulate multiple batches.
EPOCHS = 10
lr = 2e-4
value_weight = 1.0
entropy_weight = 1e-2
target_KLD = 0.05 # Early-Stopping parameter to avoid suboptimal policy in PPO2.

In [53]:
# In reality, the AI will only play the game to acquire data
# The magic really happens during her training, which is done offline.

'''
"We can call our data collected in each rollout a batch"
- Yu, Eric. Coding PPO From Scratch With PyTorch (2/4)
https://medium.com/@eyyu/coding-ppo-from-scratch-with-pytorch-part-2-4-f9d8b8aa938a

"A trajectory is a sequence of states and actions in the world. [...]
Trajectories are also frequently called episodes or rollouts."
- OpenAI's Spinning Up: https://spinningup.openai.com/en/latest/spinningup/rl_intro.html
'''

# Creating lists to store data.
# Using a separate cell in case of storing multiple episodes (playthroughs)
# TO CONSIDER: Using pickle or torch to save and load playthroughs

states = []
actions = []
log_probs = []
values = []
rewards = []
rewards_to_go = []

In [54]:
# Exploration Phase - Let her learn how the environment is.

env = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis", state="ChunLiVsBlanka.1star")
obs = env.reset()
obs = torch.from_numpy(obs)
obs = obs/255
obs = obs.permute(2, 1, 0).unsqueeze(0).float().to(device)
steps = 0

while steps < 1000:
    env.render()

    # Collecting State --> Must be done at the beginning

    states.append(obs.cpu())

    with torch.no_grad():

        log_prob = policy(obs)
        value = value_function(obs)

    action = log_prob.squeeze(0)

    # MultiBinary Environment --> Only 0.0 or 1.0 accepted
    bin = []
    for x in action:
        if x > 0.5:
            bin.append(1.)
        else:
            bin.append(0.)
    
    action = bin # This is the actual action
    log_prob = torch.log(torch.clamp(log_prob, 1e-10, 1.0)) # The probability distribution. Clipping to avoid NaN

    del bin

    obs, reward, end, info = env.step(action)
    obs = torch.from_numpy(obs)
    obs = obs/255
    obs = obs.permute(2, 1, 0).unsqueeze(0).float().to(device)
    #reward = torch.tensor(reward, device=device)

    reward = (info['health']**(1+info['matches_won'])) - (info['enemy_health']**(1+info['enemy_matches_won']))
    reward = torch.tensor(reward, device=device)
    reward = -(10.0/(torch.exp(reward) + 1.0)) + 5.0 # Normalizing to -5 to +5 (sigmoid function)

    # Collecting variables for the previous (collected) state
    
    actions.append(action)
    log_probs.append(log_prob.cpu())
    values.append(value.cpu())
    rewards.append(reward.cpu())

    steps += 1

env.render(close=True)
env.close()

In [55]:
# Processing colected data

states = torch.cat(states, 0)
log_probs = torch.cat(log_probs, 0)

discounted_reward = 0 # The final rewards provide greater impact on the algorithm
for r_t in reversed(rewards):
    
    discounted_reward = r_t + discounted_reward * gamma
    rewards_to_go.insert(0, discounted_reward)

rewards_to_go = torch.tensor(rewards_to_go)


# Calculating Generalized Advantage Estimation

'''
"One kind of return is the finite-horizon undiscounted return, which is just the sum of rewards obtained in a fixed window of steps [...]
Another kind of return is the infinite-horizon discounted return, which is the sum of all rewards ever obtained by the agent,
but discounted by how far off in the future they're obtained.
[...]
Why would we ever want a discount factor, though? Don't we just want to get all rewards?
We do, but the discount factor is both intuitively appealing and mathematically convenient.
On an intuitive level: cash now is better than cash later."
- OpenAI's Spinning Up: https://spinningup.openai.com/en/latest/spinningup/rl_intro.html
'''

deltas = []
final_delta = rewards[-1] - values[-1]
deltas.append(final_delta)

for t in reversed(range(len(rewards)-1)):
    
    delta = rewards[t] + gamma * values[t+1] - values[t] # Future returns multiplied by gamma -> Uncertainty
    deltas.insert(0, delta)

gae = deepcopy(deltas)

for t in reversed(range(len(gae)-1)):
    gae[t] = gae[t] + gamma * lamb * gae[t+1]

gae = torch.tensor(gae, dtype=torch.float)

# Normalizing GAE
# https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/ppo/ppo.py - Line 221

gae = (gae - gae.mean())/(gae.std() + 1e-10)

In [56]:
# CONSOLIDATION PHASE - https://en.wikipedia.org/wiki/Memory_consolidation
# She remembers what she saw, and learns from it.

policy.mode = 'consolidation'

# In RL, an Epoch consists of an entire episode + training from that episode,
# while a Batch consists of an episode. We won't use this terminology here as it's confusing.
# https://spinningup.openai.com/en/latest/spinningup/rl_intro3.html


policy_optm = torch.optim.Adam(policy.parameters(), lr=lr)
value_optm = torch.optim.Adam(value_function.parameters(), lr=lr)
value_criterion = torch.nn.MSELoss()

for epoch in range(EPOCHS):

    steps = 0
    batches = torch.randperm(len(states))

    for batch in range(0, len(batches)-1):

        obs = states[batch].to(device).unsqueeze(0)
        previous_log_prob = log_probs[batch].to(device).unsqueeze(0) # For Surrogate Loss
        reward = rewards[batch].to(device).unsqueeze(-1).unsqueeze(-1)
        advantage = gae[batch].to(device)
        reward_to_go = rewards_to_go[batch].to(device).unsqueeze(-1).unsqueeze(-1)

        current_log_prob = policy(obs) # Not in log yet
        # Calculating Entropy - Used to avoid deterministic behavior and it's a possible replace/complement to epsilon-greedy.
        entropy = (current_log_prob * torch.log(torch.clamp(current_log_prob, 1e-10, 1.0))).sum()
        entropy = -entropy.mean()
        current_log_prob = torch.log(torch.clamp(current_log_prob, 1e-10, 1.0))

        value = value_function(obs)

        # Calculating Surrogate Loss

        ratio = torch.exp(current_log_prob - previous_log_prob)
        clipped_ratio = torch.clamp(ratio, min=0.8, max=1.2)

        surrogate_loss = -torch.minimum((ratio * advantage), (clipped_ratio * advantage))
        surrogate_loss = surrogate_loss.mean()

        # Calculating KL Divergence between policies -> Used for early stopping

        kld = (ratio - 1) - (current_log_prob - previous_log_prob)
        kld = kld.mean()

        # Calculating Value Loss

        value_loss = value_criterion(value, reward_to_go)

        # Total Loss (plus entropy discount)

        total_loss = surrogate_loss + (value_loss * value_weight) - (entropy * entropy_weight)

        total_loss.backward()

        # Using Gradient Accumulation to avoid using batch size greater than 1 -> Lower computation cost

        if steps % BATCH_SIZE == 0:

            policy_optm.step()
            value_optm.step()
            policy.zero_grad()
            value_function.zero_grad()

        steps += 1

        # If KLD between policies goes beyond threshold --> early stopping

        if kld.item() > target_KLD:

            break

        if steps % 100 == 0:

            print(f"{epoch}/{EPOCHS}")
            print(f"Current step: {steps}")
            print(f"Current Loss: {total_loss.item()}")
            #print(f"Surrogate Loss: {surrogate_loss.item()}\tValue Loss: {value_loss.item()}\tEntropy: {entropy.item()}\nAdvantage: {advantage[-10:]}")
            print(f"Surrogate Loss: {surrogate_loss.item()}\tValue Loss: {value_loss.item()}\tEntropy: {entropy.item()}")
            print(f"Predicted Reward: {value.item()}\tCurrent Reward: {reward_to_go.item()}")
            print(f"KL DIVERGENCE: {kld.item()}")
            print(f"Policy Gradients: {policy.neuron1.weight.grad.mean().item()}")
            print(f"Reward Gradients: {value_function.neuron1.weight.grad.mean().item()}\n\n")

print(steps)
print(kld.item())
print(target_KLD)

0/10
Current step: 100
Current Loss: 17.62795066833496
Surrogate Loss: 1.0507831573486328	Value Loss: 16.58133888244629	Entropy: 4.170949935913086
Predicted Reward: 0.11062917113304138	Current Reward: 4.182648181915283
KL DIVERGENCE: 4.569689622258011e-07
Policy Gradients: 0.00018324678239878267
Reward Gradients: -0.009541415609419346


0/10
Current step: 200
Current Loss: 128.87820434570312
Surrogate Loss: 1.0656965970993042	Value Loss: 127.81668090820312	Entropy: 4.172884941101074
Predicted Reward: 0.12138014286756516	Current Reward: 11.426983833312988
KL DIVERGENCE: 1.1076530199716217e-06
Policy Gradients: -8.278109453385696e-05
Reward Gradients: -0.14590764045715332


0/10
Current step: 300
Current Loss: 25965.0234375
Surrogate Loss: 2.528183937072754	Value Loss: 25962.5	Entropy: 4.175563812255859
Predicted Reward: 0.11860451847314835	Current Reward: 161.2474365234375
KL DIVERGENCE: 3.159046173095703e-06
Policy Gradients: 0.0001701192813925445
Reward Gradients: -1.3525042533874512


In [42]:
# New cell to adjust parameters and restart training

policy = SubjectActor().to(device)
value_function = SubjectCritic().to(device)

gamma = 0.99 # Gamma for the Discount Rewards.
lamb = 0.95 # Lambda for Generalized Advantage Estimation. Together with gamma, basically a "weight" for Exponential Moving Average
BATCH_SIZE = 16 # In reality, all tensors are batch 1, so we'll use gradient accumulation to simulate multiple batches.
EPOCHS = 10
lr = 2e-8
value_weight = 1.0
entropy_weight = 1e-3
target_KLD = 0.05 # Early-Stopping parameter to avoid suboptimal policy in PPO2.

In [35]:
print(steps)
print(kld.item())
print(target_KLD)

1
2.1845510005950928
0.05


In [58]:
# Gameplay Mode

env = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis", state="ChunLiVsBlanka.1star")
obs = env.reset()
obs = torch.from_numpy(obs)
obs = obs/255
obs = obs.permute(2, 1, 0).unsqueeze(0).float().to(device)
steps = 0

# If you'd like to save and train even more

'''states = []
actions = []
rewards = []
deltas = []'''

while steps < 1000:
    env.render()

    with torch.no_grad():

        log_prob = policy(obs)
        value = value_function(obs)

    action = log_prob.squeeze(0)

    # MultiBinary Environment --> Only 0.0 or 1.0 accepted
    bin = []
    for x in action:
        if x > 0.5:
            bin.append(1.)
        else:
            bin.append(0.)

    action = bin

    obs, reward, end, info = env.step(action)
    obs = torch.from_numpy(obs)
    obs = obs/255
    obs = obs.permute(2, 1, 0).unsqueeze(0).float().to(device)
    #reward = torch.tensor(reward, device=device)

    reward = (info['health']**(1+info['matches_won'])) - (info['enemy_health']**(1+info['enemy_matches_won']))
    reward = torch.tensor(reward, device=device)
    reward = -(10.0/(torch.exp(reward) + 1.0)) + 5.0 # Normalizing to -5 to +5 (sigmoid function)

    '''states.append(obs.cpu())
    actions.append(action.cpu())
    rewards.append(reward.cpu())
    deltas.append(delta.cpu())'''

    steps += 1

env.render(close=True)
env.close()

In [10]:
env.render(close=True)
env.close()