# Reinforce method is based on cross entropy, but using q-distribution and scaling of gradient with how good the reward is:
1. initialize weighting randomly
2. Play N full episodes, and store the (s, a, r, s') transfer.
3. In episode k for every step t, calculate discounted total reward:
   Qk,t = Σi=0 (GAMMA^i )*ri
4. Calculate all transfer loss function:
   L = -Σk,t Qk,t log(π(sk,t, ak,t))
5. Use SGD renew to the weight to reduce loss.
6. Repeat step 2 until convergence

In [1]:
import gym
import ptan
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
GAMMA = 0.99
LEARNING_RATE = 0.01
#episode to train will state how many episodes is used for training
EPISODES_TO_TRAIN = 4

In [3]:
class PGN(nn.Module):
    #we don't call softmax function, instead, we use Pytorch log_softmax function to calculate the value, the calculation
    #will be more stable, also, network output is not probabilities, it is logits, which is a fraction.
    def __init__(self, input_size, n_actions):
        super(PGN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )
        
    def forward(self, x):
        return self.net(x)
    
#this function accept full episode reward and calculate discounted reward on each step.
#we get the partial reward at last step reward, which is rt-1 + GAMMA*rt, where rt is last index, sum_r will get the
#previous step total reward, so to get the total reward at last step, we need sum_r * GAMMA + partial reward
def calc_qvals(rewards):
    res = []
    sum_r = 0.0
    for r in reversed(rewards):
        sum_r *= GAMMA
        sum_r += r
        res.append(sum_r)
    return list(reversed(res))

In [4]:
if __name__ == "__main__":
    env = gym.make("CartPole-v0")
    writer = SummaryWriter(comment="-cartpole-reinforce")
    
    net = PGN(env.observation_space.shape[0], env.action_space.n)
    print(net)
    
    #if using q-value and 1st action is 0.4 and 2nd action is 0.5, we have 100% to choose 2nd action, but if using
    #probability distribution, we have 40% to choose 1st action and 50% to choose 2nd action, we can also set it to 100%
    #using 2nd network by set it to 1. We use random.choice in numpy to call it, and apply softmax to transfer output as
    #probabilities, then we transfer the Cartpole float64 result to pytorch float32.
    agent = ptan.agent.PolicyAgent(net, preprocessor=ptan.agent.float32_preprocessor, apply_softmax=True)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA)
    
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    
    #collect data for writer, total rewards, finished episodes, current episode reward
    #when the episode end, we use calc_qvals to calculate current reward to calculated discounted total reward and add to 
    #batch_qvals, batch_states and batch_actions included the states and action we see from last training.
    total_rewards = []
    done_episodes = 0
    
    batch_episodes = 0
    cur_rewards = []
    batch_states, batch_actions, batch_qvals = [], [], []
    
    #we get state, action, current reward and next state. If episode end, next state is None. If not episode end,
    # we store state, action and current reward. When episode end, we change current reward to q value and add episode
    # counter
    for step_idx, exp in enumerate(exp_source):
        batch_states.append(exp.state)
        batch_actions.append(int(exp.action))
        cur_rewards.append(exp.reward)
        
        if exp.last_state is None:
            batch_qvals.extend(calc_qvals(cur_rewards))
            cur_rewards.clear()
            batch_episodes += 1
        
        #when episode end, it record progress and write to TensorBoard
        new_rewards = exp_source.pop_total_rewards()
        if new_rewards:
            done_episodes += 1
            reward = new_rewards[0]
            total_rewards.append(reward)
            mean_rewards = float(np.mean(total_rewards[-100:]))
            print("%d: reward: %6.2f, mean_100: %6.2f, episodes: %d"\
                  %(step_idx, reward, mean_rewards, done_episodes))
            writer.add_scalar("reward", reward, step_idx)
            writer.add_scalar("reward_100", mean_rewards, step_idx)
            writer.add_scalar("episodes", done_episodes, step_idx)
            if mean_rewards > 195:
                print("Solved in %d steps and %d episodes!" %(step_idx, done_episodes))
                break
                
        if batch_episodes < EPISODES_TO_TRAIN:
            continue
        
        #optimize after getting enough samples, change state, action, q-values to pytorch format and type
        optimizer.zero_grad()
        states_v = torch.FloatTensor(batch_states)
        batch_actions_t = torch.LongTensor(batch_actions)
        batch_qvals_v = torch.FloatTensor(batch_qvals)
        
        #calculate loss. Change the state to logits, calculate logarithm + softmax, we choose the log probabilities and 
        #use q-values for resize, then take the mean for the resized value and set as negative to reduce loss.
        logits_v = net(states_v)
        log_prob_v = F.log_softmax(logits_v, dim=1)
        log_prob_actions_v = batch_qvals_v * log_prob_v[range(len(batch_states)), batch_actions_t]
        loss_v = -log_prob_actions_v.mean()
        
        #back propagate to get the gradient, and use optimizer for SGD. When the loop end, we reset batch states, actions
        #qvals for collecting new data.
        loss_v.backward()
        optimizer.step()
        
        batch_episodes = 0
        batch_states.clear()
        batch_actions.clear()
        batch_qvals.clear()
        
    writer.close()

PGN(
  (net): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=2, bias=True)
  )
)
59: reward:  58.00, mean_100:  58.00, episodes: 1
89: reward:  29.00, mean_100:  43.50, episodes: 2
100: reward:  10.00, mean_100:  32.33, episodes: 3
134: reward:  33.00, mean_100:  32.50, episodes: 4
151: reward:  16.00, mean_100:  29.20, episodes: 5
185: reward:  33.00, mean_100:  29.83, episodes: 6
221: reward:  35.00, mean_100:  30.57, episodes: 7
250: reward:  28.00, mean_100:  30.25, episodes: 8
269: reward:  18.00, mean_100:  28.89, episodes: 9
341: reward:  71.00, mean_100:  33.10, episodes: 10
382: reward:  40.00, mean_100:  33.73, episodes: 11
410: reward:  27.00, mean_100:  33.17, episodes: 12
442: reward:  31.00, mean_100:  33.00, episodes: 13
470: reward:  27.00, mean_100:  32.57, episodes: 14
493: reward:  22.00, mean_100:  31.87, episodes: 15
514: reward:  20.00, mean_100:  31.12, episodes: 16
550: reward

11373: reward: 121.00, mean_100:  86.44, episodes: 157
11484: reward: 110.00, mean_100:  87.27, episodes: 158
11520: reward:  35.00, mean_100:  86.87, episodes: 159
11640: reward: 119.00, mean_100:  87.60, episodes: 160
11747: reward: 106.00, mean_100:  87.57, episodes: 161
11771: reward:  23.00, mean_100:  87.36, episodes: 162
11897: reward: 125.00, mean_100:  87.83, episodes: 163
12036: reward: 138.00, mean_100:  88.90, episodes: 164
12163: reward: 126.00, mean_100:  89.71, episodes: 165
12329: reward: 165.00, mean_100:  90.98, episodes: 166
12440: reward: 110.00, mean_100:  91.28, episodes: 167
12465: reward:  24.00, mean_100:  90.87, episodes: 168
12631: reward: 165.00, mean_100:  91.61, episodes: 169
12775: reward: 143.00, mean_100:  91.84, episodes: 170
12824: reward:  48.00, mean_100:  91.56, episodes: 171
12973: reward: 148.00, mean_100:  91.62, episodes: 172
13104: reward: 130.00, mean_100:  92.57, episodes: 173
13296: reward: 191.00, mean_100:  93.96, episodes: 174
13424: rew

36268: reward: 200.00, mean_100: 190.58, episodes: 306
36469: reward: 200.00, mean_100: 191.04, episodes: 307
36670: reward: 200.00, mean_100: 191.94, episodes: 308
36871: reward: 200.00, mean_100: 192.66, episodes: 309
37072: reward: 200.00, mean_100: 193.45, episodes: 310
37273: reward: 200.00, mean_100: 194.37, episodes: 311
37474: reward: 200.00, mean_100: 194.82, episodes: 312
37675: reward: 200.00, mean_100: 194.99, episodes: 313
37876: reward: 200.00, mean_100: 195.79, episodes: 314
Solved in 37876 steps and 314 episodes!
