In [19]:
import gym
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.optim as optim

In [20]:
import collections
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'last_state'])

In [21]:
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY")) == 0:
    !bash ../xvfb start
    os.environ['DISPLAY'] = ':1'

In [22]:
env = gym.make("CartPole-v0").env
env.reset()

array([ 0.04385597, -0.00612533,  0.04623462, -0.02609207])

In [23]:
class PGN(nn.Module):
    def __init__(self, observation_space, n_actions):
        super(PGN, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(observation_space.shape[0], 200),
            nn.ReLU(),
            nn.Linear(200, 100),
            nn.ReLU(),
            nn.Linear(100, n_actions),
            nn.ReLU()
        )

    def forward(self, x):
        return self.model(x)

In [264]:
class Agent:
    def __init__(self, env):
        self.env = env
        self.state = self.env.reset()
        self.next_state = None
        self.total_reward_sum = []
        self.total_reward_list = []        

    def _predict_probs(self, states, net):
        state = np.array(states).reshape(1, states.size)
        state_v = torch.tensor(state, dtype=torch.float)
        q_vals = net(state_v)
        softmax = nn.Softmax(dim=1)
        predict_prob = softmax(q_vals)

        return predict_prob.data.numpy()

    def play_step(self, net, step_size, device="cpu"):
        assert step_size >= 1, "Too less step_size"
        print("Current state",self.state)
        reward_list = []
        next_state_list = []
        current_state = self.state  
        cumulated_reward = None
        first_step = True

        for i in range(step_size):
           # action_prob = self._predict_probs(self.state, net)[0]
           # action = np.random.choice(len(action_prob), p=action_prob)
            action = 1           
            
            new_state, reward, done, _ = self.env.step(action)
            next_state_list.append(new_state)
            print(next_state_list)
            if first_step is True:
                current_action = action
                current_state = self.state

                if done is True:
                    self.env.reset()

            self.state = new_state
            reward_list.append(reward)
            self.total_reward_list.append(reward)
            
            if done is True:
                self.total_reward_sum.append(sum(self.total_reward_list))                    
                new_state = None  
                cumulated_reward = self._get_cumulated_reward(reward_list)
                break

        if cumulated_reward is None:
            cumulated_reward = self._get_cumulated_reward(reward_list)

        exp = Experience(current_state, current_action, cumulated_reward, new_state)           
        self.state = next_state_list.pop(0)
        print("Next state", self.state)
        return exp

    def _get_cumulated_reward(self, rewards_list, gamma=1.0):
        cumulated_reward = 0
        for reward in rewards_list:
            cumulated_reward *= gamma
            cumulated_reward += reward

        return cumulated_reward

    def pop_total_rewards(self):
        if not self.total_reward_sum:
            #print("Empty")
            return self.total_reward_sum
        else:
            #print("Not empty")
            total_reward_sum = self.total_reward_sum
            self.total_reward_sum = []
            self.total_reward_list = []            
            return total_reward_sum

In [265]:
class ToyEnv(gym.Env):
    def __init__(self):
        super(ToyEnv, self).__init__()
        self.observation_space = gym.spaces.Discrete(n=5)
        self.action_space = gym.spaces.Discrete(n=3)
        self.step_index = 0

    def reset(self):
        self.step_index = 0
        return self.step_index

    def step(self, action):
        is_done = self.step_index == 10
        if is_done:
            return self.step_index % self.observation_space.n, 0.0, is_done, {}
        
        self.step_index += 1
        return self.step_index % self.observation_space.n, float(action), self.step_index == 10, {}


In [266]:
#test_env = gym.make("CartPole-v0").env
test_env = ToyEnv()
test_env.reset()

net = PGN(env.observation_space, env.action_space.n)
agent = Agent(test_env)

In [267]:
for idx in range(10):
    #print(agent.pop_total_rewards())
    exp = agent.play_step(net, step_size=2)
    print(exp)

Current state 0
[1]
[1, 2]
Next state 1
Experience(state=1, action=1, reward=2.0, last_state=2)
Current state 1
[3]
[3, 4]
Next state 3
Experience(state=3, action=1, reward=2.0, last_state=4)
Current state 3
[0]
[0, 1]
Next state 0
Experience(state=0, action=1, reward=2.0, last_state=1)
Current state 0
[2]
[2, 3]
Next state 2
Experience(state=2, action=1, reward=2.0, last_state=3)
Current state 2
[4]
[4, 0]
Next state 4
Experience(state=4, action=1, reward=2.0, last_state=None)
Current state 4
[1]
[1, 2]
Next state 1
Experience(state=1, action=1, reward=2.0, last_state=2)
Current state 1
[3]
[3, 4]
Next state 3
Experience(state=3, action=1, reward=2.0, last_state=4)
Current state 3
[0]
[0, 1]
Next state 0
Experience(state=0, action=1, reward=2.0, last_state=1)
Current state 0
[2]
[2, 3]
Next state 2
Experience(state=2, action=1, reward=2.0, last_state=3)
Current state 2
[4]
[4, 0]
Next state 4
Experience(state=4, action=1, reward=2.0, last_state=None)


In [231]:
GAMMA = 0.99
LEARNING_RATE = 0.001
ENTROPY_BETA = 0.01
BATCH_SIZE = 8

In [83]:
env = gym.make("CartPole-v0").env
env.reset()

array([ 0.04370282, -0.02918187, -0.04839809,  0.03518697])

In [84]:
net = PGN(env.observation_space, env.action_space.n)
agent = Agent(env)
done_episodes = 0
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
reward_sum = 0
total_rewards = []
batch_states, batch_actions, batch_scales = [], [], []
frame_idx = 0

In [85]:
while True:              
    exp = agent.play_step(net, step_size=1)    
    reward_sum += exp.reward
    baseline = reward_sum / (frame_idx + 1)
    frame_idx += 1

    batch_states.append(exp.state)
    batch_actions.append(int(exp.action))
    batch_scales.append(exp.reward - baseline)

    new_rewards = agent.pop_total_rewards()    
    if new_rewards:
        done_episodes += 1
        reward = new_rewards[0]
        total_rewards.append(reward)
        mean_reward = float(np.mean(total_rewards[-100:]))

        if done_episodes % 100 == 0:
            print("mean reward : ", mean_reward)

        if mean_reward > 200:
            print("WIN!")
            break

    if len(batch_states) < BATCH_SIZE:
        continue


    states_v = torch.tensor(batch_states, dtype=torch.float)
    batch_actions_t = torch.tensor(batch_actions, dtype=torch.long)
    batch_scale_v = torch.tensor(batch_scales, dtype=torch.float)

    optimizer.zero_grad()
    logits_v = net(states_v)
    log_softmax = nn.LogSoftmax(dim=1)
    log_prob_v = log_softmax(logits_v)
    log_prob_actions_v = batch_scale_v * log_prob_v[range(len(batch_states)), batch_actions_t]
    # log_prob_actions_v = log_prob_v[range(BATCH_SIZE), batch_actions_t]
    loss_policy_v = -log_prob_actions_v.mean()

    softmax = nn.Softmax(dim=1)
    prob_v = softmax(logits_v)
    entropy_v = -(prob_v * log_prob_v).sum(dim=1).mean()
    entropy_loss_v = -ENTROPY_BETA * entropy_v
    # * loss + (-entropy)        
    loss_v = loss_policy_v + entropy_loss_v
    loss_v.backward()
    optimizer.step()


    batch_states.clear()
    batch_actions.clear()
    batch_scales.clear()

mean reward :  21.83
mean reward :  22.95
mean reward :  21.15
mean reward :  21.76
mean reward :  20.8
mean reward :  21.06
mean reward :  20.57
mean reward :  21.3
mean reward :  20.71
mean reward :  24.2
mean reward :  24.04
mean reward :  20.94
mean reward :  22.13
mean reward :  24.29
mean reward :  23.47
mean reward :  23.22
mean reward :  21.99
mean reward :  21.33
mean reward :  23.12
mean reward :  22.37
mean reward :  23.42
mean reward :  21.29
mean reward :  22.15
mean reward :  22.42
mean reward :  24.78
mean reward :  24.4
mean reward :  22.09
mean reward :  24.51
mean reward :  23.56
mean reward :  19.73
mean reward :  22.82
mean reward :  21.24
mean reward :  22.36
mean reward :  23.55
mean reward :  22.95
mean reward :  22.68
mean reward :  22.44
mean reward :  21.8
mean reward :  21.41
mean reward :  22.8
mean reward :  21.43
mean reward :  20.55
mean reward :  23.66
mean reward :  23.38
mean reward :  20.01
mean reward :  24.72
mean reward :  21.8
mean reward :  22.5


KeyboardInterrupt: 

In [None]:
# record sessions
import gym.wrappers
env = gym.wrappers.Monitor(gym.make("CartPole-v0"), directory="videos", force=True)

states, actions, rewards = agent.play_step(net)
                           
sessions = [agent.play_step(net) for _ in range(5)]
env.close()

In [46]:
# show video
from IPython.display import HTML
import os

video_names = list(
    filter(lambda s: s.endswith(".mp4"), os.listdir("./videos/")))


HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[0]))  # this may or may not be the _last_ video. Try other indices

0


IndexError: list index out of range