In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.distributions import Categorical

import gym

In [2]:
from util import DotDict
from memory import Memory

In [176]:
# an Env should record the episode rewards etc.

class Env:
    def __init__(self, name, stat):
        self._name = name
        self._env = gym.make(name)
        self._stat = stat
        self._state = torch.FloatTensor(self._env.reset())
    
    @property
    def name(self):
        return self._name
    
    @property
    def state(self):
        return self._state
    
    def step(self, action):
        next_state, reward, done, info = self._env.step(action.item())
        self._stat.rewards[-1] += reward
        if done:
            print(self._stat.frame, self._stat.rewards[-1])
            self._stat.rewards.append(0)
        self._state = torch.FloatTensor(self._env.reset()) if done else torch.FloatTensor(next_state)
        return torch.FloatTensor(next_state), torch.tensor(reward), torch.tensor(done), info
    

In [168]:
class Base:
    def __init__(self, config):
        self._config = DotDict(**config)
        self._stat = DotDict(**{
            'frame': 0,
            'episode': 0,
            'rewards': [0]
        })
        env_name = self._config.env
        self._config.env = Env(env_name, self._stat)
    
    @property
    def env(self):
        return self._config.env
    
    @property
    def nn(self):
        return self._config.nn
    
    @property
    def optim(self):
        return self._config.optim
    
    @property
    def scheduler(self):
        return self._config.scheduler
    
    @property
    def stat(self):
        return self._stat
    
    @property
    def frame(self):
        return self._stat.frame
    
    @property
    def episode(self):
        return self._stat.episode
    
    def run(self, n_frames, from_frame=0):
        self._stat.frame = from_frame
        
        while self.frame <= n_frames:
            for _ in range(self._config.step_length):
                self.step()
                self._stat.frame += 1
            self.learn()
            
            if len(self._stat.rewards) >= 100 and sum(self._stat.rewards) / len(self._stat.rewards) >= 195:
                print('Solved')
                break

In [169]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.linear1 = nn.Linear(4, 20)
        self.linear2 = nn.Linear(20, 20)
        self.linear3 = nn.Linear(20, 20)
        self.linear4 = nn.Linear(20, 2)
    
    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        return F.softmax(self.linear4(x), dim=-1)

In [170]:
net = Net()
optimizer = optim.Adam(net.parameters())

In [171]:
config = {
    'env': 'CartPole-v0',
    'nn': net,
    'optim': optimizer,
    'scheduler': None,
    'step_length': 20,
    'gamma': 0.9
}

In [172]:
class Reinforce(Base):
    def __init__(self, config):
        super(Reinforce, self).__init__(config)
        self._memory = Memory(
            fields=('log_prob', 'reward', 'done'),
            cap=config['step_length'])
    
    def step(self):
        state = self.env.state
        policy = self.nn(state)
        probs = Categorical(policy)
        action = probs.sample()
        log_prob = probs.log_prob(action)
        next_state, reward, done, info = self.env.step(action)
        self._memory.append([log_prob, reward, done.float()])
    
    def learn(self):
        log_probs, rewards, dones = self._memory.flush()
        gamma = self._config.gamma
        eps = torch.finfo()
        
        gain = 0
        exp_rewards = []
        for i in reversed(range(rewards.size(0))):
            reward, done = rewards[i], dones[i]
            gain = reward + gamma * gain * (1 - done)
            exp_rewards.append(gain)
        
        exp_rewards.reverse()
        exp_rewards = torch.stack(exp_rewards)
        eps = torch.finfo(exp_rewards.dtype).eps
        exp_rewards = (exp_rewards - exp_rewards.mean()) / (exp_rewards.std() + eps)

        loss = -(log_probs * exp_rewards).sum()
        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

In [173]:
agent = Reinforce(config)

In [175]:
agent.run(10000)

3 18.0
28 25.0
48 20.0
59 11.0
121 62.0
148 27.0
211 63.0
220 9.0
237 17.0
278 41.0
298 20.0
319 21.0
333 14.0
354 21.0
374 20.0
389 15.0
402 13.0
418 16.0
443 25.0
480 37.0
496 16.0
512 16.0
522 10.0
532 10.0
556 24.0
579 23.0
589 10.0
598 9.0
634 36.0
671 37.0
686 15.0
699 13.0
740 41.0
762 22.0
793 31.0
803 10.0
827 24.0
844 17.0
884 40.0
896 12.0
916 20.0
960 44.0
979 19.0
995 16.0
1027 32.0
1048 21.0
1072 24.0
1086 14.0
1104 18.0
1123 19.0
1134 11.0
1153 19.0
1168 15.0
1180 12.0
1194 14.0
1216 22.0
1228 12.0
1258 30.0
1267 9.0
1279 12.0
1293 14.0
1319 26.0
1422 103.0
1440 18.0
1454 14.0
1473 19.0
1494 21.0
1508 14.0
1528 20.0
1548 20.0
1600 52.0
1619 19.0
1633 14.0
1652 19.0
1675 23.0
1687 12.0
1715 28.0
1738 23.0
1785 47.0
1801 16.0
1817 16.0
1870 53.0
1890 20.0
1946 56.0
1961 15.0
1985 24.0
2023 38.0
2037 14.0
2048 11.0
2062 14.0
2075 13.0
2091 16.0
2115 24.0
2139 24.0
2150 11.0
2163 13.0
2177 14.0
2190 13.0
2212 22.0
2243 31.0
2264 21.0
2293 29.0
2304 11.0
2321 17.0
2342 21.0
2

In [178]:
"""
  utils
"""
import copy

import torch
import torch.nn as nn

import numpy as np

const = DotDict(
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    MACHINE_EPS = np.finfo(np.float32).eps.item()
)

def astensor(obj, to_type=None):
  """
    to_type selected from 'int', 'long', 'float', 'double', and 'byte'

    not responsible for handling type cast error
  """
  if to_type is None:
    return torch.tensor(obj).to(const.DEVICE) if not isinstance(obj, torch.Tensor) else obj
  elif to_type == 'int':
    return torch.IntTensor(obj).to(const.DEVICE) if not isinstance(obj, torch.Tensor) else obj.int()
  elif to_type == 'long':
    return torch.LongTensor(obj).to(const.DEVICE) if not isinstance(obj, torch.Tensor) else obj.long()
  elif to_type == 'float':
    return torch.FloatTensor(obj).to(const.DEVICE) if not isinstance(obj, torch.Tensor) else obj.float()
  elif to_type == 'double':
    return torch.DoubleTensor(obj).to(const.DEVICE) if not isinstance(obj, torch.Tensor) else obj.double()
  elif to_type == 'byte':
    return torch.ByteTensor(obj).to(const.DEVICE) if not isinstance(obj, torch.Tensor) else obj.byte()
  else:
    raise TypeError(
      '''
        to_type can only be one of 
        'int', 'long', 'float', 'double', 'byte'
        or leave empty.
      '''
      )

def copynet(network):
  """
    a deepcopy of a network 
  """
  if not isinstance(network, nn.Module):
    raise TypeError('Input must be a instance of {}, but got {}'.format(nn.Module, network.__class__.__name__))

  return copy.deepcopy(network)


def flatten(network, input_size):
  """
    return the number of features after applying `network`structure on a network with
    shape `input_size`
  """
  if not isinstance(network, nn.Module):
    network = nn.Sequential(
        *network
    )

  with torch.no_grad():
    num_features = network(torch.rand((1, *input_size))).view(1, -1).size(1)

  return num_features


In [182]:
class PolicyNet(object):
    """
      REINFORCE algorithm implementation
    """

    def __init__(self, net, obs_shape, n_act, target=False):
        self._net = net
        self._obs_shape, self._n_act = obs_shape, n_act
        if target:
            self._tnet = copynet(self._net)
            self._tnet.load_state_dict(self._net.state_dict())
        else:
            self._tnet = None

        self.logprobs = []
        self.rewards = []
        self.dones = []

    def act(self, obs, act_mask=None):
        act_mask = astensor(act_mask,
                            'float') if act_mask is not None else astensor(
            [1] * self._n_act, 'float')

        probs = self.forward(obs) * act_mask.unsqueeze(0) + const.MACHINE_EPS
        distribution = Categorical(probs)
        action = distribution.sample()
        self.logprobs.append(distribution.log_prob(action))

        return action.cpu().item()

    def forward(self, obs):
        if not isinstance(obs, torch.Tensor):
            obs = astensor(obs, 'float')
        obs.reshape(obs.size(0), *self._obs_shape)
        if self._tnet:
            return F.softmax(self._tnet(obs), dim=1)
        return F.softmax(self._net(obs), dim=1)

    def learn(self, optimizer=None, gamma=0.99):
        assert self.logprobs and self.rewards and self.dones
        assert len(self.logprobs) == len(self.rewards) == len(self.dones)

        if optimizer is None:
            optimizer = optim.Adam(self.parameters())

        discounted_rewards = self._discount(gamma)
        loss = self._loss(discounted_rewards)
        self._learn(optimizer, loss)

    def parameters(self):
        return self._net.parameters()

    def update_tnet(self, soft_tau=1e-2):
        if self._tnet:
            for target_param, param in zip(self._tnet.parameters(),
                                           self._net.parameters()):
                target_param.data.copy_(target_param.data * (1 - soft_tau) +
                                        param.data * soft_tau)
        else:
            print(
                'You did not initialize target network, please check your network structure.'
            )

    def _discount(self, gamma, init_gain=0):
        gain = init_gain
        discounted_rewards = []
        while self.rewards:
            reward, done = self.rewards.pop(), self.dones.pop()
            gain = reward + gamma * gain * (1 - done)
            discounted_rewards.append(gain)

        with torch.no_grad():
            discounted_rewards = astensor(discounted_rewards)
            discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (
                    discounted_rewards.std() + const.MACHINE_EPS)

        return discounted_rewards

    def _loss(self, rewards):
        loss = []
        for reward in rewards:
            logprob = self.logprobs.pop()
            loss.append(-logprob * reward)
            loss = torch.cat(loss).sum()
        return loss

    @staticmethod
    def _learn(optimizer, loss):
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    def run(self, n_frame):
        env = gym.make('CartPole-v0')
        state = env.reset()
        cur_frame = 0
        while cur_frame < n_frame:
            action = self.act(state)
            next_state, reward, done, _ = env.step()
            self.rewards.append(reward)
            self.dones.append(done)
            if done:
                state = env.reset()
            else:

SyntaxError: unexpected EOF while parsing (<ipython-input-182-d57423bcf697>, line 104)

In [180]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.linear1 = nn.Linear(4, 20)
        self.linear2 = nn.Linear(20, 20)
        self.linear3 = nn.Linear(20, 20)
        self.linear4 = nn.Linear(20, 2)
    
    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        return F.softmax(self.linear4(x), dim=-1)

In [181]:
policynet = PolicyNet(Net(), (4,), 2)