In [212]:
#Import required libraries

import argparse
import gym
import numpy as np
from itertools import count
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [213]:
#Set constants for training
seed = 543
log_interval = 10
gamma = 0.99

env = gym.make('CartPole-v1')
env.reset()
state_shape = env.observation_space.shape[0]
no_of_actions = env.action_space.n
print(state_shape)
print(no_of_actions)
torch.manual_seed(seed)


4
2


<torch._C.Generator at 0x228276c5d50>

In [214]:
class Policy(nn.Module):
    """
    implements both actor and critic in one model
    """
    def __init__(self,state_shape = 4,action_size = 2 ):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(state_shape, 128)
        # actor's layer
        self.action_head = nn.Linear(128, action_size)
        # action & reward buffer
        self.saved_actions = []
        self.episode_rewards = []
        self.episode_states = []

    def forward(self, x):
        """
        forward of both actor and critic
        """
        x = F.relu(self.affine1(x))
        # actor: choses action to take from state s_t
        # by returning probability of each action
        action_prob = F.softmax(self.action_head(x), dim=-1)
        return action_prob


In [215]:
model = Policy(state_shape=state_shape,action_size=no_of_actions)
optimizer = optim.Adam(model.parameters(), lr=3e-2)
eps = np.finfo(np.float32).eps.item()
max_len = 10000 # maximum number of iteration for episode to end

In [216]:

# def select_action(state):
#     state = torch.from_numpy(state).float()
#     probs = model(state)
#     # create a categorical distribution over the list of probabilities of actions
#     m = Categorical(probs)
#     action = m.sample()
#     model.saved_actions.append(m.log_prob(action))
#     # the action to take (left or right)
#     return action.item()


# def finish_episode(Returns,state_number):
#     policy_losses = [] # list to save actor (policy) loss
#     # calculate the true value using rewards returned from the environment  
#     R = torch.tensor(Returns)
#     log_prob = model.saved_actions[state_number]
#     policy_losses.append(-torch.mul(R,log_prob))
#     print(policy_losses)
#     optimizer.zero_grad()
#     # sum up all the values of policy_losses and value_losses
#     torch.autograd.set_detect_anomaly(True)
#     log_prob.backward()
#     optimizer.step()
    


# def train():
#     running_reward = 10
#     # run infinitely many episodes
#     for i_episode in range(2000):
#         # reset environment and episode reward
#         state, _ = env.reset()
#         ep_reward = 0
#         # for each episode, only run 9999 steps so that we don't
#         # infinite loop while learning
#         for t in range(1, max_len):
#             # select action from policy
#             action = select_action(state)
#             state, reward, done, _, _ = env.step(action)
#             model.episode_states.append(state)
#             model.episode_rewards.append(reward)
#             ep_reward += reward
#             if done:
#                 break
            
#         for i in range(len(model.saved_actions)):
#             R = 0
#             for r in model.episode_rewards[i::-1]:
#                 # calculate the discounted value
#                 R = r + gamma * R
#             finish_episode(Returns=R, state_number= i)
#         del model.saved_actions[:]
#         del model.episode_rewards[:]
#         del model.episode_states[:]
            
#         running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward

#         print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
#                   i_episode, ep_reward, running_reward))
#         # check if we have "solved" the cart pole problem
#         if running_reward > env.spec.reward_threshold:
#             print("Solved! Running reward is now {} and "
#                   "the last episode runs to {} time steps!".format(running_reward, t))
#             break

correct MC reinforce with no baseline

In [217]:
# def select_action(state):
#     state = torch.from_numpy(state).float()
#     probs = model(state)
#     m = Categorical(probs)
#     action = m.sample()
#     model.saved_actions.append(m.log_prob(action))
#     return action.item()


# def finish_episode():
#     policy_losses = []
#     returns = []
#     # calculate the returns for each time step
#     G = 0
#     for r in model.episode_rewards[::-1]:
#         G = gamma * G + r
#         returns.insert(0, G)
#     returns = torch.tensor(returns)
#     returns = (returns - returns.mean()) / (returns.std() + eps)
#     # calculate the loss and update the model
#     for log_prob, R in zip(model.saved_actions, returns):
#         policy_losses.append(-log_prob * R)
#     optimizer.zero_grad()
#     loss = torch.stack(policy_losses).sum()
#     loss.backward()
#     optimizer.step()
#     # clear the saved actions and rewards for the next episode
#     del model.saved_actions[:]
#     del model.episode_rewards[:]
#     del model.episode_states[:]


# def train():
#     running_reward = 10
#     for i_episode in range(2000):
#         state,_ = env.reset()
#         ep_reward = 0
#         for t in range(1, max_len):
#             action = select_action(state)
#             state, reward, done, _, _ = env.step(action)
#             model.episode_states.append(state)
#             model.episode_rewards.append(reward)
#             ep_reward += reward
#             if done:
#                 break
#         finish_episode()
#         running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
#         print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
#             i_episode, ep_reward, running_reward))
#         if running_reward > env.spec.reward_threshold:
#             print("Solved! Running reward is now {} and "
#                   "the last episode runs to {} time steps!".format(running_reward, t))
#             break

with baseline

In [None]:
def select_action(state):
    state = torch.from_numpy(state).float()
    probs = model(state)
    m = Categorical(probs)
    action = m.sample()
    model.saved_actions.append(m.log_prob(action))
    return action.item()


def finish_episode():
    policy_losses = []
    returns = []
    # calculate the returns for each time step
    G = 0
    for r in model.episode_rewards[::-1]:
        G = gamma * G + r
        returns.insert(0, G)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    # calculate the loss and update the model
    for log_prob, R in zip(model.saved_actions, returns):
        policy_losses.append(-log_prob * R)
    optimizer.zero_grad()
    loss = torch.stack(policy_losses).sum()
    loss.backward()
    optimizer.step()
    # clear the saved actions and rewards for the next episode
    del model.saved_actions[:]
    del model.episode_rewards[:]
    del model.episode_states[:]


def train():
    running_reward = 10
    for i_episode in range(2000):
        state,_ = env.reset()
        ep_reward = 0
        for t in range(1, max_len):
            action = select_action(state)
            state, reward, done, _, _ = env.step(action)
            model.episode_states.append(state)
            model.episode_rewards.append(reward)
            ep_reward += reward
            if done:
                break
        finish_episode()
        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
            i_episode, ep_reward, running_reward))
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break

In [218]:
train()

[tensor(0.6867, grad_fn=<NegBackward0>)]
[tensor(1.3261, grad_fn=<NegBackward0>)]


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [128, 2]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!