# ECE 276C HW3 P1
Mingwei Xu A53270271

In [1]:
import gym
import numpy as np
from matplotlib import pyplot as plt

import torch
from torch import nn, optim
from torch.distributions import Categorical

## Define Network

In [2]:
class PolicyNetwork(nn.Module):
    """
    Policy Network using MLP
    """
    def __init__(self, env):
        """
        :param env: object, gym environment
        """
        super(PolicyNetwork, self).__init__()
        # get state space and action space dimension
        self.state_space_n = env.observation_space.shape[0]
        self.action_space_n = env.action_space.n

        # define layers
        self.l1 = nn.Linear(self.state_space_n, 64)
        self.dropout = nn.Dropout(p=0.6)
        self.l2 = nn.Linear(64, self.action_space_n)

    def forward(self, x):
        """
        Feed forward
        
        :param x: np array, state
        :return: tensor, softmax probability of action
        """
        # build neural network
        network = nn.Sequential(
            self.l1,
            self.dropout,
            self.l2,
            nn.Softmax(dim=-1))
        return network(torch.FloatTensor(x))

## Setup

In [3]:
env = gym.make('CartPole-v1')
state = env.reset()
policy_network = PolicyNetwork(env)
policy_network.forward(state)   # TODO: test
policy_network.parameters()

<generator object Module.parameters at 0x1112689a8>

## Choose Action

In [4]:
def choose_action(policy_network, state):
    """
    Choose action according to policy on given state

    :param policy_network: object, policy network
    :param state: np array, state
    :returns: int, action; tensor, log probability
    """
    probs = policy_network.forward(state)
    m = Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)
    return action.item(), log_prob

In [5]:
choose_action(policy_network, state)    # TODO: test

(0, tensor(-0.7439, grad_fn=<SqueezeBackward1>))

## Question 1

In [6]:
def reinforce(env, policy_network, batch_size=500, num_episodes=200, lr=0.01, gamma=0.99):
    """
    Policy gradient training using reinforce method

    :param env: object, gym environment
    :param policy_network: object, policy network
    :param batch_size: int, batch size
    :param num_episodes: int, number of episodes
    :param lr: float, learning rate
    :param gamma: float (0~1), discount factor
    """
    # TODO: setup place holders
    batch_loss_sum = 0  # sum of batch loss over episodes

    # define optimizer
    optimizer = optim.Adam(policy_network.parameters(), 
                           lr=lr)

    # train
    for i in range(num_episodes):
        # setup placeholders
        batch_states = []
        batch_actions = []
        batch_rewards = []
        log_prob_list = []

        # reset environment
        state = env.reset()
        done = False

        for j in range(batch_size):
            # step
            action, log_prob = choose_action(policy_network, state)
            state_next, reward, done, _ = env.step(action)

            # store data
            batch_states.append(state)
            batch_actions.append(action)
            batch_rewards.append(reward)
            log_prob_list.append(log_prob)

            # move on
            if done:
                state = env.reset()
            else:
                state = state_next

        # finish batch
        batch_log_prob_sum = sum(log_prob_list)     # log likelihood sum, this is tensor
        batch_discounted_return = sum(torch.FloatTensor(np.array(batch_rewards) *
                                                        np.array([gamma ** t for t in range(1, len(batch_rewards) + 1)])))  # G(t), this is tensor
        batch_loss = batch_discounted_return * batch_log_prob_sum   # calculate batch loss
        # batch_loss_sum += batch_loss    # update batch loss sum over episodes
        # loss = batch_loss_sum / (i + 1)
        loss = batch_loss
        print('Episode {} loss: {}, average reward: {}'.format(i, loss.item(), np.mean(batch_rewards)))

        # update policy
        loss.backward(retain_graph=True)
        optimizer.zero_grad()
        optimizer.step()
        

    return

In [7]:
reinforce(env, policy_network, batch_size=500, num_episodes=200, gamma=0.99)

Episode 0 loss: -32888.05859375, average reward: 1.0
Episode 1 loss: -32674.974609375, average reward: 1.0
Episode 2 loss: -32832.49609375, average reward: 1.0
Episode 3 loss: -33039.40234375, average reward: 1.0
Episode 4 loss: -33060.96875, average reward: 1.0
Episode 5 loss: -33032.546875, average reward: 1.0
Episode 6 loss: -33053.53125, average reward: 1.0
Episode 7 loss: -33065.30859375, average reward: 1.0
Episode 8 loss: -33031.04296875, average reward: 1.0
Episode 9 loss: -33104.64453125, average reward: 1.0
Episode 10 loss: -33085.28515625, average reward: 1.0
Episode 11 loss: -33068.640625, average reward: 1.0
Episode 12 loss: -33035.86328125, average reward: 1.0
Episode 13 loss: -33041.13671875, average reward: 1.0
Episode 14 loss: -33056.3203125, average reward: 1.0
Episode 15 loss: -33114.8046875, average reward: 1.0
Episode 16 loss: -33082.13671875, average reward: 1.0
Episode 17 loss: -33075.76171875, average reward: 1.0
Episode 18 loss: -33095.6640625, average reward: 

KeyboardInterrupt: 