# ECE 276C HW3 P1
Mingwei Xu A53270271

In [53]:
import logging
import gym
import numpy as np
from matplotlib import pyplot as plt

import torch
from torch import nn
from torch.distributions import Categorical

## Define Network

In [54]:
class PolicyNetwork(object):
    """
    Policy Network using MLP
    """
    def __init__(self, env):
        """
        :param env: object, gym environment
        """
        # get state space and action space dimension
        self.state_space_n = env.observation_space.shape[0]
        self.action_space_n = env.action_space.n

        # TODO: other stuff

    def forward(self, x):
        """
        Feed forward
        
        :param x: np array, state
        :return: tensor, softmax probability of action
        """
        # build neural network
        network = nn.Sequential(
            nn.Linear(self.state_space_n, 64),
            nn.Dropout(p=0.6),
            nn.Linear(64, self.action_space_n),
            nn.Softmax(dim=-1))
        return network(torch.FloatTensor(x))

## Setup

In [55]:
env = gym.make('CartPole-v1')
state = env.reset()
policy_network = PolicyNetwork(env)
policy_network.forward(state)   # TODO: test

tensor([0.5326, 0.4674], grad_fn=<SoftmaxBackward>)

## Choose Action

In [59]:
def choose_action(policy_network, state):
    """
    Choose action according to policy on given state

    :param policy_network: object, policy network
    :param state: np array, state
    :returns: int, action; tensor, log probability
    """
    probs = policy_network.forward(state)
    m = Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)
    return action.item(), log_prob

In [60]:
choose_action(policy_network, state)    # TODO: test

(1, tensor(-0.6499, grad_fn=<SqueezeBackward1>))

## Question 1

In [57]:
def reinforce(env, policy_network, batch_size=500, num_episodes=200, gamma=0.99):
    """
    Policy gradient training using reinforce method

    :param env: object, gym environment
    :param policy_network: object, policy network
    :param batch_size: int, batch size
    :param num_episodes: int, number of episodes
    :param gamma: float (0~1), discount factor
    """
    # TODO: setup place holders

    # train
    for i in range(num_episodes):
        # setup placeholders
        batch_states = []
        batch_actions = []
        batch_rewards = []
        log_prob_list = []

        # reset environment
        state = env.reset()
        done = False

        for _ in range(batch_size):
            # step
            action, log_prob = choose_action(policy_network, state)
            state_next, reward, done, _ = env.step(action)

            # store data
            batch_states.append(state)
            batch_actions.append(action)
            batch_rewards.append(reward)
            log_prob_list.append(log_prob)

            # move on
            if done:
                state = env.reset()
            else:
                state = state_next

        print(batch_states)
        print(batch_actions)
        print(batch_rewards)
        # TODO: update policy
        return

In [58]:
reinforce(env, policy_network, batch_size=20, num_episodes=20, gamma=0.99)

