# ECE 276C HW3 P2
Mingwei Xu A53270271

In [1]:
import pickle
import gym
import pybulletgym.envs
import numpy as np
from matplotlib import pyplot as plt

import torch
from torch import nn, optim
from torch.distributions import MultivariateNormal

## Setup CUDA

In [2]:
# setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Using device :', device)

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))

Using device : cuda
GeForce GTX 1080


## Define Network

In [3]:
class PolicyNetwork(nn.Module):
    """
    Policy Network using MLP
    """
    def __init__(self, env):
        """
        :param env: object, gym environment
        """
        super(PolicyNetwork, self).__init__()
        # get state space and action space dimension
        self.state_space_n = env.observation_space.shape[0] - 1   # should be 8 (TODO: bug in env showing wrong observation space?)
        self.action_space_n = env.action_space.shape[0]   # should be 2

        # define layers
        self.l1 = nn.Linear(self.state_space_n, 128)
        self.l2 = nn.Linear(128, 64)
        self.l3 = nn.Linear(64, self.action_space_n)

        self.sigma = nn.Parameter(torch.eye(2))     # initalize cov matrix with grad fn
#         self.sigma = nn.Parameter(torch.diag(torch.rand(2)))

    def forward(self, x):
        """
        Feed forward
        
        :param x: np array, state
        :return: tensor, softmax probability of action
        """
        # TODO: take sigma as input
        # build neural network
        network = nn.Sequential(
            self.l1,
            nn.Tanh(),
            self.l2,
            nn.Tanh(),
            self.l3,
            nn.Tanh())

        return network(torch.FloatTensor(x).to(device))

## Choose Action

In [4]:
def choose_action(policy_network, state):
    """
    Choose action according to policy on given state

    :param policy_network: object, policy network
    :param state: np array, state
    :returns: list (len=2), action; tensor with grad fn, log probability
    """
    probs = policy_network.forward(state)   # mean from policy network output

    cov = torch.abs(policy_network.sigma) + 1e-3    # positive definite

    m = MultivariateNormal(probs, cov)
    action = m.sample()
    log_prob = m.log_prob(action)

    return action.tolist(), log_prob

## Question 2

In [5]:
def reinforce(env, policy_network, batch_size=500, num_episodes=200, lr=0.01, gamma=0.99):
    """
    Policy gradient training using reinforce method

    :param env: object, gym environment
    :param policy_network: object, policy network
    :param batch_size: int, batch size
    :param num_episodes: int, number of episodes
    :param lr: float, learning rate
    :param gamma: float (0~1), discount factor
    :return: list of average reward on each episode
    """
    # setup place holders
    average_reward_list = []  # store step over episode

    # define optimizer
    optimizer = optim.Adam(policy_network.parameters(), 
                           lr=lr)

    # train
    for i in range(num_episodes):
        # setup placeholders for each batch
        batch_loss_sum = 0
        batch_traj_counter = 0
        batch_rewards = []

        # setup placeholders for each trajectory
        traj_rewards = []
        traj_log_prob_sum = 0

        # reset environment
        state = env.reset()
        done = False

        # batch
        for step in range(batch_size):
            # exploration
            action, log_prob = choose_action(policy_network, state)
            state_next, reward, done, _ = env.step(action)
            # env.render()

            # store data
            traj_rewards.append(reward)
            traj_log_prob_sum += log_prob

            # move on
            if done or (step == batch_size - 1):
                # trajectory or batch finished, update trajectory
                traj_discounted_return = torch.sum(torch.FloatTensor(traj_rewards).to(device) *
                                                   torch.FloatTensor([gamma ** t for t in range(1, len(traj_rewards) + 1)]).to(device))  # G(t)
                batch_loss_sum += traj_discounted_return * traj_log_prob_sum
                
                # reset state
                batch_rewards.append(np.sum(traj_rewards))
                batch_traj_counter += 1

                traj_rewards = []
                traj_log_prob_sum = 0

                state = env.reset()
            else:
                state = state_next

        # finish batch
        average_batch_reward = np.mean(batch_rewards)
        average_reward_list.append(average_batch_reward)
        loss = - batch_loss_sum / batch_traj_counter
        
        print('Episode [{}/{}] loss: {:.2f}, average reward: {:.2f}, trajectory num: {}'.format(i + 1, num_episodes,
                               loss.item(), average_batch_reward, batch_traj_counter))

        # update policy
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return average_reward_list

In [6]:
def reinforce_with_baseline(env, policy_network, batch_size=500, num_episodes=200, lr=0.01, gamma=0.99, enable_baseline=False):
    """
    Policy gradient training using modified reinforce method with baseline

    :param env: object, gym environment
    :param policy_network: object, policy network
    :param batch_size: int, batch size
    :param num_episodes: int, number of episodes
    :param lr: float, learning rate
    :param gamma: float (0~1), discount factor
    :param enable_baseline: bool, flag to enable baseline, defaults to False
    :return: list of average reward on each episode
    """
    # setup place holders
    average_reward_list = []  # store step over episode
    average_step_list = []

    # define optimizer
    optimizer = optim.Adam(policy_network.parameters(), 
                           lr=lr)

    # train
    for i in range(num_episodes):
        # setup placeholders for each batch
        batch_loss_sum = 0
        batch_traj_counter = 0
        batch_rewards = []
        batch_traj_steps = []

        # setup placeholders for each trajectory
        traj_loss_sum = 0
        traj_rewards = []
        traj_log_prob_list = []
        traj_step_counter = 0

        # reset environment
        state = env.reset()
        done = False

        # batch
        for step in range(batch_size):
            # exploration
            action, log_prob = choose_action(policy_network, state)
            state_next, reward, done, _ = env.step(action)
            traj_step_counter += 1
            # env.render()

            # store data
            traj_rewards.append(reward)
            traj_log_prob_list.append(log_prob)

            # move on
            if done or (step == batch_size - 1):
                # trajectory or batch finished, update trajectory
                discounted_return_list = []
                discounted_return_list = [sum([gamma ** (t_prime - t) * traj_rewards[t_prime] for t_prime in range(t, len(traj_rewards))]) \
                                          for t in range(len(traj_rewards))]    # TODO
                if enable_baseline:
                    # substract discounted return list by its mean
                    discounted_return_list = np.array(discounted_return_list) - np.mean(discounted_return_list)

                # sum the traj loss by loop so we do not lose tensor gradient
                traj_loss_sum = 0
                for t in range(len(traj_log_prob_list)):
                    traj_loss_sum += traj_log_prob_list[t] * discounted_return_list[t]
                batch_loss_sum += traj_loss_sum
                
                # reset state
                batch_rewards.append(np.sum(traj_rewards))
                batch_traj_counter += 1
                batch_traj_steps.append(traj_step_counter)

                traj_step_counter = 0
                traj_loss_sum = 0
                traj_rewards = []
                traj_log_prob_list = []

                state = env.reset()
            else:
                state = state_next

        # finish batch
        average_batch_reward = np.mean(batch_rewards)
        average_reward_list.append(average_batch_reward)
        average_step_list.append(np.mean(batch_traj_steps))
        loss = - batch_loss_sum / batch_traj_counter    # TODO
        
        print('TODO: sigma: ', policy_network.sigma)    # TODO
        print('Episode [{}/{}] loss: {:.2f}, average reward: {:.2f}, trajectory num: {}'.format(i + 1, num_episodes,
                               loss.item(), average_batch_reward, batch_traj_counter))

        # update policy
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return average_reward_list, average_step_list

### Training

In [None]:
env = gym.make('modified_gym_env:ReacherPyBulletEnv-v1', rand_init=False)
# env.render()
# TODO: test
state = env.reset()
print(state)

# setup network
policy_network = PolicyNetwork(env).to(device)
average_reward_list, average_step_list = reinforce_with_baseline(env, policy_network,batch_size=2000, num_episodes=500,
                                              lr=0.01, gamma=0.9, enable_baseline=True)

current_dir=/home/cvv5233/anaconda3/lib/python3.7/site-packages/pybullet_envs/bullet
options= 
[ 0.3928371   0.3928371  -0.68091764  0.26561381  0.5         0.
  0.08333333  0.        ]
TODO: sigma:  Parameter containing:
tensor([[1., 0.],
        [0., 1.]], device='cuda:0', requires_grad=True)
Episode [1/500] loss: -26.79, average reward: -51.07, trajectory num: 15
TODO: sigma:  Parameter containing:
tensor([[0.9900, 0.0000],
        [0.0000, 0.9900]], device='cuda:0', requires_grad=True)
Episode [2/500] loss: -24.32, average reward: -52.08, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[0.9801, 0.0000],
        [0.0000, 0.9800]], device='cuda:0', requires_grad=True)
Episode [3/500] loss: -25.98, average reward: -57.94, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[0.9702, 0.0000],
        [0.0000, 0.9700]], device='cuda:0', requires_grad=True)
Episode [4/500] loss: -23.49, average reward: -59.72, trajectory num: 14
TODO: sigma:  Parameter containing

TODO: sigma:  Parameter containing:
tensor([[0.5958, 0.0000],
        [0.0000, 0.5950]], device='cuda:0', requires_grad=True)
Episode [42/500] loss: -12.11, average reward: -37.85, trajectory num: 15
TODO: sigma:  Parameter containing:
tensor([[0.5858, 0.0000],
        [0.0000, 0.5852]], device='cuda:0', requires_grad=True)
Episode [43/500] loss: -15.19, average reward: -40.85, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[0.5758, 0.0000],
        [0.0000, 0.5755]], device='cuda:0', requires_grad=True)
Episode [44/500] loss: -15.26, average reward: -38.84, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[0.5657, 0.0000],
        [0.0000, 0.5656]], device='cuda:0', requires_grad=True)
Episode [45/500] loss: -15.24, average reward: -34.56, trajectory num: 15
TODO: sigma:  Parameter containing:
tensor([[0.5557, 0.0000],
        [0.0000, 0.5558]], device='cuda:0', requires_grad=True)
Episode [46/500] loss: -14.58, average reward: -37.12, trajectory num: 15


TODO: sigma:  Parameter containing:
tensor([[0.1794, 0.0000],
        [0.0000, 0.1744]], device='cuda:0', requires_grad=True)
Episode [84/500] loss: -7.20, average reward: -27.06, trajectory num: 15
TODO: sigma:  Parameter containing:
tensor([[0.1686, 0.0000],
        [0.0000, 0.1633]], device='cuda:0', requires_grad=True)
Episode [85/500] loss: -1.74, average reward: -25.87, trajectory num: 16
TODO: sigma:  Parameter containing:
tensor([[0.1586, 0.0000],
        [0.0000, 0.1527]], device='cuda:0', requires_grad=True)
Episode [86/500] loss: -2.32, average reward: -28.89, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[0.1488, 0.0000],
        [0.0000, 0.1428]], device='cuda:0', requires_grad=True)
Episode [87/500] loss: -4.94, average reward: -23.05, trajectory num: 17
TODO: sigma:  Parameter containing:
tensor([[0.1388, 0.0000],
        [0.0000, 0.1325]], device='cuda:0', requires_grad=True)
Episode [88/500] loss: -5.75, average reward: -23.17, trajectory num: 17
TODO:

TODO: sigma:  Parameter containing:
tensor([[ 0.0152,  0.0000],
        [ 0.0000, -0.0219]], device='cuda:0', requires_grad=True)
Episode [125/500] loss: -1.76, average reward: -26.37, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[ 0.0119,  0.0000],
        [ 0.0000, -0.0160]], device='cuda:0', requires_grad=True)
Episode [126/500] loss: -0.28, average reward: -25.04, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[ 0.0089,  0.0000],
        [ 0.0000, -0.0101]], device='cuda:0', requires_grad=True)
Episode [127/500] loss: 0.69, average reward: -24.22, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[ 0.0086,  0.0000],
        [ 0.0000, -0.0047]], device='cuda:0', requires_grad=True)
Episode [128/500] loss: -0.38, average reward: -22.94, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[ 0.0066,  0.0000],
        [ 0.0000, -0.0001]], device='cuda:0', requires_grad=True)
Episode [129/500] loss: 1.09, average reward: -10.36, t

TODO: sigma:  Parameter containing:
tensor([[ 0.0282,  0.0000],
        [ 0.0000, -0.0239]], device='cuda:0', requires_grad=True)
Episode [166/500] loss: -0.99, average reward: -9.93, trajectory num: 19
TODO: sigma:  Parameter containing:
tensor([[ 0.0257,  0.0000],
        [ 0.0000, -0.0223]], device='cuda:0', requires_grad=True)
Episode [167/500] loss: -2.06, average reward: -15.20, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[ 0.0223,  0.0000],
        [ 0.0000, -0.0199]], device='cuda:0', requires_grad=True)
Episode [168/500] loss: -1.39, average reward: -15.74, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[ 0.0189,  0.0000],
        [ 0.0000, -0.0170]], device='cuda:0', requires_grad=True)
Episode [169/500] loss: -1.24, average reward: -14.94, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[ 0.0144,  0.0000],
        [ 0.0000, -0.0140]], device='cuda:0', requires_grad=True)
Episode [170/500] loss: -0.22, average reward: -9.04, t

TODO: sigma:  Parameter containing:
tensor([[-0.0042,  0.0000],
        [ 0.0000,  0.0540]], device='cuda:0', requires_grad=True)
Episode [207/500] loss: -0.82, average reward: -25.66, trajectory num: 23
TODO: sigma:  Parameter containing:
tensor([[-0.0051,  0.0000],
        [ 0.0000,  0.0529]], device='cuda:0', requires_grad=True)
Episode [208/500] loss: -0.27, average reward: -44.27, trajectory num: 15
TODO: sigma:  Parameter containing:
tensor([[-0.0073,  0.0000],
        [ 0.0000,  0.0517]], device='cuda:0', requires_grad=True)
Episode [209/500] loss: -2.62, average reward: -47.19, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[-0.0086,  0.0000],
        [ 0.0000,  0.0500]], device='cuda:0', requires_grad=True)
Episode [210/500] loss: -2.21, average reward: -44.61, trajectory num: 15
TODO: sigma:  Parameter containing:
tensor([[-0.0051,  0.0000],
        [ 0.0000,  0.0486]], device='cuda:0', requires_grad=True)
Episode [211/500] loss: 2.59, average reward: -37.51, 

TODO: sigma:  Parameter containing:
tensor([[-0.0455,  0.0000],
        [ 0.0000, -0.0388]], device='cuda:0', requires_grad=True)
Episode [248/500] loss: -1.45, average reward: -37.53, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[-0.0439,  0.0000],
        [ 0.0000, -0.0406]], device='cuda:0', requires_grad=True)
Episode [249/500] loss: -0.48, average reward: -37.80, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[-0.0419,  0.0000],
        [ 0.0000, -0.0426]], device='cuda:0', requires_grad=True)
Episode [250/500] loss: -1.02, average reward: -37.59, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[-0.0400,  0.0000],
        [ 0.0000, -0.0442]], device='cuda:0', requires_grad=True)
Episode [251/500] loss: -1.94, average reward: -38.09, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[-0.0378,  0.0000],
        [ 0.0000, -0.0454]], device='cuda:0', requires_grad=True)
Episode [252/500] loss: -0.36, average reward: -37.39,

TODO: sigma:  Parameter containing:
tensor([[ 0.1028,  0.0000],
        [ 0.0000, -0.0201]], device='cuda:0', requires_grad=True)
Episode [289/500] loss: -0.66, average reward: -37.92, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[ 0.1047,  0.0000],
        [ 0.0000, -0.0211]], device='cuda:0', requires_grad=True)
Episode [290/500] loss: -2.61, average reward: -37.88, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[ 0.1060,  0.0000],
        [ 0.0000, -0.0219]], device='cuda:0', requires_grad=True)
Episode [291/500] loss: -2.18, average reward: -38.69, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[ 0.1070,  0.0000],
        [ 0.0000, -0.0229]], device='cuda:0', requires_grad=True)
Episode [292/500] loss: 1.76, average reward: -38.61, trajectory num: 14
TODO: sigma:  Parameter containing:
tensor([[ 0.1076,  0.0000],
        [ 0.0000, -0.0253]], device='cuda:0', requires_grad=True)
Episode [293/500] loss: -1.06, average reward: -38.16, 

### Plot

In [None]:
%matplotlib inline
plt.figure()
plt.plot(average_reward_list)
plt.xlabel('iteration')
plt.ylabel('average reward')
plt.title('Question 2 Average Rewards at Each Iteration')
plt.savefig('Question_2.png')
plt.show()

plt.figure()
plt.plot(average_step_list)
plt.xlabel('iteration')
plt.ylabel('average step')
plt.title('Question 2 Average Steps at Each Iteration')
plt.savefig('Question_2.png')
plt.show()

### Save Policy

In [None]:
with open('p2_policy.pkl', 'wb') as pickle_file:
    pickle.dump(policy_network, pickle_file)

## Test Policy

In [None]:
# load policy
# with open('p2_policy.pkl', 'rb') as pickle_file:
#     policy_network = pickle.load(pickle_file)

# test policy
env = gym.make('modified_gym_env:ReacherPyBulletEnv-v1', rand_init=False)
# env.render()
state = env.reset()
done = False
steps = 0

while not done:
    # TODO: do not sample here
    action, log_prob = choose_action(policy_network, state)
    state_next, reward, done, _ = env.step(action)
    steps += 1

print('Finished in {} steps'.format(steps))