In [1]:
import math
import random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

In [2]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

<h2>Use CUDA</h2>

In [3]:
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")

<h2>Create Environments</h2>

In [4]:
from common.multiprocessing_env import SubprocVecEnv

num_envs = 2
env_name = "SpaceInvaders-v4"

def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk

envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)

frame_size= env.observation_space.shape
#num_outputs = envs.action_space.shape[0]
num_inputs  = frame_size[2]
num_outputs = 6




Process Process-1:
Process Process-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/luke/Documents/RL/common/multiprocessing_env.py", line 11, in worker
    cmd, data = remote.recv()
  File "/home/luke/Documents/RL/common/multiprocessing_env.py", line 11, in worker
    cmd, data = remote.recv()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.5/m

In [6]:
frame_size

(210, 160, 3)

<h2>Neural Network</h2>

In [7]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)
        

class PPO(nn.Module):
    def __init__(self, num_inputs, num_outputs, std=0.0):
        super(PPO, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=3)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
        self.bn3 = nn.BatchNorm2d(32)
        self.conv4 = nn.Conv2d(32, 32, kernel_size=5, stride=1)
        self.bn4 = nn.BatchNorm2d(32)
        self.head = nn.Linear(2112, num_outputs)

        self.apply(init_weights)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
        x = self.head(x.view(x.size(0), -1))
        return x

In [8]:

#Hyper params:
hidden_size      = 256
lr               = 3e-4
num_steps        = 20
mini_batch_size  = 5
ppo_epochs       = 4
threshold_reward = -200

Actor = PPO(num_inputs, num_outputs).to(device)
Critic = PPO(num_inputs, 1).to(device)

optimizer = optim.Adam(Actor.parameters(), lr=lr)

In [21]:
state = envs.reset().transpose((0,3, 1, 2))
state = torch.FloatTensor(state).to(device)
state.size()

torch.Size([2, 3, 210, 160])

In [16]:
log_probs = []
values    = []
states    = []
actions   = []
masks     = []
entropy = 0
done = 0
rewards = 0 

#while not done:
    mu = Actor(state)
    value = Critic(state)
    std = 0 
    log_std = nn.Parameter(torch.ones(1, num_outputs) * std)
    std   = log_std.exp().expand_as(mu)
    dist  = Normal(mu, std)

    action = dist.sample()
    action_val, action_indx = action.max(1)
    next_state, reward, done, _ = envs.step(action_indx.cpu().numpy())
    rewards += reward
    log_prob = dist.log_prob(action)
    entropy += dist.entropy().mean()
    
    
    env.render()

In [24]:
envs.action_space.sample()

2

In [15]:
env.reset()

array([[[ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0],
        ...,
        [ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0]],

       [[ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0],
        ...,
        [ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0]],

       [[ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0],
        ...,
        [ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0]],

       ...,

       [[80, 89, 22],
        [80, 89, 22],
        [80, 89, 22],
        ...,
        [80, 89, 22],
        [80, 89, 22],
        [80, 89, 22]],

       [[80, 89, 22],
        [80, 89, 22],
        [80, 89, 22],
        ...,
        [80, 89, 22],
        [80, 89, 22],
        [80, 89, 22]],

       [[80, 89, 22],
        [80, 89, 22],
        [80, 89, 22],
        ...,
        [80, 89, 22],
        [80, 89, 22],
        [80, 89, 22]]], dtype=uint8)

In [11]:
envs.reset()
action = []
for _ in range(num_envs):
    action.append(envs.action_space.sample())
envs.step(action)

(array([[[[ 0,  0,  0],
          [ 0,  0,  0],
          [ 0,  0,  0],
          ...,
          [ 0,  0,  0],
          [ 0,  0,  0],
          [ 0,  0,  0]],
 
         [[ 0,  0,  0],
          [ 0,  0,  0],
          [ 0,  0,  0],
          ...,
          [ 0,  0,  0],
          [ 0,  0,  0],
          [ 0,  0,  0]],
 
         [[ 0,  0,  0],
          [ 0,  0,  0],
          [ 0,  0,  0],
          ...,
          [ 0,  0,  0],
          [ 0,  0,  0],
          [ 0,  0,  0]],
 
         ...,
 
         [[80, 89, 22],
          [80, 89, 22],
          [80, 89, 22],
          ...,
          [80, 89, 22],
          [80, 89, 22],
          [80, 89, 22]],
 
         [[80, 89, 22],
          [80, 89, 22],
          [80, 89, 22],
          ...,
          [80, 89, 22],
          [80, 89, 22],
          [80, 89, 22]],
 
         [[80, 89, 22],
          [80, 89, 22],
          [80, 89, 22],
          ...,
          [80, 89, 22],
          [80, 89, 22],
          [80, 89, 22]]],
 
 
        [

In [7]:
action

[4, 5]

In [8]:
envs.step(action)

(array([[[[ 0,  0,  0],
          [ 0,  0,  0],
          [ 0,  0,  0],
          ...,
          [ 0,  0,  0],
          [ 0,  0,  0],
          [ 0,  0,  0]],
 
         [[ 0,  0,  0],
          [ 0,  0,  0],
          [ 0,  0,  0],
          ...,
          [ 0,  0,  0],
          [ 0,  0,  0],
          [ 0,  0,  0]],
 
         [[ 0,  0,  0],
          [ 0,  0,  0],
          [ 0,  0,  0],
          ...,
          [ 0,  0,  0],
          [ 0,  0,  0],
          [ 0,  0,  0]],
 
         ...,
 
         [[80, 89, 22],
          [80, 89, 22],
          [80, 89, 22],
          ...,
          [80, 89, 22],
          [80, 89, 22],
          [80, 89, 22]],
 
         [[80, 89, 22],
          [80, 89, 22],
          [80, 89, 22],
          ...,
          [80, 89, 22],
          [80, 89, 22],
          [80, 89, 22]],
 
         [[80, 89, 22],
          [80, 89, 22],
          [80, 89, 22],
          ...,
          [80, 89, 22],
          [80, 89, 22],
          [80, 89, 22]]],
 
 
        [