In [1]:
# As usual, a bit of setup
from __future__ import print_function
import time
import numpy as np
import matplotlib.pyplot as plt
import platform
import gym
import pickle as pickle
import pympler

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.distributions import Categorical
import gc

# For advanced computer vision
import cv2

# For memory tracking
from pympler import summary
from pympler import muppy

cpu_dtype = torch.FloatTensor
gpu_dtype = torch.cuda.FloatTensor

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

print("Python version: ", platform.python_version())

Python version:  3.6.4


In [63]:
# The function approximator of the Policy is a 2 layer NN. 
# - The policy takes in the state of Pong, which is a resampled 40x40 image
# - The action is the softmax output (Left or Right)
# - there are 200 hidden units in the NN
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(6400, 200)
        self.affine2 = nn.Linear(200, 2)  
        
        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = F.relu(self.affine1(x))
        action_scores = self.affine2(x)
        return action_scores


# Debug Underperformance - (1) Change downsampling
def prepro80(I):
    """ 
    prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector 
    """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float).ravel()

rewards = []
log_probs = []
prev_x = None
env = gym.make("Pong-v0")

# Call an instance of the Policy and select ADAM as update rule
policy = Policy().cuda()

state = env.reset()

for i in range(3):
    # Downsample 210x160x3 frame into 6400 (80x80) 1D float vector
    cur_x = prepro80(state)
    state = cur_x - prev_x if prev_x is not None else np.zeros(6400)
    prev_x = cur_x

    state = torch.from_numpy(state).float().unsqueeze(0)

    logit = policy(Variable(state.type(gpu_dtype), requires_grad=False))
    print("Logit: ", logit)
    prob = F.softmax(logit, dim=1)
    print("Prob: ", prob)
    log_prob = F.log_softmax(logit, dim=1)
    print("Log_prob: ", log_prob)
       
    action = prob.multinomial().data
    print("Action: ", action)
    state, reward, done, _ = env.step(action)
    rewards.append(reward)
    
    log_prob_action = log_prob.gather(1, Variable(action))
    print("Log_prob: ", log_prob_action)
    log_probs.append(log_prob_action)


print("Rewards: ", rewards)
print("Log Probs: ", log_probs)



Logit:  Variable containing:
1.00000e-02 *
 -3.5490  3.8195
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

Prob:  Variable containing:
 0.4816  0.5184
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

Log_prob:  Variable containing:
-0.7307 -0.6570
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

Action:  
 0
[torch.cuda.LongTensor of size 1x1 (GPU 0)]

Log_prob:  Variable containing:
-0.7307
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

Logit:  Variable containing:
1.00000e-02 *
 -3.5821  4.2976
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

Prob:  Variable containing:
 0.4803  0.5197
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

Log_prob:  Variable containing:
-0.7333 -0.6545
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

Action:  
 1
[torch.cuda.LongTensor of size 1x1 (GPU 0)]

Log_prob:  Variable containing:
-0.6545
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

Logit:  Variable containing:
1.00000e-02 *
 -3.5490  3.8195
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

Prob:  Variable containing

Variable containing:
1.00000e-02 *
  1.9248 -3.7285
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

Variable containing:
 0.5141  0.4859
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

Variable containing:
-0.6653 -0.7218
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

Action: 
 0
[torch.cuda.LongTensor of size 1x1 (GPU 0)]

[0.0, 0.0, 0.0]
Variable containing:
-0.6653
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

[Variable containing:
-0.7218
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]
, Variable containing:
-0.7219
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]
, Variable containing:
-0.6653
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]
]


In [53]:
t = torch.Tensor([1,2])
print(t)
t.gather(1,action)


 1
 2
[torch.FloatTensor of size 2]



TypeError: gather received an invalid combination of arguments - got ([32;1mint[0m, [31;1mtorch.cuda.LongTensor[0m), but expected (int dim, torch.LongTensor index)