# REINFORCE: Monte Carlo Policy Gradient

In [1]:
""" Monte-Carlo Policy Gradient """

from __future__ import print_function

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable

import pygame
import Maze_Solver as maze_solver
from Maze_Solver import MazeSolver, MazeSolverEnv
import Maze_Generator as maze_generator

pygame 2.1.2 (SDL 2.0.18, Python 3.9.7)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
# cuda가 설치되어 있다면 cuda를 사용하고 아니라면 cpu만을 사용한다.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
class reinforce(nn.Module):
    
    def __init__(self, h, w, outputs):
        super(reinforce, self).__init__()
        self.conv1 = nn.Conv2d(4, 16, kernel_size=3, stride=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=3, stride=1)
        self.bn3 = nn.BatchNorm2d(32)
        self.bn4 = nn.BatchNorm1d(4) # 4 actions
        
        # torch.log makes nan(not a number) error so we have to add some small number in log function
        self.ups=1e-7

        # Number of Linear input connections depends on output of conv2d layers
        # and therefore the input image size, so compute it.
        def conv2d_size_out(size, kernel_size = 3, stride = 1):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        linear_input_size = convw * convh * 32
        
        self.fc = nn.Linear(linear_input_size, outputs)
        self.head = nn.Softmax(dim=1)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.fc(x.view(x.size(0), -1))
        
        return self.head(x)

    def get_action(self, state):
        state = Variable(torch.tensor(state))
        #state = torch.unsqueeze(state, 0)
        probs = self.forward(state)
        probs = torch.squeeze(probs, 0)
        action = probs.multinomial(num_samples=4)
        action = action.data
        
        action = action[0].to(device)
        return action

    def pi(self, s, a):
        s = Variable(torch.Tensor(s))
        probs = self.forward(s)
        probs = torch.squeeze(probs, 0)
        return probs[a]

    def update_weight(self, states, actions, rewards, optimizer):
        G = Variable(torch.Tensor([0])).to(device)
        # for each step of the episode t = T - 1, ..., 0
        # r_tt represents r_{t+1}
        
        for s_t, a_t, r_tt in zip(states[::-1], actions[::-1], rewards[::-1]):
            G = Variable(torch.Tensor([r_tt])).to(device) + GAMMA * G
            loss = (-1.0) * G * torch.log(self.pi(s_t, a_t) + self.ups).to(device)
            # update policy parameter \theta
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


In [4]:
import numpy as np

a = np.array([[1, 2, 3], [4, 1, 3]])

for i in np.argwhere(a == 3):
    print(i)
    print(i[0])
    print(i[1])
    
    print("=========================")

[0 2]
0
2
[1 2]
1
2


In [None]:
MAX_EPISODES = 100000
MAX_TIMESTEPS = 100000

ALPHA = 3e-5
GAMMA = 0.99

env = MazeSolverEnv()

# for exploring starts and randomized goal
env.reset(exploring_starts = True, random_goal = True)

num_actions = env.num_action
num_obs = env.num_obs

agent = reinforce(num_obs[0], num_obs[1], num_actions).to(device)
agent.eval()

optimizer = optim.Adam(agent.parameters(), lr=ALPHA)

for i_episode in range(MAX_EPISODES):
    state = env.init_obs
    done = False

    states = []
    actions = []
    rewards = [0]   # no reward at t = 0
    
    #while not done:
    for timesteps in range(MAX_TIMESTEPS):
        action = agent.get_action(state)
        states.append(state)
        actions.append(action)

        state, reward, done, _ = env.step(action.tolist())

        rewards.append(reward)

        if done:
            if (i_episode+1) % 100 == 0:
                print("Episode {} timesteps {}".format(i_episode + 1, timesteps + 1))
            #print("Episode {} finished after {} timesteps".format(i_episode, timesteps+1))
            break
                
    agent.update_weight(states, actions, rewards, optimizer)
    
    if (i_episode+1) % 100 == 0:
        print("Update {} finished".format(i_episode + 1))
        
    
    env.reset(exploring_starts = True, random_goal = True)

env.close()
torch.save(agent, './saved_models/model.pt')

Episode 100 timesteps 2
Update 100 finished
Episode 200 timesteps 1
Update 200 finished
Episode 300 timesteps 5
Update 300 finished
Episode 400 timesteps 4
Update 400 finished
Episode 500 timesteps 1
Update 500 finished
Episode 600 timesteps 2
Update 600 finished
Episode 700 timesteps 1
Update 700 finished
Episode 800 timesteps 3
Update 800 finished
Episode 900 timesteps 1
Update 900 finished
Episode 1000 timesteps 2
Update 1000 finished
Episode 1100 timesteps 5
Update 1100 finished
Episode 1200 timesteps 2
Update 1200 finished
Episode 1300 timesteps 2
Update 1300 finished
Episode 1400 timesteps 1
Update 1400 finished
Episode 1500 timesteps 1
Update 1500 finished
Episode 1600 timesteps 3
Update 1600 finished
Episode 1700 timesteps 1
Update 1700 finished
Episode 1800 timesteps 5
Update 1800 finished
Episode 1900 timesteps 2
Update 1900 finished
Episode 2000 timesteps 1
Update 2000 finished
Episode 2100 timesteps 2
Update 2100 finished
Episode 2200 timesteps 3
Update 2200 finished
Episod