# REINFORCE: Monte Carlo Policy Gradient

In [1]:
""" Monte-Carlo Policy Gradient """

from __future__ import print_function

import numpy as np
import random
import gym

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable

import pygame
import Maze_Solver as maze_solver
from Maze_Solver import MazeSolver, MazeSolverEnv
import Maze_Generator as maze_generator

pygame 2.1.2 (SDL 2.0.18, Python 3.9.7)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
# cuda가 설치되어 있다면 cuda를 사용하고 아니라면 cpu만을 사용한다.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
class reinforce(nn.Module):
    
    def __init__(self, inputs, outputs):
        super(reinforce, self).__init__()
        self.fc1 = nn.Linear(inputs, 256)
        self.fc2 = nn.Linear(256, outputs)
        self.head = nn.Softmax(dim=0)
        
        self.ups=1e-7

    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.fc1(x))
        x = self.head(self.fc2(x))
        
        return x
    
    def get_action(self, state):
        state = torch.tensor(state)
        #state = torch.unsqueeze(state, 0)
        probs = self.forward(state)
        probs = torch.squeeze(probs, 0)
        action = probs.multinomial(num_samples=1)
        action = action.data
        
        action = action[0].to(device)
        return action
    
    def epsilon_greedy_action(self, state, epsilon = 0.1):
        state = torch.tensor(state)
        #state = torch.unsqueeze(state, 0)
        probs = self.forward(state)
        probs = torch.squeeze(probs, 0)
        
        if random.random() > epsilon:
            action = torch.tensor([torch.argmax(probs)])
        else:
            action = torch.rand(probs.shape).multinomial(num_samples=1)\
        
        action = action.data
        action = action[0].to(device)
        return action
        
    def pi(self, s, a):
        s = torch.Tensor(s)
        probs = self.forward(s)
        probs = torch.squeeze(probs, 0)
        return probs[a]

    def update_weight(self, states, actions, rewards, optimizer):
        G = torch.Tensor([0])
        # for each step of the episode t = T - 1, ..., 0
        # r_tt represents r_{t+1}
        
        for s_t, a_t, r_tt in zip(states[::-1], actions[::-1], rewards[::-1]):
            G = torch.Tensor([r_tt]) + GAMMA * G
            loss = (-1.0) * G * torch.log(self.pi(s_t, a_t) + self.ups)
            # update policy parameter \theta
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


In [4]:
MAX_EPISODES = 3000
MAX_TIMESTEPS = 1000

ALPHA = 3e-5
GAMMA = 0.99

#env = MazeSolverEnv()
env = gym.make('CartPole-v0')

#num_actions = env.num_action
num_actions =  env.action_space.n
num_states = env.observation_space.shape[0]

agent = reinforce(num_states, num_actions).to(device)
agent.eval()

optimizer = optim.Adam(agent.parameters(), lr=ALPHA)

returns = []

for i_episode in range(MAX_EPISODES):
    #state = env.init_obs
    state = env.reset()
    
    done = False

    states = []
    actions = []
    rewards = [0]   # no reward at t = 0
    
    #while not done:
    for timestep in range(MAX_TIMESTEPS):
        action = agent.epsilon_greedy_action(state)
        states.append(state)
        actions.append(action)
        
        state, reward, done, _ = env.step(action.tolist())
        
        rewards.append(reward)

        if done or (timestep+1) == MAX_TIMESTEPS:
            #print("Episode {} finished after {} timesteps".format(i_episode, timesteps+1))
            break
            
    returns.append(rewards)
    
    if (i_episode+1) % 100 == 0:
        print("Episode {} return: {}".format( i_episode + 1, sum(rewards)))
    
    agent.update_weight(states, actions, rewards, optimizer)
        
    if (i_episode + 1) % 500 == 0:
        torch.save(agent, './saved_models/model' + str(i_episode + 1) + '.pt')
    
    #env.reset_player(exploring_starts = False)

env.close()


Episode 100 return: 9.0
Episode 200 return: 9.0
Episode 300 return: 10.0
Episode 400 return: 13.0
Episode 500 return: 10.0
Episode 600 return: 11.0
Episode 700 return: 9.0
Episode 800 return: 10.0
Episode 900 return: 9.0
Episode 1000 return: 10.0
Episode 1100 return: 9.0
Episode 1200 return: 9.0
Episode 1300 return: 10.0
Episode 1400 return: 11.0
Episode 1500 return: 10.0
Episode 1600 return: 10.0
Episode 1700 return: 10.0
Episode 1800 return: 10.0
Episode 1900 return: 11.0
Episode 2000 return: 10.0
Episode 2100 return: 9.0
Episode 2200 return: 10.0
Episode 2300 return: 8.0
Episode 2400 return: 8.0
Episode 2500 return: 10.0
Episode 2600 return: 10.0
Episode 2700 return: 10.0
Episode 2800 return: 10.0
Episode 2900 return: 14.0
Episode 3000 return: 10.0
