# First RL Gym Project

## 1. Imports

In [7]:
import torch as tr
import torch.nn as nn
from torch.optim.lr_scheduler import MultiStepLR
import matplotlib.pyplot as plt
import numpy as np
from environments import BoxEnvironment1 as env
from environment_utils import Box
from agents import SACAgent
from agent_utils import update_target_agent, ReplayBuffer
from log_utils import RLLogger
from plot_utils import RLPlotter, plot_normalized_mexican_hat_potential

device = tr.device('cuda' if tr.cuda.is_available() else 'cpu')
tr.autograd.set_detect_anomaly(True)
tr.set_default_tensor_type(tr.FloatTensor)

## 2. Hyperparameters

In [8]:
# ---------------- Training -------------------
    # Memory
memory_size = 3000
memory_batch_size = 512
    # Duration of training
runs = 1
n_episodes = 50
n_steps = 256
    # Training parameters
agent_batch_size = 128
learning_rate = 0.005
learning_rate_decay = 0.8
entropy_coef = 0.2
    # Bellman equation
future_discount = 0.99
    # Update Target Model
target_model_update = 1
    # Loss Function
loss_function = nn.MSELoss()

# ---------------- Environment  ----------------
    # Environment box size
env_width = 2
env_height = 2
space = Box(env_width, env_height)
    # Goal box size and center
goal_width = 0.3
goal_height = 0.3
goal_center = np.tile([0.5,0],(agent_batch_size,1))
goal = Box(goal_width, goal_height, goal_center)
    # Time step size
dt = 0.04
    # Noise
noise_characteristic_length = 10
    # Maximum of potential
U0 = 0.5

# ---------------- Agent ----------------------
state_dim = 4
hidden_dims = [8,8]
act_dim = 1
act_positive = True
act_scaling = 2*np.pi

# ---------------- Other ----------------------
plt.rcParams.update({'font.size': 13})
plt.rcParams.update({'figure.dpi': 150})
total_time = []
update_state_time = []

## 3. Simulation

In [9]:
environment = env(space, goal)
memory = ReplayBuffer(state_dim, act_dim, memory_size, agent_batch_size)
agent = SACAgent(state_dim, act_dim, hidden_dims, act_scaling, act_positive).float().to(device)
target_agent = SACAgent(state_dim, act_dim, hidden_dims, act_scaling, act_positive).float().to(device)
logger = RLLogger()
plotter = RLPlotter(logger, goal)

agent.actor_optimizer = tr.optim.Adam(agent.actor.parameters(), lr=learning_rate)
agent.critic1_optimizer = tr.optim.Adam(agent.critic1.parameters(), lr=learning_rate)
agent.critic2_optimizer = tr.optim.Adam(agent.critic2.parameters(), lr=learning_rate)

for p in target_agent.parameters():
    p.requires_grad = False

In [10]:
def update(agent, target_agent, memory_batch):
    agent.critic1_optimizer.zero_grad()
    agent.critic2_optimizer.zero_grad()

    state_now = memory_batch['state_now'].reshape(-1, state_dim)
    state_next = memory_batch['state_next'].reshape(-1, state_dim)
    action_now = memory_batch['action_now'].reshape(-1, act_dim)
    reward = memory_batch['reward'].reshape(-1)
    done = memory_batch['done'].reshape(-1)
    
    # Compute Prediction
    Q1_now = agent.critic1(state_now, action_now)
    Q2_now = agent.critic2(state_now, action_now)

    # Compute Target
    with tr.no_grad():        
        action_next, log_prob_next = agent.actor(state_next)
        
        Q1_next = target_agent.critic1(state_next, action_next)
        Q2_next = target_agent.critic2(state_next, action_next)
        Q_next = tr.min(Q1_next, Q2_next)
        target = reward + future_discount*(Q_next - entropy_coef*log_prob_next)
    # Compute Loss
    loss = loss_function(Q1_now, target) + loss_function(Q2_now, target)
    
    # Update
    loss.backward()
    agent.critic1_optimizer.step()
    agent.critic2_optimizer.step()
    
    agent.actor_optimizer.zero_grad()
    for p in agent.critic1.parameters():
        p.requires_grad = False
    for p in agent.critic2.parameters():
        p.requires_grad = False
    
    action_now_new, log_prob_now_new = agent.actor(state_now)
    Q1_now_new = agent.critic1(state_now, action_now_new)
    Q2_now_new = agent.critic2(state_now, action_now_new)
    Q_now_new = tr.min(Q1_now_new, Q2_now_new)
    loss_actor = (entropy_coef*log_prob_now_new - Q_now_new).mean()
    loss_actor.backward()
    agent.actor_optimizer.step()

    for p in agent.critic1.parameters():
        p.requires_grad = True
    for p in agent.critic2.parameters():
        p.requires_grad = True

    return loss, loss_actor

In [11]:
def episode():    
    environment.init_state(agent_batch_size, state_dim)
    for current_step in range(n_steps):
        # Log state
        logger.save_state(environment.state)
        if current_step%target_model_update == 0 and current_step > memory_size:
            update_target_agent(agent, target_agent)
        # Beginning state
        state_now = environment.state
        # Action
        action_now, _ = agent.actor(tr.as_tensor(environment.state, device=device, dtype=tr.float))
        # Next state
        reward = environment.step(action_now.detach().cpu().numpy(), U0, dt, noise_characteristic_length)
        state_next = environment.state
        # Done
        done = environment.goal_check()
        # Log action
        logger.save_action(action_now.detach().cpu().numpy())

        loss = 0
        # Sample from memory
        if memory.size >= memory_batch_size:
            memory_batch = memory.sample_batch(memory_batch_size)
            # Update Agent
            loss_critic, loss_actor = update(agent, target_agent, memory_batch)
            loss_critic, loss_actor = loss_critic.item(), loss_actor.item()
            logger.save_loss_critic(loss_critic)
            logger.save_loss_actor(loss_actor)
        
        # Store in memory
        memory.store(state_now, action_now, reward, state_next, loss, done)

        if max(environment.goal_check()): 
            print('Goal reached')
            logger.save_state(environment.state)
            break
        
    return current_step

In [12]:
def simulation():
    update_target_agent(agent, target_agent)
    for ep in range(n_episodes):
        episode_steps = episode()
        logger.save_episode(episode_steps)
        plotter.plot_last_episode()
        print('Episode', ep,' finished!')

plotter.clear_plots()
simulation()

Episode 0  finished!
Episode 1  finished!
Episode 2  finished!
Episode 3  finished!
Episode 4  finished!


KeyboardInterrupt: 