# Concentration Gradient Project

## 1. Imports

In [15]:
import torch as tr
import torch.nn as nn
from torch.optim.lr_scheduler import MultiStepLR
import matplotlib.pyplot as plt
import numpy as np
from environments import BoxEnvironment1 as env
from environment_utils import Box, Circle2D
from agents import SACAgent
from agent_utils import update_target_agent, ReplayBuffer
from log_utils import RLLogger
from plot_utils import RLPlotter, make_animation

device = tr.device('cuda' if tr.cuda.is_available() else 'cpu')
tr.autograd.set_detect_anomaly(True)
tr.set_default_tensor_type(tr.FloatTensor)

## 2. Hyperparameters

In [16]:
# ---------------- Training -------------------
    # Memory
memory_size = 4096
memory_batch_size = 512
    # Duration of training
runs = 1
n_episodes = 50
n_steps = 256
    # Training parameters
agent_batch_size = 256
learning_rate_actor = 0.0001
learning_rate_critic = 0.0001
milestones = np.arange(0, n_episodes, n_episodes//8)
learing_rate_decay = 0.5

entropy_coef = 0.002 
entropy_coef_decay = 1
    # Bellman equation
future_discount = 0.99
    # Update Target Model
target_model_update = 1
polyak_tau = 0.995
    # Loss Function
loss_function = nn.MSELoss()

# ---------------- Environment  ----------------
    # Environment box size
env_width = 2
env_height = 2
space = Box(env_width, env_height)
    # Goal box size and center

goal_radius = 0.1
    # Time step size
dt = 0.06
    # Noise
noise_characteristic_length = 2
    # Maximum of potential
c0 = 0.5

# ---------------- Agent ----------------------
state_dim = 4
hidden_dims = [32,32]
act_dim = 1
act_positive = True
act_scaling = 2*np.pi

# ---------------- Other ----------------------
plt.rcParams.update({'font.size': 13})
plt.rcParams.update({'figure.dpi': 150})
total_time = []
update_state_time = []

## 3. Simulation

In [17]:
environment = env(space)
memory = ReplayBuffer(state_dim, act_dim, memory_size, agent_batch_size)
agent = SACAgent(state_dim, act_dim, hidden_dims, act_scaling, act_positive).float().to(device)
target_agent = SACAgent(state_dim, act_dim, hidden_dims, act_scaling, act_positive).float().to(device)

logger = RLLogger()
plotter = RLPlotter(logger, 'logs')
testLogger = RLLogger()
testPlotter = RLPlotter(testLogger, 'test_logs', test=True)

agent.actor_optimizer = tr.optim.Adam(agent.actor.parameters(), lr=learning_rate_actor)
agent.critic1_optimizer = tr.optim.Adam(agent.critic1.parameters(), lr=learning_rate_critic)
agent.critic2_optimizer = tr.optim.Adam(agent.critic2.parameters(), lr=learning_rate_critic)

scheduler_actor = MultiStepLR(agent.actor_optimizer, milestones=milestones, gamma=learing_rate_decay)
scheduler_critic1 = MultiStepLR(agent.critic1_optimizer, milestones=milestones, gamma=learing_rate_decay)
scheduler_critic2 = MultiStepLR(agent.critic2_optimizer, milestones=milestones, gamma=learing_rate_decay)

for p in target_agent.parameters():
    p.requires_grad = False

In [18]:
def update(agent, target_agent, memory_batch):
    agent.critic1_optimizer.zero_grad()
    agent.critic2_optimizer.zero_grad()

    state_now = memory_batch['state_now'].reshape(-1, state_dim)
    state_next = memory_batch['state_next'].reshape(-1, state_dim)
    action_now = memory_batch['action_now'].reshape(-1, act_dim)
    reward = memory_batch['reward'].reshape(-1)
    done = memory_batch['done'].reshape(-1)
    
    # Compute Prediction
    Q1_now_critic = agent.critic1(state_now, action_now)
    Q2_now_critic = agent.critic2(state_now, action_now)

    # Compute Target
    with tr.no_grad():        
        action_next_critic, log_prob_next_critic = agent.actor(state_next)
        
        Q1_next_critic = target_agent.critic1(state_next, action_next_critic)
        Q2_next_critic = target_agent.critic2(state_next, action_next_critic)
        Q_next_critic = tr.min(Q1_next_critic, Q2_next_critic)
        target_critic = reward + future_discount*(Q_next_critic - entropy_coef*log_prob_next_critic)
    # Compute Loss
    loss_critic = loss_function(Q1_now_critic, target_critic) + loss_function(Q2_now_critic, target_critic)
    
    # Update
    loss_critic.backward()
    agent.critic1_optimizer.step()
    agent.critic2_optimizer.step()
    
    agent.actor_optimizer.zero_grad()
    for p in agent.critic1.parameters():
        p.requires_grad = False
    for p in agent.critic2.parameters():
        p.requires_grad = False
    
    action_now_actor, log_prob_now_actor = agent.actor(state_now)
    Q1_now_actor = agent.critic1(state_now, action_now_actor)
    Q2_now_actor = agent.critic2(state_now, action_now_actor)
    Q_now_actor = tr.min(Q1_now_actor, Q2_now_actor)
    loss_actor = (entropy_coef*log_prob_now_actor - Q_now_actor).mean()
    loss_actor.backward()
    agent.actor_optimizer.step()

    for p in agent.critic1.parameters():
        p.requires_grad = True
    for p in agent.critic2.parameters():
        p.requires_grad = True

    return loss_critic, loss_actor

In [19]:
def episode():
    # Initialize Goal at Random Location
    # sample = space.sample()
    sample = np.array([0.5,0])
    goal_center = np.tile(sample,(agent_batch_size,1))
    goal = Circle2D(goal_radius, goal_center)

    environment.init_env(agent_batch_size, state_dim, goal, c0, random_start = False)
    plotter.update_goal(goal)
    goal_bool = False
    for current_step in range(n_steps):
        # Log state
        logger.save_state(environment.state)
        if current_step%target_model_update == 0 and current_step > memory_size:
            update_target_agent(agent, target_agent, polyak_tau)
        # Beginning state
        state_now = environment.state.copy()
        # Action
        if memory.size < memory_batch_size:
            action_now = 2*tr.pi*tr.rand(agent_batch_size, act_dim, device=device, dtype=tr.float)
        else:
            action_now, _ = agent.actor(tr.as_tensor(environment.state, device=device, dtype=tr.float))
        # Next state
        reward = environment.step(action_now.detach().cpu().numpy(), c0, dt, noise_characteristic_length)
        state_next = environment.state.copy()
        # Done
        done = environment.goal_check()
        # Log action and reward
        logger.save_action(action_now.detach().cpu().numpy())
        logger.save_reward(reward)

        loss = 0
        # Sample from memory
        if memory.size >= memory_batch_size:
            
            memory_batch = memory.sample_batch(memory_batch_size)

            # Update Agent
            loss_critic, loss_actor = update(agent, target_agent, memory_batch)
            loss_critic, loss_actor = loss_critic.item(), loss_actor.item()
            logger.save_loss_critic(loss_critic)
            logger.save_loss_actor(loss_actor)
        
        # Store in memory
        memory.store(state_now, action_now, reward, state_next, loss, done)
        
        if max(environment.goal_check()):
            goal_bool = True


    return current_step, goal_bool

In [20]:
def test_episode():
    # Initialize Goal at Random Location
    # sample = space.sample()
    sample = np.array([0.5,0])
    goal_center = np.tile(sample,(1,1))
    goal = Circle2D(goal_radius, goal_center)

    environment.init_env(1, state_dim, goal, c0, random_start = False)
    testPlotter.update_goal(goal)
    testLogger.save_state(environment.state)
    for current_step in range(n_steps):
      
        # Action
        action_now = agent.act(tr.as_tensor(environment.state, device=device, dtype=tr.float), deterministic=True)
        environment.step(action_now, c0, dt, noise_characteristic_length, test = True)

        # Log Action and State
        testLogger.save_action(action_now)
        testLogger.save_state(environment.state)
            
    return current_step

In [21]:
def simulation():
    update_target_agent(agent, target_agent)
    for ep in range(n_episodes):
        # if ep%(n_episodes//10) == 0:
        #     entropy_coef = entropy_coef * entropy_coef_decay
        episode_steps, goal_bool = episode()
        if goal_bool:
            print('Goal reached!')
            global entropy_coef
            entropy_coef = entropy_coef * entropy_coef_decay
            # print(entropy_coef)

        logger.save_episode(episode_steps)
        plotter.plot_last_episode()
        

        test_episode_steps = test_episode()
        testLogger.save_episode(test_episode_steps)
        testPlotter.plot_last_episode()        
        print('Episode', ep,' finished!')
        if memory.size > memory_batch_size:
            scheduler_actor.step()
            scheduler_critic1.step()
            scheduler_critic2.step()
        
plotter.clear_plots('logs')
testPlotter.clear_plots('test_logs')

simulation()

Goal reached!


Episode 0  finished!
Goal reached!
Episode 1  finished!
Goal reached!
Episode 2  finished!
Goal reached!
Episode 3  finished!
Goal reached!
Episode 4  finished!
Episode 5  finished!
Episode 6  finished!
Episode 7  finished!
Episode 8  finished!


## Animation

In [None]:
make_animation('logs/episode_paths',3)
make_animation('test_logs/episode_paths',3)

Moviepy - Building video logs/episode_paths_animation.mp4.
Moviepy - Writing video logs/episode_paths_animation.mp4



                                                            

Moviepy - Done !
Moviepy - video ready logs/episode_paths_animation.mp4
Moviepy - Building video test_logs/episode_paths_animation.mp4.
Moviepy - Writing video test_logs/episode_paths_animation.mp4



                                                            

Moviepy - Done !
Moviepy - video ready test_logs/episode_paths_animation.mp4
