# First RL Gym Project

## 1. Imports

In [5]:
import torch as tr
import torch.nn as nn
from torch.optim.lr_scheduler import MultiStepLR
import matplotlib.pyplot as plt
import numpy as np
from environments import BoxEnvironment1 as env
from environment_utils import Box
from agents import SACAgent
from agent_utils import update_target_agent, ReplayBuffer


device = tr.device('cuda' if tr.cuda.is_available() else 'cpu')
tr.autograd.set_detect_anomaly(True)
tr.set_default_tensor_type(tr.FloatTensor)

## 2. Hyperparameters

In [6]:
# ---------------- Training -------------------
    # Duration of training
runs = 1
n_episodes = 20
n_steps = 200
    # Training parameters
agent_batch_size = 1
learning_rate = 0.001
learning_rate_decay = 0.8
kl_beta = 0.001
    # Bellman equation
future_discount = 0.8
    # Update Target Model
target_model_update = n_episodes//10

# ---------------- Environment  ----------------
    # Environment box size
env_width = 2
env_height = 2
space = Box(env_width, env_height)
    # Goal box size and center
goal_width = 0.2
goal_height = 0.2
goal_center = np.tile([0.5,0],(agent_batch_size,1))
goal = Box(goal_width, goal_height, goal_center)
    # Time step size
dt = 0.0375
    # Noise
noise_characteristic_length = 1
    # Maximum of potential
U0 = 0.4

# ---------------- Agent ----------------------
state_dim = 5
hidden_dims = [16,16]
act_dim = 1
act_positive = True
act_scaling = 2*np.pi
memory_size = 64
memory_batch_size = 8 

# ---------------- Other ----------------------
plt.rcParams.update({'font.size': 13})
plt.rcParams.update({'figure.dpi': 150})
total_time = []
update_state_time = []

### 3. Simulation

In [7]:
def episode(environment, agent, target_agent, memory):    
    environment.init_state(agent_batch_size)
    for current_step in range(n_steps):
        # Beginning state
        state_now = environment.state
        # Action
        action_now, _ = agent.actor(tr.as_tensor(environment.state, device=device, dtype=tr.float))
        # Q value now
        critic1_now = agent.critic1(tr.as_tensor(environment.state, device=device, dtype=tr.float), action_now)
        critic2_now = agent.critic2(tr.as_tensor(environment.state, device=device, dtype=tr.float), action_now)
        critic_now = tr.min(critic1_now, critic2_now)
        # Next state
        environment.step(action_now.detach().cpu().numpy(), U0, dt)
        state_next = environment.state
        # Reward
        reward = environment.reward()
        # Done
        done = environment.goal_check()
        # Store in memory
        memory.store(state_now, action_now, reward, state_next, done)
        # Sample from memory
        memory_batch = memory.sample_batch(memory_batch_size)
        raise Exception(memory_batch['observation'].shape)
        memory_state = memory_batch['observation']

        # Update agent


        if max(environment.goal_check()): 
            print('Goal reached')
            break


In [8]:
def simulation():
    environment = env(space, goal)
    memory = ReplayBuffer(state_dim, act_dim, memory_size, agent_batch_size)
    agent = SACAgent(state_dim, act_dim, hidden_dims, act_scaling, act_positive).float().to(device)
    target_agent = SACAgent(state_dim, act_dim, hidden_dims, act_scaling, act_positive).float().to(device)
    update_target_agent(agent, target_agent)
    for ep in range(n_episodes):
        episode(environment, agent, target_agent, memory)
        print('Episode',ep,' finished!')

simulation()

Exception: torch.Size([8, 1, 5])