# T4

### Avances

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [22]:
import torch
import torch.nn as nn
import numpy as np
from buffer import Buffer
import gym       
import torch.nn.functional as F
from torch.optim import AdamW

# 1. Parametrización del modelo

In [794]:
env = gym.make('Pendulum-v1')
env = gym.make('CartPole-v1')
#env = gym.make('CartPole-v1')

dim_states = env.observation_space.shape[0]
continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

In [795]:
class Model(nn.Module):
    
    def __init__(self, dim_states, dim_actions, continuous_control):
        super(Model, self).__init__()
        
        self._fc1 = nn.Sequential(
        nn.Linear(dim_states+1, 64),
        nn.ReLU(),
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, dim_states)
    )
       
    def forward(self, state, action):

        if len(state.shape)>1:

            concat_o_a=np.concatenate((state,action.reshape(-1,1)),axis=1)
            input=torch.from_numpy(concat_o_a).float()
            #print(input)
            output=self._fc1(input)
        
        else:
            
            action=np.array(action if continuous_control else [action])
            #print(action)
            #print(state)
            concat_o_a=np.concatenate((state,action))
            input=torch.from_numpy(concat_o_a).float()
            #print(input)
            output=self._fc1(input)

        return output

In [796]:
Model_transitions= Model(dim_states, dim_actions,continuous_control)
Model_transitions

Model(
  (_fc1): Sequential(
    (0): Linear(in_features=5, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=4, bias=True)
  )
)

In [797]:
o_t=env.reset()
o_t

array([-0.03871325,  0.01166419, -0.02463445,  0.0213343 ], dtype=float32)

In [798]:
o_t.shape

(4,)

In [799]:
len(o_t.shape)

1

In [800]:
a_t=env.action_space.sample()
a_t

1

In [801]:
Model_transitions(o_t,a_t)

tensor([ 0.0029, -0.1297,  0.0632,  0.1675], grad_fn=<AddBackward0>)

# 2. Muestreo de experiencias y entrenamiento

In [802]:
# 2.1
class Buffer:

    def __init__(self, dim_states, dim_actions, max_size, sample_size):

        assert sample_size < max_size, "Sample size cannot be greater than buffer size"
        
        self._buffer_idx     = 0
        self._exps_stored    = 0
        self._buffer_size    = max_size
        self._sample_size    = sample_size

        self._s_t_array      = np.zeros((max_size, dim_states))
        self._a_t_array      = np.zeros((max_size))
        self._s_t1_array     = np.zeros((max_size, dim_states))


    def store_transition(self, s_t, a_t, s_t1):
        
        # Add transition to the buffer
        self._s_t_array[self._buffer_idx]=s_t   
        self._a_t_array[self._buffer_idx]=a_t  
        self._s_t1_array[self._buffer_idx]=s_t1 

        # Aumento de indice y reinicio de indice si superamos capacidad
        self._buffer_idx = (self._buffer_idx + 1) % self._buffer_size
        self._exps_stored += 1
        pass

    
    def get_batches(self):
        
        assert self._exps_stored + 1 > self._sample_size, "Not enough samples has been stored to start sampling"

        # Get all the data contained in the buffer as batches 
        batches_s_t = [self._s_t_array[i:i+self._sample_size] for i in range(0, len(self._s_t_array), self._sample_size)]
        batches_a_t = [self._a_t_array[i:i+self._sample_size] for i in range(0, len(self._a_t_array), self._sample_size)]
        batches_s_t1 = [self._s_t1_array[i:i+self._sample_size] for i in range(0, len(self._s_t1_array), self._sample_size)]

        return [batches_s_t,batches_a_t,batches_s_t1]
            

In [809]:
env = gym.make('CartPole-v1')
env = gym.make('Pendulum-v1')
dim_states = env.observation_space.shape[0]
continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

In [810]:
# Inicialización de memory Buffer
max_size=7
sample_size=2
memory=Buffer(dim_states, dim_actions, max_size, sample_size)

In [811]:
# Simulación de 6 transiciones
s_t=env.reset()

for i in range(6):
   
    a_t=np.random.randint(2)
    a_t= np.array([np.random.random()]).astype("float32")

    s_t1, r_t, done_t, _ = env.step(a_t)
    print(s_t1, r_t, done_t)

    # Guardar
    memory.store_transition(s_t, a_t, s_t1)

    s_t = s_t1

[-0.52470106 -0.85128653  0.13863386] -4.586741365925634 False
[-0.54451203 -0.838753   -0.46886647] -4.509774062135058 False
[-0.5893075  -0.80790883 -1.0878834 ] -4.629895028682279 False
[-0.65552187 -0.7551762  -1.6934413 ] -4.962740140969189 False
[-0.7363409 -0.6766107 -2.2554572] -5.511061698994816 False
[-0.818928  -0.5738963 -2.6378846] -6.26193082052339 False


In [815]:
# 2.2 # 2.3
class RSPlanner:
    
    def __init__(self, dim_states, dim_actions, continuous_control, model, planning_horizon, nb_trajectories, reward_function):
        self._dim_states = dim_states
        self._dim_actions = dim_actions
        self._continuous_control = continuous_control

        self._model = model

        self._planning_horizon = planning_horizon
        self._nb_trajectories = nb_trajectories
        self._reward_function = reward_function

        
    def generate_plan(self, observation):
        # Generate a sequence of random actions
        if self._continuous_control:
            random_actions = None
        else:
            random_actions = None
        
        # Construct initial observation 
        o_t = None

        rewards = torch.zeros((self._nb_trajectories, ))
        for i in range(self._planning_horizon):
            # Get a_t
            if self._continuous_control:
                a_t = None
            else:
                a_t = None

            # Predict next observation using the model

            # Compute reward (use reward_function)
            
            o_t = o_t1

        # Return the best sequence of actions
        return None




class MBRLAgent:

    def __init__(self, dim_states, dim_actions, continuous_control, model_lr, buffer_size, batch_size, 
                       planning_horizon, nb_trajectories, reward_function):

        self._dim_states = dim_states
        self._dim_actions = dim_actions

        self._continuous_control = continuous_control

        self._model_lr = model_lr

        self._model = Model(self._dim_states, self._dim_actions, self._continuous_control)

        # Adam optimizer
        self._model_optimizer = AdamW(self._model.parameters(), lr=self._model_lr)

        self._buffer = Buffer(self._dim_states, self._dim_actions, buffer_size, batch_size)
        
        self._planner = RSPlanner(self._dim_states, self._dim_actions, self._continuous_control, 
                                  self._model, planning_horizon, nb_trajectories, reward_function)


    def select_action(self, observation, random=False):

        if random:
            # Return random action
            if self._continuous_control:

                return np.array([np.random.random()]).astype("float32")

            else:
                return np.random.randint(2)
            
            
        # Generate plan
        plan = None

        # Return the first action of the plan
        if self._continuous_control:
            return None
        
        return None


    def store_transition(self, s_t, a_t, s_t1):
        pass


    def update_model(self):
        
        batches = self._buffer.get_batches()
        
        for batch in batches:
            s_t=batch[0]
            a_t=batch[1]
            s_t1=batch[2]
            pred=self._model
            # Use the batches to train the model
            # loss: avg((s_t1 - model(s_t, a_t))^2)
            pass
        

In [813]:
Model_transitions= Model(dim_states, dim_actions,continuous_control)
Model_transitions

Model(
  (_fc1): Sequential(
    (0): Linear(in_features=4, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=3, bias=True)
  )
)

In [826]:
optimizer=AdamW(Model_transitions.parameters(), lr=0.001)
s_t,a_t,s_t1=memory.get_batches()

for x,y,z in zip(s_t,a_t,s_t1):
    print("-------")
    #print(x)
    #print(y)
    #print(Model_transitions(x,y))
    #loss=((Model_transitions(x,y)-torch.tensor(z))**2).mean()
    #print(((Model_transitions(x,y)-torch.tensor(z))**2).mean())
    # Backpropagation
    Model_transitions.zero_grad()
    #print(Model_transitions(x,y).view(-1,1).squeeze())
    #print(torch.tensor(z).view(-1,1))
    loss=F.mse_loss(Model_transitions(x,y).float(), torch.tensor(z).float())
    loss.backward()
    optimizer.step()
    #print("-------------")
    #break

-------
-------
-------
-------
