In [4]:
def get_expected_rewards(rewards, gamma=0.9):
    G = np.zeros_like(rewards, dtype=float)
    G[-1] = rewards[-1]
    for idx in range(-2, -len(rewards)-1, -1):
        G[idx] = rewards[idx] + gamma * G[idx+1]
    return G


def to_one_hot(y_tensor, ndims):
    y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
    y_one_hot = torch.zeros(
        y_tensor.size()[0], ndims).scatter_(1, y_tensor, 1)
    return y_one_hot

In [5]:
import torch.nn as nn

class CostNN(nn.Module):
    def __init__(
        self, 
        state_dim,
        hidden_dim1 = 128, 
        out_features = 1, 
    ):
        super(CostNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim1),
            nn.ReLU(),
            nn.Linear(hidden_dim1, out_features)
        )
    def forward(self, x):
        return self.net(x)    

In [6]:
import numpy as np
import torch
import torch.nn.functional as F
import tensorflow as tf


class PG(nn.Module):
    def __init__(self, state_shape, n_actions):
        super().__init__()
        self.state_shape = state_shape
        self.n_actions = n_actions

        # Policy takes input initial conditions and outputs planning route, open-loop control
        self.model = nn.Sequential(
            nn.LSTM(input_size = 10, hidden_size = 4, num_layers = 1, dropout = 0.2)
        )
        self.optimizer = torch.optim.Adam(self.model.parameters(), 1e-3)
    
    def predict_probs(self, states):
        states = torch.FloatTensor(states)
        logits, (h_T, c_T) = self.model(states)
        probs = F.softmax(logits, dim = -1).detach().numpy()
        return probs
    
    # Run agent in environment to create sample trajectories by generator
    # The environment model is a seq2seq model
    def generate_session(self, sysmodel, t_max=1000):
        states, actions, traj_probs, rewards = [], [], [], []
        states_init = np.zeros((500,1, 10)) + 24
        actions_probs_policy = self.predict_probs(states_init)
        actions = []
        actions_probs_policy = np.squeeze(actions_probs_policy)       
        for prob in actions_probs_policy:                  
            actions.append(np.random.choice(self.n_actions,  p = prob))
 
        # Apply dict to go from 1,2,3,4 as action to [0,2400,0] [0,2400,1000]  [2300,0,0]  [2300,0,1000] 
        my_dict = {0:[0,2400,0], 1:[0,2400,1000], 2:[2300,0,0], 3:[2300,0,1000]}
        actions_array = np.zeros((500,3))  
        actions_array = map(my_dict.get, actions)

        actions_array = np.array(list(actions_array))
        actions_array_sysmodel = np.concatenate([np.arange(actions_array.shape[0])[:,None]+1,actions_array], axis=1) 
        actions_array_sysmodel = tf.expand_dims(actions_array_sysmodel, axis = 0, name=None)

        states = sysmodel(actions_array_sysmodel)

        return states, actions_array, actions_probs_policy

    def _get_cumulative_rewards(self, rewards, gamma=0.99):
        G = np.zeros_like(rewards, dtype = float)
        G[-1] = rewards[-1]
        for idx in range(-2, -len(rewards)-1, -1):
            G[idx] = rewards[idx] + gamma * G[idx+1]
        return G

    def _to_one_hot(self, y_tensor, ndims):
        y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
        y_one_hot = torch.zeros(
            y_tensor.size()[0], ndims).scatter_(1, y_tensor, 1)
        return y_one_hot

In [7]:
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
import myModelinTF

from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F

# SEEDS
# seed = 18095048
# random.seed(seed)
# np.random.seed(seed)
# torch.manual_seed(seed)

# ENV SETUP
sysmodel = myModelinTF.load_model()
n_actions = 4
state = np.zeros((500,10)) + 25
state_shape = state.shape

# LOADING EXPERT/DEMO SAMPLES
trajs_demo = np.load('dataset.npy', allow_pickle=True)

# INITILIZING POLICY AND REWARD FUNCTION
policy = PG(state_shape, n_actions)
cost_f = CostNN(state_shape[0] + 1) # states (8) + action, trajectory
policy_optimizer = torch.optim.Adam(policy.parameters(), 1e-2, weight_decay=1e-4)
cost_optimizer = torch.optim.Adam(cost_f.parameters(), 1e-2, weight_decay=1e-4)

mean_rewards = []
mean_costs = []
mean_loss_rew = []
EPISODES_TO_PLAY = 1
REWARD_FUNCTION_UPDATE = 10
DEMO_BATCH = 200
sample_trajs = []

D_demo, D_samp = np.array([]), np.array([])

# CONVERTS TRAJ LIST TO STEP LIST
def preprocess_traj(traj_list, step_list, is_Demo = False):
    step_list = step_list.tolist()
    for traj in traj_list:
        states = np.array(traj[0])
        if is_Demo:
            probs = np.ones((500,states.shape[0]))
            states = np.transpose(traj[0])
            actions = traj[1]
        else:           
            states = np.squeeze(traj[0])
            actions = np.array(traj[1]) 
            probs = traj[2]

        x = np.concatenate((states, actions, probs), axis=1) 
        step_list.extend(x)
    return np.array(step_list)

D_demo = preprocess_traj(trajs_demo, D_demo, is_Demo=True)
return_list, sum_of_cost_list = [], []


for i in range(100000):

    trajs_samp = [policy.generate_session(sysmodel) for _ in range(EPISODES_TO_PLAY)]

    D_samp = preprocess_traj(trajs_samp, D_samp)

    # UPDATING REWARD FUNCTION (TAKES IN D_samp, D_demo)
    loss_rew = []

    #REWARD_FUNCTION_UPDATE = round(REWARD_FUNCTION_UPDATE * 1/((i+1)))+900
    for _ in range(REWARD_FUNCTION_UPDATE):

        selected_samp = np.random.choice(len(D_samp), DEMO_BATCH)
        selected_demo = np.random.choice(len(D_demo), DEMO_BATCH)

        D_s_samp = D_samp[selected_samp]
        D_s_demo = D_demo[selected_demo]

        #D̂ samp ← D̂ demo ∪ D̂ samp
        D_s_samp_demo = np.concatenate((D_s_demo, D_s_samp), axis = 0)
        #D_s_samp = np.add(D_s_demo, D_s_samp) 

        states, actions, probs = D_s_samp_demo[:,:-2], D_s_samp_demo[:,-2], D_s_samp_demo[:,-1]
        states_demo, actions_demo = D_s_demo[:,:-2], D_s_demo[:,-2]

        # Reducing from float64 to float32 for making computaton faster
        states = torch.tensor(states, dtype=torch.float32)
        probs = torch.tensor(probs, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.float32)
        states_demo = torch.tensor(states_demo, dtype=torch.float32)
        actions_demo = torch.tensor(actions_demo, dtype=torch.float32)

        # TOADD
        costs = cost_f(torch.cat((states, actions.reshape(-1, 1)), dim=-1))
        costs_demo = cost_f(torch.cat((states_demo, actions_demo.reshape(-1, 1)), dim=-1))

        # LOSS CALCULATION FOR COST FUNCTION
        loss_cost = torch.mean(costs_demo) - \
                torch.log(torch.mean(torch.exp(costs)/(probs+1e-7))) 
        # UPDATING THE COST FUNCTION
        cost_optimizer.zero_grad()
        loss_cost.backward()
        cost_optimizer.step()

        loss_rew.append(loss_cost.detach())

    for traj_samp in trajs_samp:
        states_samp, actions_samp = traj_samp
        
        states_samp = torch.tensor(states_samp, dtype=torch.float32)
        actions_samp = torch.tensor(actions_samp, dtype=torch.float32)

        logits_samp = policy(states_samp) # forward pass
        probs_samp = nn.functional.softmax(logits_samp, -1) # get estimated actions for states
        log_probs_samp = nn.functional.log_softmax(logits_samp, -1)
            
        costs_samp = cost_f(torch.cat((states_samp, actions_samp.reshape(-1, 1)), dim=-1)).detach().numpy() # current cost function
        cumulative_returns_np = np.array(get_expected_rewards(-costs_samp, 0.9)) # recursively get expected discounted rewards (rewards given by the current cost function)
        cumulative_returns = torch.tensor(cumulative_returns_np, dtype=torch.float32)

        log_probs_for_actions = torch.sum(
            log_probs_samp * to_one_hot(actions_samp, 3), dim=1)
    
        entropy = -torch.mean(torch.sum(probs_samp*log_probs_samp), dim = -1 )
        #loss = -torch.mean(log_probs_for_actions*cumulative_returns - entropy*1e-2) # loss for the policy (isnt it the cost function output?)
        #average reward baseline
        cumulative_returns = (cumulative_returns - torch.mean(cumulative_returns))
        #whitening baseline
        #cumulative_returns = (cumulative_returns - torch.mean(cumulative_returns))/ (torch.std(cumulative_returns))
        loss_policy = -log_probs_for_actions*cumulative_returns
        # UPDATING THE POLICY NETWORK
        policy_optimizer.zero_grad()
        loss_policy.sum().backward()
        policy_optimizer.step()

        #returns = sum(rewards)
        sum_of_cost = np.sum(costs_samp)
        return_list.append(returns)
        sum_of_cost_list.append(sum_of_cost)

    #mean_rewards.append(np.mean(return_list))
    mean_costs.append(np.mean(sum_of_cost_list))
    mean_loss_rew.append(np.mean(loss_rew))

    # PLOTTING PERFORMANCE
    if i % 10 == 0:
        # clear_output(True)
        print(f"mean reward:{np.mean(return_list)} loss: {loss_cost}")

        plt.figure(figsize=[16, 12])
        # plt.subplot(2, 2, 1)
        # plt.title(f"Mean reward per {EPISODES_TO_PLAY} games")
        # plt.plot(mean_rewards)
        # plt.grid()

        plt.subplot(2, 2, 1)
        plt.title(f"Mean cost per {EPISODES_TO_PLAY} games")
        plt.plot(mean_costs)
        plt.grid()

        plt.subplot(2, 2, 3)
        plt.title(f"Mean loss per {REWARD_FUNCTION_UPDATE} batches")
        plt.plot(mean_loss_rew)
        plt.grid()

        # plt.show()
        plt.savefig('GCL_learning_curve.png')
        plt.close()

    if np.mean(return_list) > 500:
        break



ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 23 and the array at index 1 has size 17