In [None]:
import numpy as np
from numpy.random import choice, randint, rand, uniform

import pdb

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.utils.random import erdos_renyi_graph
from torch_geometric.utils import to_dense_adj, to_networkx, to_undirected

import networkx as nx
import gym
from gym.spaces import Box, MultiDiscrete

from collections import namedtuple
from copy import copy, deepcopy
from typing import Optional
from enum import Enum, IntEnum

import sys
sys.path.append('/home/victorialena/rlkit')

import rlkit
from path_collector import MdpPathCollector
from any_replay_buffer import anyReplayBuffer
# from policies import *

### env

In [None]:
sysconfig = namedtuple("sysconfig", 
                       ['arange', 'maxX', 'maxY', 'goal_reward'], 
                       defaults=[1., 10., 10., 1.])

In [None]:
class action(IntEnum):
    up = 0
    down = 1
    right = 2
    left = 3

In [None]:
a2vecmap = {i: v for i, v in enumerate(torch.Tensor([[0., 1.],
                                                     [0, -1.],
                                                     [1., 0.], 
                                                     [-1., 0]]))}

In [None]:
class droneDeliveryProbe(gym.Env):
    """
    ### Description
    
    ### Action Space
    Each agent in the scene can move R or L or not at all.
    
    ### State Space
    The state is defined as an arbitrary input array of positions of n drones, appended by the location
    of the goal region.
    
    ### Rewards
    The reward of +1 is given to the system for each drone reaching the goal region.
    
    ### Starting State
    Randomly initilized input array.
    
    ### Episode Termination
    When all drones have reached the goal region.
    
    ### Arguments
    No additional arguments are currently supported.
    """

    def __init__(self):
        self.config = sysconfig()
        
        self.aspace = MultiDiscrete(len(action))
        self.sspace = MultiDiscrete([self.config.maxX, self.config.maxY]) # dependency on arange
        self.state = None
        
    def get_distances(self):
        return self.state.norm(p=1, keepdim=True)#, dim=1)
#         return torch.cdist(X, S[self.n_drones:, :2], p=1)
    
    def get_size(self):
        return torch.Tensor([self.config.maxX, self.config.maxY])
        
    def reward(self, a):
        dis = self.get_distances()        
        return (dis==0).sum() * self.config.goal_reward 
                        
    def step(self, a):
        err_msg = f"{a!r} ({type(a)}) is not a valid action."
        assert self.aspace.contains(a), err_msg
                
        a = a2vecmap[a]
        reward = self.reward(a)
        done = all(self.get_distances() == 0)
        
        self.state = (self.state+a).clamp(min=torch.zeros(2), max=self.get_size()-1)        
        return deepcopy(self.state), deepcopy(reward.item()), deepcopy(done), {}

    def reset(self, seed: Optional[int] = None):
        if not seed == None:
            super().reset(seed=seed)
            
        self.state = torch.Tensor(self.sspace.sample())
        return deepcopy(self.state)

    def render(self):
        pass
    
    def seed(self, n: int):
        super().reset(seed=seed)

In [None]:
seed = 42
torch.random.manual_seed(seed)
np.random.seed(seed)

In [None]:
env = droneDeliveryProbe()
x = env.reset()

### Model

In [None]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

In [None]:
from collections import OrderedDict

from torch.nn import Linear, ReLU
import torch.nn.functional as F

class droneDeliveryModel(nn.Module):
    
    def __init__(self, c_in, c_out, c_hidden=64, dp_rate_linear=0.5, **kwargs):
        
        super().__init__()
        
        self.model = nn.Sequential(OrderedDict([
            ('fc1', nn.Linear(c_in, c_hidden)),
            ('relu1', nn.ReLU()),
            ('fc2', nn.Linear(c_hidden, c_out)),
            ('act', nn.Softmax()) 
        ]))

        self._device = 'cpu'

    def forward(self, x):
        return self.model(x.to(self._device))
    
    def to(self, device):
        super().to(device)
        self._device = device

In [None]:
in_channels, out_channels = 2, len(action)

In [None]:
qf = droneDeliveryModel(in_channels, out_channels, 16)
target_qf = droneDeliveryModel(in_channels, out_channels, 16)
qf.to(device)
target_qf.to(device)

### RL 

In [None]:
from rlkit.policies.base import Policy
# from policies import *

In [None]:
class sysRolloutPolicy(nn.Module, Policy):
    def __init__(self):
        super().__init__()

    def get_action(self, obs):
        prio = obs.numpy()/obs.sum().item() if obs.sum()!=0 else None
#         pdb.set_trace()
        return np.random.choice([action.left, action.down], p=prio), {}

class argmaxDiscretePolicy(nn.Module, Policy):
    def __init__(self, qf, dim=0):
        super().__init__()
        self.qf = qf
        self.dim = dim

    def get_action(self, obs):
        q_values = self.qf(obs)
        return q_values.cpu().detach().numpy().argmax(self.dim), {}

# redundant code - clean this up
class epsilonGreedyPolicy(nn.Module, Policy):
    def __init__(self, qf, space, eps=0.1, dim=0):
        super().__init__()
        self.qf = qf
        self.aspace = space
        
        self.eps = eps
        self.dim = dim

    def get_action(self, obs):
        if rand() < self.eps:
            return self.aspace.sample(), {}
        q_values = self.qf(obs)
        return q_values.cpu().detach().numpy().argmax(self.dim), {}

In [None]:
example_policy = argmaxDiscretePolicy(qf) 
path_collector = MdpPathCollector(env, example_policy)
paths = path_collector.collect_new_paths(250, 40, False)
expected_random = np.mean([np.sum(p['rewards']) for p in paths])
print("Expected reward (per traj):", expected_random)
expected_random = np.hstack([p['rewards'] for p in paths]).mean()
print("Expected reward (per step):", expected_random, '\n')

In [None]:
example_policy = sysRolloutPolicy() 
path_collector = MdpPathCollector(env, example_policy)
paths = path_collector.collect_new_paths(250, 40, False)
expected_heuristic = np.mean([np.sum(p['rewards']) for p in paths])
print("Expected reward (per traj):", expected_heuristic)
expected_heuristic = np.hstack([p['rewards'] for p in paths]).mean()
print("Expected reward (per step):", expected_heuristic, '\n')

idx = np.random.randint(100)
for s, a, r, t in zip(paths[idx]['observations'], paths[idx]['actions'], 
                      paths[idx]['rewards'], paths[idx]['terminals']):
    print(s)
    print(a)
    print(r)
    print(t)

In [None]:
qf_criterion = nn.MSELoss()
eval_policy = argmaxDiscretePolicy(qf)
# expl_policy = sysRolloutPolicy()
expl_policy = epsilonGreedyPolicy(qf, env.aspace, eps=0.1)

In [None]:
from torch.optim import Adam

expl_path_collector = MdpPathCollector(env, expl_policy)
eval_path_collector = MdpPathCollector(env, eval_policy)
replay_buffer = anyReplayBuffer(10000, prioritized=True)
optimizer = Adam(qf.parameters(), lr=5E-3)

#### train

In [None]:
n_epoch = 50
n_iter = 100
batch_size = 128*4
max_len = 40
n_samples = 256*4

loss = []
avg_r_train = []
avg_r_test = []

for i in range(n_epoch):
    qf.train(False)
    paths = eval_path_collector.collect_new_paths(batch_size, max_len, False)
    avg_r_test.append(np.hstack([p['rewards'] for p in paths]).mean())
    
    paths = expl_path_collector.collect_new_paths(n_samples, max_len, False)
    replay_buffer.add_paths(paths)
    
    qf.train(True)    
    for _ in range(n_iter):
        batch = replay_buffer.random_batch(batch_size)
        rewards = torch.Tensor(batch['rewards']).unsqueeze(-1)
        terminals = torch.Tensor(batch['terminals'])
        actions = torch.Tensor(batch['actions'])

        obs = batch['observations']
        next_obs = batch['next_observations']
    
        X = torch.stack(next_obs)
        out = target_qf(X).cpu()
        
        target_q_values = out.max(-1, keepdim=True).values
        y_target = rewards + (1. - terminals) * 0.95 * target_q_values
        
        X = torch.stack(obs)
        out = qf(X).cpu()
        
        actions_one_hot = F.one_hot(actions.to(torch.int64), len(action))
        y_pred = torch.sum(out * actions_one_hot, dim=-1, keepdim=True)
        qf_loss = qf_criterion(y_pred, y_target)
        
        loss.append(qf_loss.item())
        avg_r_train.append(rewards.mean().item())
        
        optimizer.zero_grad()
        qf_loss.backward()
        optimizer.step()
        
#     print(qf.state_dict().items())
    
    target_qf.load_state_dict(deepcopy(qf.state_dict()))
    print("iter ", i+1, " -> loss: ", np.mean(loss[-n_iter:]),
          ", rewards: (train) ", np.mean(avg_r_train[-n_iter:]),
          ", (test) ", avg_r_test[-1])

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(np.arange(n_iter*n_epoch), [expected_random]*(n_iter*n_epoch), label = "do nothing", color='lightgray')
plt.plot(np.arange(n_iter*n_epoch), [expected_heuristic]*(n_iter*n_epoch), label = "move to closest",  color='darkgray')

plt.plot(np.arange(n_iter*n_epoch), avg_r_train, label = "avg R (train)")
plt.plot(np.arange(n_iter, n_iter*n_epoch+1, step=n_iter), avg_r_test, label = "avg R (test)")
plt.legend()
# plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')

plt.savefig('training_log.png', dpi=300)

#### options ot make it better
1. curriculum learning
2. higher eps