In [1]:
import numpy as np
from numpy.random import choice, randint, rand, uniform

import pdb

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.utils.random import erdos_renyi_graph
from torch_geometric.utils import to_dense_adj, to_networkx, to_undirected

import networkx as nx
import gym
from gym.spaces import Box, MultiDiscrete

from collections import namedtuple
from copy import copy, deepcopy
from typing import Optional
from enum import Enum, IntEnum
import sys
sys.path.append('/home/victorialena/rlkit')

import rlkit
from path_collector import MdpPathCollector

from any_replay_buffer import anyReplayBuffer
# from policies import *

### env

In [2]:
sysconfig = namedtuple("sysconfig", 
                       ['arange', 'maxX', 'maxY', 'goal_reward'], 
                       defaults=[1., 3., 3., 1.])

In [3]:
actions = namedtuple("actions", 
                    ['up', 'down', 'right', 'left'], 
                    defaults=[np.int64(0), np.int64(1), np.int64(2), np.int64(3)])

In [5]:
action = actions()

In [6]:
a2vecmap = {i: v for i, v in enumerate(torch.Tensor([[0., 1.],
                                                     [0, -1.],
                                                     [1., 0.], 
                                                     [-1., 0]]))}

In [7]:
class droneDeliveryProbe(gym.Env):
    """
    ### Description
    
    ### Action Space
    Each agent in the scene can move R or L or not at all.
    
    ### State Space
    The state is defined as an arbitrary input array of positions of n drones, appended by the location
    of the goal region.
    
    ### Rewards
    The reward of +1 is given to the system for each drone reaching the goal region.
    
    ### Starting State
    Randomly initilized input array.
    
    ### Episode Termination
    When all drones have reached the goal region.
    
    ### Arguments
    No additional arguments are currently supported.
    """

    def __init__(self):
        self.config = sysconfig()
        
        self.aspace = MultiDiscrete(len(action))
        self.sspace = Box(low=np.zeros((2,2)), 
                          high=np.hstack([np.ones((2,1))*self.config.maxX, np.ones((2,1))*self.config.maxY]),
                          dtype=np.float32)
        self.state = None
        
    def get_distances(self):
        return (self.state.x[1]-self.state.x[0]).norm(p=1, keepdim=True)#, dim=1)
#         return torch.cdist(X, S[self.n_drones:, :2], p=1)
    
    def get_size(self):
        return torch.Tensor([self.config.maxX, self.config.maxY])
        
    def reward(self, a):
        dis = self.get_distances()        
        return (dis==0).sum() * self.config.goal_reward 
                        
    def step(self, a):
        err_msg = f"{a!r} ({type(a)}) is not a valid action."
        assert self.aspace.contains(a), err_msg
                
        a = a2vecmap[a]
        reward = self.reward(a)
        done = all(self.get_distances() == 0)
        
        self.state.x[0] = (self.state.x[0]+a).clamp(min=torch.zeros(2), max=self.get_size()-1)        
        return deepcopy(self.state), deepcopy(reward.item()), deepcopy(done), {}

    def reset(self, seed: Optional[int] = None):
        if not seed == None:
            super().reset(seed=seed)
            
        x = torch.Tensor(self.sspace.sample())
        x -= x % self.config.arange
        
        edge_index = torch.eye(2, dtype=torch.int64)
        self.state = Data(x=x, edge_index=edge_index)
        
        return deepcopy(self.state)

    def render(self):
        pass
    
    def seed(self, n: int):
        super().reset(seed=seed)

In [8]:
seed = 42
torch.random.manual_seed(seed)
np.random.seed(seed)

In [9]:
env = droneDeliveryProbe()
x = env.reset()

### Model

In [10]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

In [11]:
from collections import OrderedDict

from torch.nn import Linear, ReLU, Softmax
import torch.nn.functional as F
from torch_geometric.nn import Sequential, GCNConv, SAGEConv

class droneDeliveryModel(nn.Module):
    
    def __init__(self, c_in, c_out, c_hidden=64, maxXY = None, **kwargs):
        
        super().__init__()
        
        self.model = Sequential('x, edge_index', [
            (SAGEConv(c_in, c_hidden), 'x, edge_index -> x'),
            ReLU(inplace=True),
#             (SAGEConv(c_hidden c_hidden), 'x, edge_index -> x'),
#             ReLU(inplace=True),
            Linear(c_hidden, c_out),
            nn.Softmax(dim=-1)
        ])

        self._device = 'cpu'
        self.maxXY = maxXY        

    def forward(self, x):
        X = x.x
        if self.maxXY != None:
            X = X.div(self.maxXY) - 0.5
        return self.model(X.to(self._device), x.edge_index.to(self._device))[0]
    
    def to(self, device):
        super().to(device)
        self._device = device

In [12]:
in_channels, out_channels = 2, len(action)

In [13]:
qf = droneDeliveryModel(in_channels, out_channels, 16, maxXY = env.get_size())
qf.to(device)

target_qf = droneDeliveryModel(in_channels, out_channels, 16, maxXY = env.get_size())
target_qf.to(device)

### RL 

In [14]:
max_len = 10

In [15]:
from rlkit.policies.base import Policy
# from policies import *

pick = lambda x: np.random.choice(action, p=x/sum(x) if sum(x) != 0 else None)

class sysRolloutPolicy(nn.Module, Policy):
    def __init__(self):
        super().__init__()

    def get_action(self, obs):
        dis = (obs.x[1] - obs.x[0]).numpy()
        p = [dis[1] > 0, dis[1] < 0, dis[0] > 0, dis[0] < 0]
        return pick(p), {}
    
class argmaxDiscretePolicy(nn.Module, Policy):
    def __init__(self, qf, dim=0):
        super().__init__()
        self.qf = qf
        self.dim = dim

    def get_action(self, obs):
        q_values = self.qf(obs)
        return q_values.cpu().detach().numpy().argmax(self.dim), {}

# redundant code - clean this up
class epsilonGreedyPolicy(nn.Module, Policy):
    def __init__(self, qf, space, eps=0.1, dim=0):
        super().__init__()
        self.qf = qf
        self.aspace = space
        
        self.eps = eps
        self.dim = dim

    def get_action(self, obs):
        if rand() < self.eps:
            return self.aspace.sample(), {}
        q_values = self.qf(obs)
        return q_values.cpu().detach().numpy().argmax(self.dim), {}

In [18]:
example_policy = argmaxDiscretePolicy(qf) 
path_collector = MdpPathCollector(env, example_policy)
paths = path_collector.collect_new_paths(100, max_len, False)
expected_random = np.mean([np.sum(p['rewards']) for p in paths])
print("Expected reward (per traj):", expected_random)
expected_random = np.hstack([p['rewards'] for p in paths]).mean()
print("Expected reward (per step):", expected_random, '\n')

idx = np.random.randint(100)
for s, a, r, t in zip(paths[idx]['observations'], paths[idx]['actions'], 
                      paths[idx]['rewards'], paths[idx]['terminals']):
    print(s.x)
    print(a)
    print(r)
    print(t)

Expected reward (per traj): 0.22
Expected reward (per step): 0.026731470230862697 

tensor([[0., 1.],
        [0., 2.]])
1
0.0
[False]
tensor([[0., 0.],
        [0., 2.]])
2
0.0
[False]
tensor([[1., 0.],
        [0., 2.]])
2
0.0
[False]
tensor([[2., 0.],
        [0., 2.]])
2
0.0
[False]
tensor([[2., 0.],
        [0., 2.]])
2
0.0
[False]
tensor([[2., 0.],
        [0., 2.]])
2
0.0
[False]
tensor([[2., 0.],
        [0., 2.]])
2
0.0
[False]
tensor([[2., 0.],
        [0., 2.]])
2
0.0
[False]
tensor([[2., 0.],
        [0., 2.]])
2
0.0
[False]
tensor([[2., 0.],
        [0., 2.]])
2
0.0
[False]


In [19]:
example_policy = sysRolloutPolicy() 
path_collector = MdpPathCollector(env, example_policy)
paths = path_collector.collect_new_paths(100, max_len, False)
expected_heuristic = np.mean([np.sum(p['rewards']) for p in paths])
print("Expected reward (per traj):", expected_heuristic)
expected_heuristic = np.hstack([p['rewards'] for p in paths]).mean()
print("Expected reward (per step):", expected_heuristic, '\n')

idx = np.random.randint(100)
for s, a, r, t in zip(paths[idx]['observations'], paths[idx]['actions'], 
                      paths[idx]['rewards'], paths[idx]['terminals']):
    print(s.x)
    print(a)
    print(r)
    print(t)

Expected reward (per traj): 1.0
Expected reward (per step): 0.34965034965034963 

tensor([[1., 1.],
        [0., 0.]])
1
0.0
[False]
tensor([[1., 0.],
        [0., 0.]])
3
0.0
[False]
tensor([[0., 0.],
        [0., 0.]])
2
1.0
[ True]


In [20]:
qf_criterion = nn.MSELoss()
eval_policy = argmaxDiscretePolicy(qf)
expl_policy = sysRolloutPolicy()
# expl_policy = epsilonGreedyPolicy(qf, env.aspace, eps=0.1)

In [21]:
from torch.optim import Adam

expl_path_collector = MdpPathCollector(env, expl_policy) 
eval_path_collector = MdpPathCollector(env, eval_policy)
replay_buffer = anyReplayBuffer(10000, prioritized=True)
optimizer = Adam(qf.parameters(), lr=5E-3)

#### train

In [22]:
n_epoch = 50
n_iter = 10
batch_size = 128
n_samples = 256

loss = []
avg_r_train = []
avg_r_test = []

for i in range(n_epoch):
    qf.train(False)
    paths = eval_path_collector.collect_new_paths(batch_size, max_len, False)
    avg_r_test.append(np.hstack([p['rewards'] for p in paths]).mean())
    
    paths = expl_path_collector.collect_new_paths(n_samples, max_len, False)
    replay_buffer.add_paths(paths)
    
    qf.train(True)    
    for _ in range(n_iter):
        batch = replay_buffer.random_batch(batch_size)
        rewards = torch.Tensor(batch['rewards']).unsqueeze(-1)
        terminals = torch.Tensor(batch['terminals'])
        actions = torch.Tensor(batch['actions'])

        obs = batch['observations']
        next_obs = batch['next_observations']
        
        out = torch.stack(list(map(target_qf, next_obs)), axis=0).cpu()        
        target_q_values = out.max(-1, keepdims=True).values
        y_target = rewards + (1. - terminals) * 0.95 * target_q_values
        out = torch.stack(list(map(qf, obs)), axis=0).cpu()
               
        actions_one_hot = F.one_hot(actions.to(torch.int64), len(action))
        y_pred = torch.sum(out * actions_one_hot, dim=-1, keepdim=True)
        qf_loss = qf_criterion(y_pred, y_target)
        
        loss.append(qf_loss.item())
        avg_r_train.append(rewards.mean().item())
        
        optimizer.zero_grad() 
        qf_loss.backward()
#         pdb.set_trace()
        optimizer.step()
        
#     print(qf.state_dict().items())
    
    target_qf.load_state_dict(deepcopy(qf.state_dict()))
    print("iter ", i+1, " -> loss: ", np.mean(loss[-n_iter:]),
          ", rewards: (train) ", np.mean(avg_r_train[-n_iter:]),
          ", (test) ", avg_r_test[-1])

iter  1  -> loss:  0.34079586863517763 , rewards: (train)  0.59921875 , (test)  0.024390243902439025
iter  2  -> loss:  0.348622715473175 , rewards: (train)  0.6171875 , (test)  0.015985790408525755
iter  3  -> loss:  0.3441340297460556 , rewards: (train)  0.6140625 , (test)  0.02117863720073665
iter  4  -> loss:  0.33875506818294526 , rewards: (train)  0.60078125 , (test)  0.025689819219790674
iter  5  -> loss:  0.32619392275810244 , rewards: (train)  0.58125 , (test)  0.03290129611166501
iter  6  -> loss:  0.31986173391342165 , rewards: (train)  0.57265625 , (test)  0.03406813627254509
iter  7  -> loss:  0.3468332141637802 , rewards: (train)  0.615625 , (test)  0.02554399243140965
iter  8  -> loss:  0.33547801673412325 , rewards: (train)  0.59453125 , (test)  0.018867924528301886
iter  9  -> loss:  0.3315077215433121 , rewards: (train)  0.58984375 , (test)  0.017070979335130278
iter  10  -> loss:  0.3359712898731232 , rewards: (train)  0.60078125 , (test)  0.019004524886877826
iter  

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(np.arange(n_iter*n_epoch), [expected_random]*(n_iter*n_epoch), label = "random", color='lightgray')
plt.plot(np.arange(n_iter*n_epoch), [expected_heuristic]*(n_iter*n_epoch), label = "move to closest",  color='darkgray')

# plt.plot(np.arange(n_iter*n_epoch), avg_r_train, label = "avg R (train)")
plt.plot(np.arange(n_iter, n_iter*n_epoch+1, step=n_iter), avg_r_test, label = "avg R (test)") 
plt.legend()
# plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')

plt.savefig('training_log.png', dpi=300)

#### options ot make it better
1. curriculum learning
2. higher eps