In [1]:
import numpy as np
from numpy.random import choice, randint, rand, uniform

import pdb

import torch
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.utils.random import erdos_renyi_graph
from torch_geometric.utils import to_dense_adj, to_networkx, to_undirected

import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

import networkx as nx
import gym
from gym.spaces import Box, MultiDiscrete

from collections import namedtuple
from copy import copy, deepcopy
from typing import Optional
from enum import Enum, IntEnum
import sys
sys.path.append('/home/victorialena/rlkit')

import rlkit
from path_collector import MdpPathCollector

from any_replay_buffer import anyReplayBuffer

In [2]:
import time
start_time = time.time()

### env

In [3]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

In [4]:
sysconfig = namedtuple("sysconfig",
                       ['maxX', 'maxY', 'goal_reward'], 
                       defaults=[2, 2, 1.])

In [5]:
actions = namedtuple("actions", 
                    ['right', 'left', 'up', 'down'], 
                    defaults=[np.int64(0), np.int64(1), np.int64(2), np.int64(3)])
action = actions()
a2vecmap = torch.Tensor([[1., 0.],
                         [-1, 0.],
                         [0., 1.],
                         [0, -1.]]).to(device)

In [6]:
class droneDeliveryProbe(gym.Env):
    """
    ### Description
    
    ### Action Space
    Each agent in the scene can move R or L or not at all.
    
    ### State Space
    The state is defined as an arbitrary input array of positions of n drones, appended by the location
    of the goal region.
    
    ### Rewards
    The reward of +1 is given to the system for each drone reaching the goal region.
    
    ### Starting State
    Randomly initilized input array.
    
    ### Episode Termination
    When all drones have reached the goal region.
    
    ### Arguments
    No additional arguments are currently supported.
    """

    def __init__(self, device = 'cpu'):
        self.config = sysconfig()
        
        self.aspace = MultiDiscrete(len(action))
        self.sspace = MultiDiscrete([self.config.maxX, self.config.maxY])
        self.state = None
        
        self._device = device
        
    def get_distances(self):
        return (self.state[2:]-self.state[:2]).norm(p=1)
    
    def get_size(self):
        return torch.Tensor([self.config.maxX, self.config.maxY])
        
    def reward(self, a):
        return (self.state[:2] == self.state[2:]).all().float().item()
                        
    def step(self, a):
        err_msg = f"{a!r} ({type(a)}) is not a valid action."
        assert self.aspace.contains(a), err_msg
        
        reward = self.reward(a)
        a = a2vecmap[a]
        done = (self.state[:2] == self.state[2:]).all().item()
    
        self.state[:2] = (self.state[:2]+a).clamp(min=0, max=1)
        
        return deepcopy(self.state), deepcopy(reward), deepcopy(done), {}

    def reset(self, seed: Optional[int] = None):
        if not seed == None:
            super().reset(seed=seed)
          
        self.state = torch.Tensor([self.sspace.sample() for _ in range(2)]).flatten().to(device)
        return deepcopy(self.state)

    def render(self):
        pass
    
    def seed(self, n: int):
        super().seed(n)
        self.aspace.seed(n)
        self.sspace.seed(n)
        
    def to(self, device):
        self._device = device
        if self.state:
            self.state = self.state.to(device)

In [7]:
env = droneDeliveryProbe(device)
x = env.reset()

### Model

In [8]:
from collections import OrderedDict

from torch.nn import Linear, ReLU, Softmax
import torch.nn.functional as F
        
class droneDeliveryModel(nn.Module):
    
    def __init__(self, c_in, c_out, c_hidden=64, **kwargs):
        
        super().__init__()
        
        self.model = nn.Sequential(OrderedDict([
            ('lin1', Linear(c_in, c_hidden)),
            ('relu1', ReLU()),
            ('lin2', Linear(c_hidden, c_out)),
#             nn.Softmax(dim=-1) # no freaking softmax
        ]))

        self._device = 'cpu'  

    def forward(self, x):
        return self.model(x)
    
    def to(self, device):
        super().to(device)
        self._device = device

In [9]:
in_channels, out_channels = 4, len(action)

### RL 

In [10]:
max_len = env.config.maxX + env.config.maxY - 1

In [11]:
from rlkit.policies.base import Policy

pick = lambda x: np.random.choice(action, p=x/sum(x) if sum(x) != 0 else None)

class sysRolloutPolicy(nn.Module, Policy):
    def __init__(self):
        super().__init__()

    def get_action(self, obs):
        dis = (obs[2:] - obs[:2]).cpu().numpy()
        p = np.array([dis[0] > 0, dis[0] < 0, dis[1] > 0, dis[1] < 0], dtype=int)
        return pick(p), {}
    
class argmaxDiscretePolicy(nn.Module, Policy):
    def __init__(self, qf, dim=0):
        super().__init__()
        self.qf = qf
        self.dim = dim

    def get_action(self, obs):
        q_values = self.qf(obs)
        return q_values.cpu().detach().numpy().argmax(self.dim), {}

# redundant code - clean this up
class epsilonGreedyPolicy(nn.Module, Policy):
    def __init__(self, qf, space, eps=0.1, dim=0):
        super().__init__()
        self.qf = qf
        self.aspace = space
        
        self.eps = eps
        self.dim = dim

    def get_action(self, obs):
        if rand() < self.eps:
            return self.aspace.sample(), {}
        q_values = self.qf(obs)
        return q_values.cpu().detach().numpy().argmax(self.dim), {}

In [12]:
def mean_reward_per_traj(paths):
    return np.mean([np.sum(p['rewards']) for p in paths])

def mean_reward(paths):
    return np.hstack([p['rewards'] for p in paths]).mean()

In [13]:
example_policy = sysRolloutPolicy() 
path_collector = MdpPathCollector(env, example_policy)
paths = path_collector.collect_new_paths(100, max_len, False)
expected_heuristic = mean_reward_per_traj(paths)
print("Expected reward (per traj):", expected_heuristic)
expected_heuristic = mean_reward(paths)
print("Expected reward (per step):", expected_heuristic, '\n')

idx = np.random.randint(100)
for s, a, r, t in zip(paths[idx]['observations'], paths[idx]['actions'], 
                      paths[idx]['rewards'], paths[idx]['terminals']):
    print(s)
    print(a)
    print(r)
    print(t)

Expected reward (per traj): 1.0
Expected reward (per step): 0.49504950495049505 

tensor([0., 0., 1., 1.], device='cuda:0')
0
0.0
False
tensor([1., 0., 1., 1.], device='cuda:0')
2
0.0
False
tensor([1., 1., 1., 1.], device='cuda:0')
2
1.0
True


In [14]:
qf = droneDeliveryModel(in_channels, out_channels, 8)
qf.to(device)

example_policy = argmaxDiscretePolicy(qf) 
path_collector = MdpPathCollector(env, example_policy)
paths = path_collector.collect_new_paths(100, max_len, False)
expected_random =  mean_reward_per_traj(paths)
print("Expected reward (per traj):", expected_random)
expected_random = mean_reward(paths)
print("Expected reward (per step):", expected_random, '\n')

idx = np.random.randint(100)
for s, a, r, t in zip(paths[idx]['observations'], paths[idx]['actions'], 
                      paths[idx]['rewards'], paths[idx]['terminals']):
    print(s)
    print(a)
    print(r)
    print(t)

Expected reward (per traj): 0.34
Expected reward (per step): 0.14166666666666666 

tensor([0., 1., 0., 0.], device='cuda:0')
0
0.0
False
tensor([1., 1., 0., 0.], device='cuda:0')
0
0.0
False
tensor([1., 1., 0., 0.], device='cuda:0')
0
0.0
False


#### train

In [15]:
RANDOM_SEED = 0

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
env.seed(RANDOM_SEED)

In [16]:
qf = droneDeliveryModel(in_channels, out_channels, 16)
qf.to(device)

target_qf = droneDeliveryModel(in_channels, out_channels, 16)
target_qf.to(device)

In [17]:
qf_criterion = nn.MSELoss()
eval_policy = argmaxDiscretePolicy(qf)
# expl_policy = sysRolloutPolicy()
expl_policy = epsilonGreedyPolicy(qf, env.aspace, eps=0.5)

In [18]:
expl_path_collector = MdpPathCollector(env, expl_policy) 
eval_path_collector = MdpPathCollector(env, eval_policy)
replay_buffer = anyReplayBuffer(128, prioritized=True)
optimizer = Adam(qf.parameters(), lr=5E-3)

In [19]:
n_epoch = 20
n_iter = 100
batch_size = 128
n_samples = 256

loss = []
avg_r_train = []
avg_r_test = []

for i in range(n_epoch):
    qf.train(False)
    paths = eval_path_collector.collect_new_paths(batch_size, max_len, False)
    avg_r_test.append(mean_reward_per_traj(paths))
    
    paths = expl_path_collector.collect_new_paths(n_samples, max_len, False)
    replay_buffer.add_paths(paths)
    
    qf.train(True)    
    for _ in range(n_iter):
        batch = replay_buffer.random_batch(batch_size)
        rewards = torch.Tensor(batch['rewards']).unsqueeze(-1).to(device)
        terminals = torch.Tensor(batch['terminals']).unsqueeze(-1).to(device)
        actions = torch.Tensor(batch['actions']).to(device)

        obs = batch['observations']
        next_obs = batch['next_observations']
                
        X = torch.stack(next_obs)
        out = target_qf(X)
        
        target_q_values = out.max(-1, keepdim=True).values
        y_target = rewards + (1. - terminals) * 0.90 * target_q_values
        
        X = torch.stack(obs)
        out = qf(X)
               
        actions_one_hot = F.one_hot(actions.to(torch.int64), len(action))
        y_pred = torch.sum(out * actions_one_hot, dim=-1, keepdim=True)
        qf_loss = qf_criterion(y_pred, y_target)
        
        loss.append(qf_loss.item())
        avg_r_train.append(rewards.mean().item())
        
        optimizer.zero_grad() 
        qf_loss.backward()
        optimizer.step()
        
#     print(qf.state_dict().items())
    
    target_qf.load_state_dict(deepcopy(qf.state_dict()))
    print("iter ", i+1, " -> loss: ", np.mean(loss[-n_iter:]),
          ", rewards: (train) ", np.mean(avg_r_train[-n_iter:]),
          ", (test) ", avg_r_test[-1])

iter  1  -> loss:  0.1511211445555091 , rewards: (train)  0.401796875 , (test)  0.4375
iter  2  -> loss:  0.019223500452935697 , rewards: (train)  0.303984375 , (test)  0.2109375
iter  3  -> loss:  0.004559190467698499 , rewards: (train)  0.434375 , (test)  0.453125
iter  4  -> loss:  0.005707259718328714 , rewards: (train)  0.473515625 , (test)  0.546875
iter  5  -> loss:  0.001630101787741296 , rewards: (train)  0.606640625 , (test)  0.890625
iter  6  -> loss:  0.0004854440772760427 , rewards: (train)  0.609765625 , (test)  1.0
iter  7  -> loss:  0.0004920277573546627 , rewards: (train)  0.579765625 , (test)  0.65625
iter  8  -> loss:  0.00016274868827167665 , rewards: (train)  0.565078125 , (test)  1.0
iter  9  -> loss:  0.0003416326496517286 , rewards: (train)  0.4934375 , (test)  0.8828125
iter  10  -> loss:  0.00014817242554272526 , rewards: (train)  0.643046875 , (test)  1.0
iter  11  -> loss:  8.460754726911545e-05 , rewards: (train)  0.65734375 , (test)  1.0
iter  12  -> loss:

In [20]:
print("This eval took me ", time.time() - start_time, " seconds. Thanks for waiting :)")

This eval took me  9.360820770263672  seconds. Thanks for waiting :)


In [21]:
qf.train(False);

In [22]:
qf(torch.Tensor([0, 0, 0, 0]).to(device))

tensor([0.9875, 0.9990, 1.0198, 1.0001], device='cuda:0',
       grad_fn=<AddBackward0>)

```python 
import matplotlib.pyplot as plt

plt.plot(np.arange(n_iter*n_epoch), [expected_random]*(n_iter*n_epoch), label = "random", color='lightgray')
plt.plot(np.arange(n_iter*n_epoch), [expected_heuristic]*(n_iter*n_epoch), label = "move to closest",  color='darkgray')

# plt.plot(np.arange(n_iter*n_epoch), avg_r_train, label = "avg R (train)")
plt.plot(np.arange(n_iter, n_iter*n_epoch+1, step=n_iter), avg_r_test, label = "avg R (test)") 
plt.legend()

plt.savefig('training_log.png', dpi=300)
```

#### options ot make it better
1. curriculum learning
2. higher eps
3. normalize input
4. better exploration strategy
5. one hot encode input (does not scale well)