In [None]:
import numpy as np
import scipy
import sys
import torch
from collections import deque
from functools import reduce

print("===VERSIONS===")
print(f"Python: {sys.version}")
print(f"numpy: {np.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"Scipy: {scipy.__version__}")

In [None]:
import torch.nn.functional as F


class BasicNN(torch.nn.Module):
    def __init__(self, ndim):
        super(BasicNN, self).__init__()
        hls = round(8*ndim / 3) # chosen by vibes
        self.fc1 = torch.nn.Linear(2*ndim, hls)
        self.fc2 = torch.nn.Linear(hls, hls)
        self.fc3 = torch.nn.Linear(hls, 1)

        self.fc1.weight.data.fill_(0)
        self.fc2.weight.data.fill_(0)
        self.fc3.weight.data.fill_(0)

        self.double()

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
def idenfeatures(states, actions):
    return np.array([*states, *actions])

get_features = idenfeatures

In [None]:
import time
from scipy.stats import gamma


class GammaCrediter:
    def __init__(self, ndims):
        self._history = []
        # From TAMER
        self.alpha = 2.0
        self.beta = 0.28
        self.delay = 0.20 # seconds

    def add_index(self, feature_vec):
        self._history.append((feature_vec, time.time()))

    def credit(self):
        # Prune old times
        self._history = [x for x in self._history if time.time() - x[1] < gamma.ppf(0.999, self.alpha, self.delay, self.beta)]
        self._history.sort(key=lambda x: x[1], reverse=True)
        # Calculate from remaining
        return sum([
            (gamma.cdf(x[1], self.alpha, self.delay, self.beta) - \
             (0 if idx == 0 else gamma.cdf(self._history[idx-1][1], self.alpha, self.delay, self.beta))) * \
            x[0] for idx, x in enumerate(self._history)
        ])

In [None]:
# Scurto used alpha = 0.002
class NeuralSGDAgent:
    def __init__(self, ndims, step, epsilon=0.1, alpha=0.02, gamma=0.50, crediter=GammaCrediter):
        self._ndims = ndims
        self._step = step
        self.crediter = crediter(self._ndims)
        self.state = np.zeros(self._ndims)
        self._net = BasicNN(self._ndims)
        self._criterion = torch.nn.MSELoss()
        self._optimizer = torch.optim.SGD(self._net.parameters(), lr=alpha)
        a = np.eye(self._ndims) * self._step
        self._actions = np.concatenate((a, -a))
        self._exclude_dims = set()
        self._rng = np.random.default_rng()
        self._epsilon = epsilon
        self._alpha = alpha # taken from scurto et al 2021
        self._gamma = gamma

    def set_state(self, state, action=None):
        if action is None:
            action = np.zeros(self._ndims)
        self.state = state
        self.crediter.add_index(get_features(self.state, action))

    def check_bounds(self, state):
        return ((state >= 0) & (state <= 1)).all(0)

    def get_value(self, state, action):
        return self._net(torch.from_numpy(get_features(state, action))).item()

    def select_action(self):
        max_actions = []
        invs = []
        max_value = np.NINF
        for action in self.included_actions():
            next_state = self.state + action
            if self.check_bounds(next_state):
                value = self.get_value(next_state, action)
                if np.isclose(max_value, value):
                    max_actions.append(action)
                elif value > max_value:
                    max_value = value
                    max_actions = [action]
            else:
                invs.append(action)
        if len(invs) > 0:
            print(f"Invalid actions {invs}")
        print(f"Maximum value of {max_value}")
        if len(max_actions) > 0:
            return max_actions[self._rng.integers(len(max_actions))]
        else:
            print("No valid actions!")
            return None

    def select_epsilon_greedy_action(self):
        if self._rng.random() < self._epsilon:   
            # Random action
            invalid = True
            actions = self.included_actions()
            if len(actions) > 0:
                while invalid:
                    action = actions[self._rng.integers(len(actions))]
                    next_state = self.state + action
                    invalid = not self.check_bounds(next_state)
                print(f"Taking random action {action}")
                return action
            else:
                print("No valid actions!")
                return None
        else:
            return self.select_action() 

    def apply_action(self, action):
        next_state = self.state + action
        if self.check_bounds(next_state):
            self.set_state(next_state, action)
        else:
            raise Exception(f"Tried to transition to an invalid state {next_state}.")

    def reward_and_bootstrap(self, reward):
        credit_features = self.crediter.credit()
        action = self.select_action()
        if action is not None:
            next_state = self.state + action
            exp_gain = reward + self._gamma * self.get_value(next_state, action)
        else:
            exp_gain = reward
        self._optimizer.zero_grad()
        error = self._criterion(self._net(torch.from_numpy(credit_features)), torch.from_numpy(np.array([exp_gain])))
        error.backward()
        self._optimizer.step()

    def process_zone_reward(self, reward):
        # Positive reward - apply towards this point - negative reward - apply away from this point
        # Directions include those disabled so we properly encode the zone here
        ZONE_STEPS = 3 # Arbitrarily chosen, TODO scale to match number of divisions in a dimension
        for action in self._actions:
            features = np.zeros(len(get_features(np.zeros(self._ndims), np.zeros(self._ndims))))
            for step in range(1, ZONE_STEPS + 1):
                state = self.state + step * action
                if not self.check_bounds(state):
                    break
                features += get_features(state, action)
            features /= ZONE_STEPS
            self._optimizer.zero_grad()
            # Use negative reward since we are moving away
            error = self._criterion(self._net(torch.from_numpy(features)), torch.from_numpy(np.array([-reward])))
            error.backward()
            self._optimizer.step()

    def update_activation(self, dimension, activation):
        if activation:
            self._exclude_dims.discard(dimension)
        else:
            self._exclude_dims.add(dimension)

    def included_actions(self):
        # Set of actions that do not modify the 0-indexed dimensions in self._exclude_dims
        return np.array([act for act in self._actions if reduce(lambda x, y: x and y, [act[dim] == 0 for dim in self._exclude_dims], True)])
        

In [None]:
from pythonosc.dispatcher import Dispatcher
from pythonosc.osc_server import ThreadingOSCUDPServer
from pythonosc.udp_client import SimpleUDPClient
from threading import Thread
import time

manualMode = True
agents = {}

ip = "127.0.0.1" # localhost
port = 8080
destPort = 8081

client = SimpleUDPClient(ip, destPort)

def default_handler(address, *args):
    print(f"DEFAULT {address}: {args}")

def auto_switch_handler(address, state, *args):
    print(f"Is Manual {state}")
    manualMode = state

def manual_set(address, element, *args):
    # Currently assume only one element - TODO revise later
    agents[element].set_state(args)
    print(f"{element}: {agents[element].state}")

def step(address, element):
    old_state = agents[element].state
    action = agents[element].select_epsilon_greedy_action()
    if action is not None:
        print(f"{element}: Taking action {action}")
        agents[element].apply_action(action)
        #print(f"Transitioned from {old_state} to {agent.state}")
        client.send_message("/controller/agentSet", [element, *agents[element].state])
    else:
        print(f"{element}: All actions excluded! Doing nothing.")

def reward(address, element, reward):
    # Currently assuming only one element - TODO revise later
    #old_weights = agents[element]._weights
    agents[element].reward_and_bootstrap(reward)
    # print(f"Weights updated from {old_weights} to {agent._weights}")

def zone_reward(address, element, reward):
    # Calculate length N_STEPS away on each axis, store in agent
    agents[element].process_zone_reward(reward)
    

def activate(address, element, dimension, activation):
    print(f"{element}: Setting dimension {dimension} to {activation}")
    agents[element].update_activation(dimension, activation)
    print(f"{agents[element]._exclude_dims}")

def init(address, element, ndims, step):
    if element in agents:
        print(f"Replacing agent {element} with fresh. {ndims} dimensions, initial step {step} (norm)")
    else:
        print(f"New agent {element} with {ndims} dimensions, initial step {step} (norm)")
    #agents[element] = LinearSGDAgent(ndims, step)
    agents[element] = NeuralSGDAgent(ndims, step)

def delete(address, element):
    if element in agents:
        print(f"Deleting agent {element} ({agents[element]._ndims} dimensions)")
        del agents[element]
    else:
        print(f"No agent with identifier {element}!")


dispatcher = Dispatcher()
dispatcher.set_default_handler(default_handler)
dispatcher.map("/uistate/setAutonomous", auto_switch_handler)
dispatcher.map("/controller/manualSet", manual_set)
dispatcher.map("/controller/step", step)
dispatcher.map("/controller/reward", reward)
dispatcher.map("/controller/activate", activate)
dispatcher.map("/controller/init", init)
dispatcher.map("/controller/zone_reward", zone_reward)

ip = "127.0.0.1" # localhost
port = 8080

with ThreadingOSCUDPServer((ip, port), dispatcher) as server:
    def quit_func(address, *args):
        print("Quit!")
        server.shutdown()
        server.server_close()
    dispatcher.map("/quit", quit_func)
    thread = Thread(target=server.serve_forever)
    thread.start()
    thread.join()
print("And we're out!")