In [1]:
import numpy as np
import sys
import torch
from collections import deque
from functools import reduce

print("===VERSIONS===")
print(f"Python: {sys.version}")
print(f"numpy: {np.__version__}")
print(f"PyTorch: {torch.__version__}")

===VERSIONS===
Python: 3.11.2 (main, Mar 13 2023, 12:18:29) [GCC 12.2.0]
numpy: 1.26.1
PyTorch: 2.1.0+cu121


In [2]:
import torch.nn.functional as F


class BasicNN(torch.nn.Module):
    def __init__(self):
        super(BasicNN, self).__init__()
        self.fc1 = torch.nn.Linear(8, 10)
        self.fc2 = torch.nn.Linear(10, 10)
        self.fc3 = torch.nn.Linear(10, 1)

        self.fc1.weight.data.fill_(0)
        self.fc2.weight.data.fill_(0)
        self.fc3.weight.data.fill_(0)

        self.double()

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [3]:
def features1(k, mu, al, ah):
    return np.array([
        1,
        k,
        mu,
        al,
        ah,
        k * mu,
        k * al,
        k * ah,
        mu * al,
        mu * ah,
        al * ah
    ])

def features2(k, mu, al, ah):
    return np.array([
        1,
        k,
        mu,
        al,
        ah,
#        k * mu,
#        k * al,
#        k * ah,
#        mu * al,
#        mu * ah,
#        al * ah,
        k**2,
        mu**2,
        al**2,
        ah**2
    ])

def features3(k, mu, al, ah, a1, a2, a3, a4):
    return np.array([
        1,
        k,
        mu,
        al,
        ah,
        k*a1,
        mu*a2,
        al*a3,
        ah*a4,
        k**2,
        mu**2,
        al**2,
        ah**2
    ])

def idenfeatures(k, mu, al, ah, a1, a2, a3, a4):
    return np.array([k, mu, al, ah, a1, a2, a3, a4])

get_features = idenfeatures

In [4]:
class Crediter:
    def __init__(self, ndims, max_length = 8):
        self._history = deque(maxlen=max_length)
        self._a = 0.175
        self._b = 0.1
        self._c = 0.05
        tmp = get_features(*np.zeros(ndims * 2))
        for _ in range(max_length):
            self._history.append(tmp)
        
    def add_index(self, feature_vec):
        if len(self._history) >= self._history.maxlen:
            self._history.pop()
        self._history.append(feature_vec)

    def credit(self):
        return sum([w * v for w, v in zip(
            np.array([self._c, self._b, self._a, self._a, self._a, self._a, self._b, self._c]),
            self._history
        )])

class Crediter2(Crediter):
    def __init__(self, ndims):
        self._history = deque(maxlen=4)
        self._a = 0.5
        self._b = 0.25
        self._c = 0.15
        self._d = 0.05
        tmp = get_features(*np.zeros(ndims * 2))
        for _ in range(self._history.maxlen):
            self._history.append(tmp)

    def credit(self):
        return sum([w * v for w, v in zip(
            np.array([self._a, self._b, self._c, self._d]),
            self._history
        )])

class Crediter3(Crediter):
    def __init__(self, ndims):
        self._history = deque(maxlen=2)
        tmp = get_features(*np.zeros(ndims * 2))
        for _ in range(self._history.maxlen):
            self._history.append(tmp)

    def credit(self):
        return sum([w * v for w, v in zip(
            np.array([0.75, 0.25]),
            self._history
        )])

In [5]:
class RandomAgent:
    def __init__(self):
        self.state = np.zeros(N_DIMS)
        a = np.eye(N_DIMS) / N_STEPS
        self._actions = np.concatenate((a, -a))
        self._rng = np.random.default_rng()
        print(self.state)

    # Values should be normalized to 0-1 space for each
    def set_state(self, k, mu, al, ah):
        self.state = np.array([k, mu, al, ah])

    def select_action(self):
        # Select actions randomly until it's valid
        invalid = True
        while invalid:
            action = self._actions[self._rng.integers(len(self._actions))]
            next_state = self.state + action
            invalid = not ((next_state >= 0) & (next_state <= 1)).all(0)
        return action

    def apply_action(self, action):
        next_state = self.state + action
        if ((next_state >= 0) & (next_state <= 1)).all(0):
            self.state = next_state
        else:
            raise Exception(f"Tried to transition to invalid state {next_state}.")

In [6]:
class LinearSGDAgent:
    def __init__(self, ndims, step, epsilon=0.1, alpha=0.002, gamma=0.50, crediter=Crediter3):
        self._ndims = ndims
        self._step = step
        self.crediter = crediter(self._ndims)
        self.state = np.zeros(self._ndims)
        self._weights = np.zeros(len(get_features(*np.zeros(self._ndims*2))))
        a = np.eye(self._ndims) * self._step
        self._actions = np.concatenate((a, -a))
        self._exclude_dims = set()
        self._rng = np.random.default_rng()
        self._epsilon = epsilon
        self._alpha = alpha # taken from scurto et al 2021
        self._gamma = gamma

    def set_state(self, state, action=None):
        if action is None:
            action = np.zeros(self._ndims)
        self.state = state
        self.crediter.add_index(get_features(*self.state, *action))

    def check_bounds(self, state):
        return ((state >= 0) & (state <= 1)).all(0)

    def get_value(self, state, action):
        return np.dot(self._weights, get_features(*state, *action))

    def select_action(self):
        max_actions = []
        invs = []
        max_value = np.NINF
        for action in self.included_actions():
            next_state = self.state + action
            if self.check_bounds(next_state):
                value = self.get_value(next_state, action)
                if np.isclose(max_value, value):
                    max_actions.append(action)
                elif value > max_value:
                    max_value = value
                    max_actions = [action]
            else:
                invs.append(action)
        if len(invs) > 0:
            print(f"Invalid actions {invs}")
        print(f"Maximum value of {max_value}")
        if len(max_actions) > 0:
            return max_actions[self._rng.integers(len(max_actions))]
        else:
            print("No valid actions!")
            return None

    def select_epsilon_greedy_action(self):
        if self._rng.random() < self._epsilon:   
            # Random action
            invalid = True
            actions = self.included_actions()
            if len(actions) > 0:
                while invalid:
                    action = actions[self._rng.integers(len(actions))]
                    next_state = self.state + action
                    invalid = not self.check_bounds(next_state)
                print(f"Taking random action {action}")
                return action
            else:
                print("No valid actions!")
                return None
        else:
            return self.select_action() 

    def apply_action(self, action):
        next_state = self.state + action
        if self.check_bounds(next_state):
            self.set_state(next_state, action)
        else:
            raise Exception(f"Tried to transition to an invalid state {next_state}.")

    def reward_and_bootstrap(self, reward):
        credit_features = self.crediter.credit()
        action = self.select_action()
        if action is not None:
            next_state = self.state + action
            error = reward + self._gamma * self.get_value(next_state, action) - np.dot(self._weights, credit_features)
        else:
            error = reward - np.dot(self._weights, credit_features)
        print(f"Error - {error}")
        self._weights = self._weights + self._alpha * error * credit_features

    def update_activation(self, dimension, activation):
        if activation:
            self._exclude_dims.discard(dimension)
        else:
            self._exclude_dims.add(dimension)

    def included_actions(self):
        # Set of actions that do not modify the 0-indexed dimensions in self._exclude_dims
        return np.array([act for act in self._actions if reduce(lambda x, y: x and y, [act[dim] == 0 for dim in self._exclude_dims], True)])
        

In [7]:
class NeuralSGDAgent:
    def __init__(self, ndims, step, epsilon=0.1, alpha=0.002, gamma=0.50, crediter=Crediter3):
        self._ndims = ndims
        self._step = step
        self.crediter = crediter(self._ndims)
        self.state = np.zeros(self._ndims)
        self._net = BasicNN()
        self._criterion = torch.nn.MSELoss()
        self._optimizer = torch.optim.SGD(self._net.parameters(), lr=alpha)
        a = np.eye(self._ndims) * self._step
        self._actions = np.concatenate((a, -a))
        self._exclude_dims = set()
        self._rng = np.random.default_rng()
        self._epsilon = epsilon
        self._alpha = alpha # taken from scurto et al 2021
        self._gamma = gamma

    def set_state(self, state, action=None):
        if action is None:
            action = np.zeros(self._ndims)
        self.state = state
        self.crediter.add_index(get_features(*self.state, *action))

    def check_bounds(self, state):
        return ((state >= 0) & (state <= 1)).all(0)

    def get_value(self, state, action):
        return self._net(torch.from_numpy(get_features(*state, *action))).item()

    def select_action(self):
        max_actions = []
        invs = []
        max_value = np.NINF
        for action in self.included_actions():
            next_state = self.state + action
            if self.check_bounds(next_state):
                value = self.get_value(next_state, action)
                if np.isclose(max_value, value):
                    max_actions.append(action)
                elif value > max_value:
                    max_value = value
                    max_actions = [action]
            else:
                invs.append(action)
        if len(invs) > 0:
            print(f"Invalid actions {invs}")
        print(f"Maximum value of {max_value}")
        if len(max_actions) > 0:
            return max_actions[self._rng.integers(len(max_actions))]
        else:
            print("No valid actions!")
            return None

    def select_epsilon_greedy_action(self):
        if self._rng.random() < self._epsilon:   
            # Random action
            invalid = True
            actions = self.included_actions()
            if len(actions) > 0:
                while invalid:
                    action = actions[self._rng.integers(len(actions))]
                    next_state = self.state + action
                    invalid = not self.check_bounds(next_state)
                print(f"Taking random action {action}")
                return action
            else:
                print("No valid actions!")
                return None
        else:
            return self.select_action() 

    def apply_action(self, action):
        next_state = self.state + action
        if self.check_bounds(next_state):
            self.set_state(next_state, action)
        else:
            raise Exception(f"Tried to transition to an invalid state {next_state}.")

    def reward_and_bootstrap(self, reward):
        credit_features = self.crediter.credit()
        action = self.select_action()
        if action is not None:
            next_state = self.state + action
            exp_gain = reward + self._gamma * self.get_value(next_state, action)
        else:
            exp_gain = reward
        self._optimizer.zero_grad()
        error = self._criterion(self._net(torch.from_numpy(credit_features)), torch.from_numpy(np.array([exp_gain])))
        error.backward()
        self._optimizer.step()

    def update_activation(self, dimension, activation):
        if activation:
            self._exclude_dims.discard(dimension)
        else:
            self._exclude_dims.add(dimension)

    def included_actions(self):
        # Set of actions that do not modify the 0-indexed dimensions in self._exclude_dims
        return np.array([act for act in self._actions if reduce(lambda x, y: x and y, [act[dim] == 0 for dim in self._exclude_dims], True)])
        

In [8]:
from pythonosc.dispatcher import Dispatcher
from pythonosc.osc_server import ThreadingOSCUDPServer
from pythonosc.udp_client import SimpleUDPClient
from threading import Thread
import time

manualMode = True
agents = {}

ip = "127.0.0.1" # localhost
port = 8080
destPort = 8081

client = SimpleUDPClient(ip, destPort)

def default_handler(address, *args):
    print(f"DEFAULT {address}: {args}")

def auto_switch_handler(address, state, *args):
    print(f"Is Manual {state}")
    manualMode = state

def manual_set(address, element, *args):
    # Currently assume only one element - TODO revise later
    agents[element].set_state(args)
    print(f"{element}: {agents[element].state}")

def step(address, element):
    old_state = agents[element].state
    action = agents[element].select_epsilon_greedy_action()
    if action is not None:
        print(f"{element}: Taking action {action}")
        agents[element].apply_action(action)
        #print(f"Transitioned from {old_state} to {agent.state}")
        client.send_message("/controller/agentSet", [element, *agents[element].state])
    else:
        print(f"{element}: All actions excluded! Doing nothing.")

def reward(address, element, reward):
    # Currently assuming only one element - TODO revise later
    #old_weights = agents[element]._weights
    agents[element].reward_and_bootstrap(reward)
    # print(f"Weights updated from {old_weights} to {agent._weights}")

def activate(address, element, dimension, activation):
    print(f"{element}: Setting dimension {dimension} to {activation}")
    agents[element].update_activation(dimension, activation)
    print(f"{agents[element]._exclude_dims}")

def init(address, element, ndims, step):
    if element in agents:
        print(f"Replacing agent {element} with fresh. {ndims} dimensions, initial step {step} (norm)")
    else:
        print(f"New agent {element} with {ndims} dimensions, initial step {step} (norm)")
    #agents[element] = LinearSGDAgent(ndims, step)
    agents[element] = NeuralSGDAgent(ndims, step)

def delete(address, element):
    if element in agents:
        print(f"Deleting agent {element} ({agents[element]._ndims} dimensions)")
        del agents[element]
    else:
        print(f"No agent with identifier {element}!")


dispatcher = Dispatcher()
dispatcher.set_default_handler(default_handler)
dispatcher.map("/uistate/setAutonomous", auto_switch_handler)
dispatcher.map("/controller/manualSet", manual_set)
dispatcher.map("/controller/step", step)
dispatcher.map("/controller/reward", reward)
dispatcher.map("/controller/activate", activate)
dispatcher.map("/controller/init", init)

ip = "127.0.0.1" # localhost
port = 8080

with ThreadingOSCUDPServer((ip, port), dispatcher) as server:
    def quit_func(address, *args):
        print("Quit!")
        server.shutdown()
        server.server_close()
    dispatcher.map("/quit", quit_func)
    thread = Thread(target=server.serve_forever)
    thread.start()
    thread.join()
print("And we're out!")

New agent 0 with 4 dimensions, initial step 0.05000000074505806 (norm)
New agent 1 with 4 dimensions, initial step 0.05000000074505806 (norm)
New agent 2 with 4 dimensions, initial step 0.05000000074505806 (norm)
New agent 3 with 4 dimensions, initial step 0.05000000074505806 (norm)
New agent 4 with 4 dimensions, initial step 0.05000000074505806 (norm)
Is Manual False
Stepping
Invalid actions [array([-0.05, -0.  , -0.  , -0.  ]), array([-0.  , -0.05, -0.  , -0.  ]), array([-0.  , -0.  , -0.05, -0.  ]), array([-0.  , -0.  , -0.  , -0.05])]
Maximum value of 0.000403888727305457
0: Taking action [0.05 0.   0.   0.  ]
Elapsed: 0.15859460830688477
Stepping
Invalid actions [array([-0.  , -0.05, -0.  , -0.  ]), array([-0.  , -0.  , -0.05, -0.  ]), array([-0.  , -0.  , -0.  , -0.05])]
Maximum value of 0.000403888727305457
0: Taking action [0.   0.   0.   0.05]
Elapsed: 0.004158496856689453
Stepping
Invalid actions [array([-0.  , -0.05, -0.  , -0.  ]), array([-0.  , -0.  , -0.05, -0.  ])]
Maxim

In [None]:
get_features(0, 0, 0, 0, 0, 0, 0, 0)