Access-Control Queuing Task
---
This is a decision task involving access control to a set of 10 servers. Customers of four different priorities arrive at a single queue. If given access to a server, the customers pay a reward of `1, 2, 4, or 8` to the server, depending on their priority, with higher priority customers paying more. In each time step, the customer at the head of the queue is either accepted (assigned to one of the servers) or rejected (removed from the queue, with a reward of zero). In either case, on the next time step the next customer in the queue is considered. The queue never empties, and the priorities of the customers in the queue are equally randomly distributed. Of course a customer cannot be served if there is no free server; the customer is always rejected in this case. Each busy server becomes free with probability `p = 0.06` on each time step.

The task is to decide on each step whether to accept or reject the next customer, on the basis of his priority and the number of free servers, so as `to maximize long-term reward without discounting`.

- State(num_servers, priority)
- Action(1, 0)
- Reward(1, 2, 4, 8)
---
<img src="differential_sarsa.png" width="600" style="float:left">

In [15]:
import numpy as np
import matplotlib.pyplot as plt
from TileCoding import *

In [107]:
class ValueFunction:

    def __init__(self, alpha=0.01, numOfTilings=8, maxSize=2048):
        self.maxSize = maxSize
        self.numOfTilings = numOfTilings

        # divide step size equally to each tiling
        self.alpha = alpha / numOfTilings  # learning rate for each tile

        self.hashTable = IHT(maxSize)

        # weight for each tile
        self.weights = np.zeros(maxSize)

        # position and velocity needs scaling to satisfy the tile software
        self.serverScale = self.numOfTilings / 10.0  # 10 servers
        self.priorityScale = self.numOfTilings / 3.0  # 4 kinds of priorities

    # get indices of active tiles for given state and action
    def getActiveTiles(self, n_server, priority, action):
        activeTiles = tiles(self.hashTable, self.numOfTilings,
                            [self.serverScale * n_server, self.priorityScale * priority],
                            [action])
#         print("active tiles for {}-{}-{}: {}".format(n_server, priority, action, activeTiles))
        return activeTiles

    # estimate the value of given state and action
    def value(self, state, action):
        n_server, priority = state
        activeTiles = self.getActiveTiles(n_server, priority, action)
        return np.sum(self.weights[activeTiles])

    # learn with given state, action and target
    def update(self, state, action, delta):
        n_server, priority = state
        activeTiles = self.getActiveTiles(n_server, priority, action)
        
        delta *= self.alpha
        for activeTile in activeTiles:
            self.weights[activeTile] += delta

In [128]:
class ServerAcess:
    def __init__(self, exp_rate=0.3, lr=0.1, beta=0.01, alpha=0.1):
        self.n_server = 10
        self.free_prob = 0.06
        self.priorities = range(4)
        self.actions = [0, 1]  # 0: reject; 1: accept
        self.state = (0, 0)  # (num_servers, priority)
        
        self.exp_rate = exp_rate
        self.lr = lr
        self.beta = beta
        self.alpha = alpha
        
    def numFreeServers(self):
        n = 0
        n_free_server = self.state[0]
        n_busy_server = self.n_server - n_free_server
        for _ in range(n_busy_server):
            if np.random.uniform(0, 1) <= 0.06:
                n += 1
        n_free_server += n
        self.state = (n_free_server, self.state[1])
        return n_free_server
    
    def chooseAction(self, valueFunc):
        n_free_server = self.numFreeServers()
        if n_free_server == 0:
            return 0
        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            values = {}
            for a in self.actions:
                v = valueFunc.value(self.state, a)
                values[a] = v
            action = np.random.choice([k for k, v in values.items() if v == max(values.values())])
        return action
    
    def nxtState(self, action):
        if action == 1:
            n_free_server = self.state[0] - 1
        else:
            n_free_server = self.state[0]
        priority = np.random.choice(self.priorities)
        self.state = (n_free_server, priority)
        return self.state
    
    def giveReward(self, action):
        # recieve a reward by taking the action
        if action == 1:
            priority = self.state[1]
            return np.power(2, priority)
        return 0
    
    def run(self, valueFunc, steps=100, debug=False):
        # updating average reward estimation along the way
        avg_reward = 0
        self.state = (10, np.random.choice(self.priorities))
        cur_state = self.state
        cur_action = self.chooseAction(valueFunc)  # n free server is also updated
        
        for i in range(1, steps+1):
            new_state = self.nxtState(cur_action)
            reward = self.giveReward(cur_action)
            new_action = self.chooseAction(valueFunc)
            
            if debug:
                print("state {} action {} reward {}".format(cur_state, cur_action, reward))
            if i % 100 == 0:
                print("avg reward", avg_reward)
            
            delta = reward - avg_reward + valueFunc.value(new_state, new_action) - valueFunc.value(cur_state, cur_action)
            avg_reward += self.beta*delta
            valueFunc.update(cur_state, cur_action, delta)
            
            cur_state = new_state
            cur_action = new_action

In [129]:
sa = ServerAcess(exp_rate=0.3)
vf = ValueFunction()
sa.run(vf, 10000)

avg reward 1.5975187316559318
avg reward 1.6585342328650614
avg reward 2.146286133002369
avg reward 2.2345296447966914
avg reward 2.307376400110741
avg reward 2.33458527314113
avg reward 2.1484245225969985
avg reward 2.056870004069434
avg reward 1.8754632866539522
avg reward 2.1620200966293437
avg reward 2.1896951922987737
avg reward 2.064330362125512
avg reward 2.0210457645199384
avg reward 2.736161480285944
avg reward 2.248572927767053
avg reward 2.1549760915288205
avg reward 1.972836550717075
avg reward 2.088612991814314
avg reward 1.8999321836617107
avg reward 2.7972683898551196
avg reward 2.2446498346531216
avg reward 2.001107153311131
avg reward 2.339506217820562
avg reward 2.3465854807406434
avg reward 2.2121599660268556
avg reward 2.218589129069436
avg reward 1.906188072156379
avg reward 2.120589323431796
avg reward 1.950137188038818
avg reward 2.0725176994050662
avg reward 2.001991874966852
avg reward 1.9441617438921628
avg reward 1.813855755663795
avg reward 1.854000897494626

In [124]:
sa.priorities

range(0, 4)

In [125]:
vf.value((5, 0), 1)

-136.2673710484311

In [127]:
for i in range(10):
    for p in sa.priorities:
        print(vf.value((i, p), 1))

-180.43639873727125
-186.55059492232317
-170.7317294590074
-191.9341015957678
-249.440088457333
-262.26774123831
-276.13521246568337
-271.09317078494473
-244.40676503113906
-255.17447592225824
-266.9907104047817
-257.3464581073585
-224.5008775860189
-232.20797817386293
-242.62273403351082
-229.1089428196809
-203.77030308993642
-200.66739673012185
-208.02195276052765
-210.69476199508972
-136.2673710484311
-137.51740091561922
-127.2655842373168
-132.40392859512596
-89.55114811602252
-98.92788825379122
-90.29760894659339
-96.07806759756815
-42.39981832431722
-44.912533069018494
-39.576598436646655
-40.14960104061273
-12.28714227785672
-7.071582622081973
-6.886601623479247
-13.299398099061136
-2.226606685711278
-2.500815229764973
-0.9772666804718585
-4.4205185504847595
