In [33]:
%load_ext autoreload
%autoreload 2

import os
import torch as th
import numpy as np
from scripts.utils.utils import load_yaml, make_dir, save_json, load_json
from scripts.pruning_models.model import calculate_reward_transition_matrices_new, calculate_q_matrix_avpruning, calculate_traces

run = '1000000'
n_nodes = 6
n_steps = 8
n_actions = 2


selected_folder =  f'../data/{run}/selected'

test_file = os.path.join(selected_folder, 'test.json')
train_file = os.path.join(selected_folder, 'train.json')

test_networks = load_json(test_file)
train_networks = load_json(train_file)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
import random
from itertools import groupby

rewards = [-100, -20, 20, 140]
n_rewards = len(rewards)
reward_id_map = {r: i for i,r in enumerate(rewards)}

def node_links(actions, **kwargs):
    return {
        source_node: sorted([
            {'targetId': l['targetId'], 'reward': l['reward'], 'rewardId': reward_id_map[l['reward']]} 
            for l in links
        ], key = lambda l: l['reward'])
        for source_node, links in groupby(actions, lambda l: l['sourceId'])
    }


class NetworkEnvironments:
    def __init__(self, networks, n_steps):
        self.networks = [
            {
                'node_links': node_links(**n),
                'starting_node': n['starting_node'],
                'max_reward': n['max_reward']
            }
            for n in networks
        ]
        self.n_steps = n_steps
        self.reset()

    def step(self, action):
        assert not self.done, 'Environment is done already.'
        selected_link = self.node_links[self.node][action]
        reward = selected_link['reward'] # - (self.max_reward / 8)
        self.node = selected_link['targetId']

        self.step_count += 1
        if self.step_count >= self.n_steps:
            self.done = True
            observation = None
        else:
            observation = self.observe(self.step_count, self.node_links[self.node])

        return observation, reward, self.done, {'max_reward': self.max_reward}

    @staticmethod
    def observe(step_count, node_links):
        return (step_count, *(nl['rewardId'] for nl in node_links))

    def reset(self):
        network = random.choice(self.networks)
        self.node_links = network['node_links']
        self.node = network['starting_node']
        self.max_reward = network['max_reward']
        self.step_count = 0
        self.done = False
        return self.observe(self.step_count, self.node_links[self.node])


In [35]:
env = NetworkEnvironments(train_networks, 8)

In [36]:
Q_mr = np.zeros((n_steps, n_rewards, n_rewards, 2)) # m, r

def select_action(q, epsilon):
    p = np.heaviside(q-q.mean(), 0.5) # s,a
    p = p * (1-epsilon) + epsilon / len(q)
    return np.random.choice(len(p),p=p)

n_epochs = 20000
alpha = 0.01
epsilon = 0.1

all_reward = []
all_regret = []

for i in range(n_epochs):
    done = False
    obs = env.reset() # m, r
    epoch_reward = 0
    while not done:
        q = Q_mr[obs]
        action = select_action(q, epsilon)
        next_obs, reward, done, info = env.step(action)

        prev_value = q[action]
        if not done:
            next_max = np.max(Q_mr[next_obs])
        else:
            next_max = 0
        
        new = (1 - alpha) * prev_value + alpha * (reward + next_max)
        Q_mr[obs][action] = new
        obs = next_obs
        epoch_reward += reward
    
    all_reward.append(epoch_reward)
    all_regret.append(info['max_reward'] - epoch_reward)

In [37]:
env.node_links


{0: [{'targetId': 3, 'reward': -20, 'rewardId': 1},
  {'targetId': 1, 'reward': 20, 'rewardId': 2}],
 3: [{'targetId': 4, 'reward': 20, 'rewardId': 2},
  {'targetId': 0, 'reward': 140, 'rewardId': 3}],
 1: [{'targetId': 4, 'reward': -100, 'rewardId': 0},
  {'targetId': 2, 'reward': -20, 'rewardId': 1}],
 4: [{'targetId': 2, 'reward': -100, 'rewardId': 0},
  {'targetId': 5, 'reward': 140, 'rewardId': 3}],
 2: [{'targetId': 5, 'reward': -20, 'rewardId': 1},
  {'targetId': 3, 'reward': 140, 'rewardId': 3}],
 5: [{'targetId': 2, 'reward': -100, 'rewardId': 0},
  {'targetId': 1, 'reward': 20, 'rewardId': 2}]}

In [38]:
# lower - higher reward
Q_diff = Q_mr[:,:,:,1] - Q_mr[:,:,:,0] 

In [49]:
Q_diff[1][0]

array([  0.        ,  98.41597751, 194.40843383, 370.6675063 ])

In [40]:
all_regret[-20:]

[240.0,
 40.0,
 40.0,
 40.0,
 40.0,
 40.0,
 40.0,
 160.0,
 40.0,
 40.0,
 200.0,
 240.0,
 40.0,
 200.0,
 40.0,
 40.0,
 120.0,
 40.0,
 200.0,
 40.0]