In [7]:
%load_ext autoreload
%autoreload 2

import os
import torch as th
import numpy as np
from scripts.utils.utils import load_yaml, make_dir, save_json, load_json
from scripts.pruning_models.model import calculate_reward_transition_matrices_new, calculate_q_matrix_avpruning, calculate_traces

run = '1000000'
n_nodes = 6
n_steps = 8
n_actions = 2


selected_folder =  f'../data/{run}/selected'

test_file = os.path.join(selected_folder, 'test.json')
train_file = os.path.join(selected_folder, 'train.json')

test_networks = load_json(test_file)
train_networks = load_json(train_file)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
rewards = [-100, -20, 20, 140]
n_rewards = len(rewards)
reward_id_map = {r: i for i,r in enumerate(rewards)}
rm = np.vectorize(lambda r: reward_id_map[int(r)])


Q_mr_all = np.zeros((len(train_networks), n_steps, n_rewards)) # m, r
O_mr_all = np.zeros((len(train_networks), n_steps, n_rewards)) # m, r

for n, network in enumerate(train_networks):
    # Indices:
    #     n: network [0..n_network]
    #     p: participant [0..n_participants]
    #     s: source node of action [0..n_nodes]
    #     t: target node of action [0..n_nodes]
    #     a: action [0,1]
    #     m: move / step within path [0..n_steps]
    #     f: starting node of the network [0..n_nodes]
    #     r: reward [0..n_rewards]

    T, R, L = calculate_reward_transition_matrices_new(network, n_nodes)

    RID = rm(R)
    nb_classes = 6
    RID = np.eye(n_rewards)[RID.reshape(-1)].reshape(n_nodes, n_actions, n_rewards)

    Q = calculate_q_matrix_avpruning(R, T, n_steps, gamma_g=0.0, gamma_s=0.0)

    OM = np.zeros((n_steps, n_nodes)) # m, s
    starting_node = network['starting_node']
    OM[0, starting_node] = 1
    for m in range(1, n_steps):
        OM[m] = np.einsum('s,sta->t',OM[m-1], T)

    q_mr = np.einsum('ms,msa,sar->mr',OM, np.flip(Q, 0), RID) # m,r
    o_mr = np.einsum('ms,sar->mr',OM, RID) # m,r

    Q_mr_all[n] = q_mr
    O_mr_all[n] = o_mr


In [15]:
Q_mr_all[0] / O_mr_all[0]

  Q_mr_all[0] / O_mr_all[0]


array([[          nan,  480.        ,           nan,  520.        ],
       [ 380.        ,  340.        ,  500.        ,           nan],
       [ 320.        ,  320.        ,  480.        ,  400.        ],
       [ 193.33333333,  233.33333333,  320.        ,  433.33333333],
       [ 177.14285714,  184.        ,  320.        ,  269.09090909],
       [  46.66666667,  111.2       ,  182.85714286,  278.46153846],
       [  10.        ,   51.42857143,  160.        ,  138.46153846],
       [-100.        ,  -20.        ,   20.        ,  140.        ]])

In [14]:
train_networks[0]['max_reward']

520.0

In [6]:
q_mr / o_mr

  q_mr / o_mr


array([[ 600.        ,  560.        ,           nan,           nan],
       [          nan,           nan,  520.        ,  640.        ],
       [ 320.        ,  400.        ,  440.        ,  560.        ],
       [ 180.        ,  420.        ,  300.        ,  420.        ],
       [ 140.        ,  280.        ,  280.        ,  360.        ],
       [  52.        ,  140.        ,  188.88888889,  274.54545455],
       [  -9.23076923,  120.        ,  138.82352941,  160.        ],
       [-100.        ,  -20.        ,   20.        ,  140.        ]])

In [129]:
Q_l_mr = Q_mr / O_mr
Q_l_mr = np.nan_to_num(Q_l_mr, -1000)

In [130]:
Q_fake = Q_l_mr.copy()
Q_fake[1,0] = 1000
Q_fake

array([[ 520.71428571,  500.        ,  410.        ,    0.        ],
       [1000.        ,  404.48979592,  487.64705882,  608.75      ],
       [ 287.30769231,  363.89380531,  405.30973451,  486.77966102],
       [ 187.80487805,  278.51528384,  329.91596639,  437.4863388 ],
       [ 142.00445434,  213.47826087,  264.8       ,  320.        ],
       [  42.9004329 ,  114.13636364,  167.31092437,  281.49758454],
       [ -14.87348735,   44.54244763,   99.17480999,  177.02479339],
       [-100.        ,  -20.        ,   20.        ,  140.        ]])

In [139]:
all_regret = []

this_Q = Q_l_mr


for network in test_networks:
   #  R: s, a
    T, R, L = calculate_reward_transition_matrices_new(network, n_nodes)
    Q = calculate_q_matrix_avpruning(R, T, n_steps, gamma_g=0.0, gamma_s=0.0)

    RID = rm(R)
    nb_classes = 6
    RID = np.eye(n_rewards)[RID.reshape(-1)].reshape(n_nodes, n_actions, n_rewards)

    OM = np.zeros((n_steps+1, n_nodes)) # m, s
    starting_node = network['starting_node']
    OM[0, starting_node] = 1
    P = np.zeros((n_steps, n_nodes, n_actions)) # m, s, a
    exp_reward = 0
    for m in range(0, n_steps):
        q = np.einsum('sar,r->sa',RID, this_Q[m]) # s,a
        P[m] = np.heaviside(q-q.mean(1, keepdims=True), 0.5) # s,a
        OM[m+1] = np.einsum('s,sa,sta->t',OM[m], P[m], T)

    reward = np.einsum('ms,msa,sa->',OM[:-1], P, R)
    max_reward = Q[-1,starting_node].max()
    regret = reward - max_reward
    all_regret.append(regret)

print(all_regret)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [140]:
Q_l_mr

array([[ 520.71428571,  500.        ,  410.        ,    0.        ],
       [ 350.23255814,  404.48979592,  487.64705882,  608.75      ],
       [ 287.30769231,  363.89380531,  405.30973451,  486.77966102],
       [ 187.80487805,  278.51528384,  329.91596639,  437.4863388 ],
       [ 142.00445434,  213.47826087,  264.8       ,  320.        ],
       [  42.9004329 ,  114.13636364,  167.31092437,  281.49758454],
       [ -14.87348735,   44.54244763,   99.17480999,  177.02479339],
       [-100.        ,  -20.        ,   20.        ,  140.        ]])