In [3]:
%load_ext autoreload
%autoreload 2

import os
import torch as th
import pandas as pd
import numpy as np
from scripts.utils.utils import load_yaml, make_dir, save_json, load_json
from scripts.pruning_models.model import calculate_reward_transition_matrices_new, calculate_q_matrix_avpruning, calculate_traces

run = 'v2_100'
n_nodes = 10
n_steps = 8
n_actions = 2


selected_folder =  f'../data/{run}/selected'

test_file = os.path.join(selected_folder, 'test.json')
train_file = os.path.join(selected_folder, 'train.json')


q_table_folder =  f'../data/{run}/q_table'
make_dir(q_table_folder)

q_table_file = os.path.join(q_table_folder, 'algorithm_1.json')

test_networks = load_json(test_file)
train_networks = load_json(train_file)
train_networks = [n for n in train_networks if len(n['actions']) == 20]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
rewards = [-100, -20, 0, 20, 140]
n_rewards = len(rewards)
reward_id_map = {r: i for i,r in enumerate(rewards)}
rm = np.vectorize(lambda r: reward_id_map[int(r)])


Q_mr_all = np.zeros((len(train_networks), n_steps, n_rewards)) # m, r
O_mr_all = np.zeros((len(train_networks), n_steps, n_rewards)) # m, r

for n, network in enumerate(train_networks):
    # Indices:
    #     n: network [0..n_network]
    #     p: participant [0..n_participants]
    #     s: source node of action [0..n_nodes]
    #     t: target node of action [0..n_nodes]
    #     a: action [0,1]
    #     m: move / step within path [0..n_steps]
    #     f: starting node of the network [0..n_nodes]
    #     r: reward [0..n_rewards]

    T, R, L = calculate_reward_transition_matrices_new(network, n_nodes)

    RID = rm(R)
    RID = np.eye(n_rewards)[RID.reshape(-1)].reshape(n_nodes, n_actions, n_rewards)

    Q = calculate_q_matrix_avpruning(R, T, n_steps, gamma_g=0.0, gamma_s=0.0)

    OM = np.zeros((n_steps, n_nodes)) # m, s
    starting_node = network['starting_node']
    OM[0, starting_node] = 1
    for m in range(1, n_steps):
        OM[m] = np.einsum('s,sta->t',OM[m-1], T)

    q_mr = np.einsum('ms,msa,sar->mr',OM, np.flip(Q, 0), RID) # m,r
    o_mr = np.einsum('ms,sar->mr',OM, RID) # m,r

    Q_mr_all[n] = q_mr
    O_mr_all[n] = o_mr


In [5]:
Q_l_mr = Q_mr_all.sum(0) / O_mr_all.sum(0)
Q_l_mr = np.nan_to_num(Q_l_mr, -1000)

  """Entry point for launching an IPython kernel.


In [10]:
Q_l_mr

array([[ -587.08333333,    -2.27272727,    44.09090909,    36.2962963 ,
            0.        ],
       [ -634.92957746,   -88.65979381,  -354.47619048,  -213.27102804,
            0.        ],
       [ -693.7254902 ,  -173.29411765,  -472.56198347,  -333.98373984,
            0.        ],
       [ -612.        ,  -172.50773994,  -461.68618267,  -423.95918367,
        -1000.        ],
       [ -598.02030457,  -182.77070064,  -460.33039648,  -459.16919959,
        -1000.        ],
       [ -437.93103448,  -171.21096725,  -475.59380379,  -477.62623634,
        -1000.        ],
       [  -88.88888889,  -188.36943196,  -346.32752215,  -364.18118467,
        -1000.        ],
       [ -100.        ,   -20.        ,     0.        ,    20.        ,
          140.        ]])

In [6]:
df = pd.DataFrame(Q_l_mr, columns=rewards)
df.to_json(q_table_file, orient='split')

In [7]:
# Q_fake = Q_l_mr.copy()
# Q_fake[1,0] = 1000
# Q_fake

In [8]:
all_regret = []

this_Q = Q_l_mr


for network in test_networks:
   #  R: s, a
    T, R, L = calculate_reward_transition_matrices_new(network, n_nodes)
    Q = calculate_q_matrix_avpruning(R, T, n_steps, gamma_g=0.0, gamma_s=0.0)

    RID = rm(R)
    nb_classes = 6
    RID = np.eye(n_rewards)[RID.reshape(-1)].reshape(n_nodes, n_actions, n_rewards)

    OM = np.zeros((n_steps+1, n_nodes)) # m, s
    starting_node = network['starting_node']
    OM[0, starting_node] = 1
    P = np.zeros((n_steps, n_nodes, n_actions)) # m, s, a
    exp_reward = 0
    for m in range(0, n_steps):
        q = np.einsum('sar,r->sa',RID, this_Q[m]) # s,a
        P[m] = np.heaviside(q-q.mean(1, keepdims=True), 0.5) # s,a
        OM[m+1] = np.einsum('s,sa,sta->t',OM[m], P[m], T)

    reward = np.einsum('ms,msa,sa->',OM[:-1], P, R)
    max_reward = Q[-1,starting_node].max()
    regret = reward - max_reward
    all_regret.append(regret)

print(all_regret)

[-60.0, -179.375, -180.0, -340.0, -80.0, -160.0, -95.0, -260.0, -260.0, -160.0, -160.0, -95.78125, -140.0, -160.0, -228.125, -98.4375, -147.5, -141.25, -155.0, -160.0]


In [9]:
Q_l_mr

array([[ -587.08333333,    -2.27272727,    44.09090909,    36.2962963 ,
            0.        ],
       [ -634.92957746,   -88.65979381,  -354.47619048,  -213.27102804,
            0.        ],
       [ -693.7254902 ,  -173.29411765,  -472.56198347,  -333.98373984,
            0.        ],
       [ -612.        ,  -172.50773994,  -461.68618267,  -423.95918367,
        -1000.        ],
       [ -598.02030457,  -182.77070064,  -460.33039648,  -459.16919959,
        -1000.        ],
       [ -437.93103448,  -171.21096725,  -475.59380379,  -477.62623634,
        -1000.        ],
       [  -88.88888889,  -188.36943196,  -346.32752215,  -364.18118467,
        -1000.        ],
       [ -100.        ,   -20.        ,     0.        ,    20.        ,
          140.        ]])