## Policy Playground  
This is code for loading and testing saved policies.  
Before use, please edit the path to the environment folder.  
For further instructions, please visit the "Policies" section.

In [None]:
# Path to environment folder
import sys
sys.path.insert(1, '/home/jovyan/Masterarbeit/reinforce-one/Environments')
sys.path.insert(1, '/home/jovyan/Masterarbeit/reinforce-one/Environments/Variations')

In [None]:
import tensorflow as tf
import numpy as np

import tensorflow_probability as tfp
from tf_agents.environments import tf_py_environment
from tf_agents.policies import scripted_py_policy

import matplotlib
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
matplotlib.rcParams.update({'font.size': 17})
#plt.ioff() 

#from RNN_Env_P2 import Env_P2_N
from EE0 import EE0
from EE0_5 import EE0_5
from EE1 import EE1
from EE0_NT import EE0_NT
from EE0_A import EE0_A
from EE1_A import EE1_A

### Policies

The policies that are loaded by default are the best policies for each configuration in every environment.  
The folder they are saved in is named after the iteration where they were evaluated and the average return achieved.  
To test these policies, complete the following steps:  
- Set "test_pol" to any of the loaded policies.
- Set "py_env" (and the env parameters) to the environment variant "test_pol" was trained in.
- Edit "act_plots" variable if no plots are needed.
- Run all cells.  

Results can be viewed in the last cell.  
Note that plots and cost calculation do not work correctly for P1_A without plot and testing function edits.  
Also, for correct cost calculation in EE0_NT please set all test costs to 0.  

In [None]:
# Environment Parameters 
max_episode_length=1000
num_herds = 2
total_population = 300
test_costs_env = 0.01    # 0 in EE0_NT
test_org_env = 1.    # 0 in EE0_NT
cull_cost = (1/num_herds)
# Test policy actross num_episodes episodes
num_episodes = 1000

# Best policies in EE0
P1 = tf.compat.v2.saved_model.load('Runs/RDDPG/R19/126000_-5.508191')
P2 = tf.compat.v2.saved_model.load('Runs/APPO/R15/28900_-5.7956944')
P3 = tf.compat.v2.saved_model.load('Runs/ADDPG/R20/89000_-6.7822328')
P4 = tf.compat.v2.saved_model.load('Runs/RPPO/R19/24000_-7.843697')

# Best policies in EE0_NT
P1_NT = tf.compat.v2.saved_model.load('Runs/Variants/EE0_NT/R2/88000_-3.4844654')
ADDPG_NT_P1 = tf.compat.v2.saved_model.load('Runs/Variants/EE0_NT/ANN_DDPG/R15/326900')
RDPG_NT_P1 = tf.compat.v2.saved_model.load('Runs/Variants/EE0_NT/RNN_DDPG/R1/49000_-3.3300657')
# Best policy of ANN PPO in EE0_A
P1_A = tf.compat.v2.saved_model.load('Runs/Variants/EE0_A/R3/69000_-4.8970075')
# Best policy of ANN PPO in EE1
P1_EE1 = tf.compat.v2.saved_model.load('Runs/Variants/EE1/R2/33000_-14.067305')
# # Best policy of ANN PPO in EE0_5
P1_EE0_5 = tf.compat.v2.saved_model.load('Runs/Variants/EE0_5/14000_-7.7829237')


# Policy to be tested
test_pol = P1

# Action plots
act_plots = True

# Environment the policy was trained in
py_env = EE0(num_herds = num_herds, total_population = total_population, fix_episode_length = True, average_episode_length = 200)
eval_env = tf_py_environment.TFPyEnvironment(py_env)

In [None]:
def plot_actions_and_states(action_list, inf_list, tests_list):
        if (num_herds != 2):
            print('Plotting only works for 2 herds.')
            return None
        t = np.linspace(0, len(action_list), num=len(action_list))
        plt.figure(figsize=(20,10))
        plt.title('(A) Tests over Time')
        plt.xlabel('Time Step')
        plt.ylabel('Percentage of Herd')
        plt.ylim(-0.01, 1.01)
        #p3.set_yscale('log')
        n_tests_h1, n_tests_h2, replace_h1, replace_h2, inf_h1, inf_h2 = [], [], [], [], [], []
        tests_h1, tests_h2 = [],[]
        for i in range(len(action_list)):
            n_tests_h1.append(action_list[i][0])
            n_tests_h2.append(action_list[i][1])
            '''  
            #For EE1_A:
            if action_list[i][0] < (1/3):
                n_tests_h1.append(0)
            elif action_list[i][0] < (2/3):
                n_tests_h1.append(0.5)
            else: 
                n_tests_h1.append(1)
            if action_list[i][1] < (1/3):
                n_tests_h2.append(0)
            elif action_list[i][1] < (2/3):
                n_tests_h2.append(0.5)
            else: 
                n_tests_h2.append(1)
            '''

            replace_h1.append(action_list[i][2])
            replace_h2.append(action_list[i][3])
            inf_h1.append(inf_list[i][0])
            inf_h2.append(inf_list[i][1])
            tests_h1.append(tests_list[i][0])
            tests_h2.append(tests_list[i][1])            
        plt.plot(t, n_tests_h1, color='fuchsia', label = 'Number of Tests Herd 1', marker = '', linestyle = '-', alpha=0.7)
        plt.plot(t, n_tests_h2, color='mediumblue', label = 'Number of Tests Herd 2', marker = '', linestyle = '-', alpha=0.7)
        plt.legend()
        plt.show()
        plt.close()
        
        plt.figure(figsize=(20,10))
        plt.title('(B) Correlation of Testresults and Culls')
        plt.xlabel('Time Step')
        plt.ylabel('Percentage of Tests')
        ymax_p3 = min(1.1, max(max(inf_h1)+0.01,max(inf_h2)+0.01))
        plt.ylim(-0.01, ymax_p3+0.01)
        plt.plot(t, inf_h1, color='fuchsia', label = 'Positive Tests Herd 1', marker = '', linestyle = '-', alpha=0.7)
        plt.plot(t, inf_h2, color='mediumblue', label = 'Positive Tests Herd 2', marker = '', linestyle = '-', alpha=0.7)
        first = True
        second = True
        for j in range(0,len(replace_h1)):
            if first:
                if replace_h1[j] == 1:
                    plt.scatter(x=j, y=inf_h1[j], s = 20+tests_h1[j]*300, c='limegreen', marker = 'o', edgecolors='black', label = 'Culls Herd 1')
                    if(tests_h1[j] > 0.):
                        plt.annotate(str(round(tests_h1[j],2)),(j+3, inf_h1[j]))
                    first = False
            else: 
                if replace_h1[j] == 1:
                    plt.scatter(x=j, y=inf_h1[j], s = 20+tests_h1[j]*300, c='limegreen', marker = 'o', edgecolors='black')
                    if(tests_h1[j] > 0.):
                        plt.annotate(str(round(tests_h1[j],2)),(j+3, inf_h1[j]))
            if second:
                if replace_h2[j] == 1:
                    plt.scatter(x=j, y=inf_h2[j], s = 20+tests_h2[j]*300, c='yellow', marker = 'o',edgecolors='black', label = 'Culls Herd 2')
                    if(tests_h1[j] > 0.):
                        plt.annotate(str(round(tests_h2[j],2)),(j+3, inf_h2[j]))
                    second = False
            else: 
                if replace_h2[j] == 1:
                    plt.scatter(x=j, y=inf_h2[j], s = 20+tests_h2[j]*300, c='yellow', marker = 'o',edgecolors='black')
                    if(tests_h1[j] > 0.):
                        plt.annotate(str(round(tests_h2[j],2)),(j+3, inf_h2[j]))
        plt.legend()
        plt.show()
        plt.close()
        return None

In [None]:
def test_policy(env, policy, num_episodes=50, num_herds = 2, create_plot = False):
    if isinstance(env, tf_py_environment.TFPyEnvironment):
        returns_arr = []
        culls_arr = []
        tests_arr = []
        actions = []
        infectious = []
        perc_tested = []
        cull_costs_arr = []
        test_costs_arr = []
        hpop = total_population/num_herds
        if env.action_spec().shape[0] == num_herds:
            raise ValueError('Only for environments with tests and culls.')

        for e in range(num_episodes):
            time_step = env.reset()
            if isinstance(policy, scripted_py_policy.ScriptedPyPolicy):
                raise ValueError('Only for agent policies.')
            else:
                policy_state = policy.get_initial_state(env.batch_size)
            episode_return = 0.0
            ep_tests = 0
            ep_culls = 0
            ep_tc = 0
            ep_cc = 0
            
            while not time_step.is_last():
                action_step = policy.action(time_step, policy_state)
                
                # Count total number of culls and cull costs
                for j in range (num_herds, num_herds*2):
                    if action_step.action[0][j] >= 0.5:
                        ep_cc += -cull_cost
                        ep_culls += 1
                        
                # Count number of steps where tests were done for each herd
                for k in range (0, num_herds):
                    if ((action_step.action[0][k]*hpop) >= 0.5):
                        ep_tests += 1
                        act = max(0, min(hpop, (np.rint(action_step.action[0][k]*hpop))))
                        test_c =  -(test_org_env + (act*test_costs_env)) / total_population
                        ep_tc += test_c

                        '''     
                        # For EE1_A:
                        if (action_step.action[0][k] < 1/3):
                            ep_tc += 0
                        elif (action_step.action[0][k] < 2/3):
                            act = max(0, min(hpop, (np.rint(0.5*hpop))))
                            test_c =  -(test_org_env + (act*test_costs_env)) / total_population
                            ep_tc += test_c
                            ep_tests += 1
                        else: 
                            act = max(0, min(hpop, (np.rint(1*hpop))))
                            test_c =  -(test_org_env + (act*test_costs_env)) / total_population
                            ep_tc += test_c
                            ep_tests += 1

                        '''

                            
                # Save actions and states for one episode
                if e == np.int32(num_episodes/2):
                    act = np.zeros(np.size(action_step.action[0]), np.float32)
                    act[0] = action_step.action[0][0]
                    act[1] = action_step.action[0][1]
                    for c in range(num_herds, num_herds*2):
                        if action_step.action[0][c] >= 0.5:
                            act[c] = 1.
                        else:
                            act[c] = 0.
                    actions.append(act)
                    inf_percentages = np.zeros(num_herds, np.float32)
                    perc_test = np.zeros(num_herds, np.float32)
                    # take observation instead of actual state, since get_state doesnt work
                    state = time_step.observation[0]
                    for d in range (0, num_herds):
                        # Assumes Env has observations as in Env_TSLC, otherwise d*3
                        inf_percentages[d] = (state[(d*4)+2])
                        perc_test[d] = state[(d*4)+1]
                    infectious.append(inf_percentages)
                    perc_tested.append(perc_test)
                    
                policy_state = action_step.state
                time_step = env.step(action_step.action)
                episode_return += time_step.reward
            
            returns_arr.append(episode_return) 
            culls_arr.append(ep_culls)
            tests_arr.append(ep_tests)
            cull_costs_arr.append(ep_cc)
            test_costs_arr.append(ep_tc)
            
        if create_plot:
            plot_actions_and_states(actions, infectious, perc_tested)
        avg_return = np.average(returns_arr)
        variance = np.var(returns_arr)
        stddev =  np.std(returns_arr)
        avg_culls = np.average(culls_arr) 
        avg_tests = np.average(tests_arr)
        avg_cull_costs = np.average(cull_costs_arr)
        avg_test_costs = np.average(test_costs_arr)
        return avg_return, avg_culls, avg_tests, variance, stddev, avg_cull_costs, avg_test_costs
    else:
        return None

In [None]:
 def plot_n_actions(action_list, environment, num_herds):
        # only for envs with 2 actions per herd per step
        colors = ['b', 'g', 'r', 'y', 'k']
        t = np.linspace(0, len(action_list), num=len(action_list))
        fig, (p1,p2) = plt.subplots(1, 2, figsize=(15,7))
        fig.suptitle('Actions over time')
        p1.set_title('Tests over Time')
        p1.set_xlabel('Time steps')
        p1.set_ylabel('Number of Tests')
        p1.set_ylim(-0.1, 1.1)
        p2.set_title('Culls over Time')
        p2.set_xlabel('Time')
        p2.set_ylabel('Positive tests')
        p2.set_ylim(-0.1, 1.1) 
        tests, replace = [], []
        for i in range (len(action_list)):
            temp_tests = []
            temp_replace = []
            for j in range (0, num_herds):
                temp_tests.append(action_list[i][j])
                temp_replace.append(action_list[i][j+num_herds])
            tests.append(temp_tests)
            replace.append(temp_replace)
        tests = np.array(tests)
        replace = np.array(replace)
        for k in range (0, num_herds):
            tmp_label = 'Herd ' + str(k+1)
            p1.plot(t, tests[:,k], color=colors[(k % len(colors))], label = tmp_label, marker = '', linestyle = '-',alpha=0.7)
            p2.plot(t, replace[:,k], color=colors[(k % len(colors))], label = tmp_label, marker = '', linestyle = '-',alpha=0.7)
            
        p1.legend()
        p2.legend()
        return fig

In [None]:
ret, cull, tests, var, std, cc, tc  = test_policy(eval_env, test_pol, num_episodes = num_episodes, num_herds = num_herds, create_plot = act_plots)
print('Average Return: ', ret)
print('Variance: ', var)
print('Standard Deviation: ', std)
print('Average Culls: ', cull)
print('Average Cull Costs: ', cc)
print('Average Tests: ', tests)
print('Average Test Costs: ', tc)