In [None]:
import os
import random
import numpy as np

from dynamic_programming.policy_iteration import PolicyIteration
from dynamic_programming.mdp_model import MDPModel
from dynamic_programming.policy import DPPolicy
from envs.env_creator import env_creator
from base_rl.eval_policy import EvalDiscreteStatePolicy
from envs.plot import plot_industrial_benchmark_trajectories
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
plt.rcParams["figure.figsize"] = [20, 12]
fixed_digits = 6


In [None]:
# parameters

model_names = ['model_aeq-20bits3']
root_path = 'tmp'
steps_per_episode = 1000
training_episodes = [10, 100, 1000, 10000]
total_epochs=10
eval_epochs=500

## Load Data and MDP Model

In [None]:
from experiments.offline_experiment_configs import OfflineDiscreteRLExperimentConfig


experiment_configs = []


for model_name in model_names:
    for i, training_episode in enumerate(training_episodes):
        experiment_configs.append(
            OfflineDiscreteRLExperimentConfig(
                model_name=model_name,
                model_path=os.path.join(root_path, 'state_quantization', model_name),
                dataset_path=os.path.join(root_path, "offline_rl_trajectories", model_name, "rl_dataset.npy"),
                mdp_path=os.path.join(root_path,'policy_iteration','mdp',model_name,f'{training_episode}','mdp_model.pkl'),
                policy_path=os.path.join(root_path,'policy_iteration',model_name, f'{training_episode}','policy.pkl'),
                dataset_size=training_episode*steps_per_episode)
        )
device = 'cpu'
reward_type = 'state'

In [None]:
from dynamic_programming.mdp_model import create_mdp_models


for config in experiment_configs:
    create_mdp_models(load_path=config.dataset_path, mdp_save_path=config.mdp_path, reward_function_type=reward_type, device=device, dataset_size=config.dataset_size)

## Train

In [None]:
def train_policy_iteration(mdp_path, policy_save_path):
    mdp_model = MDPModel.load(mdp_path)
    solver = PolicyIteration(reward_function=mdp_model.reward_function, transition_model=mdp_model.transition_model,
                             gamma=0.995, sa_reward=reward_type)
    solver.train(total_epochs=total_epochs, eval_epochs=eval_epochs)
    trained_policy = DPPolicy(policy_table=solver.policy, state_to_index=mdp_model.state_to_index,
                              index_to_action=mdp_model.index_to_actions)
    trained_policy.save(policy_save_path)


for config in experiment_configs:
    train_policy_iteration(mdp_path=config.mdp_path, policy_save_path=config.policy_path)

In [None]:
from benchmarks.policy_benchmarks import PolicyBenchmarks

steps_per_episode = 1000
evaluators = []
for config in experiment_configs:
    print(config.policy_path)
    eval_policy = config.get_saved_policy()
    env_kwargs = {'steps_per_episode': steps_per_episode, 'device': device, 'model_path': config.model_path}
    evaluator = EvalDiscreteStatePolicy(policy=eval_policy, env_creator=env_creator, env_kwargs=env_kwargs,
                                        tag=f'{config.model_name}/{config.dataset_size}')
    evaluators.append(evaluator)

policy_benchmarks = PolicyBenchmarks(evaluators=evaluators, epochs=10)
policy_benchmarks.benchmark()

In [None]:
plot_industrial_benchmark_trajectories(policy_benchmarks.evaluators[-1].eval_trajectories[0]['info'])

In [None]:
df = pd.DataFrame(policy_benchmarks.benchmark_metrics)
df.T

In [None]:

plt.rcParams["figure.figsize"] = [20, 12]


for model_name in model_names:
    m = np.core.defchararray.find(df.columns.values.astype(str), model_name) >= 0
    fdf = df.loc[:, m]
    ax = fdf.plot.bar()

    for container in ax.containers:
        ax.bar_label(container)
    plt.show()