# Assign jobs directly using actor critic reinforcement learning

# Prepare programming environment

## Load necessary packages

In [None]:
# Import packages
import sys

from tensorforce.environments import Environment
from datetime import datetime
from time import mktime
from tensorforce.agents import Agent
import pandas as pd
import pickle

sys.path.append("..")
from src.simulation import assign_priority_edd, assign_priority_mdd, assign_priority_spt, assign_priority_srpt, assign_priority_lpt, assign_priority_cr, assign_priority_ds, assign_priority_fifo, select_machine_winq
from src.models import AssignEnvironment, hyperparameter_tuning_assign_ac, run_agent

## Get required data

In [None]:
# Load data required for simulation
product_types_df = pd.read_csv("../data/external/product_types.csv")
product_types_df = product_types_df[product_types_df.id != 2]
with open(r"../data/interim/sim_data.pickle", "rb") as output_file:
    orders_df = pickle.load(output_file)
machines_df = pd.read_csv("../data/external/machine.csv")
machines_df = machines_df[machines_df.product_type_id != 2]

## Define constants

In [None]:
# Define constants for simulation
priority_rules = [assign_priority_edd, assign_priority_spt, assign_priority_srpt, assign_priority_fifo, assign_priority_cr,
                  assign_priority_mdd, assign_priority_lpt, assign_priority_ds]
SIMULATION_START = mktime(datetime(2022, 11, 14, 5, 0, 0).timetuple()) * 1000
due_date_range_list = [(3, 10), (5, 14), (7, 21)]
number_orders_start_list = [80, 90, 100, 110]
average_count_new_orders_list = [80, 90, 100, 110]
worker_list =  [40, 50, 60, 70]
random_states = [7, 42, 66, 97, 108]
random_states_evaluation = [100, 101, 102]

# Prepare training

In [None]:
environment = Environment.create(
    environment=AssignEnvironment(product_types_df, machines_df, orders_df,priority_rule=assign_priority_edd,
                 simulation_start=SIMULATION_START, allocation_rule=select_machine_winq, random_state=42))

# Hyperparameter tuning

In [None]:
# Parameter grid for deep q learning
params_ac = {"batch_size": [5, 10, 20, 30, 50, 100],
            "update_frequency": [0.25, 0.5, 1.0],
            "horizon": [5, 10, 20, 30, 50, 100],
            "discount": [0.9, 0.7, 0.5, 0.01],
            "return_processing": [dict(type='exponential_normalization', decay=0.9), None],
            "reward_processing": [dict(type='exponential_normalization', decay=0.9), None],
            "state_preprocessing": [dict(type='exponential_normalization', decay=0.9),  None],
            "target_update_weight": [0.7, 1.0],
            "l2_regularization": [0.01, 0.0],
            "likelihood_ratio_clipping": [0.1, 0.2, 0.3],
            "entropy_regularization": [0.0, 0.01],
            "critic_optimizer": [0.5,0.9, 1.0]
            }

In [None]:
# Execute hyperparameter search
# tuning_res = hyperparameter_tuning_assign_ac(environment, params_ac, 1, due_date_range_list,
#                                               number_orders_start_list, average_count_new_orders_list,
#                                               worker_list, [42], 3)

In [None]:
# Save results of hyperparameter search
# tuning_res.to_csv("../data/processed/hyperparameter_search/results_hyperparameter_select_deep_q_learning.csv", index = False)

In [None]:
# Read hyperparameter results
tuning_res = pd.read_csv("../data/processed/hyperparameter_search/results_hyperparameter_assign_actor-critic.csv")

In [None]:
# Show best performing combination
tuning_res.sort_values(["reward_training_mean", "reward_evaluation_mean"], ascending = False)

In [None]:
# Analyse tuning results
tuning_res.groupby(["batch_size", "update_frequency", "horizon", "discount"])["reward_training_mean", "reward_evaluation_mean"].mean().sort_values("reward_evaluation_mean")

# Define and train final model

In [None]:
# Define agent based on hyperparameter results
agent = Agent.create(
    agent='ac', environment=environment, memory=11000, max_episode_timesteps=1000, batch_size=50,
    summarizer=dict(
        directory='summaries/assign/actor_critic/',
        summaries=["action-value", "entropy", "graph", "kl-divergence", "loss", "parameters", "reward", "update-norm",
                   "updates", "variables"]
    ),
    update_frequency=0.5, learning_rate=0.001, horizon=30, discount=0.9, return_processing=None, reward_processing=None,
    state_preprocessing=None, entropy_regularization = 0.01,critic_optimizer = 0.5, exploration=0.2, l2_regularization=0.01)

In [None]:
# Run agent for 100 episodes to train it
# Duration ~ 26 hours
# rewards_list = run_agent(agent, environment, due_date_range_list,number_orders_start_list,
#                         average_count_new_orders_list, worker_list, random_states, episodes = 25, evaluate = False)

In [None]:
# Save models
# agent.save(directory="../models/assign_actor_critic")

# Evaluate model

In [None]:
# %load_ext tensorboard

In [None]:
# %tensorboard --logdir summaries/assign/actor_critic/

In [None]:
# Run agent for evaluation
# Duration ~ 22 minutes
# rewards_list_evaluation = run_agent(agent, environment, due_date_range_list,number_orders_start_list,
#                                    average_count_new_orders_list, worker_list, random_states_evaluation,
#                                    episodes = 1, evaluate = True)

In [None]:
# Save results
# rewards_list_evaluation.to_csv("../data/processed/evaluation/simulation_results_validation_assign_actor_critic.csv", index = False)

In [None]:
# Load results
rewards_list_evaluation = pd.read_csv("../data/processed/evaluation/simulation_results_validation_assign_actor_critic.csv")

In [None]:
# Calculate mean reward by summing all reward and dividing through the number of days
rewards_list_evaluation["reward"].sum() / 17280