# Assign jobs directly with proximal policy optimization

# Prepare programming environment

## Load necessary packages

In [1]:
# Import packages
import sys
from tqdm.notebook import tqdm

from tensorforce.environments import Environment
from datetime import datetime
from time import mktime
from tensorforce.agents import Agent
import pandas as pd
import pickle

sys.path.append("..")
from src.simulation import assign_priority_edd, assign_priority_mdd, assign_priority_spt, assign_priority_srpt, assign_priority_lpt, assign_priority_cr, assign_priority_ds, assign_priority_fifo, select_machine_winq
from src.models import AssignEnvironment, hyperparameter_tuning_assign_ppo, run_agent



## Get required data

In [2]:
# Load data required for simulation
product_types_df = pd.read_csv("../data/external/product_types.csv")
product_types_df = product_types_df[product_types_df.id != 2]
with open(r"../data/interim/sim_data.pickle", "rb") as output_file:
    orders_df = pickle.load(output_file)
machines_df = pd.read_csv("../data/external/machine.csv")
machines_df = machines_df[machines_df.product_type_id != 2]

## Define constants

In [3]:
# Define constants for simulation
priority_rules = [assign_priority_edd, assign_priority_spt, assign_priority_srpt, assign_priority_fifo, assign_priority_cr,
                  assign_priority_mdd, assign_priority_lpt, assign_priority_ds]
SIMULATION_START = mktime(datetime(2022, 11, 14, 5, 0, 0).timetuple()) * 1000
due_date_range_list = [(3, 10), (5, 14), (7, 21)]
number_orders_start_list = [80, 90, 100, 110]
average_count_new_orders_list = [80, 90, 100, 110]
worker_list =  [40, 50, 60, 70]
random_states = [7, 42, 66, 97, 108]
random_states_evaluation = [100, 101, 102]

# Prepare training

In [4]:
environment = Environment.create(
    environment=AssignEnvironment(product_types_df, machines_df, orders_df,priority_rule=assign_priority_edd,
                 simulation_start=SIMULATION_START, allocation_rule=select_machine_winq, random_state=42))

# Hyperparameter tuning

In [5]:
# Parameter grid for ppo learning
params_ppo = {"batch_size": [5, 10, 20, 30, 50, 100],
            "update_frequency": [0.25, 0.5, 1.0],
            "discount": [0.9, 0.7, 0.5, 0.01],
            "return_processing": [dict(type='exponential_normalization', decay=0.9), None],
            "reward_processing": [dict(type='exponential_normalization', decay=0.9), None],
            "state_preprocessing": [dict(type='exponential_normalization', decay=0.9),  None],
            "l2_regularization": [0.01, 0.0],
            "likelihood_ratio_clipping": [0.1, 0.2, 0.3],
            "entropy_regularization": [0.0, 0.01]
            }

In [6]:
# Execute hyperparameter search
# tuning_res = hyperparameter_tuning_assign_ppo(environment, params_ppo, 1, due_date_range_list,
#                                               number_orders_start_list, average_count_new_orders_list,
#                                               worker_list, [42], 3)

In [7]:
# Save results of hyperparameter search
# tuning_res.to_csv("../data/processed/hyperparameter_search/results_hyperparameter_assign_proximal_policy_optimization.csv", index = False)

In [8]:
# Read hyperparameter results
tuning_res = pd.read_csv("../data/processed/hyperparameter_search/results_hyperparameter_assign_proximal_policy_optimization.csv")

In [9]:
# Show best performing combination
tuning_res.sort_values(["reward_training_mean", "reward_evaluation_mean"], ascending = False)

Unnamed: 0,batch_size,update_frequency,discount,return_processing,state_preprocessing,reward_processing,l2_regularization,entropy_regularization,likelihood_ratio_clipping,reward_training_mean,reward_training_var,reward_evaluation_mean,reward_evaluation_var,reward_df
63,5,0.50,0.01,"{'type': 'exponential_normalization', 'decay':...",,,0.00,0.00,0.3,589.612709,5.034348e+06,554.154325,4.198803e+06,episode day reward due_date_ran...
29,5,0.25,0.70,,,"{'type': 'exponential_normalization', 'decay':...",0.01,0.00,0.3,581.753780,4.971397e+06,546.616113,4.488806e+06,episode day reward due_date_ran...
54,5,0.50,0.90,"{'type': 'exponential_normalization', 'decay':...",,,0.00,0.01,0.2,558.318754,3.501242e+06,537.786121,3.714055e+06,episode day reward due_date_rang...
97,10,0.50,0.01,"{'type': 'exponential_normalization', 'decay':...",,,0.00,0.00,0.3,541.876892,3.763843e+06,559.496317,4.013382e+06,episode day reward due_date_rang...
88,5,1.00,0.70,"{'type': 'exponential_normalization', 'decay':...",,,0.00,0.01,0.2,522.886177,3.439398e+06,503.081598,3.647436e+06,episode day reward due_date_ran...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,20,0.50,0.01,"{'type': 'exponential_normalization', 'decay':...",,"{'type': 'exponential_normalization', 'decay':...",0.01,0.00,0.3,52.286353,1.858489e+07,2.782588,1.887667e+07,episode day reward due_date_rang...
14,50,0.50,0.50,,"{'type': 'exponential_normalization', 'decay':...",,0.00,0.00,0.1,37.569176,1.896357e+07,57.479926,1.329789e+07,episode day reward due_date_rang...
8,30,1.00,0.01,,"{'type': 'exponential_normalization', 'decay':...","{'type': 'exponential_normalization', 'decay':...",0.00,0.00,0.3,-10.527091,1.890367e+07,-691.392437,5.563301e+07,episode day reward due_date_ran...
5,10,0.50,0.01,"{'type': 'exponential_normalization', 'decay':...","{'type': 'exponential_normalization', 'decay':...","{'type': 'exponential_normalization', 'decay':...",0.01,0.00,0.1,-38.490446,2.072603e+07,-116.878301,2.276274e+07,episode day reward due_date_ran...


# Define and train final model

In [10]:
# Define agent based on hyperparameter results
agent = Agent.create(
    agent='ppo', environment=environment, memory= 10000, batch_size=5,
    summarizer=dict(
        directory='summaries/assign/ppo',
        summaries=["action-value", "entropy", "graph", "kl-divergence", "loss", "parameters", "reward", "update-norm",
                   "updates", "variables"]
    ),
    update_frequency=0.25, learning_rate=0.001, discount=0.9, return_processing={'type': 'exponential_normalization', 'decay': 0.9}, reward_processing=None,
    state_preprocessing=None, likelihood_ratio_clipping = 0.3, entropy_regularization = 0.01, exploration=0.2, l2_regularization=0.0,  max_episode_timesteps = 1000,)



In [11]:
# Run agent for 100 episodes to train it
# Duration ~ 29 hours
# rewards_list = run_agent(agent, environment, due_date_range_list,number_orders_start_list,                         average_count_new_orders_list, worker_list, random_states, episodes = 40, evaluate = False)

In [12]:
# Save models
# agent.save(directory="../models/assign_ppo")

# Evaluate model

In [13]:
# %load_ext tensorboard

In [14]:
# %tensorboard --logdir summaries/assign/ppo

Launching TensorBoard...

In [15]:
# Run agent for evaluation
# Duration ~ 20 minutes
# rewards_list_evaluation = run_agent(agent, environment, due_date_range_list,number_orders_start_list, average_count_new_orders_list, worker_list, random_states_evaluation, episodes = 1, evaluate = True)

In [16]:
# Save results
# rewards_list_evaluation.to_csv("../data/processed/evaluation/simulation_results_validation_assign_ppo.csv", index = False)

In [17]:
# Load results
rewards_list_evaluation = pd.read_csv("../data/processed/evaluation/simulation_results_validation_assign_ppo.csv")

In [18]:
# Calculate mean reward by summing all reward and dividing through the number of days
rewards_list_evaluation["reward"].sum() / 17280

9689.112223796816