# Assign jobs directly with deep Q-learning

# Prepare programming environment

## Load necessary packages

In [1]:
# Import packages
import sys
from tensorforce.environments import Environment
from datetime import datetime
from time import mktime
from tensorforce.agents import Agent
import pandas as pd
import pickle

sys.path.append("..")
from src.simulation import assign_priority_edd, assign_priority_mdd, assign_priority_spt, assign_priority_srpt, assign_priority_lpt, assign_priority_cr, assign_priority_ds, assign_priority_fifo, select_machine_winq
from src.models import AssignEnvironment, hyperparameter_tuning_assign_deepq, run_agent



## Get required data

In [2]:
# Load data required for simulation
product_types_df = pd.read_csv("../data/external/product_types.csv")
product_types_df = product_types_df[product_types_df.id != 2]
with open(r"../data/interim/sim_data.pickle", "rb") as output_file:
    orders_df = pickle.load(output_file)
machines_df = pd.read_csv("../data/external/machine.csv")
machines_df = machines_df[machines_df.product_type_id != 2]

## Define constants

In [3]:
# Define constants for simulation
priority_rules = [assign_priority_edd, assign_priority_spt, assign_priority_srpt, assign_priority_fifo, assign_priority_cr,
                  assign_priority_mdd, assign_priority_lpt, assign_priority_ds]
SIMULATION_START = mktime(datetime(2022, 11, 14, 5, 0, 0).timetuple()) * 1000
due_date_range_list = [(3, 10), (5, 14), (7, 21)]
number_orders_start_list = [80, 90, 100, 110]
average_count_new_orders_list = [80, 90, 100, 110]
worker_list =  [40, 50, 60, 70]
random_states = [7, 42, 66, 97, 108]
random_states_evaluation = [100, 101, 102]

# Prepare training

In [4]:
# Create learning environment
environment = Environment.create(
    environment=AssignEnvironment(product_types_df, machines_df, orders_df,priority_rule=assign_priority_edd,
                 simulation_start=SIMULATION_START, allocation_rule=select_machine_winq, random_state=42))

# Hyperparameter tuning

In [5]:
# Parameter grid for deep q learning
params_q = {"batch_size": [5, 10, 20, 30, 50, 100],
            "update_frequency": [0.25, 0.5, 1.0],
            "horizon": [5, 10, 20, 30, 50, 100],
            "discount": [0.9, 0.7, 0.5, 0.01],
            "return_processing": [dict(type='exponential_normalization', decay=0.9), None],
            "reward_processing": [dict(type='exponential_normalization', decay=0.9), None],
            "state_preprocessing": [dict(type='exponential_normalization', decay=0.9),  None],
            "target_update_weight": [0.7, 1.0],
            "l2_regularization": [0.01, 0.0],
            }

In [6]:
# Execute hyperparameter search
# tuning_res = hyperparameter_tuning_assign_deepq(environment, params_q, 1, due_date_range_list,
#                                               number_orders_start_list, average_count_new_orders_list,
#                                               worker_list, [42], 3)

In [7]:
# Save results of hyperparameter search
# tuning_res.to_csv("../data/processed/hyperparameter_search/results_hyperparameter_select_deep_q_learning.csv", index = False)

In [8]:
# Read hyperparameter results
tuning_res = pd.read_csv("../data/processed/hyperparameter_search/results_hyperparameter_select_deep_q_learning.csv")

In [9]:
# Show best performing combination
tuning_res.sort_values(["reward_training_mean", "reward_evaluation_mean"], ascending = False)

Unnamed: 0,batch_size,update_frequency,horizon,discount,return_processing,state_preprocessing,reward_processing,target_update_weight,l2_regularization,reward_training_mean,reward_training_var,reward_evaluation_mean,reward_evaluation_var,reward_df
26,5,0.5,30,0.90,,,,0.7,0.01,8176.041143,2.227054e+07,6119.041612,2.477160e+07,episode day reward due_date_range...
63,5,0.5,30,0.90,,,,0.7,0.00,8167.411602,2.234050e+07,6127.736844,2.438002e+07,episode day reward due_date_range...
38,30,0.5,5,0.90,,,,1.0,0.01,8165.938390,2.214403e+07,6174.388806,2.537082e+07,episode day reward due_date_range...
161,5,1.0,30,0.50,,,,0.7,0.01,8153.888962,2.247067e+07,6099.959301,2.564236e+07,episode day reward due_date_rang...
179,30,0.5,5,0.90,,,,0.7,0.00,8121.865734,2.231287e+07,6109.413542,2.601128e+07,episode day reward due_date_rang...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,5,0.5,30,0.90,,"{'type': 'exponential_normalization', 'decay':...",,0.7,0.00,6989.503386,2.536119e+07,5213.493603,2.947091e+07,episode day reward due_date_range...
34,5,1.0,5,0.01,,"{'type': 'exponential_normalization', 'decay':...",,1.0,0.00,6950.360795,2.569203e+07,5275.553131,2.868677e+07,episode day reward due_date_range...
152,30,1.0,30,0.01,,"{'type': 'exponential_normalization', 'decay':...",,0.7,0.01,6863.783064,2.574048e+07,5131.338146,2.876212e+07,episode day reward due_date_range...
18,5,0.5,30,0.01,,"{'type': 'exponential_normalization', 'decay':...",,1.0,0.01,6821.040351,2.530350e+07,5065.431576,2.920767e+07,episode day reward due_date_range...


# Define and train final model

In [10]:
# Define agent based on hyperparameter results
agent = Agent.create(
    agent='dqn', environment=environment, memory=200, batch_size=20,
    summarizer=dict(
        directory='summaries/assign/deepq',
        summaries=["action-value", "entropy", "graph", "kl-divergence", "loss", "parameters", "reward", "update-norm",
                   "updates", "variables"]
    ),
    update_frequency=0.25, learning_rate=0.001, horizon=30, discount=0.5, return_processing=None, reward_processing=None,
    state_preprocessing=None,
    target_update_weight=0.7, exploration=0.2, l2_regularization=0.1)



In [11]:
# Run agent for 100 episodes to train it
# Duration ~ 35 hours
# rewards_list = run_agent(agent, environment, due_date_range_list,number_orders_start_list,
#                         average_count_new_orders_list, worker_list, random_states, episodes = 40, evaluate = False)

In [12]:
# Save models
# agent.save(directory="../models/assign_deep_q")

# Evaluate model

In [13]:
# %load_ext tensorboard

In [14]:
# Analysis of the model
# Removed as the file has a size of over 100 GB
# %tensorboard --logdir summaries/assign/deepq

In [15]:
# Run agent for evaluation
# Duration ~ 20 minutes
# rewards_list_evaluation = run_agent(agent, environment, due_date_range_list,number_orders_start_list,
#                                    average_count_new_orders_list, worker_list, random_states_evaluation, episodes = 1, evaluate = True)

In [16]:
# Save results
# rewards_list_evaluation.to_csv("../data/processed/evaluation/simulation_results_validation_assign_deepq.csv", index = False)

In [17]:
# Load results
rewards_list_evaluation = pd.read_csv("../data/processed/evaluation/simulation_results_validation_assign_deepq.csv")

In [18]:
# Calculate mean reward by summing all reward and dividing through the number of days
rewards_list_evaluation["reward"].sum() / 17280

9693.44302001655