In [52]:
import JSSP
import gym
import time
import numpy as np
import random

# Environment Initialization

In [53]:
def create_env(instance_path):
    env_name = "JSSP-v0"
    env = gym.make(env_name, instance_path = instance_path)
    print("Environment Created for: ", instance_path)
    print("Observation space: \n", env.observation_space)
    print("Action space: \n", env.action_space)
    return env

In [None]:
env1 = create_env("instance1.txt")
env3 = create_env("instance3.txt")
env4 = create_env("instance4.txt")

# Baselines

## 1. Random Sampling

In [62]:
def random_sampling(env):
    env.reset()
    episodes = 50
    max_score = -100000
    max_episode = -1
    max_action_list = []
    max_time_list = []
    for episode in range(1, episodes+1):
        env.reset()
        done = False
        score = 0
        action_list = []
        time_list = []
        while not done:
            #env.render()
            action = env.action_space.sample()
            if action != env.action_space.n -1:
                action_list.append(env.legal_allocation_list[action])
                time_list.append(env.time)
                # print('Episode:{} Allocation:{} Time:{}'.format(episode, env.legal_allocation_list[action], env.time))
            n_state, reward, done, info = env.step(action)
            score+=reward
        print('Episode:{} Total_reward:{}'.format(episode, score))
        if score >= max_score:
            max_score = score
            max_episode = episode
            max_action_list = action_list
            max_time_list = time_list


    print('From {}th Episode best policy has reward {}'.format(max_episode, max_score))
    for i in range(len(max_action_list)):
        print('The allocation chose at time {} is {}'.format(max_time_list[i], max_action_list[i]))

random_sampling(env1)

Episode:1 Total_reward:-66
Episode:2 Total_reward:-65
Episode:3 Total_reward:-76
Episode:4 Total_reward:-59
Episode:5 Total_reward:-75
Episode:6 Total_reward:-75
Episode:7 Total_reward:-65
Episode:8 Total_reward:-59
Episode:9 Total_reward:-60
Episode:10 Total_reward:-64
Episode:11 Total_reward:-60
Episode:12 Total_reward:-74
Episode:13 Total_reward:-50
Episode:14 Total_reward:-68
Episode:15 Total_reward:-68
Episode:16 Total_reward:-49
Episode:17 Total_reward:-75
Episode:18 Total_reward:-66
Episode:19 Total_reward:-65
Episode:20 Total_reward:-60
Episode:21 Total_reward:-78
Episode:22 Total_reward:-61
Episode:23 Total_reward:-65
Episode:24 Total_reward:-79
Episode:25 Total_reward:-67
Episode:26 Total_reward:-47
Episode:27 Total_reward:-60
Episode:28 Total_reward:-74
Episode:29 Total_reward:-50
Episode:30 Total_reward:-74
Episode:31 Total_reward:-50
Episode:32 Total_reward:-66
Episode:33 Total_reward:-76
Episode:34 Total_reward:-67
Episode:35 Total_reward:-65
Episode:36 Total_reward:-74
E

## 2. Q-Learning

In [None]:
def update(Q, s_key, s_next_key, action, r, eta, gma):

    if s_key not in Q.keys():
        Q[s_key] = {}

    if s_next_key not in Q.keys() or Q[s_next_key] == {}:
        max_next_s_Q = 0
    else:
        max_next_s_Q = max(Q[s_next_key].values())

    if action not in Q[s_key].keys():
        Q[s_key][action] = 0
    Q[s_key][action] = Q[s_key][action] + eta * (r + gma * max_next_s_Q - Q[s_key][action])

    return Q

In [None]:
def q_learning(env, epis):
    # 1. Load Environment and Q-table structure
    Q = {}
    # 2. Parameters of Q-learning
    eta = .628
    gamma = .9
    epsilon = .1
    rev_list = [] # rewards per episode calculate
    # 3. Q-learning Algorithm
    for i in range(epis):
        # Reset environment
        s = env.reset()
        rAll = 0
        d = False
        # The Q-Table learning algorithm
        while not d:
            # env.render()
            # Choose action from Q table
            s_key = tuple(np.concatenate((s[1], s[0])))
            if s_key not in Q.keys() or Q[s_key] == {} or random.uniform(0, 1) < epsilon:
                a = env.action_space.sample()
            else:
                a = max(Q[s_key], key = Q[s_key].get)
            #Get new state & reward from environment
            s_next, r, d, _ = env.step(a)
            #Update Q-Table with new knowledge
            s_next_key = tuple(np.concatenate((s_next[1], s_next[0])))
            Q = update(Q, s_key, s_next_key, a, r, eta, gamma)
            rAll += r
            s = s_next
        rev_list.append(rAll)
        print("Episode: " + str(i) + " has reward " + str(rAll))
        # env.render()
    # Code will stop at d == True, and render one state before it
    print("Reward Sum on all episodes " + str(sum(rev_list)/epis))
    print("Final Values Q-Table")
    print(Q)

In [None]:
import matplotlib.pyplot as plt
plt.plot(rev_list)
plt.show

In [None]:
plt.plot(rev_list)
plt.show

In [None]:
# 1. Load Environment and Q-table structure
env_name = "JSSP-v0"
env = gym.make(env_name, instance_path = "instance1.txt")
Q = {}
# 2. Parameters of Q-learning
eta = .628
gma = .9
epis = 50
rev_list = [] # rewards per episode calculate
# 3. Q-learning Algorithm
for i in range(epis):
    # Reset environment
    s = env.reset()
    rAll = 0
    d = False
    # The Q-Table learning algorithm
    while not d:
        # env.render()
        # Choose action from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        #Get new state & reward from environment
        s1,r,d,_ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + eta*(r + gma*np.max(Q[s1,:]) - Q[s,a])
        rAll += r
        s = s1
    rev_list.append(rAll)
    print("Episode: " + str(i) + " has reward " + str(rAll))
    # env.render()
# Code will stop at d == True, and render one state before it
print("Reward Sum on all episodes " + str(sum(rev_list)/epis))
print("Final Values Q-Table")
print(Q)

# Baselines

## 1. Random Sampling

In [None]:
state = env.reset()
episodes = 50
max_score = -100000
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        #env.render()
        action = get_legal_action(env)
        # if not (np.all(action == -1)):
        #     print('Episode:{} Action:{} Time:{}'.format(episode, action, env.time))
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Total_reward:{}'.format(episode, score))
    max_score = max(max_score, score)
print('From {} Episodes best policy has reward {}'.format(episodes, max_score))

## 2. Q-Learning

In [None]:
# 1. Load Environment and Q-table structure
env_name = "JSSP-v0"
env = gym.make(env_name, instance_path = "instance1.txt")
Q = {}
# 2. Parameters of Q-learning
eta = .628
gma = .9
epis = 50
rev_list = [] # rewards per episode calculate
# 3. Q-learning Algorithm
for i in range(epis):
    # Reset environment
    s = env.reset()
    rAll = 0
    d = False
    # The Q-Table learning algorithm
    while not d:
        # env.render()
        # Choose action from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        #Get new state & reward from environment
        s1,r,d,_ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + eta*(r + gma*np.max(Q[s1,:]) - Q[s,a])
        rAll += r
        s = s1
    rev_list.append(rAll)
    print("Episode: " + str(i) + " has reward " + str(rAll))
    # env.render()
# Code will stop at d == True, and render one state before it
print("Reward Sum on all episodes " + str(sum(rev_list)/epis))
print("Final Values Q-Table")
print(Q)