# Smart factory (2x2)
## Import and initialization

In [285]:
# Python imports
import itertools

# 3rd party imports
import mdptoolbox
import numpy as np
import pandas as pd

In [286]:
dataset = "Copy of SAKI Exercise 4 warehousetraining2x2.txt"
df = pd.read_csv(dataset, names=["action", "item"], delimiter="\t")
print(df.head())

    action   item
0    store    red
1    store    red
2    store    red
3    store  white
4  restore    red


In [310]:
item_counts_store = df[df["action"] == "store"].value_counts()
item_counts_restore = df[df["action"] == "restore"].value_counts()
probabilites_store_restore = {
    "store": (item_counts_store / df.count()["item"])["store"], 
    "restore": (item_counts_restore / df.count()["item"])["restore"]
}

print(f"Store sum {item_counts_store.sum()}")
print(f"Restore sum {item_counts_restore.sum()}", '\n')
print(probabilites_store_restore)

Store sum 4089
Restore sum 4088 

{'store': item
red      0.252415
white    0.125963
blue     0.121683
dtype: float64, 'restore': item
red      0.252415
white    0.125841
blue     0.121683
dtype: float64}


In [288]:
rows, cols = 2, 2
num_cells = rows * cols

items = ["red", "blue", "white"]
num_items = len(items)

goals = ["store", "restore"]
num_goals = len(goals)

warehouse_cells = ["empty"] + list(items)
num_warehouse_cells = len(warehouse_cells)

num_states = num_goals * num_items * num_warehouse_cells**(num_cells)

print(f"Grid: {rows}x{cols}")
print(f"Items:{items}")
print(f"Warehouse cells: {warehouse_cells}")
print(f"Number of states: {num_states}")

Grid: 2x2
Items:['red', 'blue', 'white']
Warehouse cells: ['empty', 'red', 'blue', 'white']
Number of states: 1536


In [289]:
actions = []
for goal in goals:
    for item in items:
        actions.append((goal, item))
print(f"Actions: {actions}\n")

states_actions = []
indices = list(itertools.product(warehouse_cells, repeat=rows * cols))
for action in actions:
    for state in indices:
        states_actions.append((state, action))
print(f"States, Actions: {states_actions[:10]}")

Actions: [('store', 'red'), ('store', 'blue'), ('store', 'white'), ('restore', 'red'), ('restore', 'blue'), ('restore', 'white')]

States, Actions: [(('empty', 'empty', 'empty', 'empty'), ('store', 'red')), (('empty', 'empty', 'empty', 'red'), ('store', 'red')), (('empty', 'empty', 'empty', 'blue'), ('store', 'red')), (('empty', 'empty', 'empty', 'white'), ('store', 'red')), (('empty', 'empty', 'red', 'empty'), ('store', 'red')), (('empty', 'empty', 'red', 'red'), ('store', 'red')), (('empty', 'empty', 'red', 'blue'), ('store', 'red')), (('empty', 'empty', 'red', 'white'), ('store', 'red')), (('empty', 'empty', 'blue', 'empty'), ('store', 'red')), (('empty', 'empty', 'blue', 'red'), ('store', 'red'))]


In [290]:
def get_probabilities():
    probabilities = {}
    for goal, item in actions:
        probabilities[(goal, item)] = probabilites_store_restore[goal][item]
        
    return probabilities

probabilities = get_probabilities()
print(probabilities)

{('store', 'red'): 0.25241531123884065, ('store', 'blue'): 0.12168276874159227, ('store', 'white'): 0.12596306713953773, ('restore', 'red'): 0.25241531123884065, ('restore', 'blue'): 0.12168276874159227, ('restore', 'white'): 0.12584077289959644}


## Create transition probability and reward matrix

In [291]:
def create_tpm():
    tpm_full = np.zeros((num_cells, num_states, num_states))

    for i, tpm in enumerate(tpm_full):
        for j, (state, (goal, item)) in enumerate(states_actions):
            store_proposal = list(state)
            store_proposal[i] = item
            store_proposal = tuple(store_proposal)

            restore_proposal = list(state)
            restore_proposal[i] = "empty"
            restore_proposal = tuple(restore_proposal)

            for k, (state_next, action_next) in enumerate(states_actions):
                if goal == "store":
                    if state[i] == "empty":
                        if store_proposal == state_next:
                            # Set probability for storing the proposed item 
                            tpm[j, k] = probabilities[action_next]
                    elif state == state_next:
                        # Set probablity for not changing anything 
                        tpm[j, k] = probabilities[action_next]
                else:
                    if state[i] in items:
                        # Set probability for restoring the proposed item
                        if restore_proposal == state_next:
                            tpm[j, k] = probabilities[action_next]
                    elif state == state_next:
                        # Set probablity for not changing anything 
                        tpm[j, k] = probabilities[action_next]

    return tpm_full

tpm_full = create_tpm()
print(tpm_full)

[[[0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  ...
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]]

 [[0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  ...
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]]

 [[0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.      

In [292]:
def create_reward():
    reward_full = np.zeros((num_states, num_cells))
    
    for j, (state, (goal, item)) in enumerate(states_actions):
        for i in range(num_cells):
            if goal == "store":
                if state[i] == "empty":
                    # Each store should be penalized a little bit so that the robot prefers items near the entry
                    reward_full[j, i] = -0.04
                else:
                    # Storing if state[i] is not empty should be penalized hard to avoid it
                    reward_full[j, i] = -1
            else:
                if state[i] == item:
                    # Each restore should be penalized a little bit so that the robot prefers items near the entry
                    reward_full[j, i] = -0.04
                else:
                    # Restoring if state[i] is empty should be penalized hard to avoid it
                    reward_full[j, i] = -1
                    
    return reward_full

reward_full = create_reward()
print(reward_full)

[[-0.04 -0.04 -0.04 -0.04]
 [-0.04 -0.04 -0.04 -1.  ]
 [-0.04 -0.04 -0.04 -1.  ]
 ...
 [-0.04 -0.04 -0.04 -1.  ]
 [-0.04 -0.04 -0.04 -1.  ]
 [-0.04 -0.04 -0.04 -0.04]]


## Training

In [317]:
discount = 0.9
mdpresultValue = mdptoolbox.mdp.ValueIteration(tpm_full,reward_full, discount, max_iter=100)
mdpresultPolicy = mdptoolbox.mdp.PolicyIteration(tpm_full,reward_full, discount, max_iter=100)

mdpresultValue.run()
mdpresultPolicy.run()

print(mdpresultValue.policy, '\n')
print(mdpresultPolicy.policy)

(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

## Evaluation

In [324]:
def get_policy_index_by_state_action(state_action):
    for i, pair in enumerate(states_actions):
        if pair == state_action:
            return i
    raise ValueError

def run_mdp_factory(policy, silent=True):
    
    warehouse = ["empty", "empty", "empty", "empty"]
    traveled_distance = 0
    num_wrong_store_predictions = 0
    num_wrong_restore_predictions = 0
    
    if not silent:
        print(f"Warehouse: {warehouse}")
    
    for action, item in zip(df["action"], df["item"]):
        if i == 10:
            break
        state_action = (tuple(warehouse), (action, item))
        policy_index = get_policy_index_by_state_action(state_action)

        warehouse_index = policy[policy_index] 
        
        if action == "store":
            if warehouse[warehouse_index] != "empty":
                num_wrong_store_predictions += 1
            else:
                warehouse[warehouse_index] = item
        else:
            if warehouse[warehouse_index] != item:
                num_wrong_restore_predictions += 1
            else:
                warehouse[warehouse_index] = "empty"
        
        # Assume 1 unit between two warehouse cells
        if warehouse_index == 0:
            traveled_distance += 1
        elif warehouse_index == 3:
            traveled_distance += 3
        else:
            traveled_distance += 2
        
        
        if not silent:
            print(f"Action: {action}, Item: {item}")
            print(f"Warehouse: {warehouse}")

    return traveled_distance, num_wrong_store_predictions + num_wrong_restore_predictions

traveled_distance_value, num_wrong_predictions_value = run_mdp_factory(mdpresultValue.policy)
traveled_distance_policy, num_wrong_predictions_policy = run_mdp_factory(mdpresultPolicy.policy)

In [325]:
# Example warehouse allocation, only 10 actions shown here
run_mdp_factory(mdpresultValue.policy, False)

Warehouse: ['empty', 'empty', 'empty', 'empty']
Action: store, Item: red
Warehouse: ['red', 'empty', 'empty', 'empty']
Action: store, Item: red
Warehouse: ['red', 'red', 'empty', 'empty']
Action: store, Item: red
Warehouse: ['red', 'red', 'red', 'empty']
Action: store, Item: white
Warehouse: ['red', 'red', 'red', 'white']
Action: restore, Item: red
Warehouse: ['empty', 'red', 'red', 'white']
Action: restore, Item: red
Warehouse: ['empty', 'empty', 'red', 'white']
Action: restore, Item: red
Warehouse: ['empty', 'empty', 'empty', 'white']
Action: restore, Item: white
Warehouse: ['empty', 'empty', 'empty', 'empty']
Action: store, Item: blue
Warehouse: ['blue', 'empty', 'empty', 'empty']
Action: store, Item: blue
Warehouse: ['blue', 'blue', 'empty', 'empty']


(19, 0)

In [321]:
def run_greedy_factory(silent=True):
    warehouse = ["empty", "empty", "empty", "empty"]
    traveled_distance = 0

    if not silent:
        print(f"Warehouse: {warehouse}")

    for action, item in zip(df["action"], df["item"]):
        warehouse_index = 0

        if action == "store":
            while warehouse[warehouse_index] != "empty":
                warehouse_index += 1
            warehouse[warehouse_index] = item
        else:
            while warehouse[warehouse_index] != item:
                warehouse_index += 1
            warehouse[warehouse_index] = "empty"

        # Assume 1 unit between two warehouse cells
        if warehouse_index == 0:
            traveled_distance += 1
        elif warehouse_index == 3:
            traveled_distance += 3
        else:
            traveled_distance += 2

        if not silent:
            print(f"Action: {action}, Item: {item}")
            print(f"Warehouse: {warehouse}")

    return traveled_distance

traveled_distance_greedy = run_greedy_factory()

In [322]:
print(f"Traveled distance with value iteration: {traveled_distance_value}, with {num_wrong_predictions_value} wrong predictions")
print(f"Traveled distance with policy iteration: {traveled_distance_policy} with {num_wrong_predictions_policy} wrong predictions")
print(f"Traveled distance with greedy search: {traveled_distance_greedy}")

Traveled distance with value iteration: 14401, with 0 wrong predictions
Traveled distance with policy iteration: 16398 with 0 wrong predictions
Traveled distance with greedy search: 14401
