In [108]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [109]:
pip install pymdptoolbox



In [486]:
import os
import numpy as np
import pandas as pd
import itertools
import mdptoolbox
import scipy.fftpack
import mdptoolbox
import mdptoolbox.example
import warnings
warnings.filterwarnings("ignore")
!pwd

/content/gdrive/My Drive/Data


In [487]:
os.chdir("/content/gdrive/MyDrive/Data" )
!pwd

/content/gdrive/MyDrive/Data


In [535]:
training_txt_file = pd.read_csv('./Copy of SAKI Exercise 3 warehousetraining2x2.txt', sep='\t')
training_txt_file.columns = ["Action", "Item"]
training_txt_file["Count"] = 1

test_txt_file = pd.read_csv('./Copy of SAKI Exercise 3 warehouseorder2x2.txt', sep='\t')
test_txt_file.columns = ["Action", "Item"]
test_txt_file["Count"] = 1

len(training_txt_file)


8176

In [536]:

ACTIONS = 6
FIELD = 4
ITEMS = 4

ITEMS_IN_FIELD = ITEMS ** FIELD
STATES = ITEMS_IN_FIELD * ACTIONS
print(STATES)
FIELD_COLUMNS = ["(1,1)-1", "(1,2)-2", 
                     "(2,1)-2", "(2,2)-3"]

REWARD = {0: 4, 1: 2, 2: 2, 3: 1}

ACTION_ITEMS = {
    "store-blue": 0,
    "store-red": 1,
    "store-white": 2,
    "restore-blue": 3,
    "restore-red": 4,
    "restore-white": 5 
}

1536


In [537]:
def create_states_as_data_frame():
    all_states = []

    for x in itertools.product([0, 1, 2, 3], repeat=FIELD):
        all_states.append(x)

    all_states = np.array(all_states)
    states_as_data_frame = pd.DataFrame(data=all_states, columns=FIELD_COLUMNS)

    states_as_data_frame["empty"] = states_as_data_frame.apply(lambda x: list(x)[:FIELD].count(0), axis=1)
    states_as_data_frame["blue"] = states_as_data_frame.apply(lambda x: list(x)[:FIELD].count(1), axis=1)
    states_as_data_frame["red"] = states_as_data_frame.apply(lambda x: list(x)[:FIELD].count(2), axis=1)
    states_as_data_frame["white"] = states_as_data_frame.apply(lambda x: list(x)[:FIELD].count(3), axis=1)
    states_as_data_frame["occupied"] = states_as_data_frame.apply(lambda x: FIELD - (list(x)[:FIELD].count(0)), axis=1)
    
    return states_as_data_frame

In [538]:
def find_next_states(state, action, item, FIELD_COLUMNS):
    count = state[item] + 1
    

    if item == "blue":
        new_states = states_df[(states_df["blue"] == count) & (states_df["red"] == state["red"]) & (states_df["white"] == state["white"])]
        filtered_new_states = new_states[((new_states[FIELD_COLUMNS] == state[FIELD_COLUMNS]).sum(axis=1) == (FIELD - 1))]
    if item == "red":
        new_states = states_df[(states_df["blue"] == state["blue"]) & (states_df["red"] == count) & (states_df["white"] == state["white"])]
        filtered_new_states = new_states[((new_states[FIELD_COLUMNS] == state[FIELD_COLUMNS]).sum(axis=1) == (FIELD - 1))]
    if item == "white":
        new_states = states_df[(states_df["blue"] == state["blue"]) & (states_df["red"] == state["red"]) & (states_df["white"] == count)]
        filtered_new_states = new_states[((new_states[FIELD_COLUMNS] == state[FIELD_COLUMNS]).sum(axis=1) == (FIELD - 1))]
    

    indices = np.argmin(np.array(filtered_new_states[FIELD_COLUMNS]) == state[FIELD_COLUMNS].values, axis=1)
    
    dists = []
    
    for dist in indices:
        dists.append(int(FIELD_COLUMNS[dist][-1]))
    
    filtered_new_states["distance"] = dists
    
    
    return filtered_new_states

In [539]:
def calculate_tpm(dists, action, item):
    assert len(dists) > 0
    dists = 4 - dists
    
    summed = np.sum(dists)
    tpm = np.round(dists / summed, 1)
    
    if np.sum(tpm) != 1.0:

        tpm[np.argmax(tpm)] += (1.0 - np.sum(tpm))
        
    tpm = tpm.astype(float)

    return tpm

In [540]:
def tpms_matrix():
    tpm_full = np.zeros((FIELD, STATES, STATES))
    
    for i, val in enumerate(FIELD_COLUMNS):
        state_row = 0

        for j, val in enumerate(ACTION_ITEMS):
            item = val.split("-")[1]
            action = val.split("-")[0]
            states_df = create_states_as_data_frame()

            for j in range(len(states_df)):
                current_state = states_df.iloc[j]
                next_states_df = find_next_states(current_state, action, item, FIELD_COLUMNS)     

                if len(next_states_df) > 0:
                    idx = np.array(next_states_df.index)
                    dists = np.array(next_states_df["distance"])

                    tpms = calculate_tpm(dists, action, item)

                    tpm_full[i, state_row, idx] = tpms

                else:
                    tpm_full[i, state_row, state_row] = 1

                state_row += 1
        
    return tpm_full

In [541]:
def reward_matrix():
    R = np.zeros((STATES, FIELD))
    ids = [0, 1, 2, 3]
    state_row = 0

    for i, val in enumerate(ACTION_ITEMS):
        item = val.split("-")[1]
        action = val.split("-")[0]
        states_df = create_states_as_data_frame()

        for j in range(len(states_df)):
            current_state = states_df.iloc[j]
            next_states_df = find_next_states(current_state, action, item, FIELD_COLUMNS)            
            fields = next_states_df[FIELD_COLUMNS]
            
            for k, field_name in enumerate(fields.columns):
                state = current_state[field_name]
                
                if action == "store":
                    if state == 0:
                        R[state_row, k] = REWARD[k]
                    else:
                        R[state_row, k] = -10
                        
                if action == "restore":
                    if state != 0:
                        R[state_row, k] = REWARD[k] 
                    else:
                        R[state_row, k] = -10
                                     
            state_row += 1
        
    return R

In [542]:
states_as_data_frame = create_states_as_data_frame()
R = reward_matrix()
tpm_full = tpms_matrix()

mdpresultPolicy = mdptoolbox.mdp.PolicyIteration(tpm_full, R, 0.95, max_iter=1000)
mdpresultValue = mdptoolbox.mdp.ValueIteration(tpm_full, R, 0.95, max_iter=1000)

mdpresultPolicy.run()
mdpresultValue.run()

print('PolicyIteration:')
print(mdpresultPolicy.policy)
print(mdpresultPolicy.V)
print(mdpresultPolicy.iter)
print()
print('ValueIteration:')
print(mdpresultValue.policy)
print(mdpresultValue.V)
print(mdpresultValue.iter)

PolicyIteration:
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 2, 2, 2, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1

In [555]:
def evaluate_mdp(mdppolicy):
    dist = [1, 2, 2, 3]
    initial_field = [0, 0, 0, 0]
    all_fields = states_df[FIELD_COLUMNS]
    count_distance_policy = 0
    count_distance_random_walk = 0

    colors = {"blue": 1, "red": 2, "white": 3}


    for i in range(len(test_data)):
        data = test_data.iloc[i]
        action = data["Action"]
        item = data["Item"]
        action_item = action + "-" + item

        action_row = ACTION_ITEMS[action_item]

        current_field = all_fields[(all_fields == initial_field).sum(axis=1) == 4]
        current_idx = current_field.index[0] + (256 * action_row)

        policy = mdppolicy[current_idx]   

        if action == "store": 
            initial_field[policy] = colors[item]

        if action == "restore":
            initial_field[policy] = 0
            
        count_distance_policy += dist[policy]
        
    print(count_distance_policy*2)

In [562]:
def evaluate_greedy():
    dist = [1, 2, 2, 3]
    initial_field = [0, 0, 0, 0]
    all_fields = states_df[FIELD_COLUMNS]
    count_distance_random_walk = 0
    colors = {"blue": 1, "red": 2, "white": 3}

    for i in range(len(test_data)):
        data = test_data.iloc[i]
        action = data["Action"]
        item = data["Item"]
        action_item = action + "-" + item

        action_row = ACTION_ITEMS[action_item]
        current_field = all_fields[(all_fields == initial_field).sum(axis=1) == 4]    

        if action == "store": 
            zeros = np.argwhere(np.array(current_field) == 0)[:,1]

            if(zeros.size!=0):
              r = zeros[0]
            initial_field[r] = colors[item]

        if action == "restore":
            cur_item = np.argwhere(np.array(current_field) == colors[item])[:,1]

            if(cur_item.size!=0):
              r = cur_item[0]
            initial_field[r] = 0
            
        count_distance_random_walk += dist[r]

    return count_distance_random_walk*2

In [563]:
test_data = test_txt_file
result = []

for i in range(50):
    result.append(evaluate_greedy())

In [564]:
print("Policy evalutaion:")
evaluate_mdp(mdpresultPolicy.policy)


Policy evalutaion:
206


In [565]:
print("Value evalutaion:")
evaluate_mdp(mdpresultValue.policy)

Value evalutaion:
206


In [567]:
print("Greedy distance:")
print(np.mean(result))

Greedy distance:
216.0
