# value iteration 

This entire notebook was used during development steps, and only contains snippets of useful code. It will offer very little insight by reading it. It wont be commented, and I have only left it here since it is useful to take apart certain functions which appear in other parts of this repository. 

In [17]:
import numpy as np 
from typing import Callable

In [None]:
len_x, len_y = 10, 10

states = [(i, j) for i in range(len_x) for j in range(len_y)]
probabilities = {}

corners = [(0, 0), (0, len_y - 1), (len_x - 1, 0), (len_x - 1, len_y - 1)]
reward_states = [(8,7), (7,2), (3,4), (3,7)]
reward_values = [10, 3, -5, -10]

actions = [(1, 0),    # right
           (-1, 0),   # left
           (0, 1),    # up
           (0, -1)]   # down

In [3]:
def populate_probabilities_rewards(state, actions):
    sub_dirs = {}
    for index, _ in enumerate(actions):
        sub_dirs[index] = {}
        for corner in corners:
            sub_dirs[index][(corner)] = 0.25
    return sub_dirs

def populate_probabilities_usual(state, actions):
    sub_dirs = {}
    for index, intended_move in enumerate(actions):
        sub_dirs[index] = {}
        for actual_move in actions:
            value = 0.7 if intended_move == actual_move else 0.1
            sub_dirs[index][(state[0] + actual_move[0], state[1] + actual_move[1])] = value
    return sub_dirs

def populate_probabilities_edge(state, actions):
    sub_dirs = {}

    for index, intended_move in enumerate(actions):
        sub_dirs[index] = {}

        for actual_move in actions:
            new_row = state[0] + actual_move[0]
            new_col = state[1] + actual_move[1]

            # Check if out of bounds
            if (new_row < 0 or new_row >= len_x or
                new_col < 0 or new_col >= len_y):
                # Remain in the same cell 
                new_state = state
            else:
                new_state = (new_row, new_col)

            value = 0.7 if intended_move == actual_move else 0.1
            sub_dirs[index][new_state] = value

    return sub_dirs

In [4]:
def populate_positive_rewards(state, actions, reward_value):
    sub_dirs = {}
    for index, _ in enumerate(actions):
        sub_dirs[index] = {}
        for corner in corners:
            sub_dirs[index][(corner)] = reward_value
    return sub_dirs

def populate_negative_rewards(state, actions, reward_value):
    sub_dirs = {}
    for index, intended_move in enumerate(actions):
        sub_dirs[index] = {}
        for actual_move in actions:
            sub_dirs[index][(state[0] + actual_move[0], state[1] + actual_move[1])] = reward_value
    return sub_dirs

def populate_edge_penalties(state, actions):
    sub_dirs = {}

    for index, intended_move in enumerate(actions):
        sub_dirs[index] = {}

        for actual_move in actions:
            new_row = state[0] + actual_move[0]
            new_col = state[1] + actual_move[1]

            # Check if out of bounds
            if (new_row < 0 or new_row >= len_x or
                new_col < 0 or new_col >= len_y):
                # Remain in the same cell 
                new_state = state
            else:
                new_state = (new_row, new_col)

            value = -1 if new_state == state else 0
            sub_dirs[index][new_state] = value

    return sub_dirs

In [6]:
for state in states:
    i, j = state
    if state in reward_states and reward_values[reward_states.index(state)] > 0:
        # teleportation step
        probabilities[state] = populate_probabilities_rewards(state, actions)
    elif 0 < i < len_x-1 and 0 < j < len_y-1:
        # we are in standard operating conditions, make usual sub directories
        probabilities[state] = populate_probabilities_usual(state, actions)
    else: # we must be at an edge
        probabilities[state] = populate_probabilities_edge(state, actions)

In [8]:
rewards = {}
for state in states:
    i, j = state
    if state in reward_states and reward_values[reward_states.index(state)] > 0:
        # we are about to teleport
        rewards[state] = populate_positive_rewards(state, actions, reward_values[reward_states.index(state)])
    elif state in reward_states and reward_values[reward_states.index(state)] < 0:
        # we get a reward but do not teleport
        rewards[state] = populate_negative_rewards(state, actions, reward_values[reward_states.index(state)])
    elif i in [0, len_x-1] or j in [0, len_y-1]:
        rewards[state] = populate_edge_penalties(state, actions)

In [10]:
from MDP import GenericMDP
import numpy as np
mdp_solver = GenericMDP(states, actions, probabilities, rewards, 0.9, 200, len_x=len_x, len_y=len_y, reward_list=reward_states, reward_values= reward_values, problem_type='gridworld')

In [7]:
# we need to make the objects for rewards and actions for the states for ex 9.27
states = ['Healthy', 'Sick']
actions = ['Relax', 'Party']
probabilities = {}
if len(states) == 2:
    for s in states: 
        temp = {}
        for index, action in enumerate(actions):
            temp2 = {}
            probability = input(f'if in state = {s} and you take action = {action}, what is the probability of entering state = {states[0]}')
            print(f'Defaulting to probability = {(1-float(probability.strip())):.3f} for transition {s} to {states[1]} following action {action}')
            temp2[states[0]] = float(probability.strip())
            temp2[states[1]] = (1-float(probability.strip()))
            temp[index] = temp2
        probabilities[s] = temp
else:
    for s in states: 
        temp = {}
        for index, action in enumerate(actions):
            temp2 = {}
            for s_prime in states:
                probability = input(f'if in state = {s} and you take action = {action}, what is the probability of entering state = {s_prime}')
                temp2[s_prime] = float(probability.strip())
            temp[index] = temp2
        probabilities[s] = temp


Defaulting to probability = 0.050 for transition Healthy to Sick following action Relax
Defaulting to probability = 0.300 for transition Healthy to Sick following action Party
Defaulting to probability = 0.500 for transition Sick to Sick following action Relax
Defaulting to probability = 0.900 for transition Sick to Sick following action Party


In [8]:
# next we do the rewards
rewards = {}
if len(actions) == 2:
    for s in states: 
        temp = {}
        for index, action in enumerate(actions):
            temp2 = {}
            reward = input(f'if in state = {s} and you take action = {action}, what is the reward given?')
            
            temp2[states[0]] = float(reward)
            temp2[states[1]] = float(reward)
            temp[index] = temp2
        rewards[s] = temp