In [1]:
import os
os.chdir("..")
from typing import Mapping,Iterable, Iterator, Tuple, TypeVar,Optional,Callable,Dict
from itertools import islice
from rl.distribution import Choose
from rl.distribution import Categorical
from rl.prediction_utils import fmrp_episodes_stream
from rl.markov_process import (MarkovRewardProcess,
                               FiniteMarkovRewardProcess, TransitionStep)
from rl.function_approx import learning_rate_schedule
from rl.returns import returns


In [2]:
from dataclasses import dataclass
from typing import Tuple, Dict
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.markov_decision_process import FinitePolicy, StateActionMapping
from rl.markov_process import FiniteMarkovProcess, FiniteMarkovRewardProcess
from rl.distribution import Categorical, Constant
from scipy.stats import poisson


@dataclass(frozen=True)
class InventoryState:
    on_hand: int
    on_order: int

    def inventory_position(self) -> int:
        return self.on_hand + self.on_order


InvOrderMapping = StateActionMapping[InventoryState, int]


class SimpleInventoryMDPCap(FiniteMarkovDecisionProcess[InventoryState, int]):

    def __init__(
        self,
        capacity: int,
        poisson_lambda: float,
        holding_cost: float,
        stockout_cost: float
    ):
        self.capacity: int = capacity
        self.poisson_lambda: float = poisson_lambda
        self.holding_cost: float = holding_cost
        self.stockout_cost: float = stockout_cost

        self.poisson_distr = poisson(poisson_lambda)
        super().__init__(self.get_action_transition_reward_map())

    def get_action_transition_reward_map(self) -> InvOrderMapping:
        d: Dict[InventoryState, Dict[int, Categorical[Tuple[InventoryState,
                                                            float]]]] = {}

        for alpha in range(self.capacity + 1):
            for beta in range(self.capacity + 1 - alpha):
                state: InventoryState = InventoryState(alpha, beta)
                ip: int = state.inventory_position()
                base_reward: float = - self.holding_cost * alpha
                d1: Dict[int, Categorical[Tuple[InventoryState, float]]] = {}

                for order in range(self.capacity - ip + 1):
                    sr_probs_dict: Dict[Tuple[InventoryState, float], float] =\
                        {(InventoryState(ip - i, order), base_reward):
                         self.poisson_distr.pmf(i) for i in range(ip)}

                    probability: float = 1 - self.poisson_distr.cdf(ip - 1)
                    reward: float = base_reward - self.stockout_cost *\
                        (probability * (self.poisson_lambda - ip) +
                         ip * self.poisson_distr.pmf(ip))
                    sr_probs_dict[(InventoryState(0, order), reward)] = \
                        probability
                    d1[order] = Categorical(sr_probs_dict)

                d[state] = d1
        return d


if __name__ == '__main__':
    from pprint import pprint

    user_capacity = 2
    user_poisson_lambda = 1.0
    user_holding_cost = 1.0
    user_stockout_cost = 10.0

    user_gamma = 0.9

    si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
        SimpleInventoryMDPCap(
            capacity=user_capacity,
            poisson_lambda=user_poisson_lambda,
            holding_cost=user_holding_cost,
            stockout_cost=user_stockout_cost
        )

    print("MDP Transition Map")
    print("------------------")
    print(si_mdp)

    fdp: FinitePolicy[InventoryState, int] = FinitePolicy(
        {InventoryState(alpha, beta):
         Constant(user_capacity - (alpha + beta)) for alpha in
         range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha)}
    )

    print("Policy Map")
    print("----------")
    print(fdp)

    implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\
        si_mdp.apply_finite_policy(fdp)
    print("Implied MP Transition Map")
    print("--------------")
    print(FiniteMarkovProcess(implied_mrp.transition_map))

    print("Implied MRP Transition Reward Map")
    print("---------------------")
    print(implied_mrp)

    print("Implied MP Stationary Distribution")
    print("-----------------------")
    implied_mrp.display_stationary_distribution()
    print()

    print("Implied MRP Reward Function")
    print("---------------")
    implied_mrp.display_reward_function()
    print()

    print("Implied MRP Value Function")
    print("--------------")
    implied_mrp.display_value_function(gamma=user_gamma)
    print()

    from rl.dynamic_programming import evaluate_mrp_result
    from rl.dynamic_programming import policy_iteration_result
    from rl.dynamic_programming import value_iteration_result

    print("Implied MRP Policy Evaluation Value Function")
    print("--------------")
    pprint(evaluate_mrp_result(implied_mrp, gamma=user_gamma))
    print()

    print("MDP Policy Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_pi, opt_policy_pi = policy_iteration_result(
        si_mdp,
        gamma=user_gamma
    )
    pprint(opt_vf_pi)
    print(opt_policy_pi)
    print()

    print("MDP Value Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp, gamma=user_gamma)
    pprint(opt_vf_vi)
    print(opt_policy_vi)
    print()

MDP Transition Map
------------------
From State InventoryState(on_hand=0, on_order=0):
  With Action 0:
    To [State InventoryState(on_hand=0, on_order=0) and Reward -10.000] with Probability 1.000
  With Action 1:
    To [State InventoryState(on_hand=0, on_order=1) and Reward -10.000] with Probability 1.000
  With Action 2:
    To [State InventoryState(on_hand=0, on_order=2) and Reward -10.000] with Probability 1.000
From State InventoryState(on_hand=0, on_order=1):
  With Action 0:
    To [State InventoryState(on_hand=1, on_order=0) and Reward -0.000] with Probability 0.368
    To [State InventoryState(on_hand=0, on_order=0) and Reward -3.679] with Probability 0.632
  With Action 1:
    To [State InventoryState(on_hand=1, on_order=1) and Reward -0.000] with Probability 0.368
    To [State InventoryState(on_hand=0, on_order=1) and Reward -3.679] with Probability 0.632
From State InventoryState(on_hand=0, on_order=2):
  With Action 0:
    To [State InventoryState(on_hand=2, on_order=

# p1 p2 p3

In [6]:
from typing import Tuple, Sequence, Set, Mapping, Dict, Callable, Optional
from dataclasses import dataclass
from operator import itemgetter
from rl.distribution import Categorical, Choose, Constant
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.markov_decision_process import StateActionMapping
from rl.markov_decision_process import FinitePolicy
from rl.dynamic_programming import value_iteration_result, V

class SimpleInventoryMDPCap_RL(FiniteMarkovDecisionProcess[InventoryState, int]):

    def __init__(
        self,
        capacity: int,
        poisson_lambda: float,
        holding_cost: float,
        stockout_cost: float
    ):
        self.capacity: int = capacity
        self.poisson_lambda: float = poisson_lambda
        self.holding_cost: float = holding_cost
        self.stockout_cost: float = stockout_cost

        self.poisson_distr = poisson(poisson_lambda)
        super().__init__(self.get_action_transition_reward_map())

    def get_action_transition_reward_map(self) -> InvOrderMapping:
        d: Dict[InventoryState, Dict[int, Categorical[Tuple[InventoryState,
                                                            float]]]] = {}

        for alpha in range(self.capacity + 1):
            for beta in range(self.capacity + 1 - alpha):
                state: InventoryState = InventoryState(alpha, beta)
                ip: int = state.inventory_position()
                base_reward: float = - self.holding_cost * alpha
                d1: Dict[int, Categorical[Tuple[InventoryState, float]]] = {}

                for order in range(self.capacity - ip + 1):
                    sr_probs_dict: Dict[Tuple[InventoryState, float], float] =\
                        {(InventoryState(ip - i, order), base_reward):
                         self.poisson_distr.pmf(i) for i in range(ip)}

                    probability: float = 1 - self.poisson_distr.cdf(ip - 1)
                    reward: float = base_reward - self.stockout_cost *\
                        (probability * (self.poisson_lambda - ip) +
                         ip * self.poisson_distr.pmf(ip))
                    sr_probs_dict[(InventoryState(0, order), reward)] = \
                        probability
                    d1[order] = Categorical(sr_probs_dict)

                d[state] = d1
        return d


    @staticmethod
    def epsilon_greedy_action(
        nt_state: InventoryState,
        q: Mapping[InventoryState, Mapping[int, float]],
        epsilon: float
    ) -> int:
        '''
        given a non-terminal state, a Q-Value Function (in the form of a
        {state: {action: Expected Return}} dictionary) and epislon, return
        an action sampled from the probability distribution implied by an
        epsilon-greedy policy that is derived from the Q-Value Function.
        '''
        action_values: Mapping[int, float] = q[nt_state]
        greedy_action: Move = max(action_values.items(), key=itemgetter(1))[0]
        return Categorical(
            {a: epsilon / len(action_values) +
             (1 - epsilon if a == greedy_action else 0.)
             for a in action_values}
        ).sample()

    def get_states_actions_dict(self) -> Mapping[InventoryState, Optional[Set[int]]]:
        '''
        Returns a dictionary whose keys are the states and the corresponding
        values are the set of actions for the state (if the key is a
        non-terminal state) or is None if the state is a terminal state.
        '''
        d1: Mapping[InventoryState, Optional[Set[int]]] = \
            {s: {a for a, _ in self.get_actions_and_next_states(s)}
             for s in self.get_all_nt_states()}
        d2: Mapping[InventoryState, Optional[Set[int]]] = \
            {s: None for s in self.terminals}
        return {**d1, **d2}

    def get_sarsa_vf_and_policy(
        self,
        states_actions_dict: Mapping[InventoryState, Optional[Set[int]]],
        sample_func: Callable[[InventoryState, int], Tuple[InventoryState, float]],
        episodes: int = 10000,
        step_size: float = 0.01
    ) -> Tuple[V[InventoryState], FinitePolicy[InventoryState, int]]:
        '''
        states_actions_dict gives us the set of possible moves from
        a non-block cell.
        sample_func is a function with two inputs: state and action,
        and with output as a sampled pair of (next_state, reward).
        '''
        q: Dict[InventoryState, Dict[int, float]] = \
            {s: {a: 0. for a in actions} for s, actions in
             states_actions_dict.items() if actions is not None}
        nt_states = {s for s in q}
        uniform_states: Choose[InventoryState] = Choose(nt_states)
        for episode_num in range(episodes):
            epsilon: float = 1.0 / (episode_num + 1)
            state: InventoryState = uniform_states.sample()

            # Since we are at a non-terminating state, we can always get an action wrt epsilon greedy policy
            sampled_a=self.epsilon_greedy_action(nt_state=state,q=q,epsilon=epsilon)
            while True:
                # sample the next state and reward pair
                nxt_state,r=sample_func(state,sampled_a)
                # if next state is T,we update with Q(s_t+1,A_t+1)=0 and stop this episode
                if nxt_state in self.terminals:
                    q[state][sampled_a]=q[state][sampled_a]+step_size*(r-q[state][sampled_a])
                    break
                # Else update q table and continue
                sampled_nxt_a=self.epsilon_greedy_action(nt_state=nxt_state,q=q,epsilon=epsilon)
                q[state][sampled_a]=q[state][sampled_a]+step_size*(r+q[nxt_state][sampled_nxt_a]-q[state][sampled_a])
                state=nxt_state
                sampled_a=sampled_nxt_a
        vf_dict: V[InventoryState] = {s: max(d.values()) for s, d in q.items()}
        policy: FinitePolicy[InventoryState, Move] = FinitePolicy(
            {s: Constant(max(d.items(), key=itemgetter(1))[0])
             for s, d in q.items()}
        )
        return (vf_dict, policy)

    def get_q_learning_vf_and_policy(
        self,
        states_actions_dict: Mapping[InventoryState, Optional[Set[int]]],
        sample_func: Callable[[InventoryState, int], Tuple[InventoryState, float]],
        episodes: int = 10000,
        step_size: float = 0.01,
        epsilon: float = 0.1
    ) -> Tuple[V[int], FinitePolicy[InventoryState, int]]:
        '''
        states_actions_dict gives us the set of possible moves from
        a non-block cell.
        sample_func is a function with two inputs: state and action,
        and with output as a sampled pair of (next_state, reward).
        '''
        q: Dict[int, Dict[int, float]] = \
            {s: {a: 0. for a in actions} for s, actions in
             states_actions_dict.items() if actions is not None}
        nt_states = {s for s in q}
        uniform_states: Choose[InventoryState] = Choose(nt_states)
        for episode_num in range(episodes):
            state: InventoryState = uniform_states.sample()
                
        # Since we are at a non-terminating state, we can always get an action wrt epsilon greedy policy
            sampled_a=self.epsilon_greedy_action(nt_state=state,q=q,epsilon=epsilon)
            while True:
                # sample the next state and reward pair
                nxt_state,r=sample_func(state,sampled_a)
                # if next state is T,we update with Q(s_t+1,A_t+1)=0 and stop this episode
                if nxt_state in self.terminals:
                    q[state][sampled_a]=q[state][sampled_a]+step_size*(r-q[state][sampled_a])
                    break
                # Else we take next action from a greedy policy(epsilon=0) and update Q table
                sampled_nxt_a=self.epsilon_greedy_action(nt_state=nxt_state,q=q,epsilon=0)
                q[state][sampled_a]=q[state][sampled_a]+step_size*(r+q[nxt_state][sampled_nxt_a]-q[state][sampled_a])
                state=nxt_state
                sampled_a=sampled_nxt_a   
            
        vf_dict: V[InventoryState] = {s: max(d.values()) for s, d in q.items()}
        policy: FinitePolicy[InventoryState, int] = FinitePolicy(
            {s: Constant(max(d.items(), key=itemgetter(1))[0])
             for s, d in q.items()}
        )
        return (vf_dict, policy)

  