In [3]:
from dataclasses import dataclass
from typing import Tuple, Dict, Generator,Callable,Optional
import numpy as np
import matplotlib.pyplot as plt
import itertools
import os
os.chdir("..")
from rl.markov_process import MarkovRewardProcess,FiniteMarkovProcess,MarkovProcess
from rl.markov_process import FiniteMarkovRewardProcess
from rl.markov_process import Transition,TransitionStep,ReturnStep
from rl.markov_process import RewardTransition
from rl.distribution import Constant,Categorical,SampledDistribution,Distribution
from rl.gen_utils.common_funcs import get_logistic_func

# p1 p2

### We incorporate MC and TD Tabular in a single class and instead implement two methods: mc_tabular td_tabular 

In [2]:
from typing import Tuple, Sequence, Set, Mapping, Dict, Callable, Optional
from dataclasses import dataclass
from operator import itemgetter
from rl.distribution import Categorical, Choose, Constant
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.markov_decision_process import StateActionMapping
from rl.markov_decision_process import FinitePolicy
from rl.dynamic_programming import value_iteration_result, V


@dataclass(frozen=True)
class WindyGrid:

    rows: int  # number of grid rows
    columns: int  # number of grid columns
    blocks: CellSet  # coordinates of block cells
    terminals: CellSet  # coordinates of goal cells
    wind: WindSpec  # spec of vertical random wind for the columns
    bump_cost: float  # cost of bumping against block or boundary

    def validate_spec(self) -> bool:
        b1 = self.rows >= 2
        b2 = self.columns >= 2
        b3 = all(0 <= r < self.rows and 0 <= c < self.columns
                 for r, c in self.blocks)
        b4 = len(self.terminals) >= 1
        b5 = all(0 <= r < self.rows and 0 <= c < self.columns and
                 (r, c) not in self.blocks for r, c in self.terminals)
        b6 = len(self.wind) == self.columns
        b7 = all(0. <= p1 <= 1. and 0. <= p2 <= 1. and p1 + p2 <= 1.
                 for p1, p2 in self.wind)
        b8 = self.bump_cost > 0.
        return all([b1, b2, b3, b4, b5, b6, b7, b8])

    def print_wind_and_bumps(self) -> None:
        for i, (d, u) in enumerate(self.wind):
            print(f"Column {i:d}: Down Prob = {d:.2f}, Up Prob = {u:.2f}")
        print(f"Bump Cost = {self.bump_cost:.2f}")
        print()

    @staticmethod
    def add_tuples(a: Cell, b: Cell) -> Cell:
        return a[0] + b[0], a[1] + b[1]

    def is_valid_state(self, cell: Cell) -> bool:
        '''
        checks if a cell is a valid state of the MDP
        '''
        return 0 <= cell[0] < self.rows and 0 <= cell[1] < self.columns \
            and cell not in self.blocks

    def get_all_nt_states(self) -> CellSet:
        '''
        returns all the non-terminal states
        '''
        return {(i, j) for i in range(self.rows) for j in range(self.columns)
                if (i, j) not in set.union(self.blocks, self.terminals)}

    def get_actions_and_next_states(self, nt_state: Cell) \
            -> Set[Tuple[Move, Cell]]:
        '''
        given a non-terminal state, returns the set of all possible
        (action, next_state) pairs
        '''
        temp: Set[Tuple[Move, Cell]] = {(a, WindyGrid.add_tuples(nt_state, a))
                                        for a in possible_moves}
        return {(a, s) for a, s in temp if self.is_valid_state(s)}

    def get_transition_probabilities(self, nt_state: Cell) \
            -> Mapping[Move, Categorical[Tuple[Cell, float]]]:
        '''
        given a non-terminal state, return a dictionary whose
        keys are the valid actions (moves) from the given state
        and the corresponding values are the associated probabilities
        (following that move) of the (next_state, reward) pairs.
        The probabilities are determined from the wind probabilities
        of the column one is in after the move. Note that if one moves
        to a goal cell (terminal state), then one ends up in that
        goal cell with 100% probability (i.e., no wind exposure in a
        goal cell).
        '''
        d: Dict[Move, Categorical[Tuple[Cell, float]]] = {}
        for a, (r, c) in self.get_actions_and_next_states(nt_state):
            if (r, c) in self.terminals:
                d[a] = Categorical({((r, c), -1.): 1.})
            else:
                d1={}
                up_valid=self.is_valid_state((r+1,c))
                down_valid=self.is_valid_state((r-1,c))
                #both up and down valid#
                if up_valid and down_valid:
                    d1[((r,c),-1)]=1-self.wind[c][0]-self.wind[c][1]
                    d1[((r-1,c),-1)]=self.wind[c][0]
                    d1[((r+1,c),-1)]=self.wind[c][1]
                #only up valid#
                elif up_valid:
                    d1[((r,c),-1)]=1-self.wind[c][0]-self.wind[c][1]
                    d1[((r,c),-1-self.bump_cost)]=self.wind[c][0]
                    d1[((r+1,c),-1)]=self.wind[c][1]
                #only down valid#
                elif down_valid:
                    d1[((r,c),-1)]=1-self.wind[c][0]-self.wind[c][1]
                    d1[((r,c),-1-self.bump_cost)]=self.wind[c][1]
                    d1[((r-1,c),-1)]=self.wind[c][0]
                #neither up nor down valid#
                else:
                    d1[((r,c),-1)]=1-self.wind[c][0]-self.wind[c][1]
                    d1[((r,c),-1-self.bump_cost)]=self.wind[c][0]+self.wind[c][1]
                d[a]=Categorical(d1)
        return d

    def get_finite_mdp(self) -> FiniteMarkovDecisionProcess[Cell, Move]:
        '''
        returns the FiniteMarkovDecision object for this windy grid problem
        '''
        d1: StateActionMapping[Cell, Move] = \
            {s: self.get_transition_probabilities(s) for s in
             self.get_all_nt_states()}
        d2: StateActionMapping[Cell, Move] = {s: None for s in self.terminals}
        return FiniteMarkovDecisionProcess({**d1, **d2})

    def get_vi_vf_and_policy(self) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]:
        '''
        Performs the Value Iteration DP algorithm returning the
        Optimal Value Function (as a V[Cell]) and the Optimal Policy
        (as a FinitePolicy[Cell, Move])
        '''
        return value_iteration_result(self.get_finite_mdp(), gamma=1.)

    @staticmethod
    def epsilon_greedy_action(
        nt_state: Cell,
        q: Mapping[Cell, Mapping[Move, float]],
        epsilon: float
    ) -> Move:
        '''
        given a non-terminal state, a Q-Value Function (in the form of a
        {state: {action: Expected Return}} dictionary) and epislon, return
        an action sampled from the probability distribution implied by an
        epsilon-greedy policy that is derived from the Q-Value Function.
        '''
        action_values: Mapping[Move, float] = q[nt_state]
        greedy_action: Move = max(action_values.items(), key=itemgetter(1))[0]
        return Categorical(
            {a: epsilon / len(action_values) +
             (1 - epsilon if a == greedy_action else 0.)
             for a in action_values}
        ).sample()

    def get_states_actions_dict(self) -> Mapping[Cell, Optional[Set[Move]]]:
        '''
        Returns a dictionary whose keys are the states and the corresponding
        values are the set of actions for the state (if the key is a
        non-terminal state) or is None if the state is a terminal state.
        '''
        d1: Mapping[Cell, Optional[Set[Move]]] = \
            {s: {a for a, _ in self.get_actions_and_next_states(s)}
             for s in self.get_all_nt_states()}
        d2: Mapping[Cell, Optional[Set[Move]]] = \
            {s: None for s in self.terminals}
        return {**d1, **d2}

    def get_sarsa_vf_and_policy(
        self,
        states_actions_dict: Mapping[Cell, Optional[Set[Move]]],
        sample_func: Callable[[Cell, Move], Tuple[Cell, float]],
        episodes: int = 10000,
        step_size: float = 0.01
    ) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]:
        '''
        states_actions_dict gives us the set of possible moves from
        a non-block cell.
        sample_func is a function with two inputs: state and action,
        and with output as a sampled pair of (next_state, reward).
        '''
        q: Dict[Cell, Dict[Move, float]] = \
            {s: {a: 0. for a in actions} for s, actions in
             states_actions_dict.items() if actions is not None}
        nt_states: CellSet = {s for s in q}
        uniform_states: Choose[Cell] = Choose(nt_states)
        for episode_num in range(episodes):
            epsilon: float = 1.0 / (episode_num + 1)
            state: Cell = uniform_states.sample()
            '''
            write your code here
            update the dictionary q initialized above according
            to the SARSA algorithm's Q-Value Function updates.
            '''
            # Since we are at a non-terminating state, we can always get an action wrt epsilon greedy policy
            sampled_a=self.epsilon_greedy_action(nt_state=state,q=q,epsilon=epsilon)
            while True:
                # sample the next state and reward pair
                nxt_state,r=sample_func(state,sampled_a)
                # if next state is T,we update with Q(s_t+1,A_t+1)=0 and stop this episode
                if nxt_state in self.terminals:
                    q[state][sampled_a]=q[state][sampled_a]+step_size*(r-q[state][sampled_a])
                    break
                # Else update q table and continue
                sampled_nxt_a=self.epsilon_greedy_action(nt_state=nxt_state,q=q,epsilon=epsilon)
                q[state][sampled_a]=q[state][sampled_a]+step_size*(r+q[nxt_state][sampled_nxt_a]-q[state][sampled_a])
                state=nxt_state
                sampled_a=sampled_nxt_a
        vf_dict: V[Cell] = {s: max(d.values()) for s, d in q.items()}
        policy: FinitePolicy[Cell, Move] = FinitePolicy(
            {s: Constant(max(d.items(), key=itemgetter(1))[0])
             for s, d in q.items()}
        )
        return (vf_dict, policy)

    def get_q_learning_vf_and_policy(
        self,
        states_actions_dict: Mapping[Cell, Optional[Set[Move]]],
        sample_func: Callable[[Cell, Move], Tuple[Cell, float]],
        episodes: int = 10000,
        step_size: float = 0.01,
        epsilon: float = 0.1
    ) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]:
        '''
        states_actions_dict gives us the set of possible moves from
        a non-block cell.
        sample_func is a function with two inputs: state and action,
        and with output as a sampled pair of (next_state, reward).
        '''
        q: Dict[Cell, Dict[Move, float]] = \
            {s: {a: 0. for a in actions} for s, actions in
             states_actions_dict.items() if actions is not None}
        nt_states: CellSet = {s for s in q}
        uniform_states: Choose[Cell] = Choose(nt_states)
        for episode_num in range(episodes):
            state: Cell = uniform_states.sample()
            '''
            write your code here
            update the dictionary q initialized above according
            to the Q-learning algorithm's Q-Value Function updates.
            '''
        # Since we are at a non-terminating state, we can always get an action wrt epsilon greedy policy
            sampled_a=self.epsilon_greedy_action(nt_state=state,q=q,epsilon=epsilon)
            while True:
                # sample the next state and reward pair
                nxt_state,r=sample_func(state,sampled_a)
                # if next state is T,we update with Q(s_t+1,A_t+1)=0 and stop this episode
                if nxt_state in self.terminals:
                    q[state][sampled_a]=q[state][sampled_a]+step_size*(r-q[state][sampled_a])
                    break
                # Else we take next action from a greedy policy(epsilon=0) and update Q table
                sampled_nxt_a=self.epsilon_greedy_action(nt_state=nxt_state,q=q,epsilon=0)
                q[state][sampled_a]=q[state][sampled_a]+step_size*(r+q[nxt_state][sampled_nxt_a]-q[state][sampled_a])
                state=nxt_state
                sampled_a=sampled_nxt_a   
            
        vf_dict: V[Cell] = {s: max(d.values()) for s, d in q.items()}
        policy: FinitePolicy[Cell, Move] = FinitePolicy(
            {s: Constant(max(d.items(), key=itemgetter(1))[0])
             for s, d in q.items()}
        )
        return (vf_dict, policy)

    def print_vf_and_policy(
        self,
        vf_dict: V[Cell],
        policy: FinitePolicy[Cell, Move]
    ) -> None:
        display = "%5.2f"
        display1 = "%5d"
        vf_full_dict = {
            **{s: display % -v for s, v in vf_dict.items()},
            **{s: display % 0.0 for s in self.terminals},
            **{s: 'X' * 5 for s in self.blocks}
        }
        print("   " + " ".join([display1 % j for j in range(self.columns)]))
        for i in range(self.rows - 1, -1, -1):
            print("%2d " % i + " ".join(vf_full_dict[(i, j)]
                                        for j in range(self.columns)))
        print()
        pol_full_dict = {
            **{s: possible_moves[policy.act(s).value]
               for s in self.get_all_nt_states()},
            **{s: 'T' for s in self.terminals},
            **{s: 'X' for s in self.blocks}
        }
        print("   " + " ".join(["%2d" % j for j in range(self.columns)]))
        for i in range(self.rows - 1, -1, -1):
            print("%2d  " % i + "  ".join(pol_full_dict[(i, j)]
                                          for j in range(self.columns)))
        print()



ModuleNotFoundError: No module named 'rl.distribution'

# p3



# p4

