# Ex8.8 Trajectory Sampling Exercise

## 0. Problem Definitions

* undiscounted episodic tasks
* 2 actions from each state, each resulting in one of $b$ states, all equally likely
* different random selection of $b$ states for each state-action pair
* for each transition theres a $0.1$ chance of entering a terminal state
* expected reward for each transition sampled from $\mathcal{N}(0,1)$

## 1. Environment

In [2]:
import numpy as np
import random as rd
import matplotlib.pyplot as plt

In [4]:
class Environment:
    def __init__(self, n, act, b):
        self.states  = n
        self.actions = a
        self.branch  = b
        
        self.build_transition_graph()
        
    def build_transition_graph(self):
        '''build neighbors for each state randomly'''
        self.trans = np.random.randint(self.states, size=(self.states, self.actions, self.branch))
    
    def get_reward(self):
        '''Returns reward as defined'''
        return np.random.normal()

    def is_done(self):
        '''Call to determine if move is terminal'''
        return np.random.rand() < 0.1

## 2. One Step Tabular Planning

In [None]:
class TabularPlanning:
    
    def __init__(self, environment, alpha, gamma):
        self.env   = environment
        self.alpha = alpha
        self.gamma = gamma
        ...
        
    def reset(self):
        self.Q = np.zeros((self.env.states, self.env.act))
        
    def train_uniform(self):
        for s in range(self.env.states):
            for a in range(self.env.act):
                self.update_q(s, a, self.env.get_reward())
        
    def update_q(self, S, A, R, Sn):
        Am   = np.argmax(self.Q[Sn])
        idx  = (S[0],  S[1],  A)
        idxn = (Sn[0], Sn[1], Am)
        self.Q[idx] += alpha * (R * gamma * self.Q[idxn] - self.Q[idx])
        