In [20]:
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
import numpy as np
import pandas as pd

### Discrete State and Action

In [15]:
# (2 Actions, 3 States, 3 States)

transition_mat = [[
    [0.1, 0.5, 0.4],
    [0.2, 0.7, 0.1],
    [0.3, 0.3, 0.4]
],[
    [0.5, 0.2, 0.3],
    [0.8, 0.1, 0.1],
    [0.2, 0.7, 0.1]
]]

reward_dist = [tfd.Normal(10, 5), tfd.Normal(3, 3), tfd.Normal(2, 1)]

In [16]:
policy = tfd.Categorical(probs=[0.5, 0.5])

In [30]:
N_ROUNDS = 1_0
state = 0
hist = []


for t in range(N_ROUNDS):
    action = policy.sample()
    next_state = tfd.Categorical(transition_mat[action][state]).sample()
    reward = reward_dist[next_state].sample()

    hist.append({"state":int(state), "action":int(action), "next_state":int(next_state), "reward":int(reward)})

    state = next_state

In [31]:
pd.DataFrame(hist)

Unnamed: 0,state,action,next_state,reward
0,0,0,2,2
1,2,1,0,7
2,0,1,2,2
3,2,1,1,5
4,1,1,1,3
5,1,0,0,1
6,0,0,0,14
7,0,0,2,1
8,2,0,0,6
9,0,1,0,15


### Continuous State, Discrete Action, deterministic transition

In [60]:
def Reward(state):
    return - np.sum(state)

In [61]:
state = np.array([1000, 1000, 1000, 1000]) # some warehouse quantities
demand_t = np.array([500, 200, 300, 400])

In [62]:
supplier = np.array([
    [200, 200, 200, 200],
    [400, 200,300,500],
    [700, 100, 200, 200]
])

In [63]:
def true_transition(state, action):
    return state + supplier[action] - demand_t

In [38]:
def T(s, a, w_t):
    """
    predict the next possible state s_prime
    """
    next_state = np.dot([s,a], w_t)

    return s ** w_t + a ** w_t

In [33]:
s = tf.Variable(3.0)
a = tf.Variable(2.0)
w_t = tf.Variable(1.0)

with tf.GradientTape() as tape:
    y = T(s,a,w_t)

In [34]:
dy_dx = tape.gradient(y, w_t)

In [24]:
def R(s,a):
    """
    predicts the next possible reward based on the state that the agent was in and the action
    he took
    """
    reward = None

    return reward

In [25]:
def Q(s,a):
    """
    calculates the Q Value
    """
    q_val = None

    return q_val

In [None]:
N_ROUNDS = 1000
alpha_t = 0.1
alpha_r = 0.1
alpha_q = 0.1
PLANNING_STEPS = 10
gamma = 0.9
s = state = np.array([1000, 1000, 1000, 1000])
action_space = np.array([0,1,2])

for round in N_ROUNDS:

    s_prime, r = true_transition(s, np.random.choice(action_space))


    with tf.GradientTape() as tape:
        t_sa = T(s,a)
        r_sa = R(s,a)
    
    w_t = w_t - alpha_t*(t_sa-s_prime)*tape.gradient(t_sa, w_t)
    w_r = w_r - alpha_r*(r_sa - r)*tape.gradient(r_sa, w_r)

    # planning
    for i in PLANNING_STEPS:
        s_hat = None
        a_hat = None

        with tf.GradientTape() as gt:
            q_val = Q(s_hat, a_hat)


        delta = R(s_hat, a_hat) + gamma * Q(T(s_hat, a_hat), a_hat_prime)
        w_q = w_q - alpha_q*delta*gt.gradient(q_val, w_q)
    
    s = s_prime