In [1]:
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import theano
import theano.tensor as T

# Simple RL agent

In [24]:
# Data: 4 trials, 2 actions
aliens = np.array([0, 0])
actions = np.array([0, 1])
rewards = np.array([0, 1])

n_trials = 2
n_aliens = 2
n_actions = 2

aliens = theano.shared(np.asarray(aliens, dtype='int32'))
actions = theano.shared(np.asarray(actions, dtype='int32'))
rewards = theano.shared(np.asarray(rewards, dtype='int32'))

with pm.Model() as model:
    
    alpha = pm.Uniform('alpha', lower=0, upper=1)
    beta = pm.Bound(pm.Normal, lower=0)('beta', mu=1, sd=5)
    
    # Trial 1
    Q_low0 = 0.5 * T.ones([n_aliens, n_actions])  # initialize Q-values for all trials, aliens, & actions to 0.5
    T.printing.Print('Q_low0')(Q_low0)
    
    RPE = rewards[0] - Q_low0[aliens[0], actions[0]]  # calculate RPE of the first trial
    T.printing.Print('RPE')(RPE)
    
    Q_low1 = T.set_subtensor(Q_low0[aliens[0], actions[0]],
                             Q_low0[aliens[0], actions[0]] + alpha * RPE)
    T.printing.Print('Q_low1')(Q_low1)
    
    # Trial 2
    RPE = rewards[1] - Q_low1[aliens[1], actions[1]]  # calculate RPE of the first trial
    T.printing.Print('RPE')(RPE)
    
    Q_low2 = T.set_subtensor(Q_low1[aliens[1], actions[1]],
                             Q_low1[aliens[1], actions[1]] + alpha * RPE)
    T.printing.Print('Q_low2')(Q_low2)
    
    # Select actions
    Q_low_all = T.concatenate([Q_low1[aliens[0]], Q_low2[aliens[1]]]).reshape((n_trials, n_actions))
    T.printing.Print('Q_low_all')(Q_low_all)
    
    p_low = T.nnet.softmax(beta * Q_low_all)
    T.printing.Print('p_low')(p_low)
    
    actions = pm.Categorical('actions', p_low, observed=actions)

Q_low0 __str__ = [[0.5 0.5]
 [0.5 0.5]]
RPE __str__ = -0.5
Q_low1 __str__ = [[0.25 0.5 ]
 [0.5  0.5 ]]
RPE __str__ = 0.5
Q_low2 __str__ = [[0.25 0.75]
 [0.5  0.5 ]]
Q_low_all __str__ = [[0.25 0.5 ]
 [0.25 0.75]]
p_low __str__ = [[0.4378235  0.5621765 ]
 [0.37754068 0.62245935]]


  rval = inputs[0].__getitem__(inputs[1:])


# Simple TS agent with argmax

In [33]:
# Data: 4 trials, 2 actions
seasons = np.array([0, 0, 0, 0])
aliens = np.array([0, 0, 1, 1])
actions = np.array([0, 1, 0, 1])
rewards = np.array([0, 1, 0, 1])

n_trials = 4
n_aliens = 2
n_actions = 2
n_seasons = 2
n_TS = 2

aliens = theano.shared(np.asarray(aliens, dtype='int32'))
actions = theano.shared(np.asarray(actions, dtype='int32'))
rewards = theano.shared(np.asarray(rewards, dtype='int32'))

with pm.Model() as model:
    
    alpha = pm.Uniform('alpha', lower=0, upper=1)
    beta = pm.Bound(pm.Normal, lower=0)('beta', mu=1, sd=5)
    
    # Trial 1
    ## Get Q-values
    Q_high0 = 0.5 * T.ones([n_seasons, n_TS])  # Q-values linking seasons to TS
    Q_low0 = 0.5 * T.ones([n_TS, n_aliens, n_actions])  # Q-values linking TS & aliens to actions
    T.printing.Print('Q_high0')(Q_high0)
    T.printing.Print('Q_low0')(Q_low0)
    
    ## Select TS
    TS0 = T.argmax(Q_high0[seasons[0]])
    T.printing.Print('TS0')(TS0)
    
    ## Select action based on TS
    Q_low0_sub = Q_low0[TS0]
    p_low0 = T.nnet.softmax(Q_low0_sub)
    T.printing.Print('Q_low0_sub')(Q_low0_sub)
    T.printing.Print('p_low0')(p_low0)

    ## Calculate RPEs and update Q-values
    now_high = seasons[0], TS0
    now_low = TS0, aliens[0], actions[0]
    RPE_high = rewards[0] - Q_high0[now_high]  # calculate RPE of the first trial
    RPE_low = rewards[0] - Q_low0[now_low]  # calculate RPE of the first trial
    T.printing.Print('RPE_high')(RPE_high)
    T.printing.Print('RPE_low')(RPE_low)
    
    Q_high1 = T.set_subtensor(Q_high0[now_high],
                              Q_high0[now_high] + alpha * RPE)
    Q_low1 = T.set_subtensor(Q_low0[now_low],
                             Q_low0[now_low] + alpha * RPE)
    T.printing.Print('Q_high1')(Q_high1)
    T.printing.Print('Q_low1')(Q_low1)
    
#     # Trial 2
#     RPE = rewards[1] - Q_low1[aliens[1], actions[1]]  # calculate RPE of the first trial
#     T.printing.Print('RPE')(RPE)
    
#     Q_low2 = T.set_subtensor(Q_low1[aliens[1], actions[1]],
#                              Q_low1[aliens[1], actions[1]] + alpha * RPE)
#     T.printing.Print('Q_low2')(Q_low2)
        
    # Select actions
#     Q_low_all = T.concatenate([Q_low1[aliens[0]], Q_low2[aliens[1]]]).reshape((n_trials, n_actions))
    Q_low_all = T.concatenate([Q_low1[aliens[0]], Q_low1[aliens[0]]]).reshape((n_trials, n_actions))
    T.printing.Print('Q_low_all')(Q_low_all)
    
    p_low = T.nnet.softmax(beta * Q_low_all)
    T.printing.Print('p_low')(p_low)
    
    actions = pm.Categorical('actions', p_low, observed=actions)

Q_high0 __str__ = [[0.5 0.5]
 [0.5 0.5]]
Q_low0 __str__ = [[[0.5 0.5]
  [0.5 0.5]]

 [[0.5 0.5]
  [0.5 0.5]]]
TS0 __str__ = 0
Q_low0_sub __str__ = [[0.5 0.5]
 [0.5 0.5]]
p_low0 __str__ = [[0.5 0.5]
 [0.5 0.5]]
RPE_high __str__ = -0.5
RPE_low __str__ = -0.5
Q_high1 __str__ = [[0.75 0.5 ]
 [0.5  0.5 ]]
Q_low1 __str__ = [[[0.75 0.5 ]
  [0.5  0.5 ]]

 [[0.5  0.5 ]
  [0.5  0.5 ]]]
Q_low_all __str__ = [[0.75 0.5 ]
 [0.5  0.5 ]
 [0.75 0.5 ]
 [0.5  0.5 ]]
p_low __str__ = [[0.5621765 0.4378235]
 [0.5       0.5      ]
 [0.5621765 0.4378235]
 [0.5       0.5      ]]


  rval = inputs[0].__getitem__(inputs[1:])


# Simple TS agent with custom softmax selection

In [59]:
from theano.tensor.shared_randomstreams import RandomStreams
rs = RandomStreams()
# theano.sandbox.rng_mrg.MRG_RandomStreams.choice(size=1, a=None, replace=True, p=None, ndim=None, dtype='int64', nstreams=None, **kwargs)

a = T.arange(2)
p = T.as_tensor_variable(np.array([0.5, 0.5]))
T.printing.Print('a')(a)
T.printing.Print('p')(p)
TS = rs.choice(size=[1], a=a, p=p)
T.printing.Print('TS')(TS)

a __str__ = [0 1]
p __str__ = [0.5 0.5]
TS __str__ = [0]


Print{message='TS', attrs=('__str__',), global_fn=<function _print_fn at 0x000001519417A9D8>}.0

In [79]:
# Data: 2 trials, 2 actions
seasons = np.array([0, 0])
aliens = np.array([0, 0])
actions = np.array([0, 1])
rewards = np.array([1, 1])

n_trials = 2
n_aliens = 2
n_actions = 2
n_seasons = 2
n_TS = 2

aliens = theano.shared(np.asarray(aliens, dtype='int32'))
actions = theano.shared(np.asarray(actions, dtype='int32'))
rewards = theano.shared(np.asarray(rewards, dtype='int32'))

with pm.Model() as model:
    
    # RL parameters
    alpha = pm.Uniform('alpha', lower=0, upper=1)
    beta = pm.Bound(pm.Normal, lower=0)('beta', mu=1, sd=5)
    T.printing.Print('alpha')(alpha)
    T.printing.Print('beta')(beta)
    
    # Initial Q-values
    Q_high0 = 0.5 * T.ones([n_seasons, n_TS])  # Q-values linking seasons to TS
    Q_low0 = 0.5 * T.ones([n_TS, n_aliens, n_actions])  # Q-values linking TS & aliens to actions
    T.printing.Print('Q_high0')(Q_high0)
    T.printing.Print('Q_low0')(Q_low0)
    
    # Trial 0
    ## Select TS
    p_high0 = T.nnet.softmax(Q_high0[seasons[0]])
    T.printing.Print('p_high0')(p_high0.flatten())

    TS0 = pm.Categorical('TS0', p_high0)
#     TS0 = rs.choice(size=[1], a=T.arange(n_TS), p=p_high.flatten())
#     TS0 = T.argmax(p_high)[0]
    T.printing.Print('TS0')(TS0)
    
    ## Calculate action values for this trial
    p_low0 = T.nnet.softmax(Q_low0[TS0])
    T.printing.Print('p_low0')(p_low0)

    ## Calculate RPEs
    now_high = seasons[0], TS0
    now_low = TS0, aliens[0], actions[0]

    RPE_high = rewards[0] - Q_high0[now_high]  # calculate RPE of the first trial
    RPE_low = rewards[0] - Q_low0[now_low]  # calculate RPE of the first trial
    
    T.printing.Print('RPE_high')(RPE_high)
    T.printing.Print('RPE_low')(RPE_low)
    
    ## Update Q-values
    Q_high1 = T.set_subtensor(Q_high0[now_high],
                              Q_high0[now_high] + alpha * RPE_high)
    Q_low1 = T.set_subtensor(Q_low0[now_low],
                             Q_low0[now_low] + alpha * RPE_low)
    
    T.printing.Print('Q_high1')(Q_high1)
    T.printing.Print('Q_low1')(Q_low1)
    
    # Trial 1
    ## Select TS
    p_high1 = T.nnet.softmax(Q_high1[seasons[1]])
    T.printing.Print('p_high1')(p_high1.flatten())

    TS1 = pm.Categorical('TS1', p_high1)
#     TS1 = rs.choice(size=[1], a=T.arange(n_TS), p=p_high.flatten())
#     TS1 = T.argmax(p_high)[1]
    T.printing.Print('TS1')(TS1)
    
    ## Calculate action values for this trial
    p_low1 = T.nnet.softmax(Q_low1[TS1])
    T.printing.Print('p_low1')(p_low1)

    ## Calculate RPEs
    now_high = seasons[1], TS1
    now_low = TS1, aliens[1], actions[1]
    
    RPE_high = rewards[1] - Q_high1[now_high]  # calculate RPE of the first trial
    RPE_low = rewards[1] - Q_low1[now_low]  # calculate RPE of the first trial
    
    T.printing.Print('RPE_high')(RPE_high)
    T.printing.Print('RPE_low')(RPE_low)
    
    ## Update Q-values
    Q_high2 = T.set_subtensor(Q_high1[now_high],
                              Q_high1[now_high] + alpha * RPE_high)
    Q_low2 = T.set_subtensor(Q_low1[now_low],
                             Q_low1[now_low] + alpha * RPE_low)
    T.printing.Print('Q_high2')(Q_high2)
    T.printing.Print('Q_low2')(Q_low2)
        
    # Select actions
    Q_low_all = T.concatenate([Q_low0[TS0, aliens[0]],
                               Q_low1[TS1, aliens[1]]]).reshape((n_trials, n_actions))
    T.printing.Print('Q_low_all')(Q_low_all)
    
    p_low_all = T.concatenate([p_low0[aliens[0]],
                               p_low1[aliens[1]]]).reshape((n_trials, n_actions))
    T.printing.Print('p_low_all')(p_low_all)
    
    actions = pm.Categorical('actions', p_low_all, observed=actions)

alpha __str__ = 0.5
beta __str__ = 1.0
Q_high0 __str__ = [[0.5 0.5]
 [0.5 0.5]]
Q_low0 __str__ = [[[0.5 0.5]
  [0.5 0.5]]

 [[0.5 0.5]
  [0.5 0.5]]]




p_high0 __str__ = [0.5 0.5]


  rval = inputs[0].__getitem__(inputs[1:])


TS0 __str__ = 0
p_low0 __str__ = [[0.5 0.5]
 [0.5 0.5]]
RPE_high __str__ = 0.5
RPE_low __str__ = 0.5
Q_high1 __str__ = [[0.75 0.5 ]
 [0.5  0.5 ]]
Q_low1 __str__ = [[[0.75 0.5 ]
  [0.5  0.5 ]]

 [[0.5  0.5 ]
  [0.5  0.5 ]]]




p_high1 __str__ = [0.5621765 0.4378235]
TS1 __str__ = 0
p_low1 __str__ = [[0.5621765 0.4378235]
 [0.5       0.5      ]]
RPE_high __str__ = 0.25
RPE_low __str__ = 0.5
Q_high2 __str__ = [[0.875 0.5  ]
 [0.5   0.5  ]]
Q_low2 __str__ = [[[0.75 0.75]
  [0.5  0.5 ]]

 [[0.5  0.5 ]
  [0.5  0.5 ]]]
Q_low_all __str__ = [[0.5  0.5 ]
 [0.75 0.5 ]]
p_low_all __str__ = [[0.5       0.5      ]
 [0.5621765 0.4378235]]


# TS agent, softmax update in loop

In [79]:
# Data: 2 trials, 2 actions
seasons = np.array([0, 0])
aliens = np.array([0, 0])
actions = np.array([0, 1])
rewards = np.array([1, 1])

n_trials = 2
n_aliens = 2
n_actions = 2
n_seasons = 2
n_TS = 2

aliens = theano.shared(np.asarray(aliens, dtype='int32'))
actions = theano.shared(np.asarray(actions, dtype='int32'))
rewards = theano.shared(np.asarray(rewards, dtype='int32'))

with pm.Model() as model:
    
    # RL parameters
    alpha = pm.Uniform('alpha', lower=0, upper=1)
    beta = pm.Bound(pm.Normal, lower=0)('beta', mu=1, sd=5)
    T.printing.Print('alpha')(alpha)
    T.printing.Print('beta')(beta)
    
    # Initial Q-values
    Q_high0 = 0.5 * T.ones([n_seasons, n_TS])  # Q-values linking seasons to TS
    Q_low0 = 0.5 * T.ones([n_TS, n_aliens, n_actions])  # Q-values linking TS & aliens to actions
    T.printing.Print('Q_high0')(Q_high0)
    T.printing.Print('Q_low0')(Q_low0)
    
    # Trial 0
    ## Select TS
    p_high0 = T.nnet.softmax(Q_high0[seasons[0]])
    T.printing.Print('p_high0')(p_high0.flatten())

    TS0 = pm.Categorical('TS0', p_high0)
#     TS0 = rs.choice(size=[1], a=T.arange(n_TS), p=p_high.flatten())
#     TS0 = T.argmax(p_high)[0]
    T.printing.Print('TS0')(TS0)
    
    ## Calculate action values for this trial
    p_low0 = T.nnet.softmax(Q_low0[TS0])
    T.printing.Print('p_low0')(p_low0)

    ## Calculate RPEs
    now_high = seasons[0], TS0
    now_low = TS0, aliens[0], actions[0]

    RPE_high = rewards[0] - Q_high0[now_high]  # calculate RPE of the first trial
    RPE_low = rewards[0] - Q_low0[now_low]  # calculate RPE of the first trial
    
    T.printing.Print('RPE_high')(RPE_high)
    T.printing.Print('RPE_low')(RPE_low)
    
    ## Update Q-values
    Q_high1 = T.set_subtensor(Q_high0[now_high],
                              Q_high0[now_high] + alpha * RPE_high)
    Q_low1 = T.set_subtensor(Q_low0[now_low],
                             Q_low0[now_low] + alpha * RPE_low)
    
    T.printing.Print('Q_high1')(Q_high1)
    T.printing.Print('Q_low1')(Q_low1)
    
    # Trial 1
    ## Select TS
    p_high1 = T.nnet.softmax(Q_high1[seasons[1]])
    T.printing.Print('p_high1')(p_high1.flatten())

    TS1 = pm.Categorical('TS1', p_high1)
#     TS1 = rs.choice(size=[1], a=T.arange(n_TS), p=p_high.flatten())
#     TS1 = T.argmax(p_high)[1]
    T.printing.Print('TS1')(TS1)
    
    ## Calculate action values for this trial
    p_low1 = T.nnet.softmax(Q_low1[TS1])
    T.printing.Print('p_low1')(p_low1)

    ## Calculate RPEs
    now_high = seasons[1], TS1
    now_low = TS1, aliens[1], actions[1]
    
    RPE_high = rewards[1] - Q_high1[now_high]  # calculate RPE of the first trial
    RPE_low = rewards[1] - Q_low1[now_low]  # calculate RPE of the first trial
    
    T.printing.Print('RPE_high')(RPE_high)
    T.printing.Print('RPE_low')(RPE_low)
    
    ## Update Q-values
    Q_high2 = T.set_subtensor(Q_high1[now_high],
                              Q_high1[now_high] + alpha * RPE_high)
    Q_low2 = T.set_subtensor(Q_low1[now_low],
                             Q_low1[now_low] + alpha * RPE_low)
    T.printing.Print('Q_high2')(Q_high2)
    T.printing.Print('Q_low2')(Q_low2)
        
    # Select actions
    Q_low_all = T.concatenate([Q_low0[TS0, aliens[0]],
                               Q_low1[TS1, aliens[1]]]).reshape((n_trials, n_actions))
    T.printing.Print('Q_low_all')(Q_low_all)
    
    p_low_all = T.concatenate([p_low0[aliens[0]],
                               p_low1[aliens[1]]]).reshape((n_trials, n_actions))
    T.printing.Print('p_low_all')(p_low_all)
    
    actions = pm.Categorical('actions', p_low_all, observed=actions)

alpha __str__ = 0.5
beta __str__ = 1.0
Q_high0 __str__ = [[0.5 0.5]
 [0.5 0.5]]
Q_low0 __str__ = [[[0.5 0.5]
  [0.5 0.5]]

 [[0.5 0.5]
  [0.5 0.5]]]




p_high0 __str__ = [0.5 0.5]


  rval = inputs[0].__getitem__(inputs[1:])


TS0 __str__ = 0
p_low0 __str__ = [[0.5 0.5]
 [0.5 0.5]]
RPE_high __str__ = 0.5
RPE_low __str__ = 0.5
Q_high1 __str__ = [[0.75 0.5 ]
 [0.5  0.5 ]]
Q_low1 __str__ = [[[0.75 0.5 ]
  [0.5  0.5 ]]

 [[0.5  0.5 ]
  [0.5  0.5 ]]]




p_high1 __str__ = [0.5621765 0.4378235]
TS1 __str__ = 0
p_low1 __str__ = [[0.5621765 0.4378235]
 [0.5       0.5      ]]
RPE_high __str__ = 0.25
RPE_low __str__ = 0.5
Q_high2 __str__ = [[0.875 0.5  ]
 [0.5   0.5  ]]
Q_low2 __str__ = [[[0.75 0.75]
  [0.5  0.5 ]]

 [[0.5  0.5 ]
  [0.5  0.5 ]]]
Q_low_all __str__ = [[0.5  0.5 ]
 [0.75 0.5 ]]
p_low_all __str__ = [[0.5       0.5      ]
 [0.5621765 0.4378235]]
