In [1]:
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import theano
import theano.tensor as T

# Simple RL agent

In [4]:
# Data: 4 trials, 2 actions
aliens = np.array([0, 0])
actions = np.array([0, 1])
rewards = np.array([0, 1])

n_trials = 2
n_aliens = 2
n_actions = 2

aliens = theano.shared(np.asarray(aliens, dtype='int32'))
actions = theano.shared(np.asarray(actions, dtype='int32'))
rewards = theano.shared(np.asarray(rewards, dtype='int32'))


with pm.Model() as model:
    
    T.printing.Print('rand')(np.random.rand())
    alpha = pm.Uniform('alpha', lower=0, upper=1)
    beta = pm.Bound(pm.Normal, lower=0)('beta', mu=1, sd=5)
    
    # Trial 1
    Q_low0 = 0.5 * T.ones([n_aliens, n_actions])  # initialize Q-values for all trials, aliens, & actions to 0.5
    T.printing.Print('Q_low0')(Q_low0)
    
    RPE = rewards[0] - Q_low0[aliens[0], actions[0]]  # calculate RPE of the first trial
    T.printing.Print('RPE')(RPE)
    
    Q_low1 = T.set_subtensor(Q_low0[aliens[0], actions[0]],
                             Q_low0[aliens[0], actions[0]] + alpha * RPE)
    T.printing.Print('Q_low1')(Q_low1)
    
    # Trial 2
    RPE = rewards[1] - Q_low1[aliens[1], actions[1]]  # calculate RPE of the first trial
    T.printing.Print('RPE')(RPE)
    
    Q_low2 = T.set_subtensor(Q_low1[aliens[1], actions[1]],
                             Q_low1[aliens[1], actions[1]] + alpha * RPE)
    T.printing.Print('Q_low2')(Q_low2)
    
    # Select actions
    Q_low_all = T.concatenate([Q_low1[aliens[0]], Q_low2[aliens[1]]]).reshape((n_trials, n_actions))
    T.printing.Print('Q_low_all')(Q_low_all)
    
    p_low = T.nnet.softmax(beta * Q_low_all)
    T.printing.Print('p_low')(p_low)
    
    actions = pm.Categorical('actions', p_low, observed=actions)

# Simple TS agent with argmax

In [33]:
# Data: 4 trials, 2 actions
seasons = np.array([0, 0, 0, 0])
aliens = np.array([0, 0, 1, 1])
actions = np.array([0, 1, 0, 1])
rewards = np.array([0, 1, 0, 1])

n_trials = 4
n_aliens = 2
n_actions = 2
n_seasons = 2
n_TS = 2

aliens = theano.shared(np.asarray(aliens, dtype='int32'))
actions = theano.shared(np.asarray(actions, dtype='int32'))
rewards = theano.shared(np.asarray(rewards, dtype='int32'))

with pm.Model() as model:
    
    alpha = pm.Uniform('alpha', lower=0, upper=1)
    beta = pm.Bound(pm.Normal, lower=0)('beta', mu=1, sd=5)
    
    # Trial 1
    ## Get Q-values
    Q_high0 = 0.5 * T.ones([n_seasons, n_TS])  # Q-values linking seasons to TS
    Q_low0 = 0.5 * T.ones([n_TS, n_aliens, n_actions])  # Q-values linking TS & aliens to actions
    T.printing.Print('Q_high0')(Q_high0)
    T.printing.Print('Q_low0')(Q_low0)
    
    ## Select TS
    TS0 = T.argmax(Q_high0[seasons[0]])
    T.printing.Print('TS0')(TS0)
    
    ## Select action based on TS
    Q_low0_sub = Q_low0[TS0]
    p_low0 = T.nnet.softmax(Q_low0_sub)
    T.printing.Print('Q_low0_sub')(Q_low0_sub)
    T.printing.Print('p_low0')(p_low0)

    ## Calculate RPEs and update Q-values
    now_high = seasons[0], TS0
    now_low = TS0, aliens[0], actions[0]
    RPE_high = rewards[0] - Q_high0[now_high]  # calculate RPE of the first trial
    RPE_low = rewards[0] - Q_low0[now_low]  # calculate RPE of the first trial
    T.printing.Print('RPE_high')(RPE_high)
    T.printing.Print('RPE_low')(RPE_low)
    
    Q_high1 = T.set_subtensor(Q_high0[now_high],
                              Q_high0[now_high] + alpha * RPE)
    Q_low1 = T.set_subtensor(Q_low0[now_low],
                             Q_low0[now_low] + alpha * RPE)
    T.printing.Print('Q_high1')(Q_high1)
    T.printing.Print('Q_low1')(Q_low1)
    
#     # Trial 2
#     RPE = rewards[1] - Q_low1[aliens[1], actions[1]]  # calculate RPE of the first trial
#     T.printing.Print('RPE')(RPE)
    
#     Q_low2 = T.set_subtensor(Q_low1[aliens[1], actions[1]],
#                              Q_low1[aliens[1], actions[1]] + alpha * RPE)
#     T.printing.Print('Q_low2')(Q_low2)
        
    # Select actions
#     Q_low_all = T.concatenate([Q_low1[aliens[0]], Q_low2[aliens[1]]]).reshape((n_trials, n_actions))
    Q_low_all = T.concatenate([Q_low1[aliens[0]], Q_low1[aliens[0]]]).reshape((n_trials, n_actions))
    T.printing.Print('Q_low_all')(Q_low_all)
    
    p_low = T.nnet.softmax(beta * Q_low_all)
    T.printing.Print('p_low')(p_low)
    
    actions = pm.Categorical('actions', p_low, observed=actions)

Q_high0 __str__ = [[0.5 0.5]
 [0.5 0.5]]
Q_low0 __str__ = [[[0.5 0.5]
  [0.5 0.5]]

 [[0.5 0.5]
  [0.5 0.5]]]
TS0 __str__ = 0
Q_low0_sub __str__ = [[0.5 0.5]
 [0.5 0.5]]
p_low0 __str__ = [[0.5 0.5]
 [0.5 0.5]]
RPE_high __str__ = -0.5
RPE_low __str__ = -0.5
Q_high1 __str__ = [[0.75 0.5 ]
 [0.5  0.5 ]]
Q_low1 __str__ = [[[0.75 0.5 ]
  [0.5  0.5 ]]

 [[0.5  0.5 ]
  [0.5  0.5 ]]]
Q_low_all __str__ = [[0.75 0.5 ]
 [0.5  0.5 ]
 [0.75 0.5 ]
 [0.5  0.5 ]]
p_low __str__ = [[0.5621765 0.4378235]
 [0.5       0.5      ]
 [0.5621765 0.4378235]
 [0.5       0.5      ]]


  rval = inputs[0].__getitem__(inputs[1:])


# Simple TS agent with custom softmax selection

In [59]:
from theano.tensor.shared_randomstreams import RandomStreams
rs = RandomStreams()
# theano.sandbox.rng_mrg.MRG_RandomStreams.choice(size=1, a=None, replace=True, p=None, ndim=None, dtype='int64', nstreams=None, **kwargs)

a = T.arange(2)
p = T.as_tensor_variable(np.array([0.5, 0.5]))
T.printing.Print('a')(a)
T.printing.Print('p')(p)
TS = rs.choice(size=[1], a=a, p=p)
T.printing.Print('TS')(TS)

a __str__ = [0 1]
p __str__ = [0.5 0.5]
TS __str__ = [0]


Print{message='TS', attrs=('__str__',), global_fn=<function _print_fn at 0x000001519417A9D8>}.0

In [79]:
# Data: 2 trials, 2 actions
seasons = np.array([0, 0])
aliens = np.array([0, 0])
actions = np.array([0, 1])
rewards = np.array([1, 1])

n_trials = 2
n_aliens = 2
n_actions = 2
n_seasons = 2
n_TS = 2

aliens = theano.shared(np.asarray(aliens, dtype='int32'))
actions = theano.shared(np.asarray(actions, dtype='int32'))
rewards = theano.shared(np.asarray(rewards, dtype='int32'))

with pm.Model() as model:
    
    # RL parameters
    alpha = pm.Uniform('alpha', lower=0, upper=1)
    beta = pm.Bound(pm.Normal, lower=0)('beta', mu=1, sd=5)
    T.printing.Print('alpha')(alpha)
    T.printing.Print('beta')(beta)
    
    # Initial Q-values
    Q_high0 = 0.5 * T.ones([n_seasons, n_TS])  # Q-values linking seasons to TS
    Q_low0 = 0.5 * T.ones([n_TS, n_aliens, n_actions])  # Q-values linking TS & aliens to actions
    T.printing.Print('Q_high0')(Q_high0)
    T.printing.Print('Q_low0')(Q_low0)
    
    # Trial 0
    ## Select TS
    p_high0 = T.nnet.softmax(Q_high0[seasons[0]])
    T.printing.Print('p_high0')(p_high0.flatten())

    TS0 = pm.Categorical('TS0', p_high0)
#     TS0 = rs.choice(size=[1], a=T.arange(n_TS), p=p_high.flatten())
#     TS0 = T.argmax(p_high)[0]
    T.printing.Print('TS0')(TS0)
    
    ## Calculate action values for this trial
    p_low0 = T.nnet.softmax(Q_low0[TS0])
    T.printing.Print('p_low0')(p_low0)

    ## Calculate RPEs
    now_high = seasons[0], TS0
    now_low = TS0, aliens[0], actions[0]

    RPE_high = rewards[0] - Q_high0[now_high]  # calculate RPE of the first trial
    RPE_low = rewards[0] - Q_low0[now_low]  # calculate RPE of the first trial
    
    T.printing.Print('RPE_high')(RPE_high)
    T.printing.Print('RPE_low')(RPE_low)
    
    ## Update Q-values
    Q_high1 = T.set_subtensor(Q_high0[now_high],
                              Q_high0[now_high] + alpha * RPE_high)
    Q_low1 = T.set_subtensor(Q_low0[now_low],
                             Q_low0[now_low] + alpha * RPE_low)
    
    T.printing.Print('Q_high1')(Q_high1)
    T.printing.Print('Q_low1')(Q_low1)
    
    # Trial 1
    ## Select TS
    p_high1 = T.nnet.softmax(Q_high1[seasons[1]])
    T.printing.Print('p_high1')(p_high1.flatten())

    TS1 = pm.Categorical('TS1', p_high1)
#     TS1 = rs.choice(size=[1], a=T.arange(n_TS), p=p_high.flatten())
#     TS1 = T.argmax(p_high)[1]
    T.printing.Print('TS1')(TS1)
    
    ## Calculate action values for this trial
    p_low1 = T.nnet.softmax(Q_low1[TS1])
    T.printing.Print('p_low1')(p_low1)

    ## Calculate RPEs
    now_high = seasons[1], TS1
    now_low = TS1, aliens[1], actions[1]
    
    RPE_high = rewards[1] - Q_high1[now_high]  # calculate RPE of the first trial
    RPE_low = rewards[1] - Q_low1[now_low]  # calculate RPE of the first trial
    
    T.printing.Print('RPE_high')(RPE_high)
    T.printing.Print('RPE_low')(RPE_low)
    
    ## Update Q-values
    Q_high2 = T.set_subtensor(Q_high1[now_high],
                              Q_high1[now_high] + alpha * RPE_high)
    Q_low2 = T.set_subtensor(Q_low1[now_low],
                             Q_low1[now_low] + alpha * RPE_low)
    T.printing.Print('Q_high2')(Q_high2)
    T.printing.Print('Q_low2')(Q_low2)
        
    # Select actions
    Q_low_all = T.concatenate([Q_low0[TS0, aliens[0]],
                               Q_low1[TS1, aliens[1]]]).reshape((n_trials, n_actions))
    T.printing.Print('Q_low_all')(Q_low_all)
    
    p_low_all = T.concatenate([p_low0[aliens[0]],
                               p_low1[aliens[1]]]).reshape((n_trials, n_actions))
    T.printing.Print('p_low_all')(p_low_all)
    
    actions = pm.Categorical('actions', p_low_all, observed=actions)

alpha __str__ = 0.5
beta __str__ = 1.0
Q_high0 __str__ = [[0.5 0.5]
 [0.5 0.5]]
Q_low0 __str__ = [[[0.5 0.5]
  [0.5 0.5]]

 [[0.5 0.5]
  [0.5 0.5]]]




p_high0 __str__ = [0.5 0.5]


  rval = inputs[0].__getitem__(inputs[1:])


TS0 __str__ = 0
p_low0 __str__ = [[0.5 0.5]
 [0.5 0.5]]
RPE_high __str__ = 0.5
RPE_low __str__ = 0.5
Q_high1 __str__ = [[0.75 0.5 ]
 [0.5  0.5 ]]
Q_low1 __str__ = [[[0.75 0.5 ]
  [0.5  0.5 ]]

 [[0.5  0.5 ]
  [0.5  0.5 ]]]




p_high1 __str__ = [0.5621765 0.4378235]
TS1 __str__ = 0
p_low1 __str__ = [[0.5621765 0.4378235]
 [0.5       0.5      ]]
RPE_high __str__ = 0.25
RPE_low __str__ = 0.5
Q_high2 __str__ = [[0.875 0.5  ]
 [0.5   0.5  ]]
Q_low2 __str__ = [[[0.75 0.75]
  [0.5  0.5 ]]

 [[0.5  0.5 ]
  [0.5  0.5 ]]]
Q_low_all __str__ = [[0.5  0.5 ]
 [0.75 0.5 ]]
p_low_all __str__ = [[0.5       0.5      ]
 [0.5621765 0.4378235]]


# TS agent, softmax update in loop

In [79]:
# Data: 2 trials, 2 actions
seasons = np.array([0, 0])
aliens = np.array([0, 0])
actions = np.array([0, 1])
rewards = np.array([1, 1])

n_trials = 2
n_aliens = 2
n_actions = 2
n_seasons = 2
n_TS = 2

aliens = theano.shared(np.asarray(aliens, dtype='int32'))
actions = theano.shared(np.asarray(actions, dtype='int32'))
rewards = theano.shared(np.asarray(rewards, dtype='int32'))

with pm.Model() as model:
    
    # RL parameters
    alpha = pm.Uniform('alpha', lower=0, upper=1)
    beta = pm.Bound(pm.Normal, lower=0)('beta', mu=1, sd=5)
    T.printing.Print('alpha')(alpha)
    T.printing.Print('beta')(beta)
    
    # Initial Q-values
    Q_high0 = 0.5 * T.ones([n_seasons, n_TS])  # Q-values linking seasons to TS
    Q_low0 = 0.5 * T.ones([n_TS, n_aliens, n_actions])  # Q-values linking TS & aliens to actions
    T.printing.Print('Q_high0')(Q_high0)
    T.printing.Print('Q_low0')(Q_low0)
    
    # Trial 0
    ## Select TS
    p_high0 = T.nnet.softmax(Q_high0[seasons[0]])
    T.printing.Print('p_high0')(p_high0.flatten())

    TS0 = pm.Categorical('TS0', p_high0)
#     TS0 = rs.choice(size=[1], a=T.arange(n_TS), p=p_high.flatten())
#     TS0 = T.argmax(p_high)[0]
    T.printing.Print('TS0')(TS0)
    
    ## Calculate action values for this trial
    p_low0 = T.nnet.softmax(Q_low0[TS0])
    T.printing.Print('p_low0')(p_low0)

    ## Calculate RPEs
    now_high = seasons[0], TS0
    now_low = TS0, aliens[0], actions[0]

    RPE_high = rewards[0] - Q_high0[now_high]  # calculate RPE of the first trial
    RPE_low = rewards[0] - Q_low0[now_low]  # calculate RPE of the first trial
    
    T.printing.Print('RPE_high')(RPE_high)
    T.printing.Print('RPE_low')(RPE_low)
    
    ## Update Q-values
    Q_high1 = T.set_subtensor(Q_high0[now_high],
                              Q_high0[now_high] + alpha * RPE_high)
    Q_low1 = T.set_subtensor(Q_low0[now_low],
                             Q_low0[now_low] + alpha * RPE_low)
    
    T.printing.Print('Q_high1')(Q_high1)
    T.printing.Print('Q_low1')(Q_low1)
    
    # Trial 1
    ## Select TS
    p_high1 = T.nnet.softmax(Q_high1[seasons[1]])
    T.printing.Print('p_high1')(p_high1.flatten())

    TS1 = pm.Categorical('TS1', p_high1)
#     TS1 = rs.choice(size=[1], a=T.arange(n_TS), p=p_high.flatten())
#     TS1 = T.argmax(p_high)[1]
    T.printing.Print('TS1')(TS1)
    
    ## Calculate action values for this trial
    p_low1 = T.nnet.softmax(Q_low1[TS1])
    T.printing.Print('p_low1')(p_low1)

    ## Calculate RPEs
    now_high = seasons[1], TS1
    now_low = TS1, aliens[1], actions[1]
    
    RPE_high = rewards[1] - Q_high1[now_high]  # calculate RPE of the first trial
    RPE_low = rewards[1] - Q_low1[now_low]  # calculate RPE of the first trial
    
    T.printing.Print('RPE_high')(RPE_high)
    T.printing.Print('RPE_low')(RPE_low)
    
    ## Update Q-values
    Q_high2 = T.set_subtensor(Q_high1[now_high],
                              Q_high1[now_high] + alpha * RPE_high)
    Q_low2 = T.set_subtensor(Q_low1[now_low],
                             Q_low1[now_low] + alpha * RPE_low)
    T.printing.Print('Q_high2')(Q_high2)
    T.printing.Print('Q_low2')(Q_low2)
        
    # Select actions
    Q_low_all = T.concatenate([Q_low0[TS0, aliens[0]],
                               Q_low1[TS1, aliens[1]]]).reshape((n_trials, n_actions))
    T.printing.Print('Q_low_all')(Q_low_all)
    
    p_low_all = T.concatenate([p_low0[aliens[0]],
                               p_low1[aliens[1]]]).reshape((n_trials, n_actions))
    T.printing.Print('p_low_all')(p_low_all)
    
    actions = pm.Categorical('actions', p_low_all, observed=actions)

alpha __str__ = 0.5
beta __str__ = 1.0
Q_high0 __str__ = [[0.5 0.5]
 [0.5 0.5]]
Q_low0 __str__ = [[[0.5 0.5]
  [0.5 0.5]]

 [[0.5 0.5]
  [0.5 0.5]]]




p_high0 __str__ = [0.5 0.5]


  rval = inputs[0].__getitem__(inputs[1:])


TS0 __str__ = 0
p_low0 __str__ = [[0.5 0.5]
 [0.5 0.5]]
RPE_high __str__ = 0.5
RPE_low __str__ = 0.5
Q_high1 __str__ = [[0.75 0.5 ]
 [0.5  0.5 ]]
Q_low1 __str__ = [[[0.75 0.5 ]
  [0.5  0.5 ]]

 [[0.5  0.5 ]
  [0.5  0.5 ]]]




p_high1 __str__ = [0.5621765 0.4378235]
TS1 __str__ = 0
p_low1 __str__ = [[0.5621765 0.4378235]
 [0.5       0.5      ]]
RPE_high __str__ = 0.25
RPE_low __str__ = 0.5
Q_high2 __str__ = [[0.875 0.5  ]
 [0.5   0.5  ]]
Q_low2 __str__ = [[[0.75 0.75]
  [0.5  0.5 ]]

 [[0.5  0.5 ]
  [0.5  0.5 ]]]
Q_low_all __str__ = [[0.5  0.5 ]
 [0.75 0.5 ]]
p_low_all __str__ = [[0.5       0.5      ]
 [0.5621765 0.4378235]]


# Custom distribution

In [94]:
# Data: 2 trials, 2 actions
seasons = np.array([0, 0])
aliens = np.array([0, 0])
actions = np.array([0, 1])
rewards = np.array([1, 1])
n_trials = 2
n_aliens = 2
n_actions = 2
n_seasons = 2
n_TS = 2
aliens = theano.shared(np.asarray(aliens, dtype='int32'))
actions = theano.shared(np.asarray(actions, dtype='int32'))
rewards = theano.shared(np.asarray(rewards, dtype='int32'))

class CustomTSDist(pm.DiscreteUniform):
    def __init__(self, lower, upper, z, *args, **kwargs):
        super(CustomTSDist, self).__init__(lower, upper, *args, **kwargs)
        self.z = z  # Raw values as inputs for softmax (len = len(TS domain))
        self.lower = lower
        self.upper = upper
        
    def random(self, Q_high, seasons, rewards):
        
        TSs = T.zeros(len(seasons))
        for trial in range(len(seasons)):
            p_high = T.nnet.softmax(Q_high[seasons[trial]])
            TS = choice(p_high)
            Q_high[TS] += alpha_high * (rewards[trial] - Q_high[TS])
            TSs[trial] = TS
            
        return TSs
        
    def logp(self, value):
        upper = self.upper
        lower = self.lower
        # bound bounds a distribution, takes probability computation as first argument, bounds as 2nd/3rd
        p = T.exp(self.z) / T.sum(T.exp(self.z))  # Softmax
        return pm.distributions.dist_math.bound(-T.log(p), lower <= value, value <= upper)
    
with pm.Model() as model:
    
    # RL parameters
    alpha = pm.Uniform('alpha', lower=0, upper=1)
    beta = pm.Bound(pm.Normal, lower=0)('beta', mu=1, sd=5)
    T.printing.Print('alpha')(alpha)
    T.printing.Print('beta')(beta)
    
    # Initial Q-values
    Q_high0 = 0.5 * T.ones([n_seasons, n_TS])  # Q-values linking seasons to TS
    Q_low0 = 0.5 * T.ones([n_TS, n_aliens, n_actions])  # Q-values linking TS & aliens to actions
    T.printing.Print('Q_high0')(Q_high0)
    T.printing.Print('Q_low0')(Q_low0)
    
    # Set up TS distribution (softmax)
    # Trial 0
    ## Select TS
    p_high0 = T.nnet.softmax(Q_high0[seasons[0]])
    TS0 = CustomTSDist('TS0', 0, 1, z=Q_high0[seasons[0]])
    T.printing.Print('p_high0')(p_high0.flatten())
    T.printing.Print('TS0')(TS0)

    ## Calculate action values for this trial
    p_low0 = T.nnet.softmax(Q_low0[TS0])
    T.printing.Print('p_low0')(p_low0)
    ## Calculate RPEs
    now_high = seasons[0], TS0
    now_low = TS, aliens[0], actions[0]
    RPE_high = rewards[0] - Q_high0[now_high]  # calculate RPE of the first trial
    RPE_low = rewards[0] - Q_low0[now_low]  # calculate RPE of the first trial
    
    T.printing.Print('RPE_high')(RPE_high)
    T.printing.Print('RPE_low')(RPE_low)
    
    ## Update Q-values
    Q_high1 = T.set_subtensor(Q_high0[now_high],
                              Q_high0[now_high] + alpha * RPE_high)
    Q_low1 = T.set_subtensor(Q_low0[now_low],
                             Q_low0[now_low] + alpha * RPE_low)
    
    T.printing.Print('Q_high1')(Q_high1)
    T.printing.Print('Q_low1')(Q_low1)
    
    # Trial 1
    ## Select TS
    p_high1 = T.nnet.softmax(Q_high1[seasons[1]])
    TS1 = CustomTSDist('TS1', 0, 1, z=Q_high1[seasons[1]])
    T.printing.Print('p_high1')(p_high1.flatten())
    T.printing.Print('TS1')(TS1)
    ## Calculate action values for this trial
    p_low1 = T.nnet.softmax(Q_low1[TS1])
    T.printing.Print('p_low1')(p_low1)
    ## Calculate RPEs
    now_high = seasons[1], TS1
    now_low = TS, aliens[1], actions[1]
    
    RPE_high = rewards[1] - Q_high1[now_high]  # calculate RPE of the first trial
    RPE_low = rewards[1] - Q_low1[now_low]  # calculate RPE of the first trial
    
    T.printing.Print('RPE_high')(RPE_high)
    T.printing.Print('RPE_low')(RPE_low)
    
    ## Update Q-values
    Q_high2 = T.set_subtensor(Q_high1[now_high],
                              Q_high1[now_high] + alpha * RPE_high)
    Q_low2 = T.set_subtensor(Q_low1[now_low],
                             Q_low1[now_low] + alpha * RPE_low)
    T.printing.Print('Q_high2')(Q_high2)
    T.printing.Print('Q_low2')(Q_low2)
        
    # Select actions
    Q_low_all = T.concatenate([Q_low0[TS0, aliens[0]],
                               Q_low1[TS1, aliens[1]]]).reshape((n_trials, n_actions))
    T.printing.Print('Q_low_all')(Q_low_all)
    
    p_low_all = T.concatenate([p_low0[aliens[0]],
                               p_low1[aliens[1]]]).reshape((n_trials, n_actions))
    T.printing.Print('p_low_all')(p_low_all)
    
    actions = pm.Categorical('actions', p_low_all, observed=actions)

alpha __str__ = 0.5
beta __str__ = 1.0
Q_high0 __str__ = [[0.5 0.5]
 [0.5 0.5]]
Q_low0 __str__ = [[[0.5 0.5]
  [0.5 0.5]]

 [[0.5 0.5]
  [0.5 0.5]]]




p_high0 __str__ = [0.5 0.5]
TS0 __str__ = 0
p_low0 __str__ = [[0.5 0.5]
 [0.5 0.5]]


  rval = inputs[0].__getitem__(inputs[1:])


RPE_high __str__ = 0.5
RPE_low __str__ = [0.5]


  out[0][inputs[2:]] = inputs[1]


Q_high1 __str__ = [[0.75 0.5 ]
 [0.5  0.5 ]]
Q_low1 __str__ = [[[0.75 0.5 ]
  [0.5  0.5 ]]

 [[0.5  0.5 ]
  [0.5  0.5 ]]]




p_high1 __str__ = [0.5621765 0.4378235]
TS1 __str__ = 0
p_low1 __str__ = [[0.5621765 0.4378235]
 [0.5       0.5      ]]
RPE_high __str__ = 0.25
RPE_low __str__ = [0.5]
Q_high2 __str__ = [[0.875 0.5  ]
 [0.5   0.5  ]]
Q_low2 __str__ = [[[0.75 0.75]
  [0.5  0.5 ]]

 [[0.5  0.5 ]
  [0.5  0.5 ]]]
Q_low_all __str__ = [[0.5  0.5 ]
 [0.75 0.5 ]]
p_low_all __str__ = [[0.5       0.5      ]
 [0.5621765 0.4378235]]


In [60]:
# np.where([True, False, True, False], np.ones(4), np.zeros(4))
# np.where([False, True, True], np.array([0, 1, 2]), np.where(np.array([1, 2, 99]))
print(3 - np.sum([True, True, True]))  # should return TS 0
print(3 - np.sum([False, True, True]))  # should return TS 1
print(3 - np.sum([False, False, True]))  # should return TS 2

0
1
2


In [142]:
p = np.array([[0.1, 0.2, 0.7],
              [0.8, 0.1, 0.1]])
# print(p)

cumsum = np.cumsum(p, axis=1)
print(cumsum)

rand = np.random.rand(2).reshape((2, 1))
print(rand)

rand_cumsum = rand < cumsum
print(rand_cumsum)

choice = 3 - np.sum(rand_cumsum, axis=1)
print(choice)

# b = np.array([[True, True, True],
#               [False, True, True],
#               [False, False, True]])
# print(b)

# a = np.arange(9).reshape((3, 3))
# print(a)
# print(np.sum(a, axis=0))

[[0.1 0.3 1. ]
 [0.8 0.9 1. ]]
[[0.31117788]
 [0.65785024]]
[[False False  True]
 [ True  True  True]]
[2 0]


In [86]:
# Data: 2 trials, 2 actions
seasons = np.array([0, 0, 0, 0, 0, 0])
aliens = np.array([0, 0, 0, 0, 0, 0])
actions = np.array([0, 1, 0, 1, 0, 1])
rewards = np.array([0, 1, 0, 1, 0, 1])
n_trials = 4
n_aliens = 2
n_actions = 2
n_seasons = 2
n_TS = 3

seasons = theano.shared(np.asarray(seasons, dtype='int32'))
aliens = theano.shared(np.asarray(aliens, dtype='int32'))
actions = theano.shared(np.asarray(actions, dtype='int32'))
rewards = theano.shared(np.asarray(rewards, dtype='int32'))

with pm.Model() as model:
    
    # RL parameters
    alpha = pm.Uniform('alpha', lower=0, upper=1)
    beta = pm.Bound(pm.Normal, lower=0)('beta', mu=1, sd=5)
    forget = pm.Uniform('forget', lower=0, upper=0.1)
    alpha_high = pm.Uniform('alpha_high', lower=0.01, upper=0.2)
#     T.printing.Print('alpha')(alpha)
#     T.printing.Print('beta')(beta)
#     T.printing.Print('forget')(forget)
#     T.printing.Print('alpha_high')(alpha_high)
    
    # Initial Q-values
    Q_high0 = 0.5 * T.ones([n_seasons, n_TS])  # Q-values linking seasons to TS
    Q_low0 = 0.5 * T.ones([n_TS, n_aliens, n_actions])  # Q-values linking TS & aliens to actions
#     T.printing.Print('Q_high0')(Q_high0)
#     T.printing.Print('Q_low0')(Q_low0)
    
    # Set up TS distribution (softmax)
    def update_Qs(season, alien, action, reward,
                  Q_low, Q_high,
                  beta, alpha, alpha_high, forget, n_TS):
        
        # Select TS
        Q_high_sub = Q_high[season]
        p_high = T.nnet.softmax(Q_high_sub)
#         T.printing.Print('p_high')(p_high)
        
        rand = rs.uniform()
        T.printing.Print('rand')(rand)
        
        cumsum = T.extra_ops.cumsum(p_high)
        T.printing.Print('cumsum')(cumsum)
        
#         TS = season  # Flat
        TS = n_TS - T.sum(rand < cumsum)
        T.printing.Print('TS')(TS)

        # Calculate action probabilities based on TS
        Q_low_sub = Q_low[TS, alien]  # Q_low_sub.shape -> [n_subj, n_actions]
        T.printing.Print('Q_low_sub')(Q_low_sub)
        p_low = T.nnet.softmax(beta * Q_low_sub)
        T.printing.Print('p_low')(p_low)

        # Forget Q-values a little bit
#         Q_low = (1 - forget) * Q_low + forget * alien_initial_Q
        # Q_high = (1 - forget_high) * Q_high + forget_high * alien_initial_Q

        # Calculate RPEs & update Q-values
        current_trial_high = season, TS
        RPE_high = reward - Q_high[current_trial_high]
        T.printing.Print('RPE_high')(RPE_high)
        Q_high = T.set_subtensor(Q_high[current_trial_high],
                                 Q_high[current_trial_high] + alpha_high * RPE_high)

        current_trial_low = TS, alien, action
        RPE_low = reward - Q_low[current_trial_low]
        Q_low = T.set_subtensor(Q_low[current_trial_low],
                                Q_low[current_trial_low] + alpha * RPE_low)

        return [Q_low, Q_high, TS, p_low]

    [Q_low, _, TS, p_low], _ = theano.scan(fn=update_Qs,
                                           sequences=[seasons, aliens, actions, rewards],
                                           outputs_info=[Q_low0, Q_high0, None, None],
                                           non_sequences=[beta, alpha, alpha_high, forget, n_TS])
    T.printing.Print('TS')(TS)
    T.printing.Print('Q_low')(Q_low)    
    T.printing.Print('p_low')(p_low)
        
    # Select actions    
    actions = pm.Categorical('actions', p_low, observed=actions)



rand __str__ = 0.63858724
cumsum __str__ = [0.33333334 0.6666667  1.        ]
TS __str__ = 1
Q_low_sub __str__ = [0.5 0.5]




p_low __str__ = [[0.5 0.5]]
RPE_high __str__ = -0.5
TS __str__ = [1 2 0 0 2 1]
Q_low __str__ = [[[[0.5  0.5 ]
   [0.5  0.5 ]]

  [[0.25 0.5 ]
   [0.5  0.5 ]]

  [[0.5  0.5 ]
   [0.5  0.5 ]]]


 [[[0.5  0.5 ]
   [0.5  0.5 ]]

  [[0.25 0.5 ]
   [0.5  0.5 ]]

  [[0.5  0.75]
   [0.5  0.5 ]]]


 [[[0.25 0.5 ]
   [0.5  0.5 ]]

  [[0.25 0.5 ]
   [0.5  0.5 ]]

  [[0.5  0.75]
   [0.5  0.5 ]]]


 [[[0.25 0.75]
   [0.5  0.5 ]]

  [[0.25 0.5 ]
   [0.5  0.5 ]]

  [[0.5  0.75]
   [0.5  0.5 ]]]


 [[[0.25 0.75]
   [0.5  0.5 ]]

  [[0.25 0.5 ]
   [0.5  0.5 ]]

  [[0.25 0.75]
   [0.5  0.5 ]]]


 [[[0.25 0.75]
   [0.5  0.5 ]]

  [[0.25 0.75]
   [0.5  0.5 ]]

  [[0.25 0.75]
   [0.5  0.5 ]]]]
p_low __str__ = [[[0.5       0.5      ]]

 [[0.5       0.5      ]]

 [[0.5       0.5      ]]

 [[0.4378235 0.5621765]]

 [[0.4378235 0.5621765]]

 [[0.4378235 0.5621765]]]


  rval = inputs[0].__getitem__(inputs[1:])


ValueError: Input dimension mis-match. (input[0].shape[1] = 6, input[1].shape[1] = 2)

# Custom distribution 2

In [11]:
from theano.tensor.shared_randomstreams import RandomStreams
from theano import function
rs = RandomStreams(seed=234)
rv_u = rs.uniform()
rv_n = rs.normal((2,2))
f = function([], rv_u)
g = function([], rv_n, no_default_updates=True)    #Not updating rv_n.rng
# nearly_zeros = function([], rv_u + rv_u - 2 * rv_u)

In [12]:
f_val0 = f()
f_val1 = f()  #different numbers from f_val0

In [13]:
print(f_val0)
print(nearly_zeros())

0.12672381
[[0. 0.]
 [0. 0.]]
