In [1]:
import numpy as np
import ToyQ2
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('Data/UH_RL_rats.csv');
df = df.drop(df.columns[[range(24)]],axis=1);
from scipy.optimize import minimize

  result = getitem(key)


# Packaging into respective subjects

In [2]:
subjects = df['subject'].unique();
sub_df={} # a dictionary whose keys is the subject number
for i in subjects:
    sub_df[i]=df[df['subject']==i]

In [3]:
df = pd.read_csv('Data/UH_RL_rats.csv');

# Note on the Data:
# the update rule used for the given data is different than ours
### There are 24 subjects (1 to 24)
### each subject has 5 sessions (['1', '11', '16', '6', 'Best'])
### each the rat chose between ([1., 2.]) which turns out to be (['lean', 'rich']) depending on the experiment and then get a reward ([0., 1.])
### reward is the same as response, action is the same as lever

In [4]:
def get_Logs(sub_df):
    '''Returns a dict of DataFrame indexs as dict[rat number][session name]
    where rat is int from 1 to 24 and session name is 1, 11, 16,6, Best as strings'''
    Logs={}
    for i in sub_df:
        rat=sub_df[i]
        sessions=rat['session'].unique()
        Logs[i]={}
        for j,sesh in enumerate(sessions):
            Logs[i][sesh]={}
            Logs[i][sesh]=rat[rat['session']==sesh]
    return Logs
epoched_df=get_Logs(sub_df)

In [5]:
epoched_df[1]['1'].head()

Unnamed: 0,session,subject,lever,response,feedback,state,action,reward,alpha_gain,alpha_loss,beta,Q,PE
14113,1,1,A,lean,0.0,1.0,1.0,0.0,1.0,0.092515,0.742804,0.453742,-0.5
14114,1,1,B,rich,1.0,1.0,2.0,1.0,1.0,0.092515,0.742804,1.0,0.5
14115,1,1,A,lean,0.0,1.0,1.0,0.0,1.0,0.092515,0.742804,0.411764,-0.453742
14116,1,1,A,lean,0.0,1.0,1.0,0.0,1.0,0.092515,0.742804,0.37367,-0.411764
14117,1,1,B,rich,1.0,1.0,2.0,1.0,1.0,0.092515,0.742804,1.0,0.0


# Simulation

In [285]:
class RL_env():
    def __init__(self,epoched_df):
        '''takes in the data of a rat over one single trial'''
        self.epoched_df=epoched_df
        self.Q=epoched_df['Q']
        self.reward=epoched_df['reward']
        self.test_alphaL=epoched_df['alpha_loss'].iloc[0] # stored as scaler
        self.test_alphaG=epoched_df['alpha_gain'].iloc[0]
        self.test_beta=epoched_df['beta'].iloc[0]
        self.PE=epoched_df['PE']
        self.count=0 # counting from 0
    def step(self):
        temp=self.reward.iloc[self.count]+0
        self.count+=1
        return temp
    def get_switchid(self):
        pass
    def init_Q(self):
        left=self.epoched_df[self.epoched_df['action']==1]['Q'].iloc[0]
        right=self.epoched_df[self.epoched_df['action']==2]['Q'].iloc[0]
        return np.array([right,left])

class Rat():
    def __init__(self,epoched_df,alphaG=None,alphaL=None,beta=None,gamma=0,init_Q=np.array([-1,-1])):
        self.gamma=gamma
        self.df=epoched_df
        self.actions=epoched_df['action']
        self.count=0
        self.PE=0 # prediction error (Q-R)
        self.alphaG=epoched_df['alpha_gain'].iloc[1]
        self.alphaL=epoched_df['alpha_loss'].iloc[1]
        self.beta=epoched_df['beta'].iloc[1]
        if (init_Q==np.array([-1,-1])).all():
            self.Q=np.random.rand(2) # Q[0] represent left
        else:
            self.Q=init_Q
    def get_action(self):
        temp=self.count+0
        self.count+=1
        return self.actions.iloc[temp]
    def update(self,obs): # 1 represent left 
        action_id=int(self.get_action()-1)
        if int(obs)==1: # alpha_gain
            self.Q[action_id]=(1-self.alphaG)*self.Q[action_id]+self.alphaG*(obs+self.gamma*np.max(self.Q))
        elif int(obs)==0: # alpha_loss
            self.Q[action_id]=(1-self.alphaL)*self.Q[action_id]+self.alphaL*(obs+self.gamma*np.max(self.Q))
        else:
            print('error')
        return self.Q[action_id]
    
    
def train_rat(env,rat,it_num):
    QLog=rat.Q
    qlog=[0]
    for i in range(it_num):
        obs=env.step()
        q=rat.update(obs)
        QLog=np.vstack((QLog,rat.Q))
        qlog.append(q)
    return QLog,qlog



# Demo
## the estimation is still off because the initialization value is different and i just put it as the first updated value, but overall it would converge to the same value
## the data is processed into a Dataframe <epoched_df> whose first index is the subject number and second is the session number

In [296]:
mydf=epoched_df[12]['Best']
rat=Rat(mydf,init_Q=np.array([0.725471,0.266683])) # the get_action function works
env=RL_env(mydf) # stpe function works
QLog,qlog=train_rat(env,rat,mydf.shape[0])
qlog

[0,
 0.3869416399847319,
 0.6633951687563733,
 0.8151842959718735,
 0.8985253885714726,
 0.944284514031321,
 0.9694089454196876,
 0.5170498712651,
 0.7348321837127378,
 0.39193457805945187,
 0.20904461846310635,
 0.5973661544605066,
 0.11149731346683318,
 0.7789305122151531,
 0.059468887559596686,
 0.03171868879717861,
 0.8786199447193247,
 0.016917673430565296,
 0.46023040836559964,
 0.2454712993128988,
 0.9333552632361521,
 0.49782005930626844,
 0.7242738942717653,
 0.8486102705037684,
 0.5857196924010148,
 0.3124030296855442,
 0.16662518645503865,
 0.9168782000659313,
 0.9543612790162802,
 0.08887222633251281,
 0.047401433008620306,
 0.5090239560788589,
 0.025282317592270325,
 0.7304254877062222,
 0.8519878544418346,
 0.9187326908381568,
 0.9553795027191789,
 0.013484731204649006,
 0.9755007419573506,
 0.5202990290436145,
 0.007192298530306761,
 0.7366161610255888,
 0.8553868955182795,
 0.4562343748727263,
 0.45489060975817014,
 0.7014409256608249,
 0.8360736377011667,
 0.2426232317

# MLE

# Visualization

# PE is R-Q

# I/O