In [1]:
import torch
import torch.optim as optim
import pandas as pd
import numpy as np

In [2]:
RETOS_BEBRASK_dataset = pd.read_excel("RETOS_BEBRASK_long.xlsx")

In [3]:
rating_columns = [col for col in RETOS_BEBRASK_dataset.columns if 'Rating0' in col]
fulfilled_columns = [col for col in RETOS_BEBRASK_dataset.columns if 'Fulfilled' in col]
actions_dataset = RETOS_BEBRASK_dataset[rating_columns]
states_dataset = RETOS_BEBRASK_dataset[fulfilled_columns]
np.array(actions_dataset.iloc[0].values)

def calculate_rewards(states_df, actions_df):
    # Define the reward mappings for each state and action
    reward_mapping = {
        0: {1: 1.0, 2: 0.0, 3: -0.5, 4: -1.0},
        1: {1: -1.0, 2: -0.5, 3: 0.0, 4: 1.0}
    }
    
    # Initialize an empty DataFrame with the same shape as states_df
    
    rewards_df = pd.DataFrame(index=states_df.index, columns=range(1,46))
    
    # Iterate through each cell in the DataFrame
    for row in range(len(states_df.index)):
        for col in range(len(states_df.columns)):
            state = states_df.iloc[row, col]
            action = actions_df.iloc[row, col]
            # Set the reward using the mapping
            reward = reward_mapping[state][action]
            rewards_df.iloc[row, col] = reward
    
    return rewards_df

rewards_dataset = calculate_rewards(states_dataset, actions_dataset)
rewards_dataset = rewards_dataset.apply(pd.to_numeric, errors='coerce')
actions_dataset= actions_dataset-1

In [4]:
def calculate_initial_q_values(states, actions, num_states, num_actions):
    # Initialize a DataFrame to count occurrences
    count_matrix = pd.DataFrame(0, index=np.arange(num_states), columns=np.arange(1, num_actions + 1))
    # Populate the count matrix with frequencies
    for state, action in zip(states, actions):
        count_matrix.iloc[state, action] += 1
            
    # Normalize to get probabilities
    normalized_counts = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    return normalized_counts.fillna(0)  # Fill NaN with 0 where division might be undefined

In [5]:
individual_Q_values = {}

# Iterate over each participant
for participant in states_dataset.index:
    participant_states = states_dataset.loc[participant]
    participant_actions = actions_dataset.loc[participant]
    
    # Calculate normalized counts as initial Q-values
    normalized_counts = calculate_initial_q_values(participant_states, participant_actions, 2, 4)
    
    # Convert to tensor
    Q_values_tensor = torch.tensor(normalized_counts.values+0.01, dtype=torch.float, requires_grad=True)
    
    # Store in dictionary
    individual_Q_values[participant] = Q_values_tensor

    print(f"Initial Q-values for Participant {participant}:")
    print(Q_values_tensor)

Initial Q-values for Participant 0:
tensor([[0.6211, 0.0656, 0.1767, 0.1767],
        [0.0100, 0.1581, 0.1581, 0.7137]], requires_grad=True)
Initial Q-values for Participant 1:
tensor([[0.3989, 0.4544, 0.1211, 0.0656],
        [0.0470, 0.2322, 0.4544, 0.3063]], requires_grad=True)
Initial Q-values for Participant 2:
tensor([[0.7878, 0.1767, 0.0656, 0.0100],
        [0.1211, 0.1581, 0.3804, 0.3804]], requires_grad=True)
Initial Q-values for Participant 3:
tensor([[0.6767, 0.1767, 0.0100, 0.1767],
        [0.0841, 0.1581, 0.3433, 0.4544]], requires_grad=True)
Initial Q-values for Participant 4:
tensor([[0.7322, 0.1767, 0.1211, 0.0100],
        [0.1211, 0.2693, 0.3063, 0.3433]], requires_grad=True)
Initial Q-values for Participant 5:
tensor([[0.6211, 0.3433, 0.0656, 0.0100],
        [0.0470, 0.1952, 0.4915, 0.3063]], requires_grad=True)
Initial Q-values for Participant 6:
tensor([[0.8433, 0.1211, 0.0656, 0.0100],
        [0.1581, 0.2322, 0.2693, 0.3804]], requires_grad=True)
Initial Q-val

In [14]:
# Assuming data is already loaded into these DataFrames
# states_df, actions_df, rewards_df are available

# Define the number of states and actions based on your model setup
num_states = 2  # Adjust based on your data
num_actions = 4  # Adjust based on your data

# Training configuration
learning_rate = 0.02
num_epochs = 250  # For demonstration, use more epochs for actual training

# Dictionary to store results for each participant
results = {}
combined_df = pd.DataFrame({
    'State': states_dataset.stack(),
    'Action': actions_dataset.stack()
}).reset_index()

# Now calculate the frequency of each action for each state
# Normalize counts to get the probability of each action per state
torch.autograd.set_detect_anomaly(True)
for participant in states_dataset.index:
    # Initialize participant-specific Q-values and beta
    
    beta_unconstrained_match = torch.tensor([-2.8], requires_grad=True)  # Initialize around 0 to start with sigmoid(beta) ~ 0.5
    beta_unconstrained_no_match = torch.tensor([-2.8], requires_grad=True)  # Initialize around 0 to start with sigmoid(beta) ~ 0.5

    Q_values = individual_Q_values[participant]

    # Set up optimizer for this participant
    optimizer = optim.Adam([Q_values, beta_unconstrained_match,beta_unconstrained_no_match], lr=learning_rate)

    # Convert participant data from DataFrame to Tensor
    states = torch.tensor(states_dataset.loc[participant].values, dtype=torch.int64)
    actions = torch.tensor(actions_dataset.loc[participant].values, dtype=torch.int64)
    rewards = torch.tensor(rewards_dataset.loc[participant].values, dtype=torch.float32)
    # Training loop for current participant
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        beta_match = torch.sigmoid(beta_unconstrained_match)   
        beta_no_match = torch.sigmoid(beta_unconstrained_no_match)   

        Q_updated = Q_values.clone()
        loss = 0
        for i in range(1,len(states)):
            state = states[i]
            action = actions[i]
            reward = rewards[i-1]
            # Update Q_values on a new tensor to avoid in-place operations
            probabilities = torch.nn.functional.softmax(Q_updated, dim=1)  # Assuming Q_updated is indexed appropriately
            selected_probability = probabilities[state].gather(0, action)
            loss -= torch.log(selected_probability)
            
            
            ## Maybe different beta for sequence s0->s0, s0->s1, s1->s0, s1->s1
            if (states[i-1] == 0):
                beta = beta_no_match
            else:
                beta = beta_match
            new_Q_value = beta * (Q_updated[state, action].clone()-reward) + Q_updated[state, action].clone()
            Q_updated[state, action] = new_Q_value
        
        loss.backward()

        optimizer.step()
    
        #Q_values = Q_updated.detach().clone()  # Detach and clone if Q_values needs to be updated
        #Q_values = torch.clamp(Q_values, min=0, max=1)
        if epoch % 50 == 0 or epoch == num_epochs - 1:  # Print last epoch always
            print(f'Participant {participant}, Epoch {epoch}: Loss = {loss.item()}')
            print("Optimized Q-values:")
            print(Q_values)
            print("Probabilities each Action")
            print(torch.nn.functional.softmax(Q_values,dim=1))
            print("Optimized Beta Match:", beta_match.item())
            print("Optimized Beta No Match:", beta_no_match.item())
    
    
    # Store the final optimized Q-values and beta for each participant
    results[participant] = {'Q_values': Q_values.clone().detach(), 'beta': beta.item()}

# Output the results after training for all participants
print("Training completed. Displaying results for each participant:")
for participant, data in results.items():
    print(f'Participant {participant}: Q-values = {data["Q_values"]}, Beta = {data["beta"]}')



Participant 0, Epoch 0: Loss = 44.903770446777344
Optimized Q-values:
tensor([[ 1.1142, -1.0398,  0.0858,  0.0836],
        [-6.4087,  0.8691,  0.8757,  2.1146]], requires_grad=True)
Probabilities each Action
tensor([[5.4633e-01, 6.3386e-02, 1.9535e-01, 1.9493e-01],
        [1.2599e-04, 1.8242e-01, 1.8362e-01, 6.3384e-01]],
       grad_fn=<SoftmaxBackward0>)
Optimized Beta Match: 0.05732417479157448
Optimized Beta No Match: 0.05732417479157448
Participant 0, Epoch 50: Loss = 41.080108642578125
Optimized Q-values:
tensor([[ 0.7288, -1.5073, -0.3635, -0.3488],
        [-7.3713,  0.7770,  0.7941,  1.8449]], requires_grad=True)
Probabilities each Action
tensor([[5.6094e-01, 5.9947e-02, 1.8817e-01, 1.9094e-01],
        [5.8706e-05, 2.0296e-01, 2.0647e-01, 5.9051e-01]],
       grad_fn=<SoftmaxBackward0>)
Optimized Beta Match: 0.0336785614490509
Optimized Beta No Match: 0.03523322194814682
Participant 0, Epoch 100: Loss = 40.71930694580078
Optimized Q-values:
tensor([[ 0.3554, -1.9530, -0.798

KeyboardInterrupt: 