In [8]:
import torch
import torch.optim as optim
import pandas as pd
import numpy as np
import time

In [9]:
device = 'cpu'#torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class DataLoader:
    def __init__(self, file_path):
        self.data = pd.read_excel(file_path)

    def get_columns_by_keyword(self, keyword):
        return [col for col in self.data.columns if keyword in col]

    def get_datasets(self, rating_keyword, fulfilled_keyword):
        rating_columns = self.get_columns_by_keyword(rating_keyword)
        fulfilled_columns = self.get_columns_by_keyword(fulfilled_keyword)
        actions_dataset = self.data[rating_columns] - 1
        states_dataset = self.data[fulfilled_columns]
        return actions_dataset, states_dataset



class ParticipantOptimizer:
    def __init__(self, states, actions,n_states,n_actions, initial_rewards, Rmin=-3, Rmax=3):
        self.states = torch.tensor(states, dtype=torch.int64, device=device)
        self.actions = torch.tensor(actions, dtype=torch.int64, device=device)
        self.n_states = n_states
        self.n_actions = n_actions

        self.best_rewards = initial_rewards
        self.Rmin = Rmin
        self.Rmax = Rmax

        self.conv = 0

        self.list_rewards = None
        self.old_rewards = self.list_rewards

        self.beta_no_match = None
        self.beta_match = None

    def calculate_rewards_individual(self, states, actions, rewards_matrix):
        rewards_tensor = torch.tensor(rewards_matrix, device=device)
        probabilities = torch.nn.functional.softmax(rewards_tensor[states], dim=1)
        selected_probabilities = probabilities.gather(1, actions.unsqueeze(1)).squeeze(1)
        loss = -torch.sum(torch.log(selected_probabilities))
        return loss, rewards_tensor

    def find_initial_rewards(self, num_trials=100):
        best_loss = float('inf')
        best_rewards = None

        for _ in range(num_trials):
            trial_rewards = np.random.uniform(self.Rmin, self.Rmax, (self.n_states, self.n_actions))
            loss, _ = self.calculate_rewards_individual(self.states, self.actions, trial_rewards)
            if loss < best_loss:
                best_loss = loss
                best_rewards = trial_rewards
        self.best_rewards = best_rewards


    def _get_rewards_list(self, rewards =None):
        if rewards is None:
            rewards = self.best_rewards

        rewards_list = []
        for state, action in zip(self.states, self.actions):
            rewards_list.append(rewards[state][action])

        self.list_rewards = torch.tensor(rewards_list, dtype=torch.float32)

    def optimize(self, num_iterations=150, learning_rate=0.05):


        self.find_initial_rewards()

        for m in range(num_iterations):
            print(m)
            beta_unconstrained_match = torch.tensor([-2.8], requires_grad=True, device=device)
            beta_unconstrained_no_match = torch.tensor([-2.8], requires_grad=True, device=device)
            optimizer = optim.Adam([beta_unconstrained_match, beta_unconstrained_no_match], lr=learning_rate)
            self._get_rewards_list()
            for epoch in range(250):
                optimizer.zero_grad()
                beta_match = torch.sigmoid(beta_unconstrained_match)
                beta_no_match = torch.sigmoid(beta_unconstrained_no_match)

                loss, _, probability_data= self.simulate(beta_match, beta_no_match)
                loss.backward()
                optimizer.step()

            self.RandomWalk(0.05,beta_match,beta_no_match,probability_data)
            if (self.conv == 50):
                break

        self.beta_match = beta_match
        self.beta_no_match = beta_no_match



    def perturb_rewards(self, scale=0.02):
        perturbation = np.random.uniform(-scale, scale, self.best_rewards.shape)
        new_rewards = np.clip(self.best_rewards + perturbation, self.Rmin, self.Rmax)
        return new_rewards
    def RandomWalk(self, distance, beta_match, beta_no_match, old_probability):
        better = False
        self.old_rewards = self.list_rewards
        self.conv = 0
        while(not better and self.conv < 50):
            new_rewards_matrix = self.perturb_rewards(distance)
            self._get_rewards_list(new_rewards_matrix)

            Q_values = torch.tensor(new_rewards_matrix, requires_grad=True, device=device)
            Q_updated = Q_values.clone()
            new_probability_data = 1
            new_loss = 0

            for i in range(1, len(self.states)):

                state = self.states[i]
                action = self.actions[i]
                reward = self.list_rewards[i - 1]
                # Update Q_values on a new tensor to avoid in-place operations
                probabilities = torch.nn.functional.softmax(Q_updated, dim=1)  # Assuming Q_updated is indexed appropriately
                selected_probability = probabilities[state].gather(0, action)
                new_probability_data *= selected_probability
                new_loss -= torch.log(selected_probability)

                ## Maybe different beta for sequence s0->s0, s0->s1, s1->s0, s1->s1
                if (self.states[i - 1] == 0):
                    beta = beta_no_match
                else:
                    beta = beta_match
                new_Q_value = beta * (Q_updated[state, action].clone() - reward) + Q_updated[state, action].clone()
                Q_updated[state, action] = new_Q_value
            if new_probability_data / old_probability < 1:
                prob = new_probability_data / old_probability / 2
            else:
                prob = new_probability_data / old_probability

            if np.random.random() < min(1, prob):
                better = True
                self.best_rewards = new_rewards_matrix
            self.conv += 1



    def simulate(self, beta_match, beta_no_match):

        Q_values = torch.tensor(self.best_rewards, requires_grad=True, device=device)
        Q_updated = Q_values.clone()
        loss = 0
        probability_data = 1
        for i in range(1, len(self.states)):
            state = self.states[i]
            action = self.actions[i]
            reward = self.list_rewards[i-1]  # Assuming rewards align with states/actions
            probabilities = torch.nn.functional.softmax(Q_updated, dim=1)
            selected_probability = probabilities[state].gather(0, action)
            probability_data *= selected_probability
            loss -= torch.log(selected_probability)
            beta = beta_no_match if self.states[i-1] == 0 else beta_match
            new_Q_value = beta * (Q_updated[state, action].clone() - reward) + Q_updated[state, action].clone()
            Q_updated[state, action] = new_Q_value

        return loss, Q_updated,probability_data

In [10]:
def optimize_participant(participant_index, states, actions, n_states, n_actions):
    start_time = time.time()
    initial_rewards = np.random.uniform(-3, 3, (n_states, n_actions))
    optimizer = ParticipantOptimizer(states, actions, n_states, n_actions, initial_rewards)
    optimizer.optimize()
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Participant {participant_index} completed in {execution_time:.2f} seconds")
    return participant_index, optimizer.best_rewards, optimizer.beta_match.item(), optimizer.beta_no_match.item(), execution_time


In [11]:
data_loader = DataLoader('RETOS_BEBRASK_long.xlsx')
actions_dataset, states_dataset = data_loader.get_datasets('Rating0', 'Fulfilled')


In [12]:


# Assuming data for each subject is grouped together
batches_per_subject = 5

# Dictionary to hold training and testing data for cross-validation
cross_val_data = {}

# Use enumerate to get index and subject data from actions and states datasets
for i, (actions, states) in enumerate(zip(actions_dataset.values, states_dataset.values)):
    num_actions = len(actions)
    batch_size = num_actions // batches_per_subject
    cross_val_data[i] = []

    for j in range(batches_per_subject):
        start_index = j * batch_size
        if j == batches_per_subject - 1:
            end_index = num_actions  # Ensure the last batch goes up to the end
        else:
            end_index = start_index + batch_size
        
        # Test data for the current fold
        test_data = (states[start_index:end_index], actions[start_index:end_index])

        # Training data for the current fold
        # Combine slices before and after the test segment
        train_states = np.concatenate((states[:start_index], states[end_index:]))
        train_actions = np.concatenate((actions[:start_index], actions[end_index:]))
        train_data = (train_states, train_actions)

        # Save the train and test data in the dictionary
        cross_val_data[i].append((train_data, test_data))

# Now, `cross_val_data` is ready for use in your training/testing loops


In [13]:
class ParticipantOptimizerEval:
    def __init__(self, train_states,train_actions,test_states,test_actions,n_states,n_actions, initial_rewards, Rmin=-3, Rmax=3):
        self.train_states = torch.tensor(train_states, dtype=torch.int64, device=device)
        self.train_actions = torch.tensor(train_actions, dtype=torch.int64, device=device)
        self.test_states = torch.tensor(test_states, dtype=torch.int64, device=device)
        self.test_actions = torch.tensor(test_actions, dtype=torch.int64, device=device)

        self.n_states = n_states
        self.n_actions = n_actions

        self.best_rewards = initial_rewards
        self.Rmin = Rmin
        self.Rmax = Rmax

        self.conv = 0

        self.list_rewards_train = None
        self.list_rewards_test = None
        self.old_rewards = self.list_rewards_train

        self.beta_no_match = None
        self.beta_match = None
        
        self.epoch_train_losses = []
        self.epoch_test_losses = []

    def calculate_rewards_individual(self, states, actions, rewards_matrix):
        rewards_tensor = torch.tensor(rewards_matrix, device=device)
        probabilities = torch.nn.functional.softmax(rewards_tensor[states], dim=1)
        selected_probabilities = probabilities.gather(1, actions.unsqueeze(1)).squeeze(1)
        loss = -torch.sum(torch.log(selected_probabilities))
        return loss, rewards_tensor

    def find_initial_rewards(self, num_trials=200):
        best_loss = float('inf')
        best_rewards = None

        for _ in range(num_trials):
            trial_rewards = np.random.uniform(self.Rmin, self.Rmax, (self.n_states, self.n_actions))
            loss, _ = self.calculate_rewards_individual(self.train_states, self.train_actions, trial_rewards)
            if loss < best_loss:
                best_loss = loss
                best_rewards = trial_rewards
        self.best_rewards = best_rewards


    def _get_rewards_list_train(self, rewards =None):
        if rewards is None:
            rewards = self.best_rewards

        rewards_list = []
        for state, action in zip(self.train_states, self.train_actions):
            rewards_list.append(rewards[state][action])

        self.list_rewards_train = torch.tensor(rewards_list, dtype=torch.float32)
        
    def _get_rewards_list_test(self, rewards =None):
        if rewards is None:
            rewards = self.best_rewards

        rewards_list = []
        for state, action in zip(self.test_states, self.test_actions):
            rewards_list.append(rewards[state][action])

        self.list_rewards_test = torch.tensor(rewards_list, dtype=torch.float32)


    def optimize(self, num_iterations=100, learning_rate=0.05):


        self.find_initial_rewards()

        for m in range(num_iterations):
            beta_unconstrained_match = torch.tensor([-2.8], requires_grad=True, device=device)
            beta_unconstrained_no_match = torch.tensor([-2.8], requires_grad=True, device=device)
            optimizer = optim.Adam([beta_unconstrained_match, beta_unconstrained_no_match], lr=learning_rate)
            self._get_rewards_list_train()
            for epoch in range(125):
                optimizer.zero_grad()
                beta_match = torch.sigmoid(beta_unconstrained_match)
                beta_no_match = torch.sigmoid(beta_unconstrained_no_match)

                loss, _, probability_data= self.simulate(self.train_states, self.train_actions,self.list_rewards_train, beta_match, beta_no_match)
                loss.backward()
                
                optimizer.step()
            self.epoch_train_losses.append(loss.item()/36)
            
            self._get_rewards_list_test()

            test_loss, _, _ = self.simulate(self.test_states, self.test_actions, self.list_rewards_test, beta_match, beta_no_match)
            self.epoch_test_losses.append(test_loss.item()/9)            
            
            self.RandomWalk(0.05,beta_match,beta_no_match,probability_data)
            if (self.conv == 50):
                break

        self.beta_match = beta_match
        self.beta_no_match = beta_no_match


    def perturb_rewards(self, scale=0.02):
        perturbation = np.random.uniform(-scale, scale, self.best_rewards.shape)
        new_rewards = np.clip(self.best_rewards + perturbation, self.Rmin, self.Rmax)
        return new_rewards
    
    def RandomWalk(self, distance, beta_match, beta_no_match, old_probability):
        better = False
        self.old_rewards = self.list_rewards_train
        self.conv = 0
        while(not better and self.conv < 50):
            new_rewards_matrix = self.perturb_rewards(distance)
            self._get_rewards_list_train(new_rewards_matrix)

            Q_values = torch.tensor(new_rewards_matrix, requires_grad=True, device=device)
            Q_updated = Q_values.clone()
            new_probability_data = 1
            new_loss = 0

            for i in range(1, len(self.train_states)):

                state = self.train_states[i]
                action = self.train_actions[i]
                reward = self.list_rewards_train[i - 1]
                # Update Q_values on a new tensor to avoid in-place operations
                probabilities = torch.nn.functional.softmax(Q_updated, dim=1)  # Assuming Q_updated is indexed appropriately
                selected_probability = probabilities[state].gather(0, action)
                new_probability_data *= selected_probability
                new_loss -= torch.log(selected_probability)

                ## Maybe different beta for sequence s0->s0, s0->s1, s1->s0, s1->s1
                if (self.train_states[i - 1] == 0):
                    beta = beta_no_match
                else:
                    beta = beta_match
                new_Q_value = beta * (Q_updated[state, action].clone() - reward) + Q_updated[state, action].clone()
                Q_updated[state, action] = new_Q_value
            if new_probability_data / old_probability < 1:
                prob = new_probability_data / old_probability / 2
            else:
                prob = new_probability_data / old_probability

            if np.random.random() < min(1, prob):
                better = True
                self.best_rewards = new_rewards_matrix
            self.conv += 1



    def simulate(self, states,actions,rewards,beta_match, beta_no_match):

        Q_values = torch.tensor(self.best_rewards, requires_grad=True, device=device)
        Q_updated = Q_values.clone()
        loss = 0
        probability_data = 1
        
        for i in range(1, len(states)):
            state = states[i]
            action = actions[i]
            reward = rewards[i-1]  # Assuming rewards align with states/actions
            probabilities = torch.nn.functional.softmax(Q_updated, dim=1)
            selected_probability = probabilities[state].gather(0, action)
            probability_data *= selected_probability
            loss -= torch.log(selected_probability)
            beta = beta_no_match if states[i-1] == 0 else beta_match
            new_Q_value = beta * (Q_updated[state, action].clone() - reward) + Q_updated[state, action].clone()
            Q_updated[state, action] = new_Q_value

        return loss, Q_updated,probability_data

In [35]:
subject_losses = []
for subject, data_splits in cross_val_data.items():
    if subject == 1:
        break
    print(f"Training for Subject {subject}")
    epoch_train_losses = []
    epoch_test_losses = []
    
    for fold, ((train_states, train_actions), (test_states, test_actions)) in enumerate(data_splits):
        initial_rewards = np.random.uniform(-3, 3, (2, 4))
        optimizer = ParticipantOptimizerEval(train_states,train_actions,test_states,test_actions,  2, 4, initial_rewards)
        optimizer.optimize()
        epoch_train_losses.append(optimizer.epoch_train_losses)
        epoch_test_losses.append(optimizer.epoch_test_losses)
    
    subject_losses.append([epoch_train_losses,epoch_test_losses])
        



Training for Subject 0


In [36]:
import plotly.graph_objects as go

for i in range(0,1): 
    print(f'Subject {i}')
    train_losses = np.mean(np.array(subject_losses[i][0]), axis=0)
    test_losses = np.mean(np.array(subject_losses[i][1]), axis=0)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(y=train_losses, mode='lines', name='Train Loss'))
    fig.add_trace(go.Scatter(y=test_losses, mode='lines', name='Test Loss'))
    
    fig.update_layout(
        plot_bgcolor='white',
        paper_bgcolor='white',
        title='Loss Evolution Over Epochs',
        xaxis_title='Epoch',
        yaxis_title='Loss',
        legend_title='Legend',
        title_font_color="black",
        font=dict(color="black"),  # Sets global font color to black, affecting most text elements
        xaxis=dict(
            title_font=dict(color="black"),
            tickfont=dict(color="black")  # Sets x-axis tick labels to black
        ),
        yaxis=dict(
            title_font=dict(color="black"),
            tickfont=dict(color="black")  # Sets y-axis tick labels to black
        ),
        legend_title_font_color="black",
        legend=dict(
            font=dict(color="black")  # Sets legend text to black
        )
    )
    
    fig.show()
    

Subject 0
