In [None]:
# Import necessary libraries
from recsim.environments import interest_evolution 
import tensorflow as tf
import numpy as np

# Define the simulation environment

env_config = {'slate_size': 2,
              'seed': 0,
              'num_candidates': 10,
              'resample_documents': True}
env = interest_evolution.create_environment(env_config)

# Define the neural network for estimating Q-values
num_features = env.observation_space['user'].shape[0]
num_actions = env.action_space.shape[0]

q_network = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(num_features,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(num_actions)
])

# Define the neural network for slate selection
slate_network = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(num_actions,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(num_actions)
])

# Define the optimizer and loss function for Q-values
q_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
q_loss_fn = tf.keras.losses.MeanSquaredError()

# Define the optimizer and loss function for slate selection
slate_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
slate_loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

# Run simulation experiments
num_steps = 50
num_episodes = 1000
rewardset=[]
for i in range(num_episodes):
    obs = env.reset()
    done = False
    t = 0
    rewards = []

    while not done and t < num_steps:
        # Compute Q-values for all possible slates
        q_values = q_network(obs['user'][np.newaxis])

        # Select the slate with the highest Q-value using the policy network
        slate_logits = slate_network(q_values )
        slate = tf.argmax(slate_logits, axis=-1)

        # Take action and observe reward and next observation
        obs, reward, done, _ = env.step({0: slate.numpy()[0]})

        # Compute target Q-values and train Q-value network
        target_q_values = q_values.numpy().copy()
        target_q_values[0, slate.numpy()[0]] = reward
        with tf.GradientTape() as q_tape:
            q_preds = q_network(obs['user'][np.newaxis])
            q_loss = q_loss_fn(target_q_values, q_preds)
        q_gradients = q_tape.gradient(q_loss, q_network.trainable_variables)
        q_optimizer.apply_gradients(zip(q_gradients, q_network.trainable_variables))

        # Train slate selection network using policy gradient
        with tf.GradientTape() as slate_tape:
            slate_logits = slate_network(q_values)
            slate_loss = slate_loss_fn(slate, slate_logits)
        slate_gradients = slate_tape.gradient(slate_loss, slate_network.trainable_variables)
        slate_optimizer.apply_gradients(zip(slate_gradients, slate_network.trainable_variables))

        rewards.append(reward)
        t += 1

    print('Episode:', i, 'Total reward:', np.sum(rewards))
    rewardset.append( np.sum(rewards))


In [None]:
rewardset2 = []
for i in range(num_episodes):
    obs = env.reset()
    done = False
    t = 0
    rewards = []

    while not done and t < num_steps:
        slate = [0,1]
        # Take ra fixed action and observe reward and next observation
        obs, reward, done, _ = env.step({0: slate})
        rewards.append(reward)
        t += 1

    print('Episode:', i, 'Total reward:', np.sum(rewards))
    rewardset2.append(np.sum(rewards))

In [None]:
window_size = 1
i = 0
rewardset_averages = []

while i < len(rewardset) - window_size + 1:
    
    # Store elements from i to i+window_size
    # in list to get the current window
    window = rewardset[i : i + window_size]
  
    # Calculate the average of current window
    window_average = round(sum(window) / window_size, 2)
      
    # Store the average of current
    # window in moving average list
    rewardset_averages.append(window_average)
      
    # Shift window to right by one position
    i += 1
  

In [None]:
i = 0
rewardset2_averages = []

while i < len(rewardset2) - window_size + 1:
    
    # Store elements from i to i+window_size
    # in list to get the current window
    window = rewardset2[i : i + window_size]
  
    # Calculate the average of current window
    window_average = round(sum(window) / window_size, 2)
      
    # Store the average of current
    # window in moving average list
    rewardset2_averages.append(window_average)
      
    # Shift window to right by one position
    i += 1
  

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [12.50, 10.50]
   
#plt.plot(rewardset)
#plt.plot(rewardset2)
plt.plot(rewardset_averages)
plt.plot(rewardset2_averages)
plt.show()