In [11]:
# Import necessary libraries
from recsim.environments import long_term_satisfaction
import tensorflow as tf
import numpy as np

# Define the simulation environment

env_config = {'slate_size': 2,
              'seed': 0,
              'num_candidates': 15,
              'resample_documents': True}
env = long_term_satisfaction.create_environment(env_config)

# Define the neural network for estimating Q-values
num_features = env.observation_space['user'].shape[0]
num_actions = env.action_space.shape[0]

q_network = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(num_features,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(num_actions)
])

# Define the neural network for slate selection
slate_network = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(num_features,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(num_actions)
])

# Define the optimizer and loss function for Q-values
q_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
q_loss_fn = tf.keras.losses.MeanSquaredError()

# Define the optimizer and loss function for slate selection
slate_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
slate_loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

# Run simulation experiments
num_steps = 1000
num_episodes = 1000
rewardset=[]
for i in range(num_episodes):
    obs = env.reset()
    done = False
    t = 0
    rewards = []

    while not done and t < num_steps:
        # Compute Q-values for all possible slates
        q_values = q_network(obs['user'][np.newaxis])

        # Select the slate with the highest Q-value using the policy network
        slate_logits = slate_network(obs['user'][np.newaxis])
        slate = tf.argmax(slate_logits, axis=-1)

        # Take action and observe reward and next observation
        obs, reward, done, _ = env.step({0: slate.numpy()[0]})

        # Compute target Q-values and train Q-value network
        target_q_values = q_values.numpy().copy()
        target_q_values[0, slate.numpy()[0]] = reward
        with tf.GradientTape() as q_tape:
            q_preds = q_network(obs['user'][np.newaxis])
            q_loss = q_loss_fn(target_q_values, q_preds)
        q_gradients = q_tape.gradient(q_loss, q_network.trainable_variables)
        q_optimizer.apply_gradients(zip(q_gradients, q_network.trainable_variables))

        # Train slate selection network using policy gradient
        with tf.GradientTape() as slate_tape:
            slate_logits = slate_network(obs['user'][np.newaxis])
            slate_loss = slate_loss_fn(slate, slate_logits)
        slate_gradients = slate_tape.gradient(slate_loss, slate_network.trainable_variables)
        slate_optimizer.apply_gradients(zip(slate_gradients, slate_network.trainable_variables))

        rewards.append(reward)
        t += 1

    print('Episode:', i, 'Total reward:', np.sum(rewards))
    rewardset.append( np.sum(rewards))


Episode: 0 Total reward: 967.7289165847804
Episode: 1 Total reward: 1003.6070337719854
Episode: 2 Total reward: 971.6753493706269
Episode: 3 Total reward: 975.0470619727462
Episode: 4 Total reward: 987.9559467577774
Episode: 5 Total reward: 1035.6122368815973
Episode: 6 Total reward: 831.1515531879124
Episode: 7 Total reward: 979.4288563155893
Episode: 8 Total reward: 1142.8023960893433
Episode: 9 Total reward: 939.5833678212085
Episode: 10 Total reward: 1069.3930305123652
Episode: 11 Total reward: 955.6141897918654
Episode: 12 Total reward: 1278.0104404833241
Episode: 13 Total reward: 1068.5820154515357
Episode: 14 Total reward: 840.7090774733614
Episode: 15 Total reward: 836.5567379450723
Episode: 16 Total reward: 949.0834238057274
Episode: 17 Total reward: 787.0577577064873
Episode: 18 Total reward: 899.3195418783839
Episode: 19 Total reward: 868.069974573341
Episode: 20 Total reward: 1098.256572192914
Episode: 21 Total reward: 849.7325188498234
Episode: 22 Total reward: 900.9568302

Episode: 184 Total reward: 799.7938320286044
Episode: 185 Total reward: 1470.999471952106
Episode: 186 Total reward: 826.164888879901
Episode: 187 Total reward: 827.9885740076478
Episode: 188 Total reward: 1131.3634493082404
Episode: 189 Total reward: 1079.3782909108131
Episode: 190 Total reward: 903.6776618248922
Episode: 191 Total reward: 892.3351214917956
Episode: 192 Total reward: 1121.972457963966
Episode: 193 Total reward: 1190.2119140326147
Episode: 194 Total reward: 1120.188031739046
Episode: 195 Total reward: 1292.0020667327576
Episode: 196 Total reward: 767.6037732422942
Episode: 197 Total reward: 980.1443601552961
Episode: 198 Total reward: 1154.8261637864273
Episode: 199 Total reward: 951.6471881243272
Episode: 200 Total reward: 1036.3652458611864
Episode: 201 Total reward: 1019.2220788904851
Episode: 202 Total reward: 1024.289015892677
Episode: 203 Total reward: 1281.434030285245
Episode: 204 Total reward: 812.2242738840586
Episode: 205 Total reward: 1305.1719610521477
Epi

Episode: 366 Total reward: 1208.8310173657887
Episode: 367 Total reward: 1334.3253752294295
Episode: 368 Total reward: 717.5558452444086
Episode: 369 Total reward: 666.1993744658987
Episode: 370 Total reward: 1067.652739535203
Episode: 371 Total reward: 830.1298728053566
Episode: 372 Total reward: 832.248426669909
Episode: 373 Total reward: 1051.653651937517
Episode: 374 Total reward: 880.3327641588781
Episode: 375 Total reward: 996.1864680166258
Episode: 376 Total reward: 991.5737194456232
Episode: 377 Total reward: 930.5888357149365
Episode: 378 Total reward: 947.1302187950522
Episode: 379 Total reward: 845.69417008748
Episode: 380 Total reward: 694.6957020562396
Episode: 381 Total reward: 1216.3535198456032
Episode: 382 Total reward: 977.0942677619764
Episode: 383 Total reward: 932.2657607271403
Episode: 384 Total reward: 898.4186102907371
Episode: 385 Total reward: 797.783892049894
Episode: 386 Total reward: 1077.944780736422
Episode: 387 Total reward: 999.7639255325803
Episode: 38

Episode: 548 Total reward: 926.2055379562835
Episode: 549 Total reward: 734.1935919055192
Episode: 550 Total reward: 908.3933326243117
Episode: 551 Total reward: 870.3580203619335
Episode: 552 Total reward: 786.9212164617729
Episode: 553 Total reward: 665.9450123116454
Episode: 554 Total reward: 932.8803907553662
Episode: 555 Total reward: 879.8883912914622
Episode: 556 Total reward: 773.7673035053515
Episode: 557 Total reward: 752.7597120264666
Episode: 558 Total reward: 1012.293484099903
Episode: 559 Total reward: 995.5336759808325
Episode: 560 Total reward: 1086.9571302613836
Episode: 561 Total reward: 698.4849860596586
Episode: 562 Total reward: 1088.1379623459284
Episode: 563 Total reward: 980.1205437424676
Episode: 564 Total reward: 932.7101335201938
Episode: 565 Total reward: 868.6806220503427
Episode: 566 Total reward: 1222.1781514759778
Episode: 567 Total reward: 950.751286061939
Episode: 568 Total reward: 1049.5844151344709
Episode: 569 Total reward: 697.2627912820799
Episode

Episode: 730 Total reward: 845.4463750712487
Episode: 731 Total reward: 834.8526363868483
Episode: 732 Total reward: 1079.3717112836225
Episode: 733 Total reward: 1222.0474383033054
Episode: 734 Total reward: 914.0920720334639
Episode: 735 Total reward: 1214.956137120545
Episode: 736 Total reward: 1112.8611290650977
Episode: 737 Total reward: 913.8063129295756
Episode: 738 Total reward: 885.114361366577
Episode: 739 Total reward: 1000.7739956879503
Episode: 740 Total reward: 782.7725763768625
Episode: 741 Total reward: 619.7177346305574
Episode: 742 Total reward: 643.443415678061
Episode: 743 Total reward: 939.5864494293237
Episode: 744 Total reward: 953.8984897381791
Episode: 745 Total reward: 1326.478582185556
Episode: 746 Total reward: 942.6278818923356
Episode: 747 Total reward: 1139.6834472514022
Episode: 748 Total reward: 950.1665078703877
Episode: 749 Total reward: 796.2210935263589
Episode: 750 Total reward: 1015.253391466605
Episode: 751 Total reward: 947.757708886727
Episode:

Episode: 912 Total reward: 1215.5353120693003
Episode: 913 Total reward: 783.986219302037
Episode: 914 Total reward: 1008.4126701481848
Episode: 915 Total reward: 1167.3243492720921
Episode: 916 Total reward: 993.411519558312
Episode: 917 Total reward: 831.2881847193146
Episode: 918 Total reward: 906.074562883321
Episode: 919 Total reward: 617.6252784083042
Episode: 920 Total reward: 707.4046349773856
Episode: 921 Total reward: 846.6488366661953
Episode: 922 Total reward: 1101.1889554588986
Episode: 923 Total reward: 770.078991998817
Episode: 924 Total reward: 1342.8322997973685
Episode: 925 Total reward: 1046.247146038502
Episode: 926 Total reward: 858.2919625560138
Episode: 927 Total reward: 771.8033581909424
Episode: 928 Total reward: 921.0431638352167
Episode: 929 Total reward: 959.95655899526
Episode: 930 Total reward: 1042.1750146287184
Episode: 931 Total reward: 1198.4223454815003
Episode: 932 Total reward: 922.550231642072
Episode: 933 Total reward: 791.8595881322099
Episode: 9

In [13]:
rewardset2 = []
for i in range(num_episodes):
    obs = env.reset()
    done = False
    t = 0
    rewards = []

    while not done and t < num_steps:
        # Compute Q-values for all possible slates
        q_values = q_network(obs['user'][np.newaxis])

        # Choose the slate with the highest NeuralUCB score
        # slate_scores = q_values.numpy().squeeze() + exploration_fn(0, 1, t)
        # slate = np.argmax(slate_scores)
        slate = [0,1]
        # Take action and observe reward and next observation
        obs, reward, done, _ = env.step({0: slate})

        # Compute target Q-values and train Q-value network
#         target_q_values = q_values.numpy().copy()
#         target_q_values[0, slate] = reward
#         with tf.GradientTape() as q_tape:
#             q_preds = q_network(obs['user'][np.newaxis])
#             q_loss = q_loss_fn(target_q_values, q_preds)
#         q_gradients = q_tape.gradient(q_loss, q_network.trainable_variables)
#         q_optimizer.apply_gradients(zip(q_gradients, q_network.trainable_variables))

        rewards.append(reward)
        t += 1

    print('Episode:', i, 'Total reward:', np.sum(rewards))
    rewardset2.append(np.sum(rewards))

Episode: 0 Total reward: 984.831629821338
Episode: 1 Total reward: 1029.303294572186
Episode: 2 Total reward: 877.8228305112524
Episode: 3 Total reward: 722.6468533171354
Episode: 4 Total reward: 874.2501250175957
Episode: 5 Total reward: 888.6936899119104
Episode: 6 Total reward: 1077.1663534883442
Episode: 7 Total reward: 1031.8381476306604
Episode: 8 Total reward: 1124.130184305596
Episode: 9 Total reward: 871.0558106816998
Episode: 10 Total reward: 958.4926061073642
Episode: 11 Total reward: 941.6821729991508
Episode: 12 Total reward: 1002.1573042621907
Episode: 13 Total reward: 745.834056009102
Episode: 14 Total reward: 967.5763647762195
Episode: 15 Total reward: 879.4334248238071
Episode: 16 Total reward: 1080.232003374801
Episode: 17 Total reward: 985.1907354185738
Episode: 18 Total reward: 1045.3430840220426
Episode: 19 Total reward: 974.4796231299604
Episode: 20 Total reward: 815.2398067944087
Episode: 21 Total reward: 885.1460297262578
Episode: 22 Total reward: 1147.193699877

Episode: 185 Total reward: 1216.871429467511
Episode: 186 Total reward: 770.7659747458529
Episode: 187 Total reward: 815.3584803898215
Episode: 188 Total reward: 971.9397705066269
Episode: 189 Total reward: 776.2483927993039
Episode: 190 Total reward: 939.4764688163899
Episode: 191 Total reward: 1140.8623091080926
Episode: 192 Total reward: 1005.9305033826724
Episode: 193 Total reward: 1138.1101333975391
Episode: 194 Total reward: 857.602117761849
Episode: 195 Total reward: 901.7455686179998
Episode: 196 Total reward: 1046.7334326791783
Episode: 197 Total reward: 857.9220943896723
Episode: 198 Total reward: 815.5236275294756
Episode: 199 Total reward: 959.2933958029987
Episode: 200 Total reward: 1085.638195147914
Episode: 201 Total reward: 1152.8257258034698
Episode: 202 Total reward: 1048.8847746716617
Episode: 203 Total reward: 830.1428346759543
Episode: 204 Total reward: 933.0577809990651
Episode: 205 Total reward: 1366.155140061571
Episode: 206 Total reward: 832.7900183113122
Episo

Episode: 367 Total reward: 982.1038617783231
Episode: 368 Total reward: 1018.7876202380453
Episode: 369 Total reward: 898.9571963578409
Episode: 370 Total reward: 842.3055457848789
Episode: 371 Total reward: 818.9870364689874
Episode: 372 Total reward: 689.5259438236949
Episode: 373 Total reward: 737.276685004702
Episode: 374 Total reward: 1017.0256977593737
Episode: 375 Total reward: 839.2898059550012
Episode: 376 Total reward: 706.0919685174382
Episode: 377 Total reward: 843.0364271038202
Episode: 378 Total reward: 1250.2314263069304
Episode: 379 Total reward: 905.2305053041083
Episode: 380 Total reward: 848.7738348397195
Episode: 381 Total reward: 899.9189082286679
Episode: 382 Total reward: 940.002323193353
Episode: 383 Total reward: 805.3761033058922
Episode: 384 Total reward: 1138.9935931276843
Episode: 385 Total reward: 891.9361718036961
Episode: 386 Total reward: 953.6514341427988
Episode: 387 Total reward: 1207.7600880211614
Episode: 388 Total reward: 1125.8130514138616
Episod

Episode: 549 Total reward: 970.5538159890311
Episode: 550 Total reward: 851.5664910612896
Episode: 551 Total reward: 846.0244730659052
Episode: 552 Total reward: 1055.093144812828
Episode: 553 Total reward: 931.4531765349674
Episode: 554 Total reward: 1129.0972289321521
Episode: 555 Total reward: 891.5293666562488
Episode: 556 Total reward: 1050.658283404628
Episode: 557 Total reward: 916.1600063400206
Episode: 558 Total reward: 859.4883510492432
Episode: 559 Total reward: 1105.9918073541655
Episode: 560 Total reward: 953.4500524554294
Episode: 561 Total reward: 1044.5693113528735
Episode: 562 Total reward: 677.0387666432127
Episode: 563 Total reward: 1106.332424673234
Episode: 564 Total reward: 984.5985535139657
Episode: 565 Total reward: 853.6956174621914
Episode: 566 Total reward: 988.654542101699
Episode: 567 Total reward: 760.7655892526053
Episode: 568 Total reward: 831.5435389744155
Episode: 569 Total reward: 1041.432533495062
Episode: 570 Total reward: 1005.0665037454271
Episode

Episode: 731 Total reward: 817.8966195069729
Episode: 732 Total reward: 947.9148932897735
Episode: 733 Total reward: 972.0600804376896
Episode: 734 Total reward: 1067.8437636834778
Episode: 735 Total reward: 938.3000992765616
Episode: 736 Total reward: 836.7264004273509
Episode: 737 Total reward: 912.7150037005966
Episode: 738 Total reward: 1037.2174337552333
Episode: 739 Total reward: 923.5993037002463
Episode: 740 Total reward: 1273.781410932041
Episode: 741 Total reward: 657.4656578818264
Episode: 742 Total reward: 1008.6916944548858
Episode: 743 Total reward: 817.7520758636856
Episode: 744 Total reward: 970.2636008202611
Episode: 745 Total reward: 774.3553760888016
Episode: 746 Total reward: 867.6110196377565
Episode: 747 Total reward: 853.0295493178517
Episode: 748 Total reward: 971.9284565742469
Episode: 749 Total reward: 1258.7571706322005
Episode: 750 Total reward: 1030.730106337774
Episode: 751 Total reward: 1306.5700735921403
Episode: 752 Total reward: 965.3267935101876
Episo

Episode: 913 Total reward: 1196.6838530359207
Episode: 914 Total reward: 904.7302742412284
Episode: 915 Total reward: 887.2228760542697
Episode: 916 Total reward: 772.3757257071507
Episode: 917 Total reward: 938.0245327387086
Episode: 918 Total reward: 1067.406092155098
Episode: 919 Total reward: 1034.6509884528089
Episode: 920 Total reward: 942.5949674702881
Episode: 921 Total reward: 876.3780547790922
Episode: 922 Total reward: 1070.6098491103967
Episode: 923 Total reward: 906.6174121380876
Episode: 924 Total reward: 895.4663475725847
Episode: 925 Total reward: 1081.9600570319074
Episode: 926 Total reward: 1092.7084771939922
Episode: 927 Total reward: 1029.580192404698
Episode: 928 Total reward: 940.6612838535812
Episode: 929 Total reward: 902.4709185518245
Episode: 930 Total reward: 1220.5043933244647
Episode: 931 Total reward: 776.7468640305293
Episode: 932 Total reward: 831.9951862536159
Episode: 933 Total reward: 858.8394034664857
Episode: 934 Total reward: 808.0485386723166
Epis

In [14]:
window_size = 10
i = 0
rewardset_averages = []

while i < len(rewardset) - window_size + 1:
    
    # Store elements from i to i+window_size
    # in list to get the current window
    window = rewardset[i : i + window_size]
  
    # Calculate the average of current window
    window_average = round(sum(window) / window_size, 2)
      
    # Store the average of current
    # window in moving average list
    rewardset_averages.append(window_average)
      
    # Shift window to right by one position
    i += 1
  

In [15]:
window_size = 10
i = 0
rewardset2_averages = []

while i < len(rewardset2) - window_size + 1:
    
    # Store elements from i to i+window_size
    # in list to get the current window
    window = rewardset2[i : i + window_size]
  
    # Calculate the average of current window
    window_average = round(sum(window) / window_size, 2)
      
    # Store the average of current
    # window in moving average list
    rewardset2_averages.append(window_average)
      
    # Shift window to right by one position
    i += 1
  

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [12.50, 10.50]

#plt.plot(rewardset)
#plt.plot(rewardset2)
plt.plot(rewardset_averages)
plt.plot(rewardset2_averages)
plt.show()