# Exercise 2.4 - Nonstationary Greedy

The goal of this task is to create a nonstationary 10-armed bandit problem and to see how two different epsilon-greedy algorithms with epsilon = 0.1 perform, where the first algorithm has an adaptive step-size (1/n) and the second algorithm has a constant step-size. 

We have 10 arms, where each ones action values q*(a) was drawn i.i.d from a normal distribution with mean 0 and variance 1. When we then select an action A_t the reward is calculated from a normal distribution with mean q*(A_t) and variance 1. After each step(so after each time that we chose an action) we update the values of all arms. With a probability of 0.5 we increase them by 0.1 and with probability 0.5 we decrease them by 0.1(random walk). 

In [188]:
import numpy as np
import matplotlib.pyplot as plt

## Stationary case

In [189]:
# initialize data structures
action_values = np.zeros(10)
initial_action_values = np.zeros(10)
action_values_count = np.ones(10)
average_rewards1 = np.zeros(1000)


total_reward = 0

# number of loop iterations 
a = 1
b = 10

# initialize action_values with normal distribution around 0 and variance 1
for i in range(0,10): 
    action_values[i] = np.random.normal(0,1,1)
initial_action_values = action_values

#######################################################

# implement greedy learning algorithm with non-changing rewards(stationarity) and we do a thousand steps 

# First case: adaptive stepsize
for z in range(a,b):
    for i in range(0,1000):
        epsilon = np.random.choice(2, 1, p=[0.1, 0.9])

        # we exploit
        if(epsilon[0] == 1): 
            #find maximal action value 
            max_index = np.argmax(action_values)
            max_reward = np.random.normal(initial_action_values[max_index],1,1)
            print("z: ",z," i: ",i ,"Max_reward: ", max_reward)
            action_values[max_index] = action_values[max_index] + (1.0/action_values_count[max_index])*(max_reward - action_values[max_index])
            total_reward += max_reward
            action_values_count[max_index] +=1
            
            #lets iteratively calculate the mean of the rewards in each timestep of 1000 timesteps
            # repeated over 2000 rounds
            average_rewards1[i] = average_rewards1[i] + (1.0/z)*(max_reward-average_rewards1[i])
            
        # we explore        
        else: 
            k = np.random.choice(10, 1)
            action = initial_action_values[k]
            reward = np.random.normal(action,1,1)
            print("z: ",z," i: ",i ,"reward: ", reward)
            action_values[k] = action_values[k] + (1.0/action_values_count[k])*(reward - action_values[k])
            total_reward += reward
            action_values_count[k] +=1
            
            #lets iteratively calculate the mean of the rewards in each timestep of 1000 timesteps
            # repeated over 2000 rounds
            average_rewards1[i] = average_rewards1[i] + (1.0/z)*(reward-average_rewards1[i])
            
        max_action = 0
                
    # re-initialize action_values with normal distribution around 0 and variance 1 after each one of the 2000 rounds
    for l in range(0,10): 
        action_values[l] = np.random.normal(0,1,1)
    initial_action_values = action_values
    # re-initialize action_values_counts
    action_values_count = np.ones(10)
 

z:  1  i:  0 reward:  [ 2.21161917]
z:  1  i:  1 reward:  [ 2.02461501]
z:  1  i:  2 Max_reward:  [ 2.61700331]
z:  1  i:  3 Max_reward:  [ 2.55431556]
z:  1  i:  4 Max_reward:  [ 3.28918761]
z:  1  i:  5 Max_reward:  [ 2.16406901]
z:  1  i:  6 Max_reward:  [ 2.13401499]
z:  1  i:  7 Max_reward:  [ 3.65576005]
z:  1  i:  8 Max_reward:  [ 3.01529839]
z:  1  i:  9 Max_reward:  [ 4.07325543]
z:  1  i:  10 Max_reward:  [ 2.64749005]
z:  1  i:  11 reward:  [-0.88903526]
z:  1  i:  12 Max_reward:  [ 2.88301742]
z:  1  i:  13 Max_reward:  [ 3.07998181]
z:  1  i:  14 Max_reward:  [ 2.47002377]
z:  1  i:  15 Max_reward:  [ 3.57056201]
z:  1  i:  16 Max_reward:  [ 4.30658722]
z:  1  i:  17 Max_reward:  [ 2.03858655]
z:  1  i:  18 Max_reward:  [ 1.92701603]
z:  1  i:  19 Max_reward:  [ 2.12454158]
z:  1  i:  20 Max_reward:  [ 3.60525888]
z:  1  i:  21 Max_reward:  [ 3.11560412]
z:  1  i:  22 reward:  [ 0.40548444]
z:  1  i:  23 Max_reward:  [ 4.56659223]
z:  1  i:  24 Max_reward:  [ 2.55649198]
z

z:  1  i:  255 Max_reward:  [ 1.58479795]
z:  1  i:  256 Max_reward:  [ 4.26197573]
z:  1  i:  257 reward:  [ 2.95160763]
z:  1  i:  258 Max_reward:  [ 4.13984206]
z:  1  i:  259 Max_reward:  [ 2.65760368]
z:  1  i:  260 reward:  [ 1.57215413]
z:  1  i:  261 Max_reward:  [ 3.11876082]
z:  1  i:  262 Max_reward:  [ 4.35653018]
z:  1  i:  263 Max_reward:  [ 2.63885101]
z:  1  i:  264 Max_reward:  [ 3.8678634]
z:  1  i:  265 reward:  [ 1.04329485]
z:  1  i:  266 Max_reward:  [ 2.87205343]
z:  1  i:  267 reward:  [ 1.66418552]
z:  1  i:  268 reward:  [-0.91017]
z:  1  i:  269 Max_reward:  [ 3.89322389]
z:  1  i:  270 Max_reward:  [ 2.52316969]
z:  1  i:  271 Max_reward:  [ 3.56718795]
z:  1  i:  272 Max_reward:  [ 5.05209344]
z:  1  i:  273 Max_reward:  [ 2.17546667]
z:  1  i:  274 Max_reward:  [ 3.89616396]
z:  1  i:  275 Max_reward:  [ 1.41676221]
z:  1  i:  276 Max_reward:  [ 3.07275819]
z:  1  i:  277 reward:  [ 0.32112564]
z:  1  i:  278 Max_reward:  [ 2.65012996]
z:  1  i:  279 Max_r

z:  1  i:  497 Max_reward:  [ 3.96462511]
z:  1  i:  498 Max_reward:  [ 3.66054489]
z:  1  i:  499 Max_reward:  [ 2.07622205]
z:  1  i:  500 Max_reward:  [ 1.98568016]
z:  1  i:  501 reward:  [-1.59723744]
z:  1  i:  502 Max_reward:  [ 3.74607928]
z:  1  i:  503 Max_reward:  [ 1.09048123]
z:  1  i:  504 Max_reward:  [ 3.35063383]
z:  1  i:  505 Max_reward:  [ 2.88586151]
z:  1  i:  506 Max_reward:  [ 3.35295662]
z:  1  i:  507 reward:  [ 0.84906173]
z:  1  i:  508 Max_reward:  [ 1.66952797]
z:  1  i:  509 Max_reward:  [ 2.93053589]
z:  1  i:  510 Max_reward:  [ 3.43553063]
z:  1  i:  511 Max_reward:  [ 1.8576438]
z:  1  i:  512 Max_reward:  [ 3.82793411]
z:  1  i:  513 Max_reward:  [ 3.35476234]
z:  1  i:  514 reward:  [-0.53995054]
z:  1  i:  515 Max_reward:  [ 5.08305089]
z:  1  i:  516 Max_reward:  [ 4.4304012]
z:  1  i:  517 Max_reward:  [ 3.61587627]
z:  1  i:  518 Max_reward:  [ 4.53044846]
z:  1  i:  519 Max_reward:  [ 3.29764016]
z:  1  i:  520 Max_reward:  [ 1.36835437]
z:  1 

z:  1  i:  813 Max_reward:  [ 3.21378649]
z:  1  i:  814 Max_reward:  [ 4.81368601]
z:  1  i:  815 Max_reward:  [ 3.60665816]
z:  1  i:  816 Max_reward:  [ 4.08498771]
z:  1  i:  817 reward:  [ 0.4889798]
z:  1  i:  818 Max_reward:  [ 2.26599592]
z:  1  i:  819 Max_reward:  [ 2.8466273]
z:  1  i:  820 Max_reward:  [ 3.23430969]
z:  1  i:  821 Max_reward:  [ 2.2478037]
z:  1  i:  822 Max_reward:  [ 2.14617417]
z:  1  i:  823 Max_reward:  [ 3.00684112]
z:  1  i:  824 Max_reward:  [ 1.85263929]
z:  1  i:  825 Max_reward:  [ 2.41996055]
z:  1  i:  826 Max_reward:  [ 2.76705932]
z:  1  i:  827 Max_reward:  [ 2.26133192]
z:  1  i:  828 Max_reward:  [ 4.3882442]
z:  1  i:  829 Max_reward:  [ 1.59086226]
z:  1  i:  830 Max_reward:  [ 3.22582389]
z:  1  i:  831 reward:  [ 2.43072917]
z:  1  i:  832 Max_reward:  [ 3.02829685]
z:  1  i:  833 Max_reward:  [ 3.77495318]
z:  1  i:  834 reward:  [-1.04868406]
z:  1  i:  835 Max_reward:  [ 3.77509632]
z:  1  i:  836 Max_reward:  [ 2.27562825]
z:  1  i

z:  2  i:  214 Max_reward:  [ 0.10586958]
z:  2  i:  215 reward:  [-3.2923516]
z:  2  i:  216 Max_reward:  [ 0.01817224]
z:  2  i:  217 Max_reward:  [ 3.78024239]
z:  2  i:  218 Max_reward:  [ 1.73738588]
z:  2  i:  219 Max_reward:  [ 0.5102256]
z:  2  i:  220 Max_reward:  [ 1.7228016]
z:  2  i:  221 Max_reward:  [ 1.81821777]
z:  2  i:  222 Max_reward:  [ 0.84431251]
z:  2  i:  223 Max_reward:  [ 1.4041309]
z:  2  i:  224 Max_reward:  [ 1.99433054]
z:  2  i:  225 Max_reward:  [ 0.07086377]
z:  2  i:  226 reward:  [-2.07048449]
z:  2  i:  227 Max_reward:  [ 1.34676857]
z:  2  i:  228 Max_reward:  [ 0.99521519]
z:  2  i:  229 Max_reward:  [ 0.54411553]
z:  2  i:  230 Max_reward:  [ 1.61254112]
z:  2  i:  231 reward:  [-1.80248945]
z:  2  i:  232 Max_reward:  [ 0.82068676]
z:  2  i:  233 reward:  [ 2.62639703]
z:  2  i:  234 Max_reward:  [-0.57923759]
z:  2  i:  235 Max_reward:  [ 0.91207188]
z:  2  i:  236 reward:  [ 0.85959689]
z:  2  i:  237 reward:  [-3.06095361]
z:  2  i:  238 Max_r

z:  2  i:  420 Max_reward:  [ 1.82093926]
z:  2  i:  421 Max_reward:  [ 1.62627924]
z:  2  i:  422 Max_reward:  [ 2.03420902]
z:  2  i:  423 Max_reward:  [ 0.8884766]
z:  2  i:  424 Max_reward:  [ 1.2345556]
z:  2  i:  425 Max_reward:  [ 0.10740232]
z:  2  i:  426 Max_reward:  [ 1.42401412]
z:  2  i:  427 reward:  [ 1.89298021]
z:  2  i:  428 Max_reward:  [ 1.69130457]
z:  2  i:  429 reward:  [-0.5433155]
z:  2  i:  430 Max_reward:  [ 1.22580203]
z:  2  i:  431 Max_reward:  [ 0.10056064]
z:  2  i:  432 reward:  [ 0.65603512]
z:  2  i:  433 Max_reward:  [ 0.76682087]
z:  2  i:  434 Max_reward:  [ 2.65369663]
z:  2  i:  435 Max_reward:  [ 0.68151577]
z:  2  i:  436 Max_reward:  [ 1.77182308]
z:  2  i:  437 Max_reward:  [ 1.47556342]
z:  2  i:  438 Max_reward:  [ 0.65603708]
z:  2  i:  439 Max_reward:  [ 0.38455432]
z:  2  i:  440 Max_reward:  [ 0.95601883]
z:  2  i:  441 Max_reward:  [ 1.29992098]
z:  2  i:  442 Max_reward:  [ 1.01318987]
z:  2  i:  443 Max_reward:  [ 2.53448043]
z:  2  

z:  2  i:  646 Max_reward:  [ 2.69135567]
z:  2  i:  647 Max_reward:  [ 2.08439712]
z:  2  i:  648 Max_reward:  [ 1.99753149]
z:  2  i:  649 Max_reward:  [ 1.91413097]
z:  2  i:  650 Max_reward:  [ 1.65188078]
z:  2  i:  651 Max_reward:  [ 1.24874251]
z:  2  i:  652 reward:  [-1.22386633]
z:  2  i:  653 Max_reward:  [ 1.19884297]
z:  2  i:  654 Max_reward:  [ 0.34045231]
z:  2  i:  655 reward:  [ 0.56228227]
z:  2  i:  656 Max_reward:  [ 0.95923737]
z:  2  i:  657 Max_reward:  [-0.24120313]
z:  2  i:  658 Max_reward:  [ 2.43099474]
z:  2  i:  659 Max_reward:  [ 0.90511694]
z:  2  i:  660 Max_reward:  [ 0.58164162]
z:  2  i:  661 Max_reward:  [ 0.68544514]
z:  2  i:  662 reward:  [ 1.83319508]
z:  2  i:  663 Max_reward:  [ 0.82915095]
z:  2  i:  664 reward:  [ 1.03974871]
z:  2  i:  665 Max_reward:  [ 1.61129276]
z:  2  i:  666 Max_reward:  [ 2.38048921]
z:  2  i:  667 Max_reward:  [ 0.47467022]
z:  2  i:  668 Max_reward:  [-0.80139443]
z:  2  i:  669 Max_reward:  [ 2.62729729]
z:  2  i

z:  3  i:  70 Max_reward:  [ 2.40474055]
z:  3  i:  71 Max_reward:  [ 2.86319299]
z:  3  i:  72 Max_reward:  [ 4.08916671]
z:  3  i:  73 Max_reward:  [ 2.87910952]
z:  3  i:  74 Max_reward:  [ 3.58589722]
z:  3  i:  75 Max_reward:  [ 2.04469152]
z:  3  i:  76 Max_reward:  [ 1.49345023]
z:  3  i:  77 Max_reward:  [ 2.6395363]
z:  3  i:  78 Max_reward:  [ 4.165808]
z:  3  i:  79 Max_reward:  [ 1.39605244]
z:  3  i:  80 Max_reward:  [ 3.9966777]
z:  3  i:  81 Max_reward:  [ 3.01117314]
z:  3  i:  82 Max_reward:  [ 2.38738578]
z:  3  i:  83 Max_reward:  [ 2.57108001]
z:  3  i:  84 Max_reward:  [ 1.78719552]
z:  3  i:  85 Max_reward:  [ 2.02345494]
z:  3  i:  86 reward:  [ 0.58855092]
z:  3  i:  87 Max_reward:  [ 1.40943508]
z:  3  i:  88 Max_reward:  [ 0.48549218]
z:  3  i:  89 Max_reward:  [ 3.55788136]
z:  3  i:  90 Max_reward:  [ 3.41494035]
z:  3  i:  91 Max_reward:  [ 2.35378441]
z:  3  i:  92 Max_reward:  [ 4.12719215]
z:  3  i:  93 Max_reward:  [ 1.1731619]
z:  3  i:  94 Max_reward:

z:  3  i:  497 Max_reward:  [ 2.61695046]
z:  3  i:  498 Max_reward:  [ 1.02079299]
z:  3  i:  499 Max_reward:  [ 1.34771411]
z:  3  i:  500 Max_reward:  [ 3.73359607]
z:  3  i:  501 reward:  [-0.12465076]
z:  3  i:  502 Max_reward:  [ 0.57656122]
z:  3  i:  503 reward:  [-2.24172396]
z:  3  i:  504 Max_reward:  [ 3.72605752]
z:  3  i:  505 reward:  [-1.56961035]
z:  3  i:  506 Max_reward:  [ 0.68946887]
z:  3  i:  507 Max_reward:  [ 1.20506378]
z:  3  i:  508 Max_reward:  [ 2.06790397]
z:  3  i:  509 Max_reward:  [ 4.58056685]
z:  3  i:  510 Max_reward:  [ 1.98409076]
z:  3  i:  511 Max_reward:  [ 2.91075663]
z:  3  i:  512 Max_reward:  [ 3.10580942]
z:  3  i:  513 Max_reward:  [ 2.66078063]
z:  3  i:  514 Max_reward:  [ 2.05535371]
z:  3  i:  515 reward:  [-3.69041716]
z:  3  i:  516 Max_reward:  [ 4.23290924]
z:  3  i:  517 Max_reward:  [ 0.73050295]
z:  3  i:  518 Max_reward:  [ 3.72558882]
z:  3  i:  519 Max_reward:  [ 2.2284794]
z:  3  i:  520 Max_reward:  [ 4.38940686]
z:  3  i:

z:  3  i:  883 Max_reward:  [ 2.28645489]
z:  3  i:  884 Max_reward:  [ 3.08376211]
z:  3  i:  885 Max_reward:  [ 3.74784457]
z:  3  i:  886 Max_reward:  [ 3.08501574]
z:  3  i:  887 Max_reward:  [ 3.58948296]
z:  3  i:  888 Max_reward:  [ 2.60529814]
z:  3  i:  889 Max_reward:  [ 2.04056638]
z:  3  i:  890 Max_reward:  [ 1.98264454]
z:  3  i:  891 reward:  [ 2.97084178]
z:  3  i:  892 Max_reward:  [ 1.63657865]
z:  3  i:  893 Max_reward:  [ 2.69799081]
z:  3  i:  894 Max_reward:  [ 2.9893192]
z:  3  i:  895 Max_reward:  [ 3.60458597]
z:  3  i:  896 Max_reward:  [ 3.46287494]
z:  3  i:  897 Max_reward:  [ 3.10809708]
z:  3  i:  898 Max_reward:  [ 2.59693722]
z:  3  i:  899 reward:  [-4.53115657]
z:  3  i:  900 Max_reward:  [ 3.53964207]
z:  3  i:  901 Max_reward:  [ 2.28509759]
z:  3  i:  902 Max_reward:  [ 2.12225457]
z:  3  i:  903 Max_reward:  [ 3.79114463]
z:  3  i:  904 Max_reward:  [ 4.13528591]
z:  3  i:  905 Max_reward:  [ 2.18402037]
z:  3  i:  906 Max_reward:  [ 2.28205315]
z

z:  4  i:  86 Max_reward:  [ 3.6016182]
z:  4  i:  87 Max_reward:  [ 2.86361207]
z:  4  i:  88 Max_reward:  [ 3.11576771]
z:  4  i:  89 Max_reward:  [ 2.58137358]
z:  4  i:  90 Max_reward:  [ 3.03381829]
z:  4  i:  91 reward:  [-0.05302923]
z:  4  i:  92 Max_reward:  [ 3.15599862]
z:  4  i:  93 Max_reward:  [ 4.82339069]
z:  4  i:  94 Max_reward:  [ 3.6874953]
z:  4  i:  95 Max_reward:  [ 4.13815985]
z:  4  i:  96 Max_reward:  [ 5.33585483]
z:  4  i:  97 Max_reward:  [ 3.42776388]
z:  4  i:  98 Max_reward:  [ 1.82713289]
z:  4  i:  99 Max_reward:  [ 5.79303898]
z:  4  i:  100 Max_reward:  [ 1.30623244]
z:  4  i:  101 Max_reward:  [ 2.35077612]
z:  4  i:  102 Max_reward:  [ 2.64625403]
z:  4  i:  103 Max_reward:  [ 3.15437168]
z:  4  i:  104 Max_reward:  [ 4.93220283]
z:  4  i:  105 Max_reward:  [ 3.68453102]
z:  4  i:  106 reward:  [ 0.53520642]
z:  4  i:  107 Max_reward:  [ 4.85793117]
z:  4  i:  108 Max_reward:  [ 3.21113115]
z:  4  i:  109 Max_reward:  [ 3.72747262]
z:  4  i:  110 M

z:  4  i:  499 Max_reward:  [ 5.42496489]
z:  4  i:  500 Max_reward:  [ 3.07352855]
z:  4  i:  501 Max_reward:  [ 4.99350086]
z:  4  i:  502 Max_reward:  [ 2.71607791]
z:  4  i:  503 Max_reward:  [ 3.22842459]
z:  4  i:  504 Max_reward:  [ 4.1340091]
z:  4  i:  505 Max_reward:  [ 3.18957129]
z:  4  i:  506 Max_reward:  [ 3.19002909]
z:  4  i:  507 Max_reward:  [ 5.59929971]
z:  4  i:  508 reward:  [ 3.9429407]
z:  4  i:  509 reward:  [ 4.98866679]
z:  4  i:  510 Max_reward:  [ 3.41980518]
z:  4  i:  511 Max_reward:  [ 6.83840952]
z:  4  i:  512 Max_reward:  [ 2.1321032]
z:  4  i:  513 Max_reward:  [ 3.70802844]
z:  4  i:  514 Max_reward:  [ 3.68565548]
z:  4  i:  515 Max_reward:  [ 3.98579312]
z:  4  i:  516 Max_reward:  [ 3.64278563]
z:  4  i:  517 Max_reward:  [ 1.99756031]
z:  4  i:  518 Max_reward:  [ 4.95012307]
z:  4  i:  519 Max_reward:  [ 3.62005724]
z:  4  i:  520 Max_reward:  [ 2.98973097]
z:  4  i:  521 Max_reward:  [ 4.51178742]
z:  4  i:  522 Max_reward:  [ 2.22893272]
z: 

z:  4  i:  727 Max_reward:  [ 3.92511651]
z:  4  i:  728 Max_reward:  [ 4.44721754]
z:  4  i:  729 Max_reward:  [ 1.82540063]
z:  4  i:  730 Max_reward:  [ 2.19603149]
z:  4  i:  731 Max_reward:  [ 1.81482165]
z:  4  i:  732 Max_reward:  [ 4.30146652]
z:  4  i:  733 Max_reward:  [ 2.47161473]
z:  4  i:  734 Max_reward:  [ 3.00309577]
z:  4  i:  735 Max_reward:  [ 2.07891158]
z:  4  i:  736 Max_reward:  [ 5.18114845]
z:  4  i:  737 Max_reward:  [ 3.7282423]
z:  4  i:  738 Max_reward:  [ 1.81938562]
z:  4  i:  739 Max_reward:  [ 3.34108375]
z:  4  i:  740 Max_reward:  [ 3.65660839]
z:  4  i:  741 Max_reward:  [ 4.15529221]
z:  4  i:  742 Max_reward:  [ 3.91554571]
z:  4  i:  743 Max_reward:  [ 3.43207966]
z:  4  i:  744 Max_reward:  [ 3.24341658]
z:  4  i:  745 Max_reward:  [ 3.25811192]
z:  4  i:  746 Max_reward:  [ 3.39460071]
z:  4  i:  747 Max_reward:  [ 3.88356967]
z:  4  i:  748 Max_reward:  [ 3.8795791]
z:  4  i:  749 Max_reward:  [ 2.1525542]
z:  4  i:  750 Max_reward:  [ 5.36268

z:  5  i:  76 reward:  [-1.77647432]
z:  5  i:  77 Max_reward:  [-0.16629905]
z:  5  i:  78 Max_reward:  [ 2.88204661]
z:  5  i:  79 Max_reward:  [ 2.01745887]
z:  5  i:  80 Max_reward:  [ 2.94511683]
z:  5  i:  81 Max_reward:  [ 0.17600823]
z:  5  i:  82 reward:  [-2.50922986]
z:  5  i:  83 Max_reward:  [ 1.75867624]
z:  5  i:  84 reward:  [ 0.80274419]
z:  5  i:  85 Max_reward:  [ 3.72992571]
z:  5  i:  86 Max_reward:  [ 2.97338252]
z:  5  i:  87 Max_reward:  [ 1.43271313]
z:  5  i:  88 Max_reward:  [ 2.06839991]
z:  5  i:  89 Max_reward:  [ 2.64834402]
z:  5  i:  90 Max_reward:  [ 3.14250999]
z:  5  i:  91 reward:  [ 0.47801298]
z:  5  i:  92 Max_reward:  [ 2.52479485]
z:  5  i:  93 Max_reward:  [ 1.39904462]
z:  5  i:  94 Max_reward:  [ 1.64671758]
z:  5  i:  95 Max_reward:  [ 3.03212247]
z:  5  i:  96 reward:  [-0.92310283]
z:  5  i:  97 reward:  [ 1.17070848]
z:  5  i:  98 Max_reward:  [ 2.0408643]
z:  5  i:  99 Max_reward:  [ 1.10505569]
z:  5  i:  100 Max_reward:  [ 1.50918938]

z:  5  i:  403 Max_reward:  [ 1.70235886]
z:  5  i:  404 reward:  [ 2.56124454]
z:  5  i:  405 Max_reward:  [ 2.10229348]
z:  5  i:  406 Max_reward:  [ 2.09306984]
z:  5  i:  407 Max_reward:  [ 3.67864148]
z:  5  i:  408 Max_reward:  [ 1.69744642]
z:  5  i:  409 Max_reward:  [ 1.57063301]
z:  5  i:  410 reward:  [ 0.42651799]
z:  5  i:  411 Max_reward:  [ 2.09585887]
z:  5  i:  412 reward:  [-0.64896035]
z:  5  i:  413 Max_reward:  [ 0.48949162]
z:  5  i:  414 Max_reward:  [ 1.92805817]
z:  5  i:  415 Max_reward:  [ 2.60462096]
z:  5  i:  416 Max_reward:  [ 0.92090369]
z:  5  i:  417 Max_reward:  [ 2.21459043]
z:  5  i:  418 Max_reward:  [ 0.36407656]
z:  5  i:  419 Max_reward:  [ 2.03438378]
z:  5  i:  420 Max_reward:  [ 1.37987108]
z:  5  i:  421 Max_reward:  [ 0.69102374]
z:  5  i:  422 Max_reward:  [ 1.42954746]
z:  5  i:  423 Max_reward:  [ 0.82464745]
z:  5  i:  424 Max_reward:  [ 2.19494197]
z:  5  i:  425 Max_reward:  [ 1.07824651]
z:  5  i:  426 Max_reward:  [ 0.94007571]
z:  

z:  5  i:  709 Max_reward:  [ 0.93784713]
z:  5  i:  710 Max_reward:  [ 2.83004205]
z:  5  i:  711 Max_reward:  [ 1.12440236]
z:  5  i:  712 Max_reward:  [ 1.66902191]
z:  5  i:  713 Max_reward:  [ 1.26396077]
z:  5  i:  714 reward:  [-0.58059191]
z:  5  i:  715 Max_reward:  [ 2.1429494]
z:  5  i:  716 reward:  [ 0.49017607]
z:  5  i:  717 Max_reward:  [ 3.69701155]
z:  5  i:  718 Max_reward:  [ 2.32287267]
z:  5  i:  719 reward:  [-2.17347499]
z:  5  i:  720 Max_reward:  [ 3.22647566]
z:  5  i:  721 Max_reward:  [ 0.38986646]
z:  5  i:  722 Max_reward:  [ 1.57630864]
z:  5  i:  723 Max_reward:  [ 2.13256327]
z:  5  i:  724 Max_reward:  [ 2.30906962]
z:  5  i:  725 reward:  [ 0.20059924]
z:  5  i:  726 reward:  [ 0.43311274]
z:  5  i:  727 Max_reward:  [ 2.06268038]
z:  5  i:  728 Max_reward:  [ 2.40670226]
z:  5  i:  729 Max_reward:  [ 1.41059439]
z:  5  i:  730 reward:  [ 0.51850862]
z:  5  i:  731 Max_reward:  [ 0.75728307]
z:  5  i:  732 Max_reward:  [ 1.4773034]
z:  5  i:  733 Max

z:  5  i:  920 Max_reward:  [ 2.38644799]
z:  5  i:  921 Max_reward:  [ 1.79457054]
z:  5  i:  922 Max_reward:  [ 1.30249441]
z:  5  i:  923 Max_reward:  [ 0.28999899]
z:  5  i:  924 Max_reward:  [ 1.14322578]
z:  5  i:  925 Max_reward:  [ 1.46616088]
z:  5  i:  926 Max_reward:  [ 1.57486821]
z:  5  i:  927 Max_reward:  [ 0.62226688]
z:  5  i:  928 Max_reward:  [ 1.63239419]
z:  5  i:  929 Max_reward:  [ 1.38820806]
z:  5  i:  930 Max_reward:  [ 1.94475329]
z:  5  i:  931 reward:  [-3.14254633]
z:  5  i:  932 Max_reward:  [ 2.01337482]
z:  5  i:  933 Max_reward:  [ 0.44831563]
z:  5  i:  934 Max_reward:  [ 2.23866552]
z:  5  i:  935 Max_reward:  [ 2.68816475]
z:  5  i:  936 Max_reward:  [ 1.80352763]
z:  5  i:  937 Max_reward:  [ 0.19687482]
z:  5  i:  938 Max_reward:  [ 2.26817724]
z:  5  i:  939 Max_reward:  [ 0.96556253]
z:  5  i:  940 Max_reward:  [ 1.10904043]
z:  5  i:  941 Max_reward:  [ 0.40116146]
z:  5  i:  942 Max_reward:  [ 3.06185627]
z:  5  i:  943 Max_reward:  [ 1.625356

z:  6  i:  138 Max_reward:  [ 2.82242243]
z:  6  i:  139 Max_reward:  [ 4.89540106]
z:  6  i:  140 Max_reward:  [ 5.26900875]
z:  6  i:  141 Max_reward:  [ 4.59101901]
z:  6  i:  142 Max_reward:  [ 4.16700285]
z:  6  i:  143 reward:  [ 3.93163407]
z:  6  i:  144 Max_reward:  [ 4.81566343]
z:  6  i:  145 Max_reward:  [ 4.22403041]
z:  6  i:  146 Max_reward:  [ 3.40597928]
z:  6  i:  147 Max_reward:  [ 5.14774508]
z:  6  i:  148 Max_reward:  [ 2.95110907]
z:  6  i:  149 reward:  [ 0.93614255]
z:  6  i:  150 Max_reward:  [ 4.76740922]
z:  6  i:  151 Max_reward:  [ 4.39223314]
z:  6  i:  152 Max_reward:  [ 4.72372072]
z:  6  i:  153 Max_reward:  [ 6.21512347]
z:  6  i:  154 Max_reward:  [ 5.28893165]
z:  6  i:  155 reward:  [ 4.93185413]
z:  6  i:  156 reward:  [ 5.81001986]
z:  6  i:  157 Max_reward:  [ 5.75977078]
z:  6  i:  158 Max_reward:  [ 5.01146157]
z:  6  i:  159 Max_reward:  [ 5.58348126]
z:  6  i:  160 Max_reward:  [ 5.39269196]
z:  6  i:  161 Max_reward:  [ 3.78703079]
z:  6  i

z:  6  i:  510 Max_reward:  [ 6.16867341]
z:  6  i:  511 Max_reward:  [ 4.65818704]
z:  6  i:  512 Max_reward:  [ 4.67365848]
z:  6  i:  513 Max_reward:  [ 3.23479643]
z:  6  i:  514 Max_reward:  [ 5.7812323]
z:  6  i:  515 Max_reward:  [ 2.25122216]
z:  6  i:  516 Max_reward:  [ 4.39414475]
z:  6  i:  517 Max_reward:  [ 4.69670629]
z:  6  i:  518 reward:  [ 1.83537755]
z:  6  i:  519 Max_reward:  [ 5.29872425]
z:  6  i:  520 Max_reward:  [ 4.10913085]
z:  6  i:  521 Max_reward:  [ 4.43270935]
z:  6  i:  522 reward:  [ 5.21775216]
z:  6  i:  523 reward:  [ 0.59330775]
z:  6  i:  524 Max_reward:  [ 7.07522376]
z:  6  i:  525 Max_reward:  [ 5.07419321]
z:  6  i:  526 reward:  [ 2.47736317]
z:  6  i:  527 Max_reward:  [ 3.80883259]
z:  6  i:  528 Max_reward:  [ 3.96527631]
z:  6  i:  529 Max_reward:  [ 4.96805066]
z:  6  i:  530 Max_reward:  [ 4.92222146]
z:  6  i:  531 Max_reward:  [ 3.71243382]
z:  6  i:  532 Max_reward:  [ 3.74534623]
z:  6  i:  533 Max_reward:  [ 5.21797712]
z:  6  i:

z:  6  i:  761 Max_reward:  [ 3.4080835]
z:  6  i:  762 Max_reward:  [ 5.05927259]
z:  6  i:  763 reward:  [ 4.61967731]
z:  6  i:  764 Max_reward:  [ 3.33904212]
z:  6  i:  765 Max_reward:  [ 4.89917458]
z:  6  i:  766 Max_reward:  [ 4.09400212]
z:  6  i:  767 Max_reward:  [ 3.72511704]
z:  6  i:  768 Max_reward:  [ 3.23276643]
z:  6  i:  769 Max_reward:  [ 4.69391175]
z:  6  i:  770 Max_reward:  [ 3.84428922]
z:  6  i:  771 Max_reward:  [ 3.77999596]
z:  6  i:  772 Max_reward:  [ 3.40875926]
z:  6  i:  773 Max_reward:  [ 4.34950673]
z:  6  i:  774 Max_reward:  [ 3.37985325]
z:  6  i:  775 Max_reward:  [ 4.97866473]
z:  6  i:  776 Max_reward:  [ 2.96344032]
z:  6  i:  777 Max_reward:  [ 4.14672673]
z:  6  i:  778 reward:  [ 2.43599768]
z:  6  i:  779 Max_reward:  [ 5.19071872]
z:  6  i:  780 Max_reward:  [ 3.12463465]
z:  6  i:  781 Max_reward:  [ 4.8978122]
z:  6  i:  782 Max_reward:  [ 3.12981641]
z:  6  i:  783 reward:  [ 1.35595364]
z:  6  i:  784 Max_reward:  [ 5.02975518]
z:  6 

z:  7  i:  10 Max_reward:  [ 3.69596203]
z:  7  i:  11 reward:  [-3.69684617]
z:  7  i:  12 Max_reward:  [ 3.31574201]
z:  7  i:  13 Max_reward:  [ 2.79264665]
z:  7  i:  14 Max_reward:  [ 3.33841534]
z:  7  i:  15 Max_reward:  [ 3.53865864]
z:  7  i:  16 Max_reward:  [ 3.23259806]
z:  7  i:  17 Max_reward:  [ 3.45477822]
z:  7  i:  18 Max_reward:  [ 0.56619451]
z:  7  i:  19 reward:  [ 0.98798883]
z:  7  i:  20 Max_reward:  [ 3.16155374]
z:  7  i:  21 reward:  [ 1.11762612]
z:  7  i:  22 Max_reward:  [ 3.42051901]
z:  7  i:  23 reward:  [ 3.80487812]
z:  7  i:  24 Max_reward:  [ 4.19314169]
z:  7  i:  25 Max_reward:  [ 3.0580941]
z:  7  i:  26 Max_reward:  [ 2.96063757]
z:  7  i:  27 Max_reward:  [ 2.4579224]
z:  7  i:  28 Max_reward:  [ 2.72993381]
z:  7  i:  29 Max_reward:  [ 2.16494179]
z:  7  i:  30 Max_reward:  [ 3.48964343]
z:  7  i:  31 Max_reward:  [ 2.45326626]
z:  7  i:  32 reward:  [ 2.01848232]
z:  7  i:  33 Max_reward:  [ 1.70653704]
z:  7  i:  34 Max_reward:  [ 3.6203234

z:  7  i:  260 Max_reward:  [ 4.83813243]
z:  7  i:  261 Max_reward:  [ 2.66834556]
z:  7  i:  262 Max_reward:  [ 4.55535109]
z:  7  i:  263 reward:  [ 2.28105398]
z:  7  i:  264 Max_reward:  [ 3.16174948]
z:  7  i:  265 Max_reward:  [ 3.46696463]
z:  7  i:  266 Max_reward:  [ 3.02672866]
z:  7  i:  267 Max_reward:  [ 2.68108123]
z:  7  i:  268 Max_reward:  [ 2.35321233]
z:  7  i:  269 Max_reward:  [ 4.92353488]
z:  7  i:  270 Max_reward:  [ 2.75318249]
z:  7  i:  271 Max_reward:  [ 2.88740238]
z:  7  i:  272 Max_reward:  [ 3.85437326]
z:  7  i:  273 Max_reward:  [ 2.8420514]
z:  7  i:  274 Max_reward:  [ 1.61518917]
z:  7  i:  275 Max_reward:  [ 0.46553425]
z:  7  i:  276 Max_reward:  [ 3.25447352]
z:  7  i:  277 Max_reward:  [ 1.49785563]
z:  7  i:  278 Max_reward:  [ 4.55427949]
z:  7  i:  279 Max_reward:  [ 1.5593845]
z:  7  i:  280 Max_reward:  [ 4.92843491]
z:  7  i:  281 Max_reward:  [ 3.38542812]
z:  7  i:  282 Max_reward:  [ 3.48081661]
z:  7  i:  283 Max_reward:  [ 1.04678977

z:  7  i:  506 Max_reward:  [ 3.73935036]
z:  7  i:  507 Max_reward:  [ 3.87693745]
z:  7  i:  508 Max_reward:  [ 2.39491156]
z:  7  i:  509 Max_reward:  [ 1.88905749]
z:  7  i:  510 Max_reward:  [ 3.26210034]
z:  7  i:  511 reward:  [ 2.77259978]
z:  7  i:  512 Max_reward:  [ 2.55131729]
z:  7  i:  513 Max_reward:  [ 4.522076]
z:  7  i:  514 Max_reward:  [ 2.14898165]
z:  7  i:  515 Max_reward:  [ 3.08407212]
z:  7  i:  516 Max_reward:  [ 1.54518045]
z:  7  i:  517 reward:  [-3.71583931]
z:  7  i:  518 Max_reward:  [ 2.96927525]
z:  7  i:  519 Max_reward:  [ 3.85660124]
z:  7  i:  520 Max_reward:  [ 2.27177334]
z:  7  i:  521 Max_reward:  [ 3.34345866]
z:  7  i:  522 Max_reward:  [ 3.29289431]
z:  7  i:  523 Max_reward:  [ 2.88067185]
z:  7  i:  524 Max_reward:  [ 0.77919349]
z:  7  i:  525 Max_reward:  [ 1.6403738]
z:  7  i:  526 Max_reward:  [ 1.15079469]
z:  7  i:  527 Max_reward:  [ 2.6849902]
z:  7  i:  528 Max_reward:  [ 2.74863409]
z:  7  i:  529 Max_reward:  [ 2.41643146]
z:  

z:  7  i:  758 Max_reward:  [ 3.7953624]
z:  7  i:  759 Max_reward:  [ 2.48166549]
z:  7  i:  760 Max_reward:  [ 3.43703244]
z:  7  i:  761 reward:  [ 2.14272291]
z:  7  i:  762 Max_reward:  [ 1.87940795]
z:  7  i:  763 Max_reward:  [ 3.39439627]
z:  7  i:  764 Max_reward:  [ 4.2746256]
z:  7  i:  765 Max_reward:  [ 3.36604095]
z:  7  i:  766 Max_reward:  [ 2.31205002]
z:  7  i:  767 Max_reward:  [ 2.39074173]
z:  7  i:  768 Max_reward:  [ 3.57578709]
z:  7  i:  769 Max_reward:  [ 5.29120419]
z:  7  i:  770 Max_reward:  [ 4.71819049]
z:  7  i:  771 Max_reward:  [ 2.9460182]
z:  7  i:  772 Max_reward:  [ 4.14470036]
z:  7  i:  773 Max_reward:  [ 2.35516816]
z:  7  i:  774 Max_reward:  [ 3.7260327]
z:  7  i:  775 Max_reward:  [ 4.24217731]
z:  7  i:  776 Max_reward:  [ 2.81438252]
z:  7  i:  777 Max_reward:  [ 3.85494551]
z:  7  i:  778 Max_reward:  [ 3.12081685]
z:  7  i:  779 Max_reward:  [ 4.21910651]
z:  7  i:  780 Max_reward:  [ 2.73545167]
z:  7  i:  781 Max_reward:  [ 4.74176217]


z:  8  i:  4 Max_reward:  [ 4.23774799]
z:  8  i:  5 Max_reward:  [ 5.39436423]
z:  8  i:  6 Max_reward:  [ 1.8942082]
z:  8  i:  7 Max_reward:  [ 2.16430761]
z:  8  i:  8 reward:  [-0.17762888]
z:  8  i:  9 Max_reward:  [ 4.72237597]
z:  8  i:  10 Max_reward:  [ 2.92259855]
z:  8  i:  11 reward:  [-2.09809683]
z:  8  i:  12 Max_reward:  [ 3.74290495]
z:  8  i:  13 Max_reward:  [ 4.04545594]
z:  8  i:  14 Max_reward:  [ 2.33193643]
z:  8  i:  15 Max_reward:  [ 1.30866623]
z:  8  i:  16 Max_reward:  [ 3.91070777]
z:  8  i:  17 Max_reward:  [ 3.3395708]
z:  8  i:  18 Max_reward:  [ 3.27574758]
z:  8  i:  19 Max_reward:  [ 3.81381304]
z:  8  i:  20 Max_reward:  [ 3.67451482]
z:  8  i:  21 Max_reward:  [ 3.17636131]
z:  8  i:  22 Max_reward:  [ 1.45186921]
z:  8  i:  23 Max_reward:  [ 2.25819248]
z:  8  i:  24 Max_reward:  [ 2.86607255]
z:  8  i:  25 Max_reward:  [ 2.65361154]
z:  8  i:  26 Max_reward:  [ 3.65648211]
z:  8  i:  27 Max_reward:  [ 2.80395277]
z:  8  i:  28 Max_reward:  [ 3.5

z:  8  i:  385 Max_reward:  [ 4.06795547]
z:  8  i:  386 Max_reward:  [ 3.41607448]
z:  8  i:  387 reward:  [-4.55592123]
z:  8  i:  388 Max_reward:  [ 3.71856144]
z:  8  i:  389 Max_reward:  [ 2.17921908]
z:  8  i:  390 Max_reward:  [ 2.5218943]
z:  8  i:  391 Max_reward:  [ 3.26010863]
z:  8  i:  392 Max_reward:  [ 2.44161709]
z:  8  i:  393 reward:  [ 0.53481522]
z:  8  i:  394 Max_reward:  [ 4.31732223]
z:  8  i:  395 Max_reward:  [ 2.46376531]
z:  8  i:  396 Max_reward:  [ 2.27963943]
z:  8  i:  397 Max_reward:  [ 2.65692959]
z:  8  i:  398 Max_reward:  [ 3.11249649]
z:  8  i:  399 Max_reward:  [ 4.79594347]
z:  8  i:  400 Max_reward:  [ 4.01518405]
z:  8  i:  401 Max_reward:  [ 1.10391536]
z:  8  i:  402 Max_reward:  [ 1.8510248]
z:  8  i:  403 Max_reward:  [ 3.23812201]
z:  8  i:  404 Max_reward:  [ 3.72749953]
z:  8  i:  405 Max_reward:  [ 3.72427411]
z:  8  i:  406 Max_reward:  [ 3.05380143]
z:  8  i:  407 Max_reward:  [ 2.78881316]
z:  8  i:  408 Max_reward:  [ 3.0206663]
z: 

z:  8  i:  756 Max_reward:  [ 4.16781071]
z:  8  i:  757 Max_reward:  [ 2.00256313]
z:  8  i:  758 Max_reward:  [ 4.75536678]
z:  8  i:  759 Max_reward:  [ 4.6883305]
z:  8  i:  760 Max_reward:  [ 3.52507308]
z:  8  i:  761 Max_reward:  [ 3.79504057]
z:  8  i:  762 Max_reward:  [ 3.21274956]
z:  8  i:  763 Max_reward:  [ 1.86702527]
z:  8  i:  764 reward:  [ 2.12059842]
z:  8  i:  765 Max_reward:  [ 3.99143822]
z:  8  i:  766 Max_reward:  [ 2.54740344]
z:  8  i:  767 Max_reward:  [ 3.20418414]
z:  8  i:  768 Max_reward:  [ 3.87020453]
z:  8  i:  769 Max_reward:  [ 2.5200817]
z:  8  i:  770 Max_reward:  [ 1.0430398]
z:  8  i:  771 Max_reward:  [ 3.75850603]
z:  8  i:  772 Max_reward:  [ 3.17698939]
z:  8  i:  773 Max_reward:  [ 2.48603923]
z:  8  i:  774 Max_reward:  [ 2.7002098]
z:  8  i:  775 Max_reward:  [ 3.55744557]
z:  8  i:  776 Max_reward:  [ 4.33307891]
z:  8  i:  777 Max_reward:  [ 2.02404092]
z:  8  i:  778 Max_reward:  [ 4.06336568]
z:  8  i:  779 Max_reward:  [ 4.01705039]


z:  9  i:  3 Max_reward:  [ 1.83271921]
z:  9  i:  4 Max_reward:  [ 1.61269524]
z:  9  i:  5 Max_reward:  [ 2.16248508]
z:  9  i:  6 Max_reward:  [ 1.30388797]
z:  9  i:  7 Max_reward:  [ 3.55898102]
z:  9  i:  8 Max_reward:  [ 2.85654588]
z:  9  i:  9 Max_reward:  [ 1.68141264]
z:  9  i:  10 Max_reward:  [ 2.49039334]
z:  9  i:  11 Max_reward:  [ 1.59205057]
z:  9  i:  12 Max_reward:  [ 2.28538361]
z:  9  i:  13 Max_reward:  [ 1.1695282]
z:  9  i:  14 Max_reward:  [ 2.86273626]
z:  9  i:  15 reward:  [-0.99787641]
z:  9  i:  16 Max_reward:  [ 2.71221412]
z:  9  i:  17 reward:  [-0.6868089]
z:  9  i:  18 reward:  [-0.79493464]
z:  9  i:  19 Max_reward:  [ 0.50781734]
z:  9  i:  20 Max_reward:  [ 3.04315243]
z:  9  i:  21 Max_reward:  [ 3.5690355]
z:  9  i:  22 Max_reward:  [ 2.0107874]
z:  9  i:  23 Max_reward:  [ 1.89715505]
z:  9  i:  24 Max_reward:  [ 1.34476907]
z:  9  i:  25 Max_reward:  [ 1.72175228]
z:  9  i:  26 reward:  [ 0.6970919]
z:  9  i:  27 Max_reward:  [ 2.66423181]
z: 

z:  9  i:  248 Max_reward:  [ 2.87286783]
z:  9  i:  249 Max_reward:  [ 2.45249819]
z:  9  i:  250 Max_reward:  [ 2.97688543]
z:  9  i:  251 Max_reward:  [ 3.53209073]
z:  9  i:  252 Max_reward:  [ 1.63120926]
z:  9  i:  253 Max_reward:  [ 0.640754]
z:  9  i:  254 Max_reward:  [ 2.16225543]
z:  9  i:  255 Max_reward:  [ 2.5638079]
z:  9  i:  256 Max_reward:  [ 1.68055661]
z:  9  i:  257 Max_reward:  [ 2.03811296]
z:  9  i:  258 Max_reward:  [ 2.1195064]
z:  9  i:  259 Max_reward:  [ 1.4048225]
z:  9  i:  260 Max_reward:  [ 1.86202649]
z:  9  i:  261 Max_reward:  [ 2.64570067]
z:  9  i:  262 Max_reward:  [ 2.83120724]
z:  9  i:  263 Max_reward:  [ 1.7685382]
z:  9  i:  264 Max_reward:  [ 2.64350558]
z:  9  i:  265 Max_reward:  [ 2.89479254]
z:  9  i:  266 Max_reward:  [ 2.51516323]
z:  9  i:  267 Max_reward:  [ 1.36460939]
z:  9  i:  268 Max_reward:  [ 2.4717457]
z:  9  i:  269 Max_reward:  [ 3.87883912]
z:  9  i:  270 Max_reward:  [ 3.67759002]
z:  9  i:  271 Max_reward:  [ 3.32094355]

z:  9  i:  476 Max_reward:  [ 2.2495274]
z:  9  i:  477 Max_reward:  [ 3.29925284]
z:  9  i:  478 Max_reward:  [ 2.32942129]
z:  9  i:  479 Max_reward:  [ 2.17695692]
z:  9  i:  480 Max_reward:  [ 3.2689317]
z:  9  i:  481 Max_reward:  [ 2.378265]
z:  9  i:  482 Max_reward:  [ 1.95057453]
z:  9  i:  483 Max_reward:  [ 2.85758836]
z:  9  i:  484 Max_reward:  [ 1.82481177]
z:  9  i:  485 Max_reward:  [ 1.95553486]
z:  9  i:  486 Max_reward:  [ 1.71130199]
z:  9  i:  487 Max_reward:  [ 2.22321803]
z:  9  i:  488 Max_reward:  [ 3.13048812]
z:  9  i:  489 Max_reward:  [ 3.70107035]
z:  9  i:  490 Max_reward:  [ 2.26013919]
z:  9  i:  491 Max_reward:  [ 3.44050313]
z:  9  i:  492 Max_reward:  [ 1.19982183]
z:  9  i:  493 Max_reward:  [ 0.48112736]
z:  9  i:  494 Max_reward:  [ 1.7506732]
z:  9  i:  495 Max_reward:  [ 1.11192067]
z:  9  i:  496 Max_reward:  [ 2.08603344]
z:  9  i:  497 Max_reward:  [ 1.19272007]
z:  9  i:  498 Max_reward:  [ 2.34445665]
z:  9  i:  499 Max_reward:  [ 3.0547887

z:  9  i:  845 Max_reward:  [ 0.37450409]
z:  9  i:  846 Max_reward:  [ 1.65598923]
z:  9  i:  847 Max_reward:  [ 2.37770117]
z:  9  i:  848 Max_reward:  [ 2.70731312]
z:  9  i:  849 Max_reward:  [ 1.59131723]
z:  9  i:  850 Max_reward:  [ 3.0643802]
z:  9  i:  851 Max_reward:  [ 2.9626684]
z:  9  i:  852 Max_reward:  [ 2.65699952]
z:  9  i:  853 Max_reward:  [ 1.88999287]
z:  9  i:  854 Max_reward:  [ 0.2473405]
z:  9  i:  855 Max_reward:  [ 1.78203223]
z:  9  i:  856 Max_reward:  [ 1.22122982]
z:  9  i:  857 reward:  [-0.34527353]
z:  9  i:  858 Max_reward:  [ 2.94762128]
z:  9  i:  859 Max_reward:  [ 2.61689804]
z:  9  i:  860 Max_reward:  [ 2.22369904]
z:  9  i:  861 Max_reward:  [ 2.34487586]
z:  9  i:  862 Max_reward:  [ 3.06315233]
z:  9  i:  863 reward:  [-0.31141543]
z:  9  i:  864 Max_reward:  [ 0.78448243]
z:  9  i:  865 Max_reward:  [ 1.3376379]
z:  9  i:  866 Max_reward:  [ 1.39054547]
z:  9  i:  867 Max_reward:  [ 2.8120942]
z:  9  i:  868 Max_reward:  [ 0.56129469]
z:  9

In [190]:
# initialize data structures
action_values = np.zeros(10)
action_values_count = np.ones(10)
initial_action_values = np.zeros(10)
average_rewards2 = np.zeros(1000)
total_reward2 = 0
max_action = 0

# number of loop iterations 
a = 1
b = 10

# initialize action_values with normal distribution around 0 and variance 1
for i in range(0,10): 
    action_values[i] = np.random.normal(0,1,1)
initial_action_values = action_values

#######################################################

# implement greedy learning algorithm with changing rewards(non-stationarity) and we do a thousand steps 
stepsize = 1
# First case: adaptive stepsize
for z in range(a,b):
    for i in range(0,1000):
        epsilon = np.random.choice(2, 1, p=[0.1, 0.9])

        # we exploit
        if(epsilon[0] == 1): 
            #find maximal action value 
            max_index = np.argmax(action_values)
            max_reward = np.random.normal(initial_action_values[max_index],1,1) 
            print("z: ",z," i: ",i ,"Max_reward: ", max_reward)
            action_values[max_index] = action_values[max_index] + stepsize*(max_reward - action_values[max_index])
            total_reward2 += max_reward
            action_values_count[max_index] +=1
            
            #lets iteratively calculate the mean of the rewards in each timestep of 1000 timesteps
            # repeated over 2000 rounds
            average_rewards2[i] = average_rewards2[i] + (1.0/z)*(max_reward-average_rewards2[i])
            
        # we explore        
        else: 
            k = np.random.choice(10, 1)
            action = initial_action_values[k]
            reward = np.random.normal(action,1,1)
            print("z: ",z," i: ",i ,"reward: ", reward)
            action_values[k] = action_values[k] + stepsize*(reward - action_values[k])
            total_reward2 += reward
            action_values_count[k] +=1
            
            #lets iteratively calculate the mean of the rewards in each timestep of 1000 timesteps
            # repeated over 2000 rounds
            average_rewards2[i] = average_rewards2[i] + (1.0/z)*(reward-average_rewards2[i])
            
        max_action = 0
                
    # re-initialize action_values with normal distribution around 0 and variance 1 after each one of the 2000 rounds
    for l in range(0,10): 
        action_values[l] = np.random.normal(0,1,1)
    initial_action_values = action_values
    # re-initialize action_values_counts
    action_values_count = np.ones(10)
 

z:  1  i:  0 Max_reward:  [ 1.96135233]
z:  1  i:  1 Max_reward:  [ 2.63442203]
z:  1  i:  2 Max_reward:  [ 3.12486276]
z:  1  i:  3 Max_reward:  [ 2.77603294]
z:  1  i:  4 Max_reward:  [ 1.96703295]
z:  1  i:  5 Max_reward:  [ 1.37207055]
z:  1  i:  6 Max_reward:  [ 2.67474248]
z:  1  i:  7 Max_reward:  [ 1.27624101]
z:  1  i:  8 Max_reward:  [-0.13783591]
z:  1  i:  9 Max_reward:  [ 0.9655921]
z:  1  i:  10 Max_reward:  [ 0.80393705]
z:  1  i:  11 Max_reward:  [-1.04208742]
z:  1  i:  12 Max_reward:  [ 1.15568185]
z:  1  i:  13 Max_reward:  [ 1.88029161]
z:  1  i:  14 Max_reward:  [ 1.85146283]
z:  1  i:  15 Max_reward:  [ 2.94912071]
z:  1  i:  16 Max_reward:  [ 3.00852022]
z:  1  i:  17 Max_reward:  [ 4.04987216]
z:  1  i:  18 reward:  [-3.45995269]
z:  1  i:  19 Max_reward:  [ 2.7436755]
z:  1  i:  20 Max_reward:  [ 3.57488415]
z:  1  i:  21 Max_reward:  [ 4.79972087]
z:  1  i:  22 reward:  [ 0.13577099]
z:  1  i:  23 Max_reward:  [ 5.50887646]
z:  1  i:  24 Max_reward:  [ 6.81558

z:  1  i:  245 Max_reward:  [ 14.18280846]
z:  1  i:  246 Max_reward:  [ 14.21266875]
z:  1  i:  247 Max_reward:  [ 14.62445814]
z:  1  i:  248 Max_reward:  [ 15.84238447]
z:  1  i:  249 Max_reward:  [ 15.31565514]
z:  1  i:  250 Max_reward:  [ 16.28205416]
z:  1  i:  251 Max_reward:  [ 15.44417594]
z:  1  i:  252 Max_reward:  [ 18.13348802]
z:  1  i:  253 reward:  [ 3.65748634]
z:  1  i:  254 Max_reward:  [ 20.59015052]
z:  1  i:  255 Max_reward:  [ 20.95620333]
z:  1  i:  256 Max_reward:  [ 19.30412052]
z:  1  i:  257 Max_reward:  [ 19.7198721]
z:  1  i:  258 Max_reward:  [ 19.70389765]
z:  1  i:  259 Max_reward:  [ 19.33560069]
z:  1  i:  260 Max_reward:  [ 20.74324305]
z:  1  i:  261 Max_reward:  [ 19.35040186]
z:  1  i:  262 Max_reward:  [ 19.39024515]
z:  1  i:  263 Max_reward:  [ 19.71057592]
z:  1  i:  264 Max_reward:  [ 20.78098484]
z:  1  i:  265 Max_reward:  [ 20.1677914]
z:  1  i:  266 Max_reward:  [ 21.24049025]
z:  1  i:  267 Max_reward:  [ 21.62934476]
z:  1  i:  268 Max

z:  1  i:  440 Max_reward:  [ 20.82793613]
z:  1  i:  441 Max_reward:  [ 21.84273563]
z:  1  i:  442 Max_reward:  [ 22.44498368]
z:  1  i:  443 Max_reward:  [ 21.43286225]
z:  1  i:  444 reward:  [-1.67935861]
z:  1  i:  445 Max_reward:  [ 21.34833828]
z:  1  i:  446 Max_reward:  [ 20.55754261]
z:  1  i:  447 Max_reward:  [ 21.87060377]
z:  1  i:  448 Max_reward:  [ 21.51471746]
z:  1  i:  449 Max_reward:  [ 20.87117351]
z:  1  i:  450 Max_reward:  [ 22.32503744]
z:  1  i:  451 Max_reward:  [ 22.20049409]
z:  1  i:  452 Max_reward:  [ 20.66480612]
z:  1  i:  453 reward:  [ 1.82711025]
z:  1  i:  454 Max_reward:  [ 22.06414721]
z:  1  i:  455 reward:  [-2.77113906]
z:  1  i:  456 Max_reward:  [ 23.41641435]
z:  1  i:  457 Max_reward:  [ 23.94588883]
z:  1  i:  458 reward:  [-1.17185627]
z:  1  i:  459 Max_reward:  [ 24.37814062]
z:  1  i:  460 Max_reward:  [ 23.03582058]
z:  1  i:  461 Max_reward:  [ 22.96116329]
z:  1  i:  462 Max_reward:  [ 22.21017764]
z:  1  i:  463 Max_reward:  [ 2

z:  1  i:  799 Max_reward:  [ 10.11730253]
z:  1  i:  800 Max_reward:  [ 10.84217115]
z:  1  i:  801 Max_reward:  [ 9.49840939]
z:  1  i:  802 Max_reward:  [ 11.53914235]
z:  1  i:  803 Max_reward:  [ 12.10423421]
z:  1  i:  804 Max_reward:  [ 13.21280107]
z:  1  i:  805 Max_reward:  [ 12.69577239]
z:  1  i:  806 Max_reward:  [ 13.73000194]
z:  1  i:  807 Max_reward:  [ 16.78779132]
z:  1  i:  808 Max_reward:  [ 14.71392122]
z:  1  i:  809 Max_reward:  [ 14.95310485]
z:  1  i:  810 Max_reward:  [ 14.20729859]
z:  1  i:  811 Max_reward:  [ 13.72516206]
z:  1  i:  812 Max_reward:  [ 12.07815217]
z:  1  i:  813 Max_reward:  [ 12.45132904]
z:  1  i:  814 Max_reward:  [ 13.49284394]
z:  1  i:  815 Max_reward:  [ 12.53567901]
z:  1  i:  816 reward:  [-2.67513051]
z:  1  i:  817 Max_reward:  [ 12.50070765]
z:  1  i:  818 Max_reward:  [ 13.07546281]
z:  1  i:  819 Max_reward:  [ 13.34573568]
z:  1  i:  820 Max_reward:  [ 13.06654854]
z:  1  i:  821 Max_reward:  [ 12.80486725]
z:  1  i:  822 Ma

z:  2  i:  13 Max_reward:  [ 1.6221672]
z:  2  i:  14 reward:  [ 0.3857264]
z:  2  i:  15 Max_reward:  [ 1.85038783]
z:  2  i:  16 Max_reward:  [ 1.52002986]
z:  2  i:  17 Max_reward:  [ 0.68945088]
z:  2  i:  18 Max_reward:  [ 0.96971735]
z:  2  i:  19 reward:  [-0.45696597]
z:  2  i:  20 Max_reward:  [ 2.01500671]
z:  2  i:  21 Max_reward:  [ 1.22817041]
z:  2  i:  22 Max_reward:  [ 0.20511034]
z:  2  i:  23 Max_reward:  [ 2.2985515]
z:  2  i:  24 reward:  [-3.35761434]
z:  2  i:  25 Max_reward:  [ 1.78152693]
z:  2  i:  26 Max_reward:  [ 1.43895675]
z:  2  i:  27 Max_reward:  [-0.57005237]
z:  2  i:  28 Max_reward:  [ 1.97083481]
z:  2  i:  29 Max_reward:  [ 2.22755901]
z:  2  i:  30 Max_reward:  [ 2.73729271]
z:  2  i:  31 Max_reward:  [ 2.70168457]
z:  2  i:  32 Max_reward:  [ 3.13053922]
z:  2  i:  33 Max_reward:  [ 2.40713265]
z:  2  i:  34 Max_reward:  [ 2.14772288]
z:  2  i:  35 Max_reward:  [ 0.50840554]
z:  2  i:  36 Max_reward:  [ 1.91043855]
z:  2  i:  37 reward:  [-0.3332

z:  2  i:  246 Max_reward:  [ 16.69618726]
z:  2  i:  247 Max_reward:  [ 18.52834075]
z:  2  i:  248 Max_reward:  [ 17.70677811]
z:  2  i:  249 Max_reward:  [ 17.21128353]
z:  2  i:  250 Max_reward:  [ 17.33745773]
z:  2  i:  251 Max_reward:  [ 18.18324258]
z:  2  i:  252 Max_reward:  [ 18.08171718]
z:  2  i:  253 Max_reward:  [ 18.17595742]
z:  2  i:  254 Max_reward:  [ 17.80905499]
z:  2  i:  255 Max_reward:  [ 18.20691077]
z:  2  i:  256 Max_reward:  [ 18.78035907]
z:  2  i:  257 Max_reward:  [ 19.28817097]
z:  2  i:  258 Max_reward:  [ 19.17563952]
z:  2  i:  259 Max_reward:  [ 16.64487797]
z:  2  i:  260 Max_reward:  [ 16.77155847]
z:  2  i:  261 Max_reward:  [ 16.18180124]
z:  2  i:  262 Max_reward:  [ 16.41499213]
z:  2  i:  263 Max_reward:  [ 17.2724995]
z:  2  i:  264 Max_reward:  [ 17.067055]
z:  2  i:  265 Max_reward:  [ 16.8971956]
z:  2  i:  266 reward:  [-1.81664074]
z:  2  i:  267 Max_reward:  [ 16.65641129]
z:  2  i:  268 Max_reward:  [ 16.1706932]
z:  2  i:  269 reward

z:  2  i:  464 Max_reward:  [ 5.44373572]
z:  2  i:  465 Max_reward:  [ 4.3348129]
z:  2  i:  466 Max_reward:  [ 3.49692969]
z:  2  i:  467 Max_reward:  [ 4.89777028]
z:  2  i:  468 Max_reward:  [ 4.92243061]
z:  2  i:  469 reward:  [-1.61243999]
z:  2  i:  470 Max_reward:  [ 3.97585898]
z:  2  i:  471 Max_reward:  [ 3.01897812]
z:  2  i:  472 Max_reward:  [ 1.8536733]
z:  2  i:  473 Max_reward:  [ 2.35718962]
z:  2  i:  474 Max_reward:  [ 2.03757017]
z:  2  i:  475 Max_reward:  [ 3.24034334]
z:  2  i:  476 Max_reward:  [ 1.2052706]
z:  2  i:  477 reward:  [-0.85598985]
z:  2  i:  478 Max_reward:  [ 2.05705138]
z:  2  i:  479 Max_reward:  [ 1.87785479]
z:  2  i:  480 Max_reward:  [ 0.34330787]
z:  2  i:  481 Max_reward:  [ 0.98809026]
z:  2  i:  482 Max_reward:  [ 1.3009054]
z:  2  i:  483 Max_reward:  [ 0.30924012]
z:  2  i:  484 Max_reward:  [-0.41912248]
z:  2  i:  485 Max_reward:  [ 0.72489534]
z:  2  i:  486 Max_reward:  [ 3.8996255]
z:  2  i:  487 Max_reward:  [ 5.73551523]
z:  2

KeyboardInterrupt: 

In [None]:
# plot stuff and yield results 
total_reward = total_reward/10.0
total_reward2 = total_reward2/10.0
print("Adaptive stepsize has total reward of: ", total_reward)
print("Constant stepsize has total reward of: ", total_reward2)

x_axis = np.arange(0,1000)
plt.xlabel('steps')
plt.ylabel('average rewards')
plt.plot(x_axis, average_rewards1,'r.', x_axis, average_rewards2,'b.')
plt.show()