In [188]:
import torch
import random
import numpy as np
import matplotlib.pyplot as plt
import copy
from agent import *
from selection import *
from utils import *
from env import Environment
from simulation import constructAgent, twoSimulate
from main import Config

## DQN vs TfT

In [43]:
def determine_convergence(agent:object, threshold:int, k:int=100):
    if agent.play_times < 2*k:
        return False
    history_1 = agent.own_memory[agent.play_times-k:agent.play_times]
    history_2 = agent.own_memory[agent.play_times-2*k:agent.play_times-k]
    difference = torch.sum(torch.abs(history_1 - history_2))
    if difference > threshold:
        return False
    else:
        return True

In [88]:
# choices = {'0-alwaysCooperate','1-alwaysDefect','2-titForTat','3-reverseTitForTat','4-random','5-grudger','6-pavlov','7-qLearning','8-lstm-TFT','9-dqn','10-lstmqn','11-a2c','12-a2c-lstm'}
# rl_choices = {'7-qLearning','8-lstm-pavlov','9-dqn','10-lstmqn','11-a2c','12-a2c-lstm'}
strategies = {0:'ALLC',1:'ALLD',2:'TitForTat',3:'revTitForTat',4:'Random',5:'Grudger',6:'Pavlov',7:'QLearning',8:'LSTM',9:'DQN',10:'LSTMQN',11:'A2C',12:'A2CLSTM'}

h = [1,2,5,10]
epsilon_decay = [0.95, 0.99, 0.995, 0.999]

config = {
    'reward': 3, 
    'sucker': 0, 
    'temptation': 5, 
    'punishment': 1, 
    'n_episodes': 10000, 
    'discount': 0.99,
    'play_epsilon': 1,
    'select_epsilon': 1,
    'epsilon_decay': 0.999,
    'min_epsilon': 0.01,
    'alpha': 0.1,
    'n_actions': 2,
    'h': 10,
    'state_repr': 'bi',
    'batch_size': 64,
    'learning_rate': 1e-3,
}

epsilon_dict = {'epsilon_decay=0.95':[],'epsilon_decay=0.99':[],'epsilon_decay=0.995':[],'epsilon_decay=0.999':[]}
result_dict={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
loss_dict={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
strategy_dict={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
                
for key in result_dict:
    result_dict[key] = copy.deepcopy(epsilon_dict)
    loss_dict[key] = copy.deepcopy(epsilon_dict)
    strategy_dict[key] = copy.deepcopy(epsilon_dict)

for i in h:
    for j in epsilon_decay:
        config['h'] = 1
        config['epsilon_decay'] = j
        config_ob = Config(config)
        env = Environment(config_ob)
        print(f'config: h={i}, epsilon_decay={j}')
        
        num = 2
        rl_num = 9
        for _ in range(5):
            convergence = False
            # twoSimulate(dict({num: strategies[num],rl_num: strategies[rl_num]}), rl_num, config)
            with HiddenPrints():
                agent1 = constructAgent(strategies[num], config_ob)
                agent2 = constructAgent(strategies[rl_num], config_ob)

                k = 1000
                while not convergence:
                    env.play(agent1, agent2, k)
                    convergence = determine_convergence(agent2, 20, k=k)
                    if agent2.play_times >= 20*k:
                        break
        
                result_dict[f'h={i}'][f'epsilon_decay={j}'].append(agent2.play_times)
                loss_dict[f'h={i}'][f'epsilon_decay={j}'].append(np.mean(agent2.loss))
                strategy_dict[f'h={i}'][f'epsilon_decay={j}'].append(list(agent2.own_memory[agent2.play_times-10:agent2.play_times]))
        print(f'playing times: {agent2.play_times}')
        print(f'length of loss: {len(agent2.loss)}, average of loss (interval is 2): {np.mean(agent2.loss[::2])}, average of loss (interval is 20): {np.mean(agent2.loss[::20])}, average of loss (interval is 100): {np.mean(agent2.loss[::100])}')
        agent2.show()
        print()
        print()
        
    # plt.plot(agent2.loss[::20])
    # plt.title(f'agent:{agent2.name}')
    # plt.show()
# agent1.show()
# agent2.show()
# print("==================================================")
# print(f'{agent1.name} score: {agent1.running_score}\n{agent2.name} score: {agent2.running_score}')
# print("------------------------------------------------------------------------------------------------------------------------------------------------")
# print()

# x = [i for i in range(0, agent1.play_times)]
# plt.figure(figsize=(20, 10))
# plt.plot(x, agent1.own_memory[0:agent1.play_times], label=agent1.name, alpha=0.5)
# plt.plot(x, agent2.own_memory[0:agent2.play_times], label=agent2.name, alpha=0.5)
# plt.legend()
# plt.ylim(-0.5, 2)
# plt.xlim(0, agent1.play_times)
# plt.title(f'agent:{agent1.name} vs agent:{agent2.name}')
# plt.savefig(f'images/{agent1.name}vs{agent2.name}_result_h={config.h}.png')
# plt.show()

config: h=1, epsilon_decay=0.95
playing times: 4000
length of loss: 3936, average of loss (interval is 2): 9.324725299452743, average of loss (interval is 20): 9.414865579550641, average of loss (interval is 100): 9.450661483407021
Your action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Oppo action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


config: h=1, epsilon_decay=0.99
playing times: 3000
length of loss: 2936, average of loss (interval is 2): 6.902885661084409, average of loss (interval is 20): 7.005591902314831, average of loss (interval is 100): 7.062497464815776
Your action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Oppo action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


config: h=1, epsilon_decay=0.995
playing times: 3000
length of loss: 2936, average of loss (interval is 2): 5.65398275408540

In [110]:
for i in h:
    for j in epsilon_decay:
        strategy_dict[f'h={i}'][f'epsilon_decay={j}'] = torch.Tensor(strategy_dict[f'h={i}'][f'epsilon_decay={j}']).numpy().astype(int).tolist()
print(result_dict)
print()
print(strategy_dict)

{'h=1': {'epsilon_decay=0.95': [3000, 3000, 3000, 3000, 4000], 'epsilon_decay=0.99': [3000, 3000, 3000, 3000, 3000], 'epsilon_decay=0.995': [4000, 3000, 20000, 3000, 3000], 'epsilon_decay=0.999': [20000, 20000, 20000, 20000, 20000]}, 'h=2': {'epsilon_decay=0.95': [3000, 20000, 3000, 2000, 3000], 'epsilon_decay=0.99': [3000, 4000, 3000, 3000, 3000], 'epsilon_decay=0.995': [3000, 7000, 3000, 3000, 3000], 'epsilon_decay=0.999': [20000, 20000, 20000, 20000, 20000]}, 'h=5': {'epsilon_decay=0.95': [5000, 3000, 3000, 3000, 3000], 'epsilon_decay=0.99': [3000, 3000, 3000, 3000, 3000], 'epsilon_decay=0.995': [3000, 3000, 3000, 4000, 3000], 'epsilon_decay=0.999': [20000, 20000, 20000, 20000, 20000]}, 'h=10': {'epsilon_decay=0.95': [3000, 5000, 3000, 3000, 4000], 'epsilon_decay=0.99': [3000, 3000, 3000, 3000, 3000], 'epsilon_decay=0.995': [3000, 6000, 20000, 7000, 5000], 'epsilon_decay=0.999': [20000, 20000, 20000, 20000, 20000]}}

{'h=1': {'epsilon_decay=0.95': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0

In [108]:
for i in h:
    for j in epsilon_decay:
        print(np.mean(loss_dict[f'h={i}'][f'epsilon_decay={j}']))

11.84565812599636
6.681979909030754
6.685040653032973
6.696518058278707
5.219096498522428
6.6952139011328855
6.726328886213844
6.935448207144164
6.702649024437409
6.949632831592234
6.9972721526957375
7.1748613247461845
7.147196131551553
6.770848050045555
8.551492503719853
6.891393396063504


## DQN vs DQN

In [119]:
result_dict_dqn={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
loss_dict_dqn={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
strategy_dict_dqn={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}

for key in result_dict_dqn:
    result_dict_dqn[key] = copy.deepcopy(epsilon_dict)
    loss_dict_dqn[key] = copy.deepcopy(epsilon_dict)
    strategy_dict_dqn[key] = copy.deepcopy(epsilon_dict)

for i in h:
    for j in epsilon_decay:
        config['h'] = 1
        config['epsilon_decay'] = j
        config_ob = Config(config)
        env = Environment(config_ob)
        print(f'config: h={i}, epsilon_decay={j}')
        
        rl_num = 9
        for _ in range(5):
            convergence = False
            with HiddenPrints():
                agent1 = constructAgent(strategies[rl_num], config_ob)
                agent2 = constructAgent(strategies[rl_num], config_ob)

                k = 1000
                while not convergence:
                    env.play(agent1, agent2, k)
                    convergence = determine_convergence(agent2, 20, k=k)
                    if agent2.play_times >= 20*k:
                        break
        
                result_dict_dqn[f'h={i}'][f'epsilon_decay={j}'].append(agent2.play_times)
                loss_dict_dqn[f'h={i}'][f'epsilon_decay={j}'].append([np.mean(agent1.loss),np.mean(agent2.loss)])
                strategy_dict_dqn[f'h={i}'][f'epsilon_decay={j}'].append([agent1.own_memory[agent1.play_times-10:agent1.play_times].numpy().astype(int).tolist(), agent2.own_memory[agent2.play_times-10:agent2.play_times].numpy().astype(int).tolist()])
        print(f'playing times: {agent2.play_times}')
        print(f'length of loss: {len(agent1.loss)}, average of loss (interval is 2): {np.mean(agent1.loss[::2])}, average of loss (interval is 20): {np.mean(agent1.loss[::20])}, average of loss (interval is 100): {np.mean(agent1.loss[::100])}')
        print(f'length of loss: {len(agent2.loss)}, average of loss (interval is 2): {np.mean(agent2.loss[::2])}, average of loss (interval is 20): {np.mean(agent2.loss[::20])}, average of loss (interval is 100): {np.mean(agent2.loss[::100])}')
        agent2.show()
        print()
        print()

config: h=1, epsilon_decay=0.95
playing times: 4000
length of loss: 3936, average of loss (interval is 2): 9.74302608461709, average of loss (interval is 20): 10.596129937853835, average of loss (interval is 100): 9.4769208105281
length of loss: 3936, average of loss (interval is 2): 10.071894292226553, average of loss (interval is 20): 10.309910644696751, average of loss (interval is 100): 9.359448426775634
Your action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Oppo action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


config: h=1, epsilon_decay=0.99
playing times: 4000
length of loss: 3936, average of loss (interval is 2): 9.600981240357628, average of loss (interval is 20): 9.171886406271591, average of loss (interval is 100): 9.538195848464966
length of loss: 3936, average of loss (interval is 2): 9.990633483828477, average of loss (interval is 20): 9.74006481582138, average of loss (interva

playing times: 11000
length of loss: 10936, average of loss (interval is 2): 11.49228885657806, average of loss (interval is 20): 11.618063856631354, average of loss (interval is 100): 11.271099874648181
length of loss: 10936, average of loss (interval is 2): 13.133397008050178, average of loss (interval is 20): 12.947969336260172, average of loss (interval is 100): 12.110137082907286
Your action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.])
Oppo action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


config: h=10, epsilon_decay=0.99
playing times: 20000
length of loss: 19936, average of loss (interval is 2): 6.12775451077845, average of loss (interval is 20): 6.061126431601576, average of loss (interval is 100): 5.861278599724174
length of loss: 19936, average of loss (interval is 2): 5.032747965510763, average of loss (interval is 20): 5.155075607367361, average of loss (interval is 100): 5

In [124]:
result_dict_dqn

{'h=1': {'epsilon_decay=0.95': [20000, 3000, 20000, 20000, 4000],
  'epsilon_decay=0.99': [20000, 5000, 20000, 20000, 4000],
  'epsilon_decay=0.995': [20000, 18000, 3000, 6000, 20000],
  'epsilon_decay=0.999': [20000, 20000, 20000, 20000, 20000]},
 'h=2': {'epsilon_decay=0.95': [3000, 20000, 4000, 4000, 20000],
  'epsilon_decay=0.99': [20000, 5000, 3000, 20000, 20000],
  'epsilon_decay=0.995': [5000, 20000, 20000, 18000, 18000],
  'epsilon_decay=0.999': [20000, 20000, 20000, 20000, 20000]},
 'h=5': {'epsilon_decay=0.95': [3000, 4000, 6000, 3000, 18000],
  'epsilon_decay=0.99': [20000, 5000, 20000, 20000, 4000],
  'epsilon_decay=0.995': [20000, 20000, 4000, 8000, 20000],
  'epsilon_decay=0.999': [20000, 20000, 20000, 20000, 20000]},
 'h=10': {'epsilon_decay=0.95': [3000, 20000, 13000, 3000, 11000],
  'epsilon_decay=0.99': [20000, 8000, 20000, 20000, 20000],
  'epsilon_decay=0.995': [20000, 4000, 20000, 20000, 20000],
  'epsilon_decay=0.999': [20000, 20000, 20000, 20000, 20000]}}

In [125]:
strategy_dict_dqn

{'h=1': {'epsilon_decay=0.95': [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [0, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
   [[1, 0, 1, 1, 0, 1, 1, 0, 1, 1], [0, 1, 1, 0, 1, 1, 0, 1, 1, 0]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]],
  'epsilon_decay=0.99': [[[0, 1, 0, 1, 0, 1, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[1, 0, 1, 0, 1, 0, 1, 0, 1, 0], [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 1, 0, 1, 1, 1]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]],
  'epsilon_decay=0.995': [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[0, 0, 0, 0

In [131]:
for i in h:
    for j in epsilon_decay:
        print(np.mean(np.array(loss_dict_dqn[f'h={i}'][f'epsilon_decay={j}'])[:,0]))

6.787715951313224
7.0768090728648145
8.595668000553706
5.504095430615758
7.706253216702057
7.208648023710969
6.405157787942431
6.026369777740031
7.165410507373098
6.166338333760331
5.14131538263794
6.066017281508488
11.990292821132114
9.489932365717634
5.2823971095843785
5.448982174054747


## DRQN vs TfT

In [120]:
result_dict_drqn_t={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
loss_dict_drqn_t={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
strategy_dict_drqn_t={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}

for key in result_dict_drqn_t:
    result_dict_drqn_t[key] = copy.deepcopy(epsilon_dict)
    loss_dict_drqn_t[key] = copy.deepcopy(epsilon_dict)
    strategy_dict_drqn_t[key] = copy.deepcopy(epsilon_dict)

for i in h:
    for j in epsilon_decay:
        config['h'] = 1
        config['epsilon_decay'] = j
        config_ob = Config(config)
        env = Environment(config_ob)
        print(f'config: h={i}, epsilon_decay={j}')
        
        n_num = 2
        rl_num = 10
        for _ in range(5):
            convergence = False
            with HiddenPrints():
                agent1 = constructAgent(strategies[n_num], config_ob)
                agent2 = constructAgent(strategies[rl_num], config_ob)

                k = 1000
                while not convergence:
                    env.play(agent1, agent2, k)
                    convergence = determine_convergence(agent2, 20, k=k)
                    if agent2.play_times >= 20*k:
                        break
        
                result_dict_drqn_t[f'h={i}'][f'epsilon_decay={j}'].append(agent2.play_times)
                loss_dict_drqn_t[f'h={i}'][f'epsilon_decay={j}'].append(np.mean(agent2.loss))
                strategy_dict_drqn_t[f'h={i}'][f'epsilon_decay={j}'].append(agent2.own_memory[agent2.play_times-10:agent2.play_times].numpy().astype(int).tolist())
        print(f'playing times: {agent2.play_times}')
        print(f'length of loss: {len(agent2.loss)}, average of loss (interval is 2): {np.mean(agent2.loss[::2])}, average of loss (interval is 20): {np.mean(agent2.loss[::20])}, average of loss (interval is 100): {np.mean(agent2.loss[::100])}')
        agent2.show()
        print()
        print()

config: h=1, epsilon_decay=0.95
playing times: 2000
length of loss: 1936, average of loss (interval is 2): 0.10399050374947488, average of loss (interval is 20): 0.11866714815979912, average of loss (interval is 100): 0.10686213557346491
Your action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.])
Oppo action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.])


config: h=1, epsilon_decay=0.99
playing times: 3000
length of loss: 2936, average of loss (interval is 2): 0.33830433815402033, average of loss (interval is 20): 0.4948061129347925, average of loss (interval is 100): 0.1756123941411109
Your action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Oppo action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


config: h=1, epsilon_decay=0.995
playing times: 3000
length of loss: 2936, average of loss (interv

In [132]:
result_dict_drqn_t

{'h=1': {'epsilon_decay=0.95': [3000, 3000, 3000, 3000, 2000],
  'epsilon_decay=0.99': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.995': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.999': [6000, 5000, 6000, 5000, 6000]},
 'h=2': {'epsilon_decay=0.95': [3000, 2000, 2000, 2000, 2000],
  'epsilon_decay=0.99': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.995': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.999': [6000, 6000, 6000, 5000, 5000]},
 'h=5': {'epsilon_decay=0.95': [3000, 2000, 3000, 2000, 3000],
  'epsilon_decay=0.99': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.995': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.999': [7000, 6000, 6000, 6000, 5000]},
 'h=10': {'epsilon_decay=0.95': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.99': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.995': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.999': [5000, 6000, 6000, 6000, 6000]}}

In [133]:
strategy_dict_drqn_t

{'h=1': {'epsilon_decay=0.95': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
  'epsilon_decay=0.99': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
  'epsilon_decay=0.995': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
  'epsilon_decay=0.999': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]},
 'h=2': {'epsilon_decay=0.95': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
   [1, 1, 1, 1, 1, 1, 1, 1, 1

In [134]:
for i in h:
    for j in epsilon_decay:
        print(np.mean(loss_dict_drqn_t[f'h={i}'][f'epsilon_decay={j}']))

0.14955716687125167
0.32202200703229
0.31961403267176447
0.22819994771071772
0.10022256314383701
0.3143290042001776
0.31912981373774035
0.23007589212767704
0.29201921030987105
0.32237076219563315
0.31479959637606914
0.22131568557035958
0.2689748312757962
0.31211304511570065
0.3172150835656422
0.22538436881343943


## DRQN vs DRQN

In [121]:
result_dict_drqn={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
loss_dict_drqn={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
strategy_dict_drqn={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}

for key in result_dict_drqn:
    result_dict_drqn[key] = copy.deepcopy(epsilon_dict)
    loss_dict_drqn[key] = copy.deepcopy(epsilon_dict)
    strategy_dict_drqn[key] = copy.deepcopy(epsilon_dict)

for i in h:
    for j in epsilon_decay:
        config['h'] = 1
        config['epsilon_decay'] = j
        config_ob = Config(config)
        env = Environment(config_ob)
        print(f'config: h={i}, epsilon_decay={j}')
        
        rl_num = 10
        for _ in range(5):
            convergence = False
            with HiddenPrints():
                agent1 = constructAgent(strategies[rl_num], config_ob)
                agent2 = constructAgent(strategies[rl_num], config_ob)

                k = 1000
                while not convergence:
                    env.play(agent1, agent2, k)
                    convergence = determine_convergence(agent2, 20, k=k)
                    if agent2.play_times >= 20*k:
                        break
        
                result_dict_drqn[f'h={i}'][f'epsilon_decay={j}'].append(agent2.play_times)
                loss_dict_drqn[f'h={i}'][f'epsilon_decay={j}'].append([np.mean(agent1.loss),np.mean(agent2.loss)])
                strategy_dict_drqn[f'h={i}'][f'epsilon_decay={j}'].append([agent1.own_memory[agent1.play_times-10:agent1.play_times].numpy().astype(int).tolist(), agent2.own_memory[agent2.play_times-10:agent2.play_times].numpy().astype(int).tolist()])
        print(f'playing times: {agent2.play_times}')
        print(f'length of loss: {len(agent1.loss)}, average of loss (interval is 2): {np.mean(agent1.loss[::2])}, average of loss (interval is 20): {np.mean(agent1.loss[::20])}, average of loss (interval is 100): {np.mean(agent1.loss[::100])}')
        print(f'length of loss: {len(agent2.loss)}, average of loss (interval is 2): {np.mean(agent2.loss[::2])}, average of loss (interval is 20): {np.mean(agent2.loss[::20])}, average of loss (interval is 100): {np.mean(agent2.loss[::100])}')
        agent2.show()
        print()
        print()

config: h=1, epsilon_decay=0.95
playing times: 3000
length of loss: 2936, average of loss (interval is 2): 0.6526697659626508, average of loss (interval is 20): 0.6657035927524567, average of loss (interval is 100): 0.5939308361465616
length of loss: 2936, average of loss (interval is 2): 0.1415143483039591, average of loss (interval is 20): 0.13582485945880762, average of loss (interval is 100): 0.11111775233306011
Your action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.])
Oppo action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.])


config: h=1, epsilon_decay=0.99
playing times: 3000
length of loss: 2936, average of loss (interval is 2): 0.17202555021594662, average of loss (interval is 20): 0.18493436405741995, average of loss (interval is 100): 0.1769069066460361
length of loss: 2936, average of loss (interval is 2): 0.19043408928435976, average of loss (interval is 20): 0.208805103

playing times: 4000
length of loss: 3936, average of loss (interval is 2): 1.2995826105044657, average of loss (interval is 20): 1.4000101721338134, average of loss (interval is 100): 1.1315500372606038
length of loss: 3936, average of loss (interval is 2): 0.9955691047924238, average of loss (interval is 20): 1.1591022541870004, average of loss (interval is 100): 0.804607459478575
Your action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Oppo action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


config: h=10, epsilon_decay=0.99
playing times: 3000
length of loss: 2936, average of loss (interval is 2): 0.24617277398932197, average of loss (interval is 20): 0.2572297518623216, average of loss (interval is 100): 0.24088269153520134
length of loss: 2936, average of loss (interval is 2): 0.17137975524918023, average of loss (interval is 20): 0.1752142835971519, average of loss (interval is 100): 0.1651

In [135]:
result_dict_drqn

{'h=1': {'epsilon_decay=0.95': [3000, 4000, 3000, 4000, 3000],
  'epsilon_decay=0.99': [3000, 5000, 3000, 3000, 3000],
  'epsilon_decay=0.995': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.999': [6000, 5000, 6000, 6000, 5000]},
 'h=2': {'epsilon_decay=0.95': [4000, 3000, 3000, 3000, 4000],
  'epsilon_decay=0.99': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.995': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.999': [6000, 5000, 6000, 5000, 6000]},
 'h=5': {'epsilon_decay=0.95': [3000, 3000, 4000, 4000, 3000],
  'epsilon_decay=0.99': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.995': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.999': [5000, 6000, 6000, 5000, 5000]},
 'h=10': {'epsilon_decay=0.95': [3000, 4000, 3000, 3000, 4000],
  'epsilon_decay=0.99': [3000, 4000, 3000, 3000, 3000],
  'epsilon_decay=0.995': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.999': [5000, 6000, 6000, 6000, 5000]}}

In [137]:
strategy_dict_drqn

{'h=1': {'epsilon_decay=0.95': [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[1, 0, 1, 0, 1, 0, 1, 0, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]],
  'epsilon_decay=0.99': [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]],
  'epsilon_decay=0.995': [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[1, 1, 1, 1

In [138]:
for i in h:
    for j in epsilon_decay:
        print(np.mean(np.array(loss_dict_drqn[f'h={i}'][f'epsilon_decay={j}'])[:,0]))

0.8541613318540993
0.23367255136915593
0.25134189097108084
0.3811893631586258
0.6240091759693913
0.2727832893012461
0.3144180846513758
0.37814567458965065
0.6750504486773959
0.3165014527001072
0.27966370917910305
0.3883578428508924
0.5543067492746468
0.45279564580553255
0.26032672400135487
0.3807973351521633


## QLearning vs TfT

In [123]:
result_dict_q={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
strategy_dict_q={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}

for key in result_dict_q:
    result_dict_q[key] = copy.deepcopy(epsilon_dict)
    strategy_dict_q[key] = copy.deepcopy(epsilon_dict)

for i in h:
    for j in epsilon_decay:
        config['h'] = 1
        config['epsilon_decay'] = j
        config_ob = Config(config)
        env = Environment(config_ob)
        print(f'config: h={i}, epsilon_decay={j}')
        
        n_num = 2
        rl_num = 7
        for _ in range(5):
            convergence = False
            with HiddenPrints():
                agent1 = constructAgent(strategies[n_num], config_ob)
                agent2 = constructAgent(strategies[rl_num], config_ob)

                k = 1000
                while not convergence:
                    env.play(agent1, agent2, k)
                    convergence = determine_convergence(agent2, 20, k=k)
                    if agent2.play_times >= 20*k:
                        break
        
                result_dict_q[f'h={i}'][f'epsilon_decay={j}'].append(agent2.play_times)
                strategy_dict_q[f'h={i}'][f'epsilon_decay={j}'].append(list(agent2.own_memory[agent2.play_times-10:agent2.play_times]))
        print(f'playing times: {agent2.play_times}')
        agent2.show()
        print()
        print()

config: h=1, epsilon_decay=0.95
playing times: 6000
QLearning play 6000 rounds
Q_table:
tensor([[ 87.7800, 213.4048],
        [211.8785, 128.9350]])
Your action: tensor([1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0.,
        0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1.])
Oppo action: tensor([1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1.,
        0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0.])


config: h=1, epsilon_decay=0.99
playing times: 3000
QLearning play 3000 rounds
Q_table:
tensor([[ 2.1978, 56.2419],
        [57.2711, 80.9001]])
Your action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
Oppo action: tensor([0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])


config

playing times: 6000
QLearning play 6000 rounds
Q_table:
tensor([[ 77.9678, 102.8538],
        [101.4865, 102.3831]])
Your action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
Oppo action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])




## DRQN-Varient vs TfT

In [140]:
# choices = {'0-alwaysCooperate','1-alwaysDefect','2-titForTat','3-reverseTitForTat','4-random','5-grudger','6-pavlov','7-qLearning','8-lstm-TFT','9-dqn','10-lstmqn','11-a2c','12-a2c-lstm'}
# rl_choices = {'7-qLearning','8-lstm-pavlov','9-dqn','10-lstmqn','11-a2c','12-a2c-lstm'}
strategies = {0:'ALLC',1:'ALLD',2:'TitForTat',3:'revTitForTat',4:'Random',5:'Grudger',6:'Pavlov',7:'QLearning',8:'LSTM',9:'DQN',10:'LSTMQN',11:'A2C',12:'A2CLSTM'}

h = [1,2,5,10]
epsilon_decay = [0.95, 0.99, 0.995, 0.999]

config = {
    'reward': 3, 
    'sucker': 0, 
    'temptation': 5, 
    'punishment': 1, 
    'n_episodes': 10000, 
    'discount': 0.99,
    'play_epsilon': 1,
    'select_epsilon': 1,
    'epsilon_decay': 0.999,
    'min_epsilon': 0.01,
    'alpha': 0.1,
    'n_actions': 2,
    'h': 10,
    'state_repr': 'bi-repr',
    'batch_size': 64,
    'learning_rate': 1e-3,
}

result_dict_drqnv_t={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
loss_dict_drqnv_t={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
strategy_dict_drqnv_t={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
                
for key in result_dict_drqnv_t:
    result_dict_drqnv_t[key] = copy.deepcopy(epsilon_dict)
    loss_dict_drqnv_t[key] = copy.deepcopy(epsilon_dict)
    strategy_dict_drqnv_t[key] = copy.deepcopy(epsilon_dict)

for i in h:
    for j in epsilon_decay:
        config['h'] = 1
        config['epsilon_decay'] = j
        config_ob = Config(config)
        env = Environment(config_ob)
        print(f'config: h={i}, epsilon_decay={j}')
        
        num = 2
        rl_num = 10
        for _ in range(5):
            convergence = False
            # twoSimulate(dict({num: strategies[num],rl_num: strategies[rl_num]}), rl_num, config)
            with HiddenPrints():
                agent1 = constructAgent(strategies[num], config_ob)
                agent2 = constructAgent(strategies[rl_num], config_ob)

                k = 1000
                while not convergence:
                    env.play(agent1, agent2, k)
                    convergence = determine_convergence(agent2, 20, k=k)
                    if agent2.play_times >= 20*k:
                        break
        
                result_dict_drqnv_t[f'h={i}'][f'epsilon_decay={j}'].append(agent2.play_times)
                loss_dict_drqnv_t[f'h={i}'][f'epsilon_decay={j}'].append(np.mean(agent2.loss))
                strategy_dict_drqnv_t[f'h={i}'][f'epsilon_decay={j}'].append(list(agent2.own_memory[agent2.play_times-10:agent2.play_times]))
        print(f'playing times: {agent2.play_times}')
        print(f'length of loss: {len(agent2.loss)}, average of loss (interval is 2): {np.mean(agent2.loss[::2])}, average of loss (interval is 20): {np.mean(agent2.loss[::20])}, average of loss (interval is 100): {np.mean(agent2.loss[::100])}')
        agent2.show()
        print()
        print()
        

config: h=1, epsilon_decay=0.95
playing times: 2000
length of loss: 1936, average of loss (interval is 2): 19.958214418566207, average of loss (interval is 20): 20.756882024564078, average of loss (interval is 100): 17.16371552553028
Your action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Oppo action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


config: h=1, epsilon_decay=0.99
playing times: 10000
length of loss: 9936, average of loss (interval is 2): 4.62382058109342, average of loss (interval is 20): 4.710781207529292, average of loss (interval is 100): 4.070250947065651
Your action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Oppo action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


config: h=1, epsilon_decay=0.995
playing times: 13000
length of loss: 12936, average of loss (interval is 2): 8.8268795505

In [142]:
result_dict_drqnv_t

{'h=1': {'epsilon_decay=0.95': [3000, 2000, 3000, 3000, 2000],
  'epsilon_decay=0.99': [3000, 18000, 13000, 20000, 10000],
  'epsilon_decay=0.995': [7000, 8000, 20000, 5000, 13000],
  'epsilon_decay=0.999': [16000, 6000, 7000, 13000, 9000]},
 'h=2': {'epsilon_decay=0.95': [5000, 3000, 4000, 20000, 4000],
  'epsilon_decay=0.99': [10000, 3000, 4000, 16000, 11000],
  'epsilon_decay=0.995': [9000, 11000, 7000, 10000, 7000],
  'epsilon_decay=0.999': [20000, 7000, 19000, 20000, 8000]},
 'h=5': {'epsilon_decay=0.95': [5000, 3000, 3000, 4000, 4000],
  'epsilon_decay=0.99': [3000, 3000, 10000, 6000, 20000],
  'epsilon_decay=0.995': [15000, 20000, 10000, 5000, 6000],
  'epsilon_decay=0.999': [8000, 16000, 7000, 8000, 13000]},
 'h=10': {'epsilon_decay=0.95': [3000, 20000, 9000, 6000, 9000],
  'epsilon_decay=0.99': [5000, 8000, 5000, 13000, 6000],
  'epsilon_decay=0.995': [10000, 12000, 17000, 4000, 4000],
  'epsilon_decay=0.999': [11000, 11000, 7000, 6000, 20000]}}

In [147]:
for i in h:
    for j in epsilon_decay:
        strategy_dict_drqnv_t[f'h={i}'][f'epsilon_decay={j}'] = torch.Tensor(strategy_dict_drqnv_t[f'h={i}'][f'epsilon_decay={j}']).numpy().astype(int).tolist()
    print(f'h={i}')
    print(strategy_dict_drqnv_t[f'h={i}'])

h=1
{'epsilon_decay=0.95': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'epsilon_decay=0.99': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 1, 0, 1, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'epsilon_decay=0.995': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'epsilon_decay=0.999': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}
h=2
{'epsilon_decay=0.95': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'epsilon_decay=0.99': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 

In [149]:
for i in h:
    for j in epsilon_decay:
        print(np.mean(np.array(loss_dict_drqnv_t[f'h={i}'][f'epsilon_decay={j}'])))

12.414868480210291
64.1770756559956
7.172251405337815
12.621518122548967
15.981460710378439
10.155779616291039
8.0922326763928
7.329782502905262
9.076925424397809
8.973471412228323
6.925937601916589
11.510299137688111
7.22495812126611
12.386840782386141
6.774935636954588
10.22713355665719


## DRQN-Varient vs DRQN-Varient

In [141]:
result_dict_drqnv={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
loss_dict_drqnv={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
strategy_dict_drqnv={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
                
for key in result_dict_drqnv:
    result_dict_drqnv[key] = copy.deepcopy(epsilon_dict)
    loss_dict_drqnv[key] = copy.deepcopy(epsilon_dict)
    strategy_dict_drqnv[key] = copy.deepcopy(epsilon_dict)

for i in h:
    for j in epsilon_decay:
        config['h'] = 1
        config['epsilon_decay'] = j
        config_ob = Config(config)
        env = Environment(config_ob)
        print(f'config: h={i}, epsilon_decay={j}')
        
        rl_num = 10
        for _ in range(5):
            convergence = False
            # twoSimulate(dict({num: strategies[num],rl_num: strategies[rl_num]}), rl_num, config)
            with HiddenPrints():
                agent1 = constructAgent(strategies[rl_num], config_ob)
                agent2 = constructAgent(strategies[rl_num], config_ob)

                k = 1000
                while not convergence:
                    env.play(agent1, agent2, k)
                    convergence = determine_convergence(agent2, 20, k=k)
                    if agent2.play_times >= 20*k:
                        break
        
                result_dict_drqnv[f'h={i}'][f'epsilon_decay={j}'].append(agent2.play_times)
                loss_dict_drqnv[f'h={i}'][f'epsilon_decay={j}'].append([np.mean(agent1.loss),np.mean(agent2.loss)])
                strategy_dict_drqnv[f'h={i}'][f'epsilon_decay={j}'].append([agent1.own_memory[agent1.play_times-10:agent1.play_times].numpy().astype(int).tolist(), agent2.own_memory[agent2.play_times-10:agent2.play_times].numpy().astype(int).tolist()])
        print(f'playing times: {agent2.play_times}')
        print(f'length of loss: {len(agent1.loss)}, average of loss (interval is 2): {np.mean(agent1.loss[::2])}, average of loss (interval is 20): {np.mean(agent1.loss[::20])}, average of loss (interval is 100): {np.mean(agent1.loss[::100])}')
        print(f'length of loss: {len(agent2.loss)}, average of loss (interval is 2): {np.mean(agent2.loss[::2])}, average of loss (interval is 20): {np.mean(agent2.loss[::20])}, average of loss (interval is 100): {np.mean(agent2.loss[::100])}')
        agent2.show()
        print()
        print()
        

config: h=1, epsilon_decay=0.95
playing times: 6000
length of loss: 5936, average of loss (interval is 2): 3.873874127867348, average of loss (interval is 20): 3.979258805663899, average of loss (interval is 100): 3.188942242289583
length of loss: 5936, average of loss (interval is 2): 1.9740417012050522, average of loss (interval is 20): 2.0249151966067327, average of loss (interval is 100): 1.6428610040495792
Your action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.])
Oppo action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


config: h=1, epsilon_decay=0.99
playing times: 4000
length of loss: 3936, average of loss (interval is 2): 5.398667894546094, average of loss (interval is 20): 5.473265752120671, average of loss (interval is 100): 4.036063334345817
length of loss: 3936, average of loss (interval is 2): 3.614369572677869, average of loss (interval is 20): 3.5694336149898276, average of 

playing times: 5000
length of loss: 4936, average of loss (interval is 2): 3.8621226131185478, average of loss (interval is 20): 4.048364943639952, average of loss (interval is 100): 3.0765360298752786
length of loss: 4936, average of loss (interval is 2): 1.6482140220825467, average of loss (interval is 20): 1.7004396611078065, average of loss (interval is 100): 1.741843352019787
Your action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Oppo action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


config: h=10, epsilon_decay=0.995
playing times: 4000
length of loss: 3936, average of loss (interval is 2): 2.115105207630895, average of loss (interval is 20): 2.142013110939016, average of loss (interval is 100): 1.848880621790886
length of loss: 3936, average of loss (interval is 2): 2.1006690108982045, average of loss (interval is 20): 2.127445940438866, average of loss (interval is 100): 2.11390341818

In [152]:
result_dict_drqnv

{'h=1': {'epsilon_decay=0.95': [13000, 6000, 2000, 13000, 6000],
  'epsilon_decay=0.99': [3000, 6000, 4000, 3000, 4000],
  'epsilon_decay=0.995': [5000, 8000, 4000, 5000, 6000],
  'epsilon_decay=0.999': [8000, 6000, 6000, 10000, 8000]},
 'h=2': {'epsilon_decay=0.95': [5000, 5000, 4000, 7000, 5000],
  'epsilon_decay=0.99': [3000, 4000, 3000, 3000, 4000],
  'epsilon_decay=0.995': [5000, 4000, 5000, 3000, 4000],
  'epsilon_decay=0.999': [7000, 6000, 6000, 7000, 9000]},
 'h=5': {'epsilon_decay=0.95': [11000, 2000, 6000, 3000, 6000],
  'epsilon_decay=0.99': [3000, 5000, 18000, 4000, 5000],
  'epsilon_decay=0.995': [4000, 4000, 3000, 4000, 8000],
  'epsilon_decay=0.999': [9000, 6000, 6000, 9000, 8000]},
 'h=10': {'epsilon_decay=0.95': [3000, 4000, 2000, 2000, 5000],
  'epsilon_decay=0.99': [5000, 6000, 5000, 5000, 5000],
  'epsilon_decay=0.995': [3000, 6000, 3000, 4000, 4000],
  'epsilon_decay=0.999': [10000, 8000, 8000, 7000, 8000]}}

In [153]:
strategy_dict_drqnv

{'h=1': {'epsilon_decay=0.95': [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]],
  'epsilon_decay=0.99': [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]],
  'epsilon_decay=0.995': [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
   [[1, 1, 1, 1

In [190]:
for i in h:
    for j in epsilon_decay:
        print(np.mean(np.array(loss_dict_drqnv[f'h={i}'][f'epsilon_decay={j}'])[:,0]))

8.71502027226603
3.0871757431699516
3.9225479145171795
4.036711904387093
3.3467126590067187
3.0363597030676535
2.382156602091908
3.1833075166015696
6.580258945382764
7.736673866204468
4.135884103423476
4.6798164707273795
9.016569304337873
4.0369239787560565
2.8074456614052847
3.3759989282246763
