In [1]:
import torch
import random
import numpy as np
import matplotlib.pyplot as plt
import copy
from agent import *
from selection import *
from utils import *
from env import Environment
from simulation import constructAgent, twoSimulate
from main import Config

In [2]:
seed_everything()
torch.cuda.is_available()

True

## DQN vs TfT

In [3]:
def determine_convergence(agent:object, threshold:int, k:int=100):
    if agent.play_times < 2*k:
        return False
    history_1 = agent.own_memory[agent.play_times-k:agent.play_times]
    history_2 = agent.own_memory[agent.play_times-2*k:agent.play_times-k]
    difference = torch.sum(torch.abs(history_1 - history_2))
    if difference > threshold:
        return False
    else:
        return True

In [4]:
# choices = {'0-alwaysCooperate','1-alwaysDefect','2-titForTat','3-reverseTitForTat','4-random','5-grudger','6-pavlov','7-qLearning','8-lstm-TFT','9-dqn','10-lstmqn','11-a2c','12-a2c-lstm'}
# rl_choices = {'7-qLearning','8-lstm-pavlov','9-dqn','10-lstmqn','11-a2c','12-a2c-lstm'}
strategies = {0:'ALLC',1:'ALLD',2:'TitForTat',3:'revTitForTat',4:'Random',5:'Grudger',6:'Pavlov',7:'QLearning',8:'LSTM',9:'DQN',10:'LSTMQN',11:'A2C',12:'A2CLSTM'}

h = [1,2,5,10]
epsilon_decay = [0.95, 0.99, 0.995, 0.999]

config = {
    'reward': 3, 
    'sucker': 0, 
    'temptation': 5, 
    'punishment': 1, 
    'n_episodes': 10000, 
    'discount': 0.99,
    'play_epsilon': 1,
    'select_epsilon': 1,
    'epsilon_decay': 0.999,
    'min_epsilon': 0.01,
    'alpha': 0.1,
    'n_actions': 2,
    'h': 10,
    'state_repr': 'bi',
    'batch_size': 64,
    'learning_rate': 1e-3,
}

epsilon_dict = {'epsilon_decay=0.95':[],'epsilon_decay=0.99':[],'epsilon_decay=0.995':[],'epsilon_decay=0.999':[]}
result_dict={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
loss_dict={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
strategy_dict={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
                
for key in result_dict:
    result_dict[key] = copy.deepcopy(epsilon_dict)
    loss_dict[key] = copy.deepcopy(epsilon_dict)
    strategy_dict[key] = copy.deepcopy(epsilon_dict)

for i in h:
    for j in epsilon_decay:
        config['h'] = i
        config['epsilon_decay'] = j
        config_ob = Config(config)
        env = Environment(config_ob)
        print(f'config: h={i}, epsilon_decay={j}')
        
        num = 2
        rl_num = 9
        for _ in range(5):
            convergence = False
            # twoSimulate(dict({num: strategies[num],rl_num: strategies[rl_num]}), rl_num, config)
            with HiddenPrints():
                agent1 = constructAgent(strategies[num], config_ob)
                agent2 = constructAgent(strategies[rl_num], config_ob)

                k = 1000
                while not convergence:
                    env.play(agent1, agent2, k)
                    convergence = determine_convergence(agent2, 20, k=k)
                    if agent2.play_times >= 20*k:
                        break
        
                result_dict[f'h={i}'][f'epsilon_decay={j}'].append(agent2.play_times)
                loss_dict[f'h={i}'][f'epsilon_decay={j}'].append(np.mean(agent2.loss))
                strategy_dict[f'h={i}'][f'epsilon_decay={j}'].append(list(agent2.own_memory[agent2.play_times-10:agent2.play_times]))
        print(f'playing times: {agent2.play_times}')
        print(f'length of loss: {len(agent2.loss)}, average of loss (interval is 2): {np.mean(agent2.loss[::2])}, average of loss (interval is 20): {np.mean(agent2.loss[::20])}, average of loss (interval is 100): {np.mean(agent2.loss[::100])}')
        agent2.show()
        print()
        print()
        
    # plt.plot(agent2.loss[::20])
    # plt.title(f'agent:{agent2.name}')
    # plt.show()
# agent1.show()
# agent2.show()
# print("==================================================")
# print(f'{agent1.name} score: {agent1.running_score}\n{agent2.name} score: {agent2.running_score}')
# print("------------------------------------------------------------------------------------------------------------------------------------------------")
# print()

# x = [i for i in range(0, agent1.play_times)]
# plt.figure(figsize=(20, 10))
# plt.plot(x, agent1.own_memory[0:agent1.play_times], label=agent1.name, alpha=0.5)
# plt.plot(x, agent2.own_memory[0:agent2.play_times], label=agent2.name, alpha=0.5)
# plt.legend()
# plt.ylim(-0.5, 2)
# plt.xlim(0, agent1.play_times)
# plt.title(f'agent:{agent1.name} vs agent:{agent2.name}')
# plt.savefig(f'images/{agent1.name}vs{agent2.name}_result_h={config.h}.png')
# plt.show()

config: h=1, epsilon_decay=0.95
playing times: 20000
length of loss: 19936, average of loss (interval is 2): 4.116038464949306, average of loss (interval is 20): 4.226298858814218, average of loss (interval is 100): 4.110161491520703
Your action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
        1., 0.])
Oppo action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1.,
        0., 1.])


config: h=1, epsilon_decay=0.99
playing times: 3000
length of loss: 2936, average of loss (interval is 2): 7.489043057898417, average of loss (interval is 20): 8.077306690796906, average of loss (interval is 100): 5.947919905744493
Your action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Oppo action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


config: h=1, epsilon_decay=0.995
playing times: 3000
length of loss: 2936, average of loss (interval is 2)

In [5]:
for i in h:
    for j in epsilon_decay:
        strategy_dict[f'h={i}'][f'epsilon_decay={j}'] = torch.Tensor(strategy_dict[f'h={i}'][f'epsilon_decay={j}']).numpy().astype(int).tolist()
print(result_dict)
print()
print(strategy_dict)

{'h=1': {'epsilon_decay=0.95': [3000, 20000, 3000, 3000, 20000], 'epsilon_decay=0.99': [3000, 3000, 3000, 3000, 3000], 'epsilon_decay=0.995': [3000, 3000, 3000, 3000, 3000], 'epsilon_decay=0.999': [20000, 20000, 20000, 20000, 20000]}, 'h=2': {'epsilon_decay=0.95': [4000, 3000, 5000, 8000, 3000], 'epsilon_decay=0.99': [8000, 3000, 4000, 3000, 3000], 'epsilon_decay=0.995': [3000, 4000, 9000, 4000, 3000], 'epsilon_decay=0.999': [20000, 20000, 20000, 20000, 20000]}, 'h=5': {'epsilon_decay=0.95': [3000, 3000, 7000, 4000, 5000], 'epsilon_decay=0.99': [3000, 3000, 5000, 8000, 20000], 'epsilon_decay=0.995': [7000, 20000, 6000, 4000, 20000], 'epsilon_decay=0.999': [20000, 9000, 20000, 20000, 20000]}, 'h=10': {'epsilon_decay=0.95': [20000, 3000, 3000, 3000, 5000], 'epsilon_decay=0.99': [5000, 6000, 20000, 3000, 4000], 'epsilon_decay=0.995': [4000, 4000, 3000, 4000, 10000], 'epsilon_decay=0.999': [20000, 20000, 20000, 20000, 20000]}}

{'h=1': {'epsilon_decay=0.95': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [6]:
for i in h:
    for j in epsilon_decay:
        print(np.mean(loss_dict[f'h={i}'][f'epsilon_decay={j}']))

5.8020073184537395
7.096781128811779
6.426352696914155
7.009957177002221
9.312634296046323
7.32161069686447
6.298518353351357
7.8245764660819574
7.374517959213087
13.117284897510364
22.171735876204444
6.650255216491738
41.68496656525413
54.37895088756359
75.1673582437496
5.0361007256933545


## DQN vs DQN

In [7]:
result_dict_dqn={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
loss_dict_dqn={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
strategy_dict_dqn={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}

for key in result_dict_dqn:
    result_dict_dqn[key] = copy.deepcopy(epsilon_dict)
    loss_dict_dqn[key] = copy.deepcopy(epsilon_dict)
    strategy_dict_dqn[key] = copy.deepcopy(epsilon_dict)

for i in h:
    for j in epsilon_decay:
        config['h'] = i
        config['epsilon_decay'] = j
        config_ob = Config(config)
        env = Environment(config_ob)
        print(f'config: h={i}, epsilon_decay={j}')
        
        rl_num = 9
        for _ in range(5):
            convergence = False
            with HiddenPrints():
                agent1 = constructAgent(strategies[rl_num], config_ob)
                agent2 = constructAgent(strategies[rl_num], config_ob)

                k = 1000
                while not convergence:
                    env.play(agent1, agent2, k)
                    convergence = determine_convergence(agent2, 20, k=k)
                    if agent2.play_times >= 20*k:
                        break
        
                result_dict_dqn[f'h={i}'][f'epsilon_decay={j}'].append(agent2.play_times)
                loss_dict_dqn[f'h={i}'][f'epsilon_decay={j}'].append([np.mean(agent1.loss),np.mean(agent2.loss)])
                strategy_dict_dqn[f'h={i}'][f'epsilon_decay={j}'].append([agent1.own_memory[agent1.play_times-10:agent1.play_times].numpy().astype(int).tolist(), agent2.own_memory[agent2.play_times-10:agent2.play_times].numpy().astype(int).tolist()])
        print(f'playing times: {agent2.play_times}')
        print(f'length of loss: {len(agent1.loss)}, average of loss (interval is 2): {np.mean(agent1.loss[::2])}, average of loss (interval is 20): {np.mean(agent1.loss[::20])}, average of loss (interval is 100): {np.mean(agent1.loss[::100])}')
        print(f'length of loss: {len(agent2.loss)}, average of loss (interval is 2): {np.mean(agent2.loss[::2])}, average of loss (interval is 20): {np.mean(agent2.loss[::20])}, average of loss (interval is 100): {np.mean(agent2.loss[::100])}')
        agent2.show()
        print()
        print()

config: h=1, epsilon_decay=0.95
playing times: 6000
length of loss: 5936, average of loss (interval is 2): 16.66003415278756, average of loss (interval is 20): 16.415073569776233, average of loss (interval is 100): 15.694142721096675
length of loss: 5936, average of loss (interval is 2): 7.205223576020138, average of loss (interval is 20): 7.050136550257503, average of loss (interval is 100): 7.05993527273337
Your action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Oppo action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


config: h=1, epsilon_decay=0.99
playing times: 20000
length of loss: 19936, average of loss (interval is 2): 3.765200049057304, average of loss (interval is 20): 3.766812394022822, average of loss (interval is 100): 3.842576918378472
length of loss: 19936, average of loss (interval is 2): 8.724930017325706, average of loss (interval is 20): 8.81258524953058, average of loss (int

playing times: 4000
length of loss: 3927, average of loss (interval is 2): 39.74156008666863, average of loss (interval is 20): 39.14970471350675, average of loss (interval is 100): 38.4372915238142
length of loss: 3927, average of loss (interval is 2): 53.9493605391314, average of loss (interval is 20): 55.327261114483555, average of loss (interval is 100): 57.51042265892029
Your action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.])
Oppo action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.])


config: h=10, epsilon_decay=0.99
playing times: 4000
length of loss: 3927, average of loss (interval is 2): 92.06164374024951, average of loss (interval is 20): 92.61497359106383, average of loss (interval is 100): 90.12103570997715
length of loss: 3927, average of loss (interval is 2): 83.13210276840539, average of loss (interval is 20): 84.06750825667744, average of loss (interval is 100): 88.4

In [8]:
result_dict_dqn

{'h=1': {'epsilon_decay=0.95': [20000, 20000, 3000, 3000, 6000],
  'epsilon_decay=0.99': [20000, 20000, 3000, 20000, 20000],
  'epsilon_decay=0.995': [4000, 20000, 20000, 8000, 16000],
  'epsilon_decay=0.999': [20000, 20000, 20000, 20000, 20000]},
 'h=2': {'epsilon_decay=0.95': [3000, 3000, 2000, 3000, 3000],
  'epsilon_decay=0.99': [20000, 4000, 8000, 3000, 11000],
  'epsilon_decay=0.995': [4000, 20000, 18000, 5000, 6000],
  'epsilon_decay=0.999': [20000, 20000, 20000, 20000, 20000]},
 'h=5': {'epsilon_decay=0.95': [20000, 8000, 3000, 5000, 5000],
  'epsilon_decay=0.99': [3000, 6000, 8000, 4000, 9000],
  'epsilon_decay=0.995': [20000, 8000, 8000, 20000, 9000],
  'epsilon_decay=0.999': [20000, 7000, 20000, 20000, 6000]},
 'h=10': {'epsilon_decay=0.95': [3000, 4000, 10000, 20000, 4000],
  'epsilon_decay=0.99': [3000, 5000, 19000, 4000, 4000],
  'epsilon_decay=0.995': [6000, 5000, 3000, 3000, 3000],
  'epsilon_decay=0.999': [20000, 6000, 9000, 15000, 9000]}}

In [9]:
strategy_dict_dqn

{'h=1': {'epsilon_decay=0.95': [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 1, 0, 1, 1, 1, 1, 1, 1, 1]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]],
  'epsilon_decay=0.99': [[[1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]],
  'epsilon_decay=0.995': [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[1, 0, 1, 0, 1, 0, 1, 0, 1, 0], [1, 0, 1, 0, 1, 0, 1, 1, 1, 1]],
   [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
   [[0, 0, 0, 0

In [10]:
for i in h:
    for j in epsilon_decay:
        print(np.mean(np.array(loss_dict_dqn[f'h={i}'][f'epsilon_decay={j}'])[:,0]))

10.36281814012781
5.30305792075878
8.368344093669322
6.437139419724933
16.61400641145889
22.313856821682563
25.244923164907213
9.55751421001342
33.46241106023884
36.018553720553626
22.911297939896606
33.22744478771189
34.79207039937955
68.35792606106725
41.8682668179988
41.587205091281305


## DRQN vs TfT

In [11]:
result_dict_drqn_t={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
loss_dict_drqn_t={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
strategy_dict_drqn_t={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}

for key in result_dict_drqn_t:
    result_dict_drqn_t[key] = copy.deepcopy(epsilon_dict)
    loss_dict_drqn_t[key] = copy.deepcopy(epsilon_dict)
    strategy_dict_drqn_t[key] = copy.deepcopy(epsilon_dict)

for i in h:
    for j in epsilon_decay:
        config['h'] = i
        config['epsilon_decay'] = j
        config_ob = Config(config)
        env = Environment(config_ob)
        print(f'config: h={i}, epsilon_decay={j}')
        
        n_num = 2
        rl_num = 10
        for _ in range(5):
            convergence = False
            with HiddenPrints():
                agent1 = constructAgent(strategies[n_num], config_ob)
                agent2 = constructAgent(strategies[rl_num], config_ob)

                k = 1000
                while not convergence:
                    env.play(agent1, agent2, k)
                    convergence = determine_convergence(agent2, 20, k=k)
                    if agent2.play_times >= 20*k:
                        break
        
                result_dict_drqn_t[f'h={i}'][f'epsilon_decay={j}'].append(agent2.play_times)
                loss_dict_drqn_t[f'h={i}'][f'epsilon_decay={j}'].append(np.mean(agent2.loss))
                strategy_dict_drqn_t[f'h={i}'][f'epsilon_decay={j}'].append(agent2.own_memory[agent2.play_times-10:agent2.play_times].numpy().astype(int).tolist())
        print(f'playing times: {agent2.play_times}')
        print(f'length of loss: {len(agent2.loss)}, average of loss (interval is 2): {np.mean(agent2.loss[::2])}, average of loss (interval is 20): {np.mean(agent2.loss[::20])}, average of loss (interval is 100): {np.mean(agent2.loss[::100])}')
        agent2.show()
        print()
        print()

config: h=1, epsilon_decay=0.95
playing times: 3000
length of loss: 2936, average of loss (interval is 2): 0.10015642578853146, average of loss (interval is 20): 0.11335388142257823, average of loss (interval is 100): 0.09106246220738588
Your action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.])
Oppo action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.])


config: h=1, epsilon_decay=0.99
playing times: 3000
length of loss: 2936, average of loss (interval is 2): 0.33327397430011857, average of loss (interval is 20): 0.49396941360202223, average of loss (interval is 100): 0.19131687412736936
Your action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Oppo action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


config: h=1, epsilon_decay=0.995
playing times: 3000
length of loss: 2936, average of loss (inte

In [12]:
result_dict_drqn_t

{'h=1': {'epsilon_decay=0.95': [3000, 3000, 3000, 2000, 3000],
  'epsilon_decay=0.99': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.995': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.999': [6000, 5000, 5000, 6000, 5000]},
 'h=2': {'epsilon_decay=0.95': [4000, 2000, 5000, 2000, 9000],
  'epsilon_decay=0.99': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.995': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.999': [5000, 5000, 5000, 8000, 6000]},
 'h=5': {'epsilon_decay=0.95': [3000, 13000, 2000, 4000, 3000],
  'epsilon_decay=0.99': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.995': [3000, 3000, 3000, 6000, 3000],
  'epsilon_decay=0.999': [11000, 9000, 5000, 6000, 6000]},
 'h=10': {'epsilon_decay=0.95': [3000, 4000, 2000, 3000, 3000],
  'epsilon_decay=0.99': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.995': [3000, 3000, 3000, 3000, 3000],
  'epsilon_decay=0.999': [10000, 5000, 8000, 9000, 6000]}}

In [13]:
strategy_dict_drqn_t

{'h=1': {'epsilon_decay=0.95': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
  'epsilon_decay=0.99': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
  'epsilon_decay=0.995': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
  'epsilon_decay=0.999': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]},
 'h=2': {'epsilon_decay=0.95': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
   [0, 0, 0, 0, 0, 0, 0, 0, 0

In [14]:
for i in h:
    for j in epsilon_decay:
        print(np.mean(loss_dict_drqn_t[f'h={i}'][f'epsilon_decay={j}']))

0.19736206660347236
0.3169081791133227
0.3141808840314436
0.2345171351167666
0.1611805972969937
0.31566605395002834
0.29898199706499035
0.20585462610902844
0.13222151485549857
0.3299996406784215
0.388512719779985
0.2090834334060912
0.20063491137410666
0.3059607806440088
0.37549847163298866
0.2417215784707242


## DRQN vs DRQN

In [None]:
result_dict_drqn={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
loss_dict_drqn={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
strategy_dict_drqn={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}

for key in result_dict_drqn:
    result_dict_drqn[key] = copy.deepcopy(epsilon_dict)
    loss_dict_drqn[key] = copy.deepcopy(epsilon_dict)
    strategy_dict_drqn[key] = copy.deepcopy(epsilon_dict)

for i in h:
    for j in epsilon_decay:
        config['h'] = i
        config['epsilon_decay'] = j
        config_ob = Config(config)
        env = Environment(config_ob)
        print(f'config: h={i}, epsilon_decay={j}')
        
        rl_num = 10
        for _ in range(5):
            convergence = False
            with HiddenPrints():
                agent1 = constructAgent(strategies[rl_num], config_ob)
                agent2 = constructAgent(strategies[rl_num], config_ob)

                k = 1000
                while not convergence:
                    env.play(agent1, agent2, k)
                    convergence = determine_convergence(agent2, 20, k=k)
                    if agent2.play_times >= 20*k:
                        break
        
                result_dict_drqn[f'h={i}'][f'epsilon_decay={j}'].append(agent2.play_times)
                loss_dict_drqn[f'h={i}'][f'epsilon_decay={j}'].append([np.mean(agent1.loss),np.mean(agent2.loss)])
                strategy_dict_drqn[f'h={i}'][f'epsilon_decay={j}'].append([agent1.own_memory[agent1.play_times-10:agent1.play_times].numpy().astype(int).tolist(), agent2.own_memory[agent2.play_times-10:agent2.play_times].numpy().astype(int).tolist()])
        print(f'playing times: {agent2.play_times}')
        print(f'length of loss: {len(agent1.loss)}, average of loss (interval is 2): {np.mean(agent1.loss[::2])}, average of loss (interval is 20): {np.mean(agent1.loss[::20])}, average of loss (interval is 100): {np.mean(agent1.loss[::100])}')
        print(f'length of loss: {len(agent2.loss)}, average of loss (interval is 2): {np.mean(agent2.loss[::2])}, average of loss (interval is 20): {np.mean(agent2.loss[::20])}, average of loss (interval is 100): {np.mean(agent2.loss[::100])}')
        agent2.show()
        print()
        print()

config: h=1, epsilon_decay=0.95
playing times: 3000
length of loss: 2936, average of loss (interval is 2): 0.4487104880408006, average of loss (interval is 20): 0.47947113510663936, average of loss (interval is 100): 0.3961803452776318
length of loss: 2936, average of loss (interval is 2): 0.6912727479154854, average of loss (interval is 20): 0.7411777464347163, average of loss (interval is 100): 0.5318989552227625
Your action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.])
Oppo action: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.])


config: h=1, epsilon_decay=0.99
playing times: 3000
length of loss: 2936, average of loss (interval is 2): 0.27358664700737984, average of loss (interval is 20): 0.2932020850775789, average of loss (interval is 100): 0.28660630601807496
length of loss: 2936, average of loss (interval is 2): 0.19517364247438146, average of loss (interval is 20): 0.2064740381

In [None]:
result_dict_drqn

In [None]:
strategy_dict_drqn

In [None]:
for i in h:
    for j in epsilon_decay:
        print(np.mean(np.array(loss_dict_drqn[f'h={i}'][f'epsilon_decay={j}'])[:,0]))

## QLearning vs TfT

In [None]:
result_dict_q={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
strategy_dict_q={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}

for key in result_dict_q:
    result_dict_q[key] = copy.deepcopy(epsilon_dict)
    strategy_dict_q[key] = copy.deepcopy(epsilon_dict)

for i in h:
    for j in epsilon_decay:
        config['h'] = i
        config['epsilon_decay'] = j
        config_ob = Config(config)
        env = Environment(config_ob)
        print(f'config: h={i}, epsilon_decay={j}')
        
        n_num = 2
        rl_num = 7
        for _ in range(5):
            convergence = False
            with HiddenPrints():
                agent1 = constructAgent(strategies[n_num], config_ob)
                agent2 = constructAgent(strategies[rl_num], config_ob)

                k = 1000
                while not convergence:
                    env.play(agent1, agent2, k)
                    convergence = determine_convergence(agent2, 20, k=k)
                    if agent2.play_times >= 20*k:
                        break
        
                result_dict_q[f'h={i}'][f'epsilon_decay={j}'].append(agent2.play_times)
                strategy_dict_q[f'h={i}'][f'epsilon_decay={j}'].append(list(agent2.own_memory[agent2.play_times-10:agent2.play_times]))
        print(f'playing times: {agent2.play_times}')
        agent2.show()
        print()
        print()

## DRQN-Varient vs TfT

In [None]:
# choices = {'0-alwaysCooperate','1-alwaysDefect','2-titForTat','3-reverseTitForTat','4-random','5-grudger','6-pavlov','7-qLearning','8-lstm-TFT','9-dqn','10-lstmqn','11-a2c','12-a2c-lstm'}
# rl_choices = {'7-qLearning','8-lstm-pavlov','9-dqn','10-lstmqn','11-a2c','12-a2c-lstm'}
strategies = {0:'ALLC',1:'ALLD',2:'TitForTat',3:'revTitForTat',4:'Random',5:'Grudger',6:'Pavlov',7:'QLearning',8:'LSTM',9:'DQN',10:'LSTMQN',11:'A2C',12:'A2CLSTM'}

h = [1,2,5,10]
epsilon_decay = [0.95, 0.99, 0.995, 0.999]

config = {
    'reward': 3, 
    'sucker': 0, 
    'temptation': 5, 
    'punishment': 1, 
    'n_episodes': 10000, 
    'discount': 0.99,
    'play_epsilon': 1,
    'select_epsilon': 1,
    'epsilon_decay': 0.999,
    'min_epsilon': 0.01,
    'alpha': 0.1,
    'n_actions': 2,
    'h': 10,
    'state_repr': 'bi-repr',
    'batch_size': 64,
    'learning_rate': 1e-3,
}

result_dict_drqnv_t={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
loss_dict_drqnv_t={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
strategy_dict_drqnv_t={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
                
for key in result_dict_drqnv_t:
    result_dict_drqnv_t[key] = copy.deepcopy(epsilon_dict)
    loss_dict_drqnv_t[key] = copy.deepcopy(epsilon_dict)
    strategy_dict_drqnv_t[key] = copy.deepcopy(epsilon_dict)

for i in h:
    for j in epsilon_decay:
        config['h'] = i
        config['epsilon_decay'] = j
        config_ob = Config(config)
        env = Environment(config_ob)
        print(f'config: h={i}, epsilon_decay={j}')
        
        num = 2
        rl_num = 10
        for _ in range(5):
            convergence = False
            # twoSimulate(dict({num: strategies[num],rl_num: strategies[rl_num]}), rl_num, config)
            with HiddenPrints():
                agent1 = constructAgent(strategies[num], config_ob)
                agent2 = constructAgent(strategies[rl_num], config_ob)

                k = 1000
                while not convergence:
                    env.play(agent1, agent2, k)
                    convergence = determine_convergence(agent2, 20, k=k)
                    if agent2.play_times >= 20*k:
                        break
        
                result_dict_drqnv_t[f'h={i}'][f'epsilon_decay={j}'].append(agent2.play_times)
                loss_dict_drqnv_t[f'h={i}'][f'epsilon_decay={j}'].append(np.mean(agent2.loss))
                strategy_dict_drqnv_t[f'h={i}'][f'epsilon_decay={j}'].append(list(agent2.own_memory[agent2.play_times-10:agent2.play_times]))
        print(f'playing times: {agent2.play_times}')
        print(f'length of loss: {len(agent2.loss)}, average of loss (interval is 2): {np.mean(agent2.loss[::2])}, average of loss (interval is 20): {np.mean(agent2.loss[::20])}, average of loss (interval is 100): {np.mean(agent2.loss[::100])}')
        agent2.show()
        print()
        print()
        

In [None]:
result_dict_drqnv_t

In [None]:
for i in h:
    for j in epsilon_decay:
        strategy_dict_drqnv_t[f'h={i}'][f'epsilon_decay={j}'] = torch.Tensor(strategy_dict_drqnv_t[f'h={i}'][f'epsilon_decay={j}']).numpy().astype(int).tolist()
    print(f'h={i}')
    print(strategy_dict_drqnv_t[f'h={i}'])

In [None]:
for i in h:
    for j in epsilon_decay:
        print(np.mean(np.array(loss_dict_drqnv_t[f'h={i}'][f'epsilon_decay={j}'])))

## DRQN-Varient vs DRQN-Varient

In [None]:
result_dict_drqnv={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
loss_dict_drqnv={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
strategy_dict_drqnv={'h=1':{}, 'h=2':{}, 'h=5':{}, 'h=10':{}}
                
for key in result_dict_drqnv:
    result_dict_drqnv[key] = copy.deepcopy(epsilon_dict)
    loss_dict_drqnv[key] = copy.deepcopy(epsilon_dict)
    strategy_dict_drqnv[key] = copy.deepcopy(epsilon_dict)

for i in h:
    for j in epsilon_decay:
        config['h'] = i
        config['epsilon_decay'] = j
        config_ob = Config(config)
        env = Environment(config_ob)
        print(f'config: h={i}, epsilon_decay={j}')
        
        rl_num = 10
        for _ in range(5):
            convergence = False
            # twoSimulate(dict({num: strategies[num],rl_num: strategies[rl_num]}), rl_num, config)
            with HiddenPrints():
                agent1 = constructAgent(strategies[rl_num], config_ob)
                agent2 = constructAgent(strategies[rl_num], config_ob)

                k = 1000
                while not convergence:
                    env.play(agent1, agent2, k)
                    convergence = determine_convergence(agent2, 20, k=k)
                    if agent2.play_times >= 20*k:
                        break
        
                result_dict_drqnv[f'h={i}'][f'epsilon_decay={j}'].append(agent2.play_times)
                loss_dict_drqnv[f'h={i}'][f'epsilon_decay={j}'].append([np.mean(agent1.loss),np.mean(agent2.loss)])
                strategy_dict_drqnv[f'h={i}'][f'epsilon_decay={j}'].append([agent1.own_memory[agent1.play_times-10:agent1.play_times].numpy().astype(int).tolist(), agent2.own_memory[agent2.play_times-10:agent2.play_times].numpy().astype(int).tolist()])
        print(f'playing times: {agent2.play_times}')
        print(f'length of loss: {len(agent1.loss)}, average of loss (interval is 2): {np.mean(agent1.loss[::2])}, average of loss (interval is 20): {np.mean(agent1.loss[::20])}, average of loss (interval is 100): {np.mean(agent1.loss[::100])}')
        print(f'length of loss: {len(agent2.loss)}, average of loss (interval is 2): {np.mean(agent2.loss[::2])}, average of loss (interval is 20): {np.mean(agent2.loss[::20])}, average of loss (interval is 100): {np.mean(agent2.loss[::100])}')
        agent2.show()
        print()
        print()
        

In [None]:
result_dict_drqnv

In [None]:
strategy_dict_drqnv

In [None]:
for i in h:
    for j in epsilon_decay:
        print(np.mean(np.array(loss_dict_drqnv[f'h={i}'][f'epsilon_decay={j}'])[:,0]))