In [1]:
import torch
import random
import numpy as np
import matplotlib.pyplot as plt
from agent import *
from selection import *
from utils import *
from env import Environment
from simulation import constructAgent
from main import Config

In [2]:
MULTI_SELECTION_METHOD = 'LSTM-VAR'
config = {
    'reward': 3, 
    'sucker': 0, 
    'temptation': 5, 
    'punishment': 1, 
    'n_episodes': 5000, 
    'discount': 0.99,
    'play_epsilon': 1,
    'select_epsilon': 1,
    'epsilon_decay': 0.999,
    'min_epsilon': 0.01,
    'alpha': 0.1,
    'n_actions': 2,
    'h': 10,
    'state_repr': 'bi-repr',
    'batch_size': 64,
    'learning_rate': 1e-3,
}
config = Config(config)
env = Environment(config)
seed_everything()

In [3]:
agents = {}
index = 0
with HiddenPrints():
#     for _ in range(5):
#         agents[index] = constructAgent('TitForTat', config)
#         index += 1
#     for _ in range(5):
#         agents[index] = constructAgent('LSTM', config)
#         index += 1
    for _ in range(15):
        agents[index] = constructAgent('LSTMQN', config)
        index += 1
 
agents = lstm_variant_selection(config, agents, env)

for n in range(len(agents)):
    print('Agent{}: name:{}  final score:{}  play time:{}  times to play D:{}  ratio: {}  faced D ratio: {}'
        .format(n, agents[n].name, agents[n].running_score,
        len(agents[n].own_memory[:agents[n].play_times]), list(agents[n].own_memory[:agents[n].play_times]).count(1),
                list(agents[n].own_memory[:agents[n].play_times]).count(1)/len(agents[n].own_memory[:agents[n].play_times]),
                list(agents[n].opponent_memory[:agents[n].play_times]).count(1)/len(agents[n].opponent_memory[:agents[n].play_times])))
print('The reward for total society: {}'.format(env.running_score/len(agents)))

100%|███████████████████████████████████████| 5000/5000 [41:01<00:00,  2.03it/s]


Agent0: name:LSTMQN  final score:192.49930955572668  play time:8537  times to play D:6976  ratio: 0.8171488813400491  faced D ratio: 0.8707977041115146
Agent1: name:LSTMQN  final score:202.49820512681507  play time:9046  times to play D:6805  ratio: 0.7522661950033164  faced D ratio: 0.7066106566438205
Agent2: name:LSTMQN  final score:129.95001785988626  play time:7369  times to play D:6029  ratio: 0.8181571447957661  faced D ratio: 0.6201655584204099
Agent3: name:LSTMQN  final score:150.30131529770244  play time:9386  times to play D:6839  ratio: 0.7286383976134668  faced D ratio: 0.7939484338376305
Agent4: name:LSTMQN  final score:103.96503257143743  play time:8374  times to play D:6470  ratio: 0.7726295677095772  faced D ratio: 0.8865536183424887
Agent5: name:LSTMQN  final score:118.55748120551888  play time:7597  times to play D:6514  ratio: 0.8574437277872845  faced D ratio: 0.8473081479531394
Agent6: name:LSTMQN  final score:192.62982131390172  play time:17573  times to play D:13

In [4]:
def get_index_from_action(action, idx):
    scale = lambda x: x+1 if x>=idx else x
    return list(map(scale, action))

for idx in agents:
    transitions = agents[idx].SelectMemory.memory
    _, actions, rewards, _ = zip(*transitions)
    actions = get_index_from_action(np.array(actions, dtype=int), idx)
    actions, rewards = np.array(actions), np.array(rewards)
    print(f'Agent {idx}:', end='')
    values, counts = np.unique(actions, return_counts=True)
    print(f' opponent_idx: {list(values)}, counts: {list(counts)} ', end='')
    dict_idx = {x: rewards[np.where(actions == x)] for x in values}
    print(f'rewards: {[np.mean(y) for _, y in dict_idx.items()]}')
    # print(f'rewards: {[np.std(y) for _, y in dict_idx.items()]}')
# plt.plot(actions)
# plt.show()
# print(rewards)

Agent 0: opponent_idx: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [5, 8, 315, 411, 644, 8, 76, 6, 597, 7, 7, 120, 2414, 374] rewards: [1.6, 1.875, 1.4603174603174602, 1.3819951338199514, 1.0962732919254659, 1.625, 1.9210526315789473, 2.6666666666666665, 1.8442211055276383, 1.5714285714285714, 1.7142857142857142, 1.3833333333333333, 1.096106048053024, 0.9171122994652406]
Agent 1: opponent_idx: [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [23, 41, 9, 43, 422, 1109, 1322, 82, 179, 4, 11, 1734, 7, 6] rewards: [0.6086956521739131, 1.0731707317073171, 1.6666666666666667, 3.372093023255814, 1.8056872037914693, 1.8440036068530208, 2.2783661119515886, 0.6707317073170732, 2.1284916201117317, 4.5, 2.5454545454545454, 2.086505190311419, 1.5714285714285714, 2.0]
Agent 2: opponent_idx: [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [6, 81, 2, 8, 35, 2089, 1664, 593, 427, 30, 6, 7, 4, 40] rewards: [0.6666666666666666, 1.345679012345679, 4.0, 1.75, 1.057142857142857

In [5]:
MULTI_SELECTION_METHOD = 'LSTM-VAR'
config = {
    'reward': 3, 
    'sucker': 0, 
    'temptation': 5, 
    'punishment': 1, 
    'n_episodes': 10000, 
    'discount': 0.99,
    'play_epsilon': 1,
    'select_epsilon': 1,
    'epsilon_decay': 0.999,
    'min_epsilon': 0.01,
    'alpha': 0.1,
    'n_actions': 2,
    'h': 10,
    'state_repr': 'bi-repr',
    'batch_size': 64,
    'learning_rate': 1e-3,
}
config = Config(config)
env = Environment(config)
seed_everything()

In [6]:
agents = {}
index = 0
with HiddenPrints():
#     for _ in range(5):
#         agents[index] = constructAgent('TitForTat', config)
#         index += 1
#     for _ in range(5):
#         agents[index] = constructAgent('LSTM', config)
#         index += 1
    for _ in range(15):
        agents[index] = constructAgent('LSTMQN', config)
        index += 1
 
agents = lstm_variant_selection(config, agents, env)

for n in range(len(agents)):
    print('Agent{}: name:{}  final score:{}  play time:{}  times to play D:{}  ratio: {}  faced D ratio: {}'
        .format(n, agents[n].name, agents[n].running_score,
        len(agents[n].own_memory[:agents[n].play_times]), list(agents[n].own_memory[:agents[n].play_times]).count(1),
                list(agents[n].own_memory[:agents[n].play_times]).count(1)/len(agents[n].own_memory[:agents[n].play_times]),
                list(agents[n].opponent_memory[:agents[n].play_times]).count(1)/len(agents[n].opponent_memory[:agents[n].play_times])))
print('The reward for total society: {}'.format(env.running_score/len(agents)))

100%|███████████████████████████████████| 10000/10000 [1:57:31<00:00,  1.42it/s]


Agent0: name:LSTMQN  final score:211.0547075668551  play time:16989  times to play D:13169  ratio: 0.7751486255812584  faced D ratio: 0.6967449526163988
Agent1: name:LSTMQN  final score:208.695557622663  play time:21989  times to play D:13590  ratio: 0.6180362908727091  faced D ratio: 0.6442766837964436
Agent2: name:LSTMQN  final score:207.26825968312187  play time:20928  times to play D:14874  ratio: 0.7107224770642202  faced D ratio: 0.6801892201834863
Agent3: name:LSTMQN  final score:103.3718353843269  play time:14973  times to play D:12495  ratio: 0.8345021037868162  faced D ratio: 0.7452748280237761
Agent4: name:LSTMQN  final score:213.1758166448058  play time:15510  times to play D:8522  ratio: 0.549451966473243  faced D ratio: 0.6444874274661508
Agent5: name:LSTMQN  final score:134.87981524057417  play time:20054  times to play D:16109  ratio: 0.8032811409195173  faced D ratio: 0.7484292410491672
Agent6: name:LSTMQN  final score:84.9413001144647  play time:14542  times to play D

In [7]:
def get_index_from_action(action, idx):
    scale = lambda x: x+1 if x>=idx else x
    return list(map(scale, action))

for idx in agents:
    transitions = agents[idx].SelectMemory.memory
    _, actions, rewards, _ = zip(*transitions)
    actions = get_index_from_action(np.array(actions, dtype=int), idx)
    actions, rewards = np.array(actions), np.array(rewards)
    print(f'Agent {idx}:', end='')
    values, counts = np.unique(actions, return_counts=True)
    print(f' opponent_idx: {list(values)}, counts: {list(counts)} ', end='')
    dict_idx = {x: rewards[np.where(actions == x)] for x in values}
    print(f'rewards: {[np.mean(y) for _, y in dict_idx.items()]}')
    # print(f'rewards: {[np.std(y) for _, y in dict_idx.items()]}')
# plt.plot(actions)
# plt.show()
# print(rewards)

Agent 0: opponent_idx: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [105, 388, 106, 345, 99, 471, 585, 212, 3038, 575, 79, 3205, 505, 279] rewards: [1.9142857142857144, 1.384020618556701, 1.0377358490566038, 0.9623188405797102, 1.5656565656565657, 3.129511677282378, 1.029059829059829, 1.1415094339622642, 1.3568136932192232, 3.102608695652174, 1.2278481012658229, 2.77597503900156, 1.7861386138613862, 1.946236559139785]
Agent 1: opponent_idx: [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [257, 656, 119, 87, 364, 249, 146, 1240, 276, 38, 837, 228, 5123, 372] rewards: [0.933852140077821, 2.1966463414634148, 1.1092436974789917, 3.4367816091954024, 1.1043956043956045, 1.6907630522088353, 1.3493150684931507, 2.2983870967741935, 2.199275362318841, 1.131578947368421, 2.3273596176821982, 2.1271929824561404, 1.6449346086277572, 2.575268817204301]
Agent 2: opponent_idx: [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [30, 180, 5, 23, 9, 576, 196, 88, 26, 125, 283, 4

In [8]:
agents = {}
index = 0
with HiddenPrints():
#     for _ in range(5):
#         agents[index] = constructAgent('TitForTat', config)
#         index += 1
    for _ in range(5):
        agents[index] = constructAgent('LSTM', config)
        index += 1
    for _ in range(10):
        agents[index] = constructAgent('LSTMQN', config)
        index += 1
 
agents = lstm_variant_selection(config, agents, env)

for n in range(len(agents)):
    print('Agent{}: name:{}  final score:{}  play time:{}  times to play D:{}  ratio: {}  faced D ratio: {}'
        .format(n, agents[n].name, agents[n].running_score,
        len(agents[n].own_memory[:agents[n].play_times]), list(agents[n].own_memory[:agents[n].play_times]).count(1),
                list(agents[n].own_memory[:agents[n].play_times]).count(1)/len(agents[n].own_memory[:agents[n].play_times]),
                list(agents[n].opponent_memory[:agents[n].play_times]).count(1)/len(agents[n].opponent_memory[:agents[n].play_times])))
print('The reward for total society: {}'.format(env.running_score/len(agents)))

100%|███████████████████████████████████| 10000/10000 [1:56:37<00:00,  1.43it/s]


Agent0: name:LSTM  final score:149.73053338078785  play time:19030  times to play D:8940  ratio: 0.4697845507094062  faced D ratio: 0.47840252233315816
Agent1: name:LSTM  final score:235.3781250111217  play time:28753  times to play D:14097  ratio: 0.49027927520606546  faced D ratio: 0.4348068027684068
Agent2: name:LSTM  final score:145.1183547486282  play time:23669  times to play D:12156  ratio: 0.5135831678566902  faced D ratio: 0.5177236047150281
Agent3: name:LSTM  final score:195.44771600088603  play time:15586  times to play D:9085  ratio: 0.5828949056845887  faced D ratio: 0.6652765302194277
Agent4: name:LSTM  final score:157.8239237599182  play time:19764  times to play D:8495  ratio: 0.4298218984011334  faced D ratio: 0.4472272819267355
Agent5: name:LSTMQN  final score:108.50122727705985  play time:21526  times to play D:14144  ratio: 0.6570658738269999  faced D ratio: 0.632909040230419
Agent6: name:LSTMQN  final score:106.75738463799674  play time:21945  times to play D:16950

In [9]:
def get_index_from_action(action, idx):
    scale = lambda x: x+1 if x>=idx else x
    return list(map(scale, action))

for idx in agents:
    transitions = agents[idx].SelectMemory.memory
    _, actions, rewards, _ = zip(*transitions)
    actions = get_index_from_action(np.array(actions, dtype=int), idx)
    actions, rewards = np.array(actions), np.array(rewards)
    print(f'Agent {idx}:', end='')
    values, counts = np.unique(actions, return_counts=True)
    print(f' opponent_idx: {list(values)}, counts: {list(counts)} ', end='')
    dict_idx = {x: rewards[np.where(actions == x)] for x in values}
    print(f'rewards: {[np.mean(y) for _, y in dict_idx.items()]}')
    # print(f'rewards: {[np.std(y) for _, y in dict_idx.items()]}')
# plt.plot(actions)
# plt.show()
# print(rewards)

Agent 0: opponent_idx: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [152, 613, 253, 2, 102, 1088, 214, 560, 421, 83, 1362, 1251, 177, 3715] rewards: [2.0592105263157894, 2.900489396411093, 2.8537549407114624, 3.0, 1.411764705882353, 1.5661764705882353, 3.116822429906542, 1.4642857142857142, 2.441805225653207, 1.7710843373493976, 1.5969162995594715, 2.1119104716227017, 2.0338983050847457, 1.723822341857335]
Agent 1: opponent_idx: [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [108, 5, 698, 836, 24, 288, 524, 1826, 730, 1280, 824, 2739, 2, 109] rewards: [2.7685185185185186, 1.8, 1.654727793696275, 2.6555023923444976, 2.7083333333333335, 0.9930555555555556, 2.3244274809160306, 3.31763417305586, 2.8438356164383563, 3.46796875, 1.5728155339805825, 2.298284045271997, 2.5, 1.5045871559633028]
Agent 2: opponent_idx: [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [86, 1487, 5, 2, 508, 745, 1081, 238, 2621, 416, 379, 338, 264, 1823] rewards: [3.255813953488372, 1.

In [10]:
agents = {}
index = 0
with HiddenPrints():
    for _ in range(5):
        agents[index] = constructAgent('TitForTat', config)
        index += 1
#     for _ in range(5):
#         agents[index] = constructAgent('LSTM', config)
#         index += 1
    for _ in range(10):
        agents[index] = constructAgent('LSTMQN', config)
        index += 1
 
agents = lstm_variant_selection(config, agents, env)

for n in range(len(agents)):
    print('Agent{}: name:{}  final score:{}  play time:{}  times to play D:{}  ratio: {}  faced D ratio: {}'
        .format(n, agents[n].name, agents[n].running_score,
        len(agents[n].own_memory[:agents[n].play_times]), list(agents[n].own_memory[:agents[n].play_times]).count(1),
                list(agents[n].own_memory[:agents[n].play_times]).count(1)/len(agents[n].own_memory[:agents[n].play_times]),
                list(agents[n].opponent_memory[:agents[n].play_times]).count(1)/len(agents[n].opponent_memory[:agents[n].play_times])))
print('The reward for total society: {}'.format(env.running_score/len(agents)))

100%|███████████████████████████████████| 10000/10000 [1:15:40<00:00,  2.20it/s]


Agent0: name:TitForTat  final score:232.11253210870288  play time:23242  times to play D:11616  ratio: 0.4997848722140952  faced D ratio: 0.5083469580931073
Agent1: name:TitForTat  final score:314.5650976132172  play time:19614  times to play D:7771  ratio: 0.3961965942693994  faced D ratio: 0.31706944019577854
Agent2: name:TitForTat  final score:317.47321732554417  play time:22206  times to play D:9667  ratio: 0.43533279293884536  faced D ratio: 0.4254255606592813
Agent3: name:TitForTat  final score:222.35111793410692  play time:25738  times to play D:8770  ratio: 0.3407413163415961  faced D ratio: 0.2682026575491491
Agent4: name:TitForTat  final score:257.3447925237049  play time:23256  times to play D:10493  ratio: 0.4511953904368765  faced D ratio: 0.5446766425868593
Agent5: name:LSTMQN  final score:198.51260718589975  play time:16161  times to play D:10860  ratio: 0.6719881195470577  faced D ratio: 0.5874017696924695
Agent6: name:LSTMQN  final score:297.146164414369  play time:185

In [11]:
def get_index_from_action(action, idx):
    scale = lambda x: x+1 if x>=idx else x
    return list(map(scale, action))

for idx in agents:
    transitions = agents[idx].SelectMemory.memory
    _, actions, rewards, _ = zip(*transitions)
    actions = get_index_from_action(np.array(actions, dtype=int), idx)
    actions, rewards = np.array(actions), np.array(rewards)
    print(f'Agent {idx}:', end='')
    values, counts = np.unique(actions, return_counts=True)
    print(f' opponent_idx: {list(values)}, counts: {list(counts)} ', end='')
    dict_idx = {x: rewards[np.where(actions == x)] for x in values}
    print(f'rewards: {[np.mean(y) for _, y in dict_idx.items()]}')
    # print(f'rewards: {[np.std(y) for _, y in dict_idx.items()]}')
# plt.plot(actions)
# plt.show()
# print(rewards)

Agent 0: opponent_idx: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [186, 343, 1, 5296, 564, 348, 1147, 133, 602, 418, 66, 582, 8, 297] rewards: [2.0913978494623655, 1.3906705539358601, 0.0, 2.474320241691843, 1.5691489361702127, 2.117816091954023, 1.7384481255448998, 2.0902255639097747, 2.0199335548172757, 2.0526315789473686, 1.696969696969697, 1.3127147766323024, 1.875, 3.1818181818181817]
Agent 1: opponent_idx: [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [2017, 179, 269, 562, 122, 57, 907, 3582, 20, 221, 613, 465, 921, 56] rewards: [2.778879524045612, 3.8435754189944134, 2.646840148698885, 2.4590747330960854, 1.6721311475409837, 2.491228070175439, 2.5093715545755235, 2.66499162479062, 1.75, 2.823529411764706, 2.6704730831973897, 1.6301075268817204, 2.516829533116178, 2.8214285714285716]
Agent 2: opponent_idx: [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [587, 778, 709, 965, 140, 1412, 50, 823, 316, 1880, 422, 1063, 305, 541] rewards: [3.238500851

In [12]:
agents = {}
index = 0
with HiddenPrints():
    for _ in range(1):
        agents[index] = constructAgent('TitForTat', config)
        index += 1
    for _ in range(4):
        agents[index] = constructAgent('LSTM', config)
        index += 1
    for _ in range(10):
        agents[index] = constructAgent('LSTMQN', config)
        index += 1
 
agents = lstm_variant_selection(config, agents, env)

for n in range(len(agents)):
    print('Agent{}: name:{}  final score:{}  play time:{}  times to play D:{}  ratio: {}  faced D ratio: {}'
        .format(n, agents[n].name, agents[n].running_score,
        len(agents[n].own_memory[:agents[n].play_times]), list(agents[n].own_memory[:agents[n].play_times]).count(1),
                list(agents[n].own_memory[:agents[n].play_times]).count(1)/len(agents[n].own_memory[:agents[n].play_times]),
                list(agents[n].opponent_memory[:agents[n].play_times]).count(1)/len(agents[n].opponent_memory[:agents[n].play_times])))
print('The reward for total society: {}'.format(env.running_score/len(agents)))

100%|███████████████████████████████████| 10000/10000 [1:47:46<00:00,  1.55it/s]


Agent0: name:TitForTat  final score:244.39913736288526  play time:23867  times to play D:8645  ratio: 0.36221561151380566  faced D ratio: 0.2793396740268991
Agent1: name:LSTM  final score:99.96799720884074  play time:16640  times to play D:10176  ratio: 0.6115384615384616  faced D ratio: 0.6274038461538461
Agent2: name:LSTM  final score:253.83676149407972  play time:17886  times to play D:10102  ratio: 0.56479928435648  faced D ratio: 0.5861567706586157
Agent3: name:LSTM  final score:234.0960943092598  play time:21448  times to play D:11395  ratio: 0.5312849682954122  faced D ratio: 0.5059679224170086
Agent4: name:LSTM  final score:219.7052153663711  play time:22325  times to play D:13799  ratio: 0.6180963045912654  faced D ratio: 0.654020156774916
Agent5: name:LSTMQN  final score:132.109304169317  play time:15363  times to play D:12463  ratio: 0.8112347848727463  faced D ratio: 0.721278396146586
Agent6: name:LSTMQN  final score:326.70707219187125  play time:16690  times to play D:6369

In [13]:
def get_index_from_action(action, idx):
    scale = lambda x: x+1 if x>=idx else x
    return list(map(scale, action))

for idx in agents:
    transitions = agents[idx].SelectMemory.memory
    _, actions, rewards, _ = zip(*transitions)
    actions = get_index_from_action(np.array(actions, dtype=int), idx)
    actions, rewards = np.array(actions), np.array(rewards)
    print(f'Agent {idx}:', end='')
    values, counts = np.unique(actions, return_counts=True)
    print(f' opponent_idx: {list(values)}, counts: {list(counts)} ', end='')
    dict_idx = {x: rewards[np.where(actions == x)] for x in values}
    print(f'rewards: {[np.mean(y) for _, y in dict_idx.items()]}')
    # print(f'rewards: {[np.std(y) for _, y in dict_idx.items()]}')
# plt.plot(actions)
# plt.show()
# print(rewards)

Agent 0: opponent_idx: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [106, 933, 38, 2057, 406, 351, 520, 101, 214, 5056, 51, 82, 75, 2] rewards: [0.7735849056603774, 2.437299035369775, 4.157894736842105, 2.1934856587263005, 1.5443349753694582, 3.547008547008547, 1.7576923076923077, 1.7920792079207921, 1.4205607476635513, 2.970530063291139, 1.0784313725490196, 2.0853658536585367, 2.32, 3.0]
Agent 1: opponent_idx: [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [790, 1341, 610, 1154, 409, 191, 222, 456, 764, 171, 1770, 188, 1737, 189] rewards: [2.288607594936709, 2.3862788963460106, 2.411475409836066, 1.6299826689774697, 1.215158924205379, 2.0471204188481678, 2.536036036036036, 0.9671052631578947, 0.9895287958115183, 2.017543859649123, 1.1101694915254237, 3.0106382978723403, 2.3932066781807713, 1.08994708994709]
Agent 2: opponent_idx: [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [1161, 350, 3433, 325, 368, 653, 154, 97, 591, 4, 92, 1715, 663, 386] rewards:

In [14]:
agents = {}
index = 0
with HiddenPrints():
#     for _ in range(1):
#         agents[index] = constructAgent('TitForTat', config)
#         index += 1
    for _ in range(15):
        agents[index] = constructAgent('LSTM', config)
        index += 1
#     for _ in range(10):
#         agents[index] = constructAgent('LSTMQN', config)
#         index += 1
 
agents = lstm_variant_selection(config, agents, env)

for n in range(len(agents)):
    print('Agent{}: name:{}  final score:{}  play time:{}  times to play D:{}  ratio: {}  faced D ratio: {}'
        .format(n, agents[n].name, agents[n].running_score,
        len(agents[n].own_memory[:agents[n].play_times]), list(agents[n].own_memory[:agents[n].play_times]).count(1),
                list(agents[n].own_memory[:agents[n].play_times]).count(1)/len(agents[n].own_memory[:agents[n].play_times]),
                list(agents[n].opponent_memory[:agents[n].play_times]).count(1)/len(agents[n].opponent_memory[:agents[n].play_times])))
print('The reward for total society: {}'.format(env.running_score/len(agents)))

100%|███████████████████████████████████| 10000/10000 [1:53:56<00:00,  1.46it/s]


Agent0: name:LSTM  final score:100.61701778603704  play time:19891  times to play D:17954  ratio: 0.9026192750490172  faced D ratio: 0.8955809159921573
Agent1: name:LSTM  final score:100.49861477418634  play time:16024  times to play D:14931  ratio: 0.9317898152770844  faced D ratio: 0.9311657513729406
Agent2: name:LSTM  final score:101.03020984960094  play time:20835  times to play D:18829  ratio: 0.9037197024238061  faced D ratio: 0.9036237101031918
Agent3: name:LSTM  final score:99.68633891027822  play time:14271  times to play D:13104  ratio: 0.9182257725457221  faced D ratio: 0.9192067829864761
Agent4: name:LSTM  final score:103.54485132308076  play time:18584  times to play D:17441  ratio: 0.9384954799827809  faced D ratio: 0.9379573826947912
Agent5: name:LSTM  final score:99.57625476879019  play time:17413  times to play D:16224  ratio: 0.9317176821914661  faced D ratio: 0.9372882329294205
Agent6: name:LSTM  final score:100.9195041731359  play time:27668  times to play D:24433  

In [15]:
def get_index_from_action(action, idx):
    scale = lambda x: x+1 if x>=idx else x
    return list(map(scale, action))

for idx in agents:
    transitions = agents[idx].SelectMemory.memory
    _, actions, rewards, _ = zip(*transitions)
    actions = get_index_from_action(np.array(actions, dtype=int), idx)
    actions, rewards = np.array(actions), np.array(rewards)
    print(f'Agent {idx}:', end='')
    values, counts = np.unique(actions, return_counts=True)
    print(f' opponent_idx: {list(values)}, counts: {list(counts)} ', end='')
    dict_idx = {x: rewards[np.where(actions == x)] for x in values}
    print(f'rewards: {[np.mean(y) for _, y in dict_idx.items()]}')
    # print(f'rewards: {[np.std(y) for _, y in dict_idx.items()]}')
# plt.plot(actions)
# plt.show()
# print(rewards)

Agent 0: opponent_idx: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [1, 762, 106, 147, 852, 3596, 2830, 202, 2, 207, 267, 739, 217, 64] rewards: [1.0, 1.068241469816273, 1.0377358490566038, 1.0272108843537415, 1.0258215962441315, 1.5795328142380423, 1.0551236749116608, 1.0, 1.0, 0.9951690821256038, 1.9775280898876404, 1.0243572395128553, 0.9815668202764977, 1.765625]
Agent 1: opponent_idx: [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [137, 634, 541, 3508, 384, 654, 391, 66, 105, 73, 338, 973, 44, 2144] rewards: [1.0583941605839415, 1.0488958990536277, 1.0683918669131238, 1.040478905359179, 1.28125, 1.0412844036697249, 1.4194373401534526, 1.0, 1.0, 1.1095890410958904, 1.1035502958579881, 1.039054470709147, 1.1818181818181819, 1.6399253731343284]
Agent 2: opponent_idx: [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [624, 233, 293, 180, 79, 4070, 489, 219, 364, 3, 1678, 970, 536, 254] rewards: [1.1746794871794872, 1.0171673819742488, 1.023890784982935, 1.

In [16]:
agents = {}
index = 0
with HiddenPrints():
    for _ in range(5):
        agents[index] = constructAgent('TitForTat', config)
        index += 1
    for _ in range(10):
        agents[index] = constructAgent('LSTM', config)
        index += 1
#     for _ in range(10):
#         agents[index] = constructAgent('LSTMQN', config)
#         index += 1
 
agents = lstm_variant_selection(config, agents, env)

for n in range(len(agents)):
    print('Agent{}: name:{}  final score:{}  play time:{}  times to play D:{}  ratio: {}  faced D ratio: {}'
        .format(n, agents[n].name, agents[n].running_score,
        len(agents[n].own_memory[:agents[n].play_times]), list(agents[n].own_memory[:agents[n].play_times]).count(1),
                list(agents[n].own_memory[:agents[n].play_times]).count(1)/len(agents[n].own_memory[:agents[n].play_times]),
                list(agents[n].opponent_memory[:agents[n].play_times]).count(1)/len(agents[n].opponent_memory[:agents[n].play_times])))
print('The reward for total society: {}'.format(env.running_score/len(agents)))

100%|███████████████████████████████████| 10000/10000 [1:14:53<00:00,  2.23it/s]


Agent0: name:TitForTat  final score:224.5442113793691  play time:20434  times to play D:5118  ratio: 0.25046491142213956  faced D ratio: 0.23465792306939415
Agent1: name:TitForTat  final score:115.41369094224146  play time:23675  times to play D:6895  ratio: 0.2912354804646251  faced D ratio: 0.3089334741288279
Agent2: name:TitForTat  final score:272.5430055103634  play time:20959  times to play D:5363  ratio: 0.2558805286511761  faced D ratio: 0.27434515005486904
Agent3: name:TitForTat  final score:125.42595977955563  play time:21179  times to play D:3940  ratio: 0.1860333349072194  faced D ratio: 0.1765428018320034
Agent4: name:TitForTat  final score:97.15111685546731  play time:17225  times to play D:4101  ratio: 0.23808417997097242  faced D ratio: 0.22432510885341075
Agent5: name:LSTM  final score:118.49426302695123  play time:26714  times to play D:3917  ratio: 0.1466272366549375  faced D ratio: 0.1310923111477128
Agent6: name:LSTM  final score:134.27669304860606  play time:17323 

In [17]:
def get_index_from_action(action, idx):
    scale = lambda x: x+1 if x>=idx else x
    return list(map(scale, action))

for idx in agents:
    transitions = agents[idx].SelectMemory.memory
    _, actions, rewards, _ = zip(*transitions)
    actions = get_index_from_action(np.array(actions, dtype=int), idx)
    actions, rewards = np.array(actions), np.array(rewards)
    print(f'Agent {idx}:', end='')
    values, counts = np.unique(actions, return_counts=True)
    print(f' opponent_idx: {list(values)}, counts: {list(counts)} ', end='')
    dict_idx = {x: rewards[np.where(actions == x)] for x in values}
    print(f'rewards: {[np.mean(y) for _, y in dict_idx.items()]}')
    # print(f'rewards: {[np.std(y) for _, y in dict_idx.items()]}')
# plt.plot(actions)
# plt.show()
# print(rewards)

Agent 0: opponent_idx: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [359, 2243, 170, 33, 271, 128, 2863, 93, 324, 120, 725, 686, 1575, 403] rewards: [2.9415041782729805, 2.646455639768168, 2.5941176470588236, 2.696969696969697, 2.5719557195571956, 2.8984375, 3.106531610199092, 2.806451612903226, 2.79320987654321, 3.0, 1.5006896551724138, 2.628279883381924, 2.273015873015873, 2.925558312655087]
Agent 1: opponent_idx: [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [145, 64, 817, 9, 2631, 3, 965, 1579, 281, 1, 1348, 1611, 256, 283] rewards: [2.496551724137931, 2.8125, 2.747858017135863, 1.1111111111111112, 2.8350437096161154, 1.3333333333333333, 2.6922279792746115, 2.35085497150095, 1.5907473309608542, 3.0, 2.6313056379821957, 2.3438857852265675, 2.921875, 2.081272084805654]
Agent 2: opponent_idx: [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [2928, 128, 91, 435, 38, 11, 99, 137, 124, 1759, 3542, 596, 2, 103] rewards: [2.5631830601092895, 2.390625, 3.0, 1.

In [18]:
MULTI_SELECTION_METHOD = 'LSTM-VAR'
config = {
    'reward': 3, 
    'sucker': 0, 
    'temptation': 5, 
    'punishment': 1, 
    'n_episodes': 5000, 
    'discount': 0.99,
    'play_epsilon': 1,
    'select_epsilon': 1,
    'epsilon_decay': 0.999,
    'min_epsilon': 0.01,
    'alpha': 0.1,
    'n_actions': 2,
    'h': 10,
    'state_repr': 'bi',
    'batch_size': 64,
    'learning_rate': 1e-3,
}
config = Config(config)
env = Environment(config)
seed_everything()

In [19]:
agents = {}
index = 0
with HiddenPrints():
#     for _ in range(15):
#         agents[index] = constructAgent('TitForTat', config)
#         index += 1
#     for _ in range(10):
#         agents[index] = constructAgent('LSTM', config)
#         index += 1
    for _ in range(15):
        agents[index] = constructAgent('LSTMQN', config)
        index += 1
 
agents = lstm_variant_selection(config, agents, env)

for n in range(len(agents)):
    print('Agent{}: name:{}  final score:{}  play time:{}  times to play D:{}  ratio: {}  faced D ratio: {}'
        .format(n, agents[n].name, agents[n].running_score,
        len(agents[n].own_memory[:agents[n].play_times]), list(agents[n].own_memory[:agents[n].play_times]).count(1),
                list(agents[n].own_memory[:agents[n].play_times]).count(1)/len(agents[n].own_memory[:agents[n].play_times]),
                list(agents[n].opponent_memory[:agents[n].play_times]).count(1)/len(agents[n].opponent_memory[:agents[n].play_times])))
print('The reward for total society: {}'.format(env.running_score/len(agents)))

100%|███████████████████████████████████████| 5000/5000 [22:16<00:00,  3.74it/s]


Agent0: name:LSTMQN  final score:104.47790262485135  play time:7286  times to play D:6731  ratio: 0.9238265166071918  faced D ratio: 0.9363162228932199
Agent1: name:LSTMQN  final score:100.26787130492221  play time:6744  times to play D:6241  ratio: 0.9254151838671412  faced D ratio: 0.9427639383155397
Agent2: name:LSTMQN  final score:104.55159986219213  play time:10392  times to play D:9866  ratio: 0.9493841416474211  faced D ratio: 0.9409160892994611
Agent3: name:LSTMQN  final score:102.39313437042964  play time:10469  times to play D:9944  ratio: 0.9498519438341771  faced D ratio: 0.9236794345209667
Agent4: name:LSTMQN  final score:100.03581536490722  play time:10976  times to play D:10424  ratio: 0.9497084548104956  faced D ratio: 0.9452441690962099
Agent5: name:LSTMQN  final score:100.21228276632905  play time:7476  times to play D:6943  ratio: 0.928705189941145  faced D ratio: 0.9514446227929374
Agent6: name:LSTMQN  final score:99.99345148607813  play time:11014  times to play D:

In [20]:
def get_index_from_action(action, idx):
    scale = lambda x: x+1 if x>=idx else x
    return list(map(scale, action))

for idx in agents:
    transitions = agents[idx].SelectMemory.memory
    _, actions, rewards, _ = zip(*transitions)
    actions = get_index_from_action(np.array(actions, dtype=int), idx)
    actions, rewards = np.array(actions), np.array(rewards)
    print(f'Agent {idx}:', end='')
    values, counts = np.unique(actions, return_counts=True)
    print(f' opponent_idx: {list(values)}, counts: {list(counts)} ', end='')
    dict_idx = {x: rewards[np.where(actions == x)] for x in values}
    print(f'rewards: {[np.mean(y) for _, y in dict_idx.items()]}')
    # print(f'rewards: {[np.std(y) for _, y in dict_idx.items()]}')
# plt.plot(actions)
# plt.show()
# print(rewards)

Agent 0: opponent_idx: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [4, 3102, 5, 529, 106, 81, 206, 194, 60, 10, 169, 418, 13, 95] rewards: [0.5, 1.1376531270148291, 1.8, 1.0189035916824196, 1.990566037735849, 0.9753086419753086, 1.266990291262136, 1.0360824742268042, 1.1166666666666667, 2.0, 1.017751479289941, 1.0287081339712918, 2.076923076923077, 1.0736842105263158]
Agent 1: opponent_idx: [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [842, 115, 2288, 251, 173, 233, 98, 127, 59, 38, 113, 148, 460, 47] rewards: [1.0831353919239906, 1.0695652173913044, 1.1350524475524475, 1.0318725099601593, 1.0809248554913296, 1.0600858369098713, 1.0, 1.1023622047244095, 1.0508474576271187, 1.0789473684210527, 1.0707964601769913, 1.0337837837837838, 1.1978260869565218, 1.297872340425532]
Agent 2: opponent_idx: [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], counts: [5, 268, 10, 174, 80, 4, 5, 163, 5, 3642, 189, 243, 200, 4] rewards: [0.6, 1.0186567164179106, 3.1, 1.936781609195