In [1]:
import torch as th
import numpy as np
import laserhockey.hockey_env as h_env
from network import Network
from own_env import OwnEnv
from action_selection import ActionSelection
from utils import ACTIONS_T
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
from shared_memory import SharedMemory
from replay_buffer import ReplayBuffer
import time

GAMMA = 0.95

In [2]:
l_winners = []
l_obs = []
l_reward = []
l_done = []
l_action_idx = []
l_policy_distr = []
l_trunc = []

In [3]:
env = OwnEnv()
opponent = h_env.BasicOpponent(weak=True)

In [4]:
class SelfPlay():
    def __init__(self, replay_buffer, writer, gamma, K, unroll_steps, shared_memory):
        self.replay_buffer = replay_buffer
        # self.writer = writer
        self.gamma = gamma
        self.K = K
        self.unroll_steps = unroll_steps
        # self.shared_memory = shared_memory

        self.action_selection = ActionSelection(self.gamma)

        self.env = OwnEnv()
        self.opponent = h_env.BasicOpponent(weak=True)

        self.env.seed(seed=np.random.randint(0, 1e8))
        self.obs_1, _ = self.env.reset(customized=False)
        self.obs_2 = self.env.obs_agent_two()

        self.temp_obs_1 = []
        self.temp_action_idx_1 = []
        self.temp_reward = []
        self.policy_distr = []

        self.network = Network()
        # self.network.load_state_dict(self.shared_memory.get_current_model())

        self.step_idx = 0

    def select_actions(self, net):
        eps = max(0.05, 1 - self.step_idx/400_000)
        action_1, action_idx_1, policy_distr = self.action_selection.select_action(eps, net, self.obs_1, False)

        # if self.step_idx % 10_000 == 0:
        #     self.writer.add_scalar('epsilon', eps, self.step_idx)

        return action_1, action_idx_1, policy_distr

    def store_sample(self, obs_1, action_idx_1, reward, policy_distr):
        obs_1 = th.from_numpy(obs_1)

        self.temp_obs_1.append(obs_1)
        self.temp_action_idx_1.append(action_idx_1)
        self.temp_reward.append(reward)
        self.policy_distr.append(policy_distr)

    def push_temp_buffer(self, last_obs_1, done):
        if done:
            n_often = self.K + self.unroll_steps

            default_policy = np.zeros(ACTIONS_T.shape[0])
            default_policy[-1] = 100.
            last_obs = th.from_numpy(last_obs_1).float()

            observations = th.from_numpy(np.vstack(self.temp_obs_1 + ((n_often+1)*[last_obs]))).float()
            rewards = th.FloatTensor([0] + self.temp_reward + (n_often*[0]))
            policy_distributions = th.from_numpy(np.vstack(self.policy_distr + ((n_often+1)*[default_policy]))).float()
            random_actions = np.random.randint(0, ACTIONS_T.shape[0], n_often).tolist()
            action_indices = th.LongTensor([-1] + self.temp_action_idx_1 + random_actions)

            dones = th.zeros(action_indices.shape[0], dtype=th.float)
            dones[-(n_often+1):] = 1.
        else:
            observations = th.from_numpy(np.vstack(self.temp_obs_1)).float()
            rewards = th.FloatTensor([-100] + self.temp_reward[:-1]).float()
            policy_distributions = th.from_numpy(np.vstack(self.policy_distr)).float()
            action_indices = th.FloatTensor([-100] + self.temp_action_idx_1[:-1]).long()
            dones = th.zeros(observations.shape[0], dtype=th.float)

        self.replay_buffer.add_samples(observations, rewards, action_indices, policy_distributions, dones)
        self.temp_obs_1, self.temp_reward, self.temp_action_idx_1, self.policy_distr = [], [], [], []

    def do_self_play(self):
        action_1, action_idx_1, policy_distr = self.select_actions(self.network)
        self.step_idx += 1

        for _ in range(2):
            action_2 = self.opponent.act(self.obs_2)
            next_obs_1, reward, done, trunc, info = self.env.step(np.hstack([action_1, action_2]), customized=False)
            next_obs_2 = self.env.obs_agent_two()
            if trunc:
                break

        # [obs, action_idx, reward, non_terminal_mask, next_obs]
        self.store_sample(self.obs_1, action_idx_1, reward, policy_distr)

        l_obs.append(self.obs_1)
        l_reward.append(reward)
        l_done.append(done)
        l_action_idx.append(action_idx_1)
        l_policy_distr.append(policy_distr)
        l_trunc.append(trunc)
        
        self.obs_1 = np.copy(next_obs_1) # copying is probably not necessary
        self.obs_2 = np.copy(next_obs_2) # copying is probably not necessary
        
        if trunc:
            # self.writer.add_scalar('winner', info['winner'], self.step_idx)
            l_obs.append(self.obs_1)
            l_winners.append(info['winner'])
            l_reward.append(reward)
            l_done.append(done)
            l_action_idx.append(action_idx_1)
            l_policy_distr.append(policy_distr)
            l_trunc.append(trunc)
            
            self.push_temp_buffer(next_obs_1, done)

            self.env.seed(seed=np.random.randint(0, 1e8))
            self.obs_1, _ = self.env.reset(customized=False)
            self.obs_2 = self.env.obs_agent_two()

            # self.network.load_state_dict(self.shared_memory.get_current_model())

In [5]:
max_buffer_size = 50_000
n_warmup = 10_000
alpha = 0.6
beta = 1
K = 5
gamma = 0.95
unroll_steps = 3
# writer = SummaryWriter()
# shared_memory = SharedMemory()
replay_buffer = ReplayBuffer(max_buffer_size, K, unroll_steps, n_warmup, alpha, beta)
self_play = SelfPlay(replay_buffer, None, gamma, K, unroll_steps, None)

In [6]:
for _ in range(1_000):
    self_play.do_self_play()

In [7]:
env.close()

In [8]:
winners = np.array(l_winners)
observations = np.vstack(l_obs)
rewards = np.array(l_reward)
dones = np.array(l_done)
truncs = np.array(l_trunc)
action_indices = np.array(l_action_idx)
policy_distributions = np.vstack(l_policy_distr)

In [9]:
class Game():
    def __init__(self, observations, rewards, dones, truncs, action_indices, policy_distributions):
        self.observations = observations
        self.rewards = rewards
        self.dones = dones
        self.dones[-2] = 0
        self.truncs = truncs
        self.truncs[-2] = False
        self.action_indices = action_indices
        self.policy_distributions = policy_distributions

print(np.argwhere(truncs).flatten())
ind = [-1] + [np.argwhere(truncs).flatten()[2*i+1]for i in range(len(np.argwhere(truncs).flatten())//2)]
print(ind)
games = []
for j in range(len(ind) - 1):
    from_idx = ind[j]+1
    to_idx = ind[j+1]+1
    game = Game(observations[from_idx:to_idx], rewards[from_idx:to_idx], dones[from_idx:to_idx], truncs[from_idx:to_idx], action_indices[from_idx:to_idx], policy_distributions[from_idx:to_idx])
    games.append(game)

[ 125  126  252  253  270  271  371  372  389  390  516  517  547  548
  674  675  692  693  819  820  938  939  958  959 1008 1009]
[-1, 126, 253, 271, 372, 390, 517, 548, 675, 693, 820, 939, 959, 1009]


In [14]:
winners

array([ 0,  0, -1, -1, -1,  0, -1,  0, -1,  0, -1,  1, -1])

In [101]:
print(replay_buffer.indices[118:237])

tensor([126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
        140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
        154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
        168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
        182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,
        196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
        210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
        238, 239, 240, 241, 242, 243, 252])


In [118]:
observations[127:254,:3]

array([[-3.        ,  0.        ,  0.        ],
       [-3.        , -0.10800004,  0.        ],
       [-3.        , -0.12099719,  0.        ],
       [-3.00091791, -0.0830462 ,  0.12256545],
       [-3.11260724, -0.0461235 ,  0.27527273],
       [-3.34743452, -0.01484942,  0.41600773],
       [-3.57377577,  0.11943007,  0.54570913],
       [-3.65149045,  0.26234388,  0.6652419 ],
       [-3.6559844 ,  0.25946903,  0.71234292],
       [-3.65116811,  0.11109543,  0.74003905],
       [-3.63941002, -0.033849  ,  0.81779659],
       [-3.62882423, -0.14840364,  0.90687817],
       [-3.62593079, -0.24127769,  0.94096518],
       [-3.62017536, -0.30812311,  1.03254557],
       [-3.60479212, -0.25798631,  1.03556442],
       [-3.58208609, -0.19061518,  0.93767691],
       [-3.56319141, -0.24561954,  0.84746373],
       [-3.54750919, -0.43312502,  0.76432329],
       [-3.42652702, -0.61982703,  0.68770111],
       [-3.28690124, -0.78497601,  0.49452063],
       [-3.17445421, -1.03551745,  0.276

In [119]:
# observations, policy_distr, action_indices, rewards, dones, indices
replay_buffer.observations[126:252,:3]

tensor([[-3.0000,  0.0000,  0.0000],
        [-3.0000, -0.1080,  0.0000],
        [-3.0000, -0.1210,  0.0000],
        [-3.0009, -0.0830,  0.1226],
        [-3.1126, -0.0461,  0.2753],
        [-3.3474, -0.0148,  0.4160],
        [-3.5738,  0.1194,  0.5457],
        [-3.6515,  0.2623,  0.6652],
        [-3.6560,  0.2595,  0.7123],
        [-3.6512,  0.1111,  0.7400],
        [-3.6394, -0.0338,  0.8178],
        [-3.6288, -0.1484,  0.9069],
        [-3.6259, -0.2413,  0.9410],
        [-3.6202, -0.3081,  1.0325],
        [-3.6048, -0.2580,  1.0356],
        [-3.5821, -0.1906,  0.9377],
        [-3.5632, -0.2456,  0.8475],
        [-3.5475, -0.4331,  0.7643],
        [-3.4265, -0.6198,  0.6877],
        [-3.2869, -0.7850,  0.4945],
        [-3.1745, -1.0355,  0.2767],
        [-3.0872, -1.2762,  0.0760],
        [-3.0203, -1.3661, -0.1090],
        [-2.8619, -1.4076, -0.2794],
        [-2.6981, -1.4287, -0.3140],
        [-2.5640, -1.5496, -0.3061],
        [-2.4554, -1.5729, -0.2988],
 