In [1]:
import torch
import torch.nn as nn
import numpy as np
from single_agent_env import SinglePlayerFootball, ACTION_SPACE_SIZE, STATE_SPACE_SIZE
from RL import DeepQNetworkAgent
from RL.utils import ReplayBuffer
torch.manual_seed(3407)
torch.cuda.manual_seed(3407)
np.random.seed(3407)

In [2]:
class DQN(nn.Module):

    def __init__(self, input_shape, output_shape) -> None:
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_shape, 1024),
            nn.LeakyReLU(),
            nn.Linear(1024, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.Linear(64, output_shape),
        )

    def forward(self, x):
        return self.model(x)


In [3]:
env = SinglePlayerFootball(title="DQN train")
agent = DeepQNetworkAgent(STATE_SPACE_SIZE, ACTION_SPACE_SIZE, device="cuda:0")
agent.create_model(DQN, lr=0.0001, y=0.99, e_decay=0.999999, batchs=64)
agent.create_buffer(ReplayBuffer(1_000_000, 100_000, STATE_SPACE_SIZE))

scores = []

while env.running:
    rewards = []
    state = env.env.reset()
    while not env.loop_once():
        action = agent.policy(state)
        n_state, reward, done = env.step(action)
        agent.learn(state, action, n_state, reward, done, update="soft")
        state = n_state
    scores.append(sum(rewards))

env.rendering = True
env.running = True
for _ in range(10):
    s = env.reset()
    while not env.loop_once():
        s, _, _ = env.step(agent.policy(s))
del env

Episode: 341 | Train: 100 | Loss: 0.000004 | e: 0.999901
Episode: 341 | Train: 200 | Loss: 0.000007 | e: 0.999801
Episode: 342 | Train: 300 | Loss: 0.000002 | e: 0.999701
Episode: 342 | Train: 400 | Loss: 0.003127 | e: 0.999601
Episode: 342 | Train: 500 | Loss: 0.000006 | e: 0.999501
Episode: 343 | Train: 600 | Loss: 0.003048 | e: 0.999401
Episode: 343 | Train: 700 | Loss: 0.000018 | e: 0.999301
Episode: 343 | Train: 800 | Loss: 0.000003 | e: 0.999201
Episode: 344 | Train: 900 | Loss: 0.000004 | e: 0.999101
Episode: 344 | Train: 1000 | Loss: 0.000004 | e: 0.999001
Episode: 345 | Train: 1100 | Loss: 0.000013 | e: 0.998902
Episode: 345 | Train: 1200 | Loss: 0.000016 | e: 0.998802
Episode: 345 | Train: 1300 | Loss: 0.000012 | e: 0.998702
Episode: 346 | Train: 1400 | Loss: 0.000003 | e: 0.998602
Episode: 346 | Train: 1500 | Loss: 0.003064 | e: 0.998502
Episode: 346 | Train: 1600 | Loss: 0.000002 | e: 0.998402
Episode: 347 | Train: 1700 | Loss: 0.000003 | e: 0.998302
Episode: 347 | Train: 1