In [1]:
import torch
import torch.nn as nn
import numpy as np
from single_agent_env import SinglePlayerFootball, ACTION_SPACE_SIZE, STATE_SPACE_SIZE
from RL import DeepQNetworkAgent
from RL.utils import ReplayBuffer
torch.manual_seed(3407)
torch.cuda.manual_seed(3407)
np.random.seed(3407)

In [2]:
class DQN(nn.Module):

    def __init__(self, input_shape, output_shape) -> None:
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_shape, 1024),
            nn.LeakyReLU(),
            nn.Linear(1024, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.Linear(64, output_shape),
        )

    def forward(self, x):
        return self.model(x)

In [3]:
agent = DeepQNetworkAgent(STATE_SPACE_SIZE, ACTION_SPACE_SIZE, device="cuda:1")
agent.create_model(DQN, lr=0.0001, y=0.99, e_decay=0.999999, batchs=64)
agent.create_buffer(ReplayBuffer(1_000_000, 100_000, STATE_SPACE_SIZE))

In [4]:
agent.model = torch.jit.load("models/random_ball/first/-0.01_every_to_1 iteration 2_31275_0.99.pt")
agent.model.to(agent.device)
agent.target_model.load_state_dict(agent.model.state_dict())

<All keys matched successfully>

In [5]:
train_id = "random_ball_every_-1_goal_300_transfer"
env = SinglePlayerFootball(title=train_id)
best_score = 0
while env.running:
    state = env.reset(ball_random=True)
    while not env.loop_once():
        action = agent.policy(state)
        n_state, reward, done = env.step(action)
        agent.learn(state, action, n_state, reward, done)
        state = n_state
    if agent.e < 0.1:
        if agent.reward_history[-1] > best_score:
            best_score = agent.reward_history[-1]
            model_scripted = torch.jit.script(agent.model)
            model_scripted.save(f"best_models/{train_id}_{agent.episode_count}_{round(best_score, 6)}.pt")
del env

Episode: 1 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 2 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 3 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 4 | Train: 0 | e: 1.000000 | r: -29.000000
Episode: 5 | Train: 0 | e: 1.000000 | r: -136.000000
Episode: 6 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 7 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 8 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 9 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 10 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 11 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 12 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 13 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 14 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 15 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 16 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 17 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 18 | Train: 0 | e: 1.000000 | r: -300.000000
Episode: 19 | Train: 0 | e: 1.000000 |

In [9]:
with open('rewards.txt', 'w') as f:
    f.writelines([f"{round(item, 6)}\n" for item in agent.reward_history])

In [8]:
agent.train = False
env = SinglePlayerFootball(title=train_id)
for _ in range(10):
    s = env.reset(ball_random=True)
    while not env.loop_once():
        s, _, _ = env.step(agent.policy(s))
del env