In [1]:
import torch
import torch.nn as nn
from torch.distributions import Categorical
import numpy as np
from RL import ReinforceAgent
from single_agent_env import SinglePlayerFootball, ACTION_SPACE_SIZE, STATE_SPACE_SIZE
import matplotlib.pyplot as plt
torch.manual_seed(3407)
np.random.seed(3407)

In [2]:
class PG(nn.Module):

    def __init__(self, observation_size, action_size):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(observation_size, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 128),
            nn.LeakyReLU(),
            nn.Linear(128, action_size),
            nn.Softmax(dim=0)
        )

    def forward(self, x):
        x = self.model(x)
        m = Categorical(x)
        action = m.sample()
        return action.item(), m.log_prob(action)

In [None]:
env = SinglePlayerFootball(title="Reinforce train")
agent = ReinforceAgent(STATE_SPACE_SIZE, ACTION_SPACE_SIZE, device="cuda:1")
agent.create_model(PG, lr=0.00025, y=0.99)
scores = []

while env.running:
    reward = []
    s = env.reset()
    while not env.loop_once():
        a = agent.policy(s)
        s, r, d = env.step(a)
        agent.learn(r, d)
        reward.append(r)
    scores.append(sum(reward))

env.rendering = True

for _ in range(10):
    s = env.reset()
    while not env.loop_once():
        s, _, _ = env.step(agent.policy(s))

del env