<a href="https://colab.research.google.com/github/Lee-1997/fuzzy-tribble/blob/master/DoubleDQN_gfootball.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Double DQN算法应用于academy_empty_goal环境：

In [0]:
!apt-get update
!apt-get install libsdl2-gfx-dev libsdl2-ttf-dev

# Make sure that the Branch in git clone and in wget call matches !!
!git clone -b v2.0.6 https://github.com/google-research/football.git
!mkdir -p football/third_party/gfootball_engine/lib

!wget https://storage.googleapis.com/gfootball/prebuilt_gameplayfootball_v2.0.6.so -O football/third_party/gfootball_engine/lib/prebuilt_gameplayfootball.so
!cd football && GFOOTBALL_USE_PREBUILT_SO=1 pip3 install .

In [0]:
import gfootball.env as football_env
import collections
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


hyper_episodes = 1000
hyper_gamma = 0.95
hyper_gradient_step = 1e-3
hyper_soft_update = 0.999
hyper_batch_size = 50
hyper_replay_size = 2000
hyper_replay_start = 200
hyper_epsilon_start = 1.0
hyper_epsilon_decay = 0.99
hyper_epsilon_final = 0.
torch.manual_seed(1234)

class NeuralNetworks(nn.Module):
  def __init__(self, input_size, output_size, embedding_size=100, hidden_size=20):
    super(NeuralNetworks, self).__init__()
    self.fc1 = nn.Linear(input_size, embedding_size)
    self.bn1 = nn.BatchNorm1d(embedding_size)
    self.fc2 = nn.Linear(embedding_size, hidden_size)
    self.bn2 = nn.BatchNorm1d(hidden_size)
    self.fc3 = nn.Linear(hidden_size, hidden_size)
    self.bn3 = nn.BatchNorm1d(hidden_size)
    self.fc4 = nn.Linear(hidden_size, hidden_size)
    self.bn4 = nn.BatchNorm1d(hidden_size)
    self.fc5 = nn.Linear(hidden_size, hidden_size)
    self.bn5 = nn.BatchNorm1d(hidden_size)
    self.fc6 = nn.Linear(hidden_size, hidden_size)
    self.bn6 = nn.BatchNorm1d(hidden_size)
    self.fc7 = nn.Linear(hidden_size, hidden_size)
    self.bn7 = nn.BatchNorm1d(hidden_size)
    self.fc8 = nn.Linear(hidden_size, hidden_size)
    self.bn8 = nn.BatchNorm1d(hidden_size)
    self.fc9 = nn.Linear(hidden_size, hidden_size)
    self.bn9 = nn.BatchNorm1d(hidden_size)
    self.fc10 = nn.Linear(hidden_size, hidden_size)
    self.bn10 = nn.BatchNorm1d(hidden_size)
    self.fc11 = nn.Linear(hidden_size, output_size)    

  def forward(self, x):
    x = F.relu(self.bn1(self.fc1(x)))
    x = F.relu(self.bn2(self.fc2(x)))
    x = F.relu(x + self.bn4(self.fc4(F.relu(self.bn3(self.fc3(x))))))
    x = F.relu(x + self.bn6(self.fc6(F.relu(self.bn5(self.fc5(x))))))
    x = F.relu(x + self.bn8(self.fc8(F.relu(self.bn7(self.fc7(x))))))
    x = F.relu(x + self.bn10(self.fc10(F.relu(self.bn9(self.fc9(x))))))
    x = self.fc11(x)
    return x

Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class ExperienceReplay:
  def __init__(self, capacity):
    self.buffer = collections.deque(maxlen=capacity)

  def __len__(self):
    return len(self.buffer)

  def append(self, experience):
    self.buffer.append(experience)

  def sample(self, batch_size):
    indices = np.random.choice(len(self.buffer), batch_size, replace=False)
    states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
    return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), np.array(dones, dtype=np.uint8), np.array(next_states)


class DdqnAgent:
  def __init__(self, env, exp_buffer):
    self.env = env
    self.exp_buffer = exp_buffer
    self._reset()

  def _reset(self):
    self.state = env.reset()
    self.total_reward = 0.0

  def play_step(self, net, epsilon=0.0, device="cpu"):
    done_reward = None
    if np.random.random() < epsilon:
      action = env.action_space.sample()
    else:
      net.eval()
      state_a = np.array([self.state], copy=False)
      state_re = torch.tensor(state_a)
      qvalus_re = net(state_re)
      _, act_re = torch.max(qvalus_re, dim=1)
      action = int(act_re.item())
  
    new_state, reward, is_done, _ = self.env.step(action)
    self.total_reward += reward
    exp = Experience(self.state, action, reward, is_done, new_state)
    self.exp_buffer.append(exp)
    self.state = new_state
    if is_done:
      done_reward = self.total_reward
      self._reset()
    return done_reward


def loss_calculation(batch, net, target_net, device="cpu"):
  states, actions, rewards, dones, next_states = batch
  states_re = torch.tensor(states)
  next_states_re = torch.tensor(next_states)
  actions_re = torch.tensor(actions)
  rewards_re = torch.tensor(rewards)
  done_mask = torch.ByteTensor(dones)
  state_action_values = net(states_re).gather(1, actions_re.unsqueeze(-1)).squeeze(-1)
  next_state_values = net(next_states_re)
  _, next_action_re = torch.max(next_state_values, dim=1)
  next_state_action_values = target_net(next_states_re).gather(1, next_action_re.unsqueeze(-1)).squeeze(-1)
  next_state_action_values[done_mask] = 0.0
  next_state_action_values = next_state_action_values.detach()
  expected_state_action_values = next_state_action_values * hyper_gamma + rewards_re
  return nn.MSELoss()(state_action_values, expected_state_action_values)


if __name__ == "__main__":
  env = football_env.create_environment(env_name="academy_empty_goal", representation='simple115', number_of_left_players_agent_controls=1, stacked=False, logdir='/tmp/football', write_goal_dumps=False, write_full_episode_dumps=False, render=False)
  net = NeuralNetworks(env.observation_space.shape[0], env.action_space.n)
  target_net = NeuralNetworks(env.observation_space.shape[0], env.action_space.n)
  target_net.load_state_dict(net.state_dict())
  buffer = ExperienceReplay(hyper_replay_size)
  agent = DdqnAgent(env, buffer)
  epsilon = hyper_epsilon_start
  optimizer = optim.Adam(net.parameters(), lr=hyper_gradient_step)
  total_rewards = []
  step_list = []
  Eps = []
  loss_episode = []

  while True:    
    reward = agent.play_step(net, epsilon,)
    if len(buffer) > hyper_replay_start and reward is not None:
      break

  for episode_index in range(1,hyper_episodes+1):
    step_index = 0
    loss_step = []
    epsilon = max(hyper_epsilon_final, hyper_epsilon_start * (hyper_epsilon_decay ** episode_index) )
    Eps.append(epsilon)

    while True:
      step_index += 1
      reward = agent.play_step(net, epsilon)

      optimizer.zero_grad()
      batch = buffer.sample(hyper_batch_size)
      loss_t = loss_calculation(batch, net, target_net)
      loss_step.append(loss_t.item())
      loss_t.backward()
      optimizer.step()

      temp_net = collections.OrderedDict()
      for param_tensor in net.state_dict():
        temp_net_new = net.state_dict()[param_tensor].numpy()
        temp_net_old = target_net.state_dict()[param_tensor].numpy() 
        temp_net[param_tensor] = torch.from_numpy(np.array(temp_net_new * (1-hyper_soft_update) + temp_net_old * hyper_soft_update))
      target_net.load_state_dict(temp_net)

      if reward is not None:
        step_list.append(step_index)
        total_rewards.append(reward)
        loss_episode.append(np.mean(np.array(loss_step)))
        print("episode: %d/%d, step: %d, reward: %.2f, epsilon: %.2f" % (episode_index, hyper_episodes, step_index, reward, epsilon))
        break

  plt.plot(list(range(1,hyper_episodes+1)),step_list)
  plt.title('Steps of each Episode')
  plt.ylabel('Steps')
  plt.xlabel('Episodes')
  plt.show()
  plt.plot(list(range(1,hyper_episodes+1)),Eps)
  plt.title('Epsilon of each Episode')
  plt.ylabel('Epsilon')
  plt.xlabel('Episodes')
  plt.show()
  plt.plot(list(range(1,hyper_episodes+1)),total_rewards)
  plt.title('Reward of each Episode')
  plt.ylabel('Reward')
  plt.xlabel('Episodes')
  plt.show()
  plt.plot(list(range(1,hyper_episodes+1)),loss_episode)
  plt.title('Mean Loss of each Episode')
  plt.ylabel('Mean Loss')
  plt.xlabel('Episodes')
  plt.show()

episode: 1/1000, step: 73, reward: 1.00, epsilon: 0.99
episode: 2/1000, step: 324, reward: 0.00, epsilon: 0.98
episode: 3/1000, step: 61, reward: 0.00, epsilon: 0.97
episode: 4/1000, step: 204, reward: 0.00, epsilon: 0.96
episode: 5/1000, step: 66, reward: 0.00, epsilon: 0.95
episode: 6/1000, step: 77, reward: 0.00, epsilon: 0.94
episode: 7/1000, step: 33, reward: 0.00, epsilon: 0.93
episode: 8/1000, step: 68, reward: 0.00, epsilon: 0.92
episode: 9/1000, step: 169, reward: 0.00, epsilon: 0.91
episode: 10/1000, step: 77, reward: 0.00, epsilon: 0.90
episode: 11/1000, step: 112, reward: 0.00, epsilon: 0.90
episode: 12/1000, step: 112, reward: 0.00, epsilon: 0.89
episode: 13/1000, step: 62, reward: 0.00, epsilon: 0.88
episode: 14/1000, step: 61, reward: 0.00, epsilon: 0.87
episode: 15/1000, step: 302, reward: 0.00, epsilon: 0.86
episode: 16/1000, step: 43, reward: 0.00, epsilon: 0.85
episode: 17/1000, step: 104, reward: 0.00, epsilon: 0.84
episode: 18/1000, step: 177, reward: 0.00, epsilon

KeyboardInterrupt: ignored