In [None]:
"""
Setup gfootball 
"""
!apt-get update
!apt-get install libsdl2-gfx-dev libsdl2-ttf-dev

# Make sure that the Branch in git clone and in wget call matches !!
!git clone -b v2.0.6 https://github.com/google-research/football.git
!mkdir -p football/third_party/gfootball_engine/lib

!wget https://storage.googleapis.com/gfootball/prebuilt_gameplayfootball_v2.0.6.so -O football/third_party/gfootball_engine/lib/prebuilt_gameplayfootball.so
!cd football && GFOOTBALL_USE_PREBUILT_SO=1 pip3 install .

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:6 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease    
Get:7 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]    
Hit:8 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:11 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:12 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease
Get:13 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Get:14 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [1,376 kB]
Get:

In [None]:
import gfootball.env as football_env

from sklearn.metrics import mean_squared_error

import numpy as np
from matplotlib import pyplot as plt

import random

import tensorflow as tf
from keras.callbacks import TensorBoard
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten,concatenate
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K
from keras.applications.mobilenet_v2 import MobileNetV2

from numpy import random

from collections import deque

In [None]:
"""
Define env
"""
# define environment
env = football_env.create_environment(env_name='academy_empty_goal', representation='simple115')

# reset the environment
state = env.reset()

# define some variables
state_dims = env.observation_space.shape
n_actions = env.action_space.n



In [None]:
def actor_model(input_dims, output_dims):
  state_input = Input(shape=input_dims)
  oldpolicy_probs = Input(shape=(1, output_dims,))
  rewards = Input(shape=(1, 1,))
  values = Input(shape=(1, 1,))

  # F-C net (using Keras)
  x = Dense(512, activation='relu', name='fc1')(state_input)
  x = Dense(256, activation='relu', name='fc2')(x)
  # use softmax to get out probabilities
  out_actions = Dense(output_dims, activation='softmax', name='predictions')(x)

  model = Model(inputs=[state_input, oldpolicy_probs],
                  outputs=[out_actions])
  model.compile(optimizer=Adam(lr=1e-4), loss='mse')
  return model




def critic_model(input_dims,extra_dims):
  # net for critic, loss here is just mse
  state_input = Input(shape = input_dims)
  extra_input = Input(shape = extra_dims)

  # F-C net (using Keras)
  x = Dense(int(np.array(input_dims))+int(np.array(extra_dims)), activation='relu', name='fc1')(state_input)
  ax = Dense(512, activation='relu', name='fc2')(concatenate([x,extra_input]))
  ax = Dense(512, activation='relu', name='fc3')(ax)
  # final activation is tanh
  out_actions = Dense(n_actions, activation='softmax')(ax)

  model = Model(inputs=[state_input,extra_input], outputs=[out_actions])
  model.compile(optimizer=Adam(lr=1e-4), loss='mse')
  return model



In [None]:
"""
Define networks
"""
# Actor
actor = actor_model(input_dims = state_dims, output_dims = n_actions)
actor_target = actor_model(input_dims = state_dims, output_dims = n_actions)

# Critic
critic = critic_model(input_dims = state_dims,extra_dims = (n_actions,))
critic_target = critic_model(input_dims = state_dims,extra_dims = (n_actions,))

In [None]:
"""
Training

"""
ep_av_policyloss=[]
ep_av_criticloss=[]
rewards=[]
steps_to_reward=[]


import random
#agent = DDPGagent(env)
#noise = OUNoise(env.action_space)
batch_size = 64
rewards = []
avg_rewards = []

# episodes
max_iters = 10

# steps within episode
max_steps = 300

gamma = 0.99

# outer loop
dummy_n = np.zeros((1,1,n_actions))
dummy_1 = np.zeros((1,1,1))
steps=0
for episode in range(max_iters):
    print("episode {}...".format(episode))  
 
    env.reset()
    state = env.reset()

    agent_memory = []
    
    #noise.reset()
    episode_reward = 0


    buffer = deque(maxlen=0)
    state_batch=[]
    action_batch=[]
    reward_batch=[]
    next_state_batch=[]
    done_batch=[]

    target_reached = False
    
    state_rec=[]
    action_rec=[]
    reward_rec=[]
    next_state_rec=[]
    done_rec=[]
    action_probs_rec = []
    
    episode_policy_loss=[]
    episode_critic_loss=[]


    for step in range(max_steps):
        
        state_input = K.expand_dims(state, 0)
        
        # forward pass for actor, with random initialization, to get the prob dist of actions (random_n and random_1 just used to fill the extra dimensions)
        action_dist = actor.predict([state_input, dummy_n], steps=1)

        # forward pass for critic, to get Q value
        #q_value = model_critic.predict([state_input], steps=1)

        # pick action index (sample for action distribuition)
        action = np.random.choice(n_actions, p=action_dist[0, :])
        action_onehot = np.zeros(n_actions)
        action_onehot[action] = 1
        #break

        

        next_state, reward, done, _ = env.step(action) 
        

        #agent_memory.append(state, action, reward, new_state, done)
        state_rec.append(state)
        action_rec.append(action)
        reward_rec.append(reward)
        next_state_rec.append(next_state)
        done_rec.append(done)
        action_probs_rec.append(action_dist)
        memory = []

        if done:
          env.reset()

        

    # buffer
        if len(reward_rec) > batch_size:
          # sample a batch from record
          state_batch = []
          action_batch = []
          reward_batch = []
          next_state_batch = []
          done_batch = []
          probs_batch =[]

          indices = random.sample(range(len(reward_rec)),batch_size)

          for index in indices:
            state_batch.append(state_rec[index])
            action_batch.append(action_rec[index])
            reward_batch.append(reward_rec[index])
            next_state_batch.append(next_state_rec[index])
            done_batch.append(done_rec[index])
            probs_batch.append(action_probs_rec[index])

    
          # Critic loss     
          action_batch_onehot = np.zeros((len(action_batch),n_actions))

          for i in range(len(action_batch)):
            action_batch_onehot[i,action_batch[i]] = 1   
          
          Qvals = critic.predict([np.array(next_state_batch), np.array(action_batch_onehot)])
          
          next_actions = actor_target.predict([np.array(next_state_batch),dummy_n], steps=1)
          

          next_Q = critic_target.predict([np.array(next_state_batch) , np.array(next_actions)])
          
          Qprime = np.reshape(np.array(reward_batch),(len(reward_batch),1)) + gamma * next_Q

          
          critic_loss = mean_squared_error(Qvals, Qprime)
          
          # Actor loss
          policy_loss = -critic.predict([np.array(state_batch), np.array(actor.predict([state_batch, dummy_n], steps=1))])
                                                                         
          # update networks
          print("episode: {}, step: {} / {}, fitting actor...".format(episode,step, max_steps))
          actor.fit([state_batch,probs_batch],[action_batch_onehot], verbose=True, shuffle=True, epochs=10)
          print("episode: {}, step: {} / {}, fitting critic...".format(episode,step, max_steps))
          critic.fit([np.array(next_state_batch),np.array(action_batch_onehot)],[next_actions], verbose=True, shuffle=True, epochs=10)

# update weights
          actorW = actor.get_weights()
          actor_tW = actor_target.get_weights()
          criticW = critic.get_weights()
          critic_tW = critic_target.get_weights()

          for iter in range(len(actorW)):
            actor_tW[iter] = tau * actorW[iter]+(1-tau)*actor_tW[iter]
          for iter in range(len(criticW)):
            critic_tW[iter] = tau * criticW[iter]+(1-tau)*critic_tW[iter]

          actor_target.set_weights(actor_tW)
          critic_target.set_weights(critic_tW)
          
          episode_policy_loss.append(policy_loss)
          episode_critic_loss.append(critic_loss)         

          if reward>0.9:
            print("episode: {}, step: {}, reward: {}, average _reward: {} \n, policy loss: {}, value loss: {}".format(episode, step, episode_reward, np.mean(rewards[-10:]),policy_loss,critic_loss))
            target_reached = True
            steps =step
            break
          if done:
            env.reset()
        state = next_state
        episode_reward += reward

    ep_av_policyloss.append(np.mean(episode_policy_loss))
    ep_av_criticloss.append(np.mean(episode_critic_loss))
    rewards.append(episode_reward)
    steps_to_reward.append(steps)
    
    avg_rewards.append(np.mean(rewards[-10:]))
    print("episode: {}, policy loss: {}, critic_loss: {}, reward_cum: {} ".format(episode, np.mean(episode_policy_loss),np.mean(episode_critic_loss), sum(rewards)))
    #tot_loss.append(tot_loss)

(115,)