In [1]:
import environment as env
import policy
import numpy as np
import random
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from operator import itemgetter

# Deep Q Network for Mille Bornes - Second Attempt
Goals for this attempt:
- Learn how to store a trained model
- Implement a replay buffer (instead of training every game, create a replay buffer and sample experiences)
- Implement Fixed Q-Value Targets (two models, an online and a target model)

In [2]:
# Global parameters and variables
discount_factor = 0.9  # Discount factor of future rewards
optimizer = keras.optimizers.Adam(learning_rate=.001)
loss_fn = keras.losses.mean_squared_error
model_input_shape = [47]
model_n_outputs = 97
model_folder = "Larry"

# Card and Action matrix
card_matrix = env.card_matrix_build()
action_matrix = env.action_matrix_build(card_matrix)

# Replay buffer
#  Only experiences for the DQN player are stored
#  Inner lists of equal length: states, actions, rewards, next states, dones
#    Next states = state for "dones" (couldn't think of a good way to account for no next states)
#    dones: 0 = no, game continued; 1 = yes, game over
replay_buffer = [ [], [], [], [], [] ]



In [3]:
# Model functions

def model_create():
    """
    Create a new Sequential model
    
    Return - Sequential model object
    """
    return keras.models.Sequential([
        keras.layers.Dense(32, activation="elu", input_shape=model_input_shape),
        keras.layers.Dense(32, activation="elu"),
        keras.layers.Dense(model_n_outputs)
    ])

In [4]:
# Adds to the replay buffer at the conclusion of a game
def replay_buffer_append(game, player_index):
    """
    Player experience collection
    
    game (environment.Game) - the instance of the Game class that was played
    
    player_index (int) - the index of the DQN player to store experiences
    """
    # Empty list to populate game experiences
    player_experience = [ [], [], [], [], [] ]
    
    # Used to calculate the player's reward for a full round of play (reward = player action points + team player action points - opponent players action points)
    reward_round = 0
    
    # Player object - simply checking DQN player versus action history log
    player = game.players[player_index]
    
    # Loop in reverse to calculate rewards from a given action
    for act in reversed(game.action_history):
        if act[0] == player and act[2] > -1:
           # Record this action and reset reward round for this player
            reward_round += act[3]
            player_experience[0].append(act[1])  # State
            player_experience[1].append(act[2])  # Action
            player_experience[2].append(reward_round)  # Reward
            if len(player_experience[0]) > 1:
                player_experience[3].append(player_experience[0][-2])  # Next State
                player_experience[4].append(0) # Done
            else:
                player_experience[3].append(act[1]) # Next State
                player_experience[4].append(1) # Done

            reward_round = 0
        else:
            # Adjust reward
            reward_round += act[3] * (1 if act[0].team == player.team else -1)
               
    # Reverse list orders to be in game play order and append to buffer
    for i in range(5):
        player_experience[i].reverse()
        replay_buffer[i].extend(player_experience[i])
    

In [5]:
# Return a sample of experiences from the replay buffer
def replay_buffer_sample(batch_size):
    """
    Return a sample of experiences from the replay buffer
    
    batch_size (int) - the number of experiences to sample
    """
    
    indices = np.random.randint(len(replay_buffer[0]), size=batch_size)
    
    # itemgetter = operator class; * - converts the indices to an integer scalar array
    states = np.array(itemgetter(*indices)(replay_buffer[0]))
    actions = np.array(itemgetter(*indices)(replay_buffer[1]))
    rewards = np.array(itemgetter(*indices)(replay_buffer[2]))
    next_states = np.array(itemgetter(*indices)(replay_buffer[3]))
    dones = np.array(itemgetter(*indices)(replay_buffer[4]))
    
    return states, actions, rewards, next_states, dones

In [6]:
# Train model using a sample of experiences from the buffer
def train_model(batch_size):
    """
    Train the model
    
    player_experience - the arrays associated with the desired player
    """
    
    # Data for person to process
    states, actions, rewards, next_states, dones = replay_buffer_sample(batch_size)

    # ----- Q values for next states
    Q_values_next_state = model_target.predict(next_states, verbose=0)

    # Filter next state Q values to only valid actions
    valid_actions = [env.actions_space(next_state, card_matrix, action_matrix) for next_state in next_states]
    Q_values_next_state_valid = [
        [Q_values_next_state[i][j] for j in valid_actions[i]]
        for i in range(len(Q_values_next_state))
    ]

    # Get the max Q value for each next state
    Q_values_next_state_max = [max(q) for q in Q_values_next_state_valid]

    # Convert to numpy array
    Q_values_next_state_max = np.array(Q_values_next_state_max)
    dones = np.array(dones)

    # Target Q values: Bellman function: rewards + discounted future rewards (max Q value from the next state as model is assumed to act optimally)
    Q_values_target = (rewards + (1 - dones) * discount_factor * Q_values_next_state_max)

    # ----- Q values for states, compute loss and train model

    # Create mask to retain only the actions that were taken for each action
    mask = tf.one_hot(actions, model_n_outputs)

    # Calculate the gradient descent (target values the model would have taken under optimal scenario vs. actual actions taken)
    with tf.GradientTape() as tape:
        # Retrieve Q values for all possible actions in each state
        Q_values_state_all = model(states)

        # Retain only Q values for actions taken (state-action pair - uses "mask")
        Q_values_state = tf.reduce_sum(Q_values_state_all * mask, axis=1, keepdims=True)

        # Compute loss
        loss = tf.reduce_mean(loss_fn(Q_values_target, Q_values_state))
    gradients = tape.gradient(loss, model.trainable_variables)

    # Optimize the model
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [7]:
# Play a game
def play_game(players_name, players_policy, epsilon):
    """
    players_name ([str]) - name of players
    players_policy ([str]) - policy to apply to each player
    epsilon (dec) - value to use when evaluating the explore/exploit policy
    
    Return Game
    """
    game = env.Game(players_name)
    
    # Loop game play
    while game.play_status < 4:
        # Policy
        policy_type = players_policy[players_name.index(game.player_current.name)]
        if policy_type == "dqn":
            action = policy.dqn(model, np.array(game.state()), game.player_actions, epsilon)
        elif policy_type == "program":
            action = policy.program(game.player_actions)
        else:
            action = policy.rand(game.player_actions)
        
        # Play the step
        game.play_action(action)
    
    return game

In [8]:
# Play a tournament (multiple games)
def play_tournament(players_name, players_policy, n_games, train_batch_size):
    """
    Play's multiple games for the specific players based on their policies
    
    players_name ([str]) - player's names
    players_policy ([str]) - policy to decide action for each player
    n_games (int) - number of games to play
    train_batch_size (int) - number of steps/turns to pull from the buffer to train the model each time
    """
    
    # General variables
    players_count = len(players_name)
    players_range = range(players_count)
    teams_count = players_count if players_count < 4 else players_count // 2
    teams_range = range(teams_count)
    dqn_player_index = players_policy.index('dqn')
    dqn_team_index = dqn_player_index if players_count < 4 else dqn_player_index % teams_count  # Team the DQN player is on              
    
    # Statistics to track
    winner_team = []
    dqn_rewards = []
    
    # After 90% of games have been run, epsilon should be 0.01 (highest liklihood the policy will select the DQN option)
    epsilon_max = int(n_games * .9)
    
    for i in range(n_games):
        # Determine epsilon for DQN
        epsilon = max(1 - i / epsilon_max, 0.01)
        
        # Play a game and then get player's experiences (replay buffer)
        game = play_game(players_name, players_policy, epsilon)
        replay_buffer_append(game, dqn_player_index)
        
        # Track statistics
        team_points = game.final_team_points()
        max_points = max(team_points)
        winner_team.append(team_points.index(max_points) if team_points.count(max_points) == 1 else -1)
        dqn_rewards.append(np.sum([team_points[i] * (1 if i == dqn_team_index else -1) for i in teams_range]))
        
        # Train the model (for Larry) (skip the first few episode/game to ensure the buffer is large enough)
        if i > 5:
            train_model(train_batch_size)
        
        # Update the target model after every 50 games
        if i % 50 == 0:
            model_target.set_weights(model.get_weights())
        
    # Return statistics
    return (winner_team, dqn_rewards)

In [52]:
# Core code: First pass

# Create a new model
model = model_create()

# Setup target model
model_target = keras.models.clone_model(model)
model_target.set_weights(model.get_weights())

# Setup parameters
players = ['Bob', 'Larry', 'Jr']
policies = ['random', 'dqn', 'program']
n_games = 500
train_batch_size = 50

# Play the tournament
winner_team, dqn_rewards = play_tournament(players, policies, n_games, train_batch_size)

# Save the model
model.save(model_folder)

# Write team wins to file (note, this code only works for 2 or 3 players)
file_wins = open('log_wins.csv', 'a+')
for i in range(len(policies)):
    file_wins.write(f"{policies[i]},{int(np.sum([1 for win in winner_team if win == i]))},{n_games}\n")

file_wins.write(f"ties,{int(np.sum([1 for win in winner_team if win == -1]))},{n_games}\n")

file_wins.close()
    
# Write DQN Rewards to file
file_dqn_rewards = open('log_rewards.csv', mode='a+')
for reward in dqn_rewards:
    file_dqn_rewards.write(f"{reward}\n")
file_dqn_rewards.close()

INFO:tensorflow:Assets written to: Larry\assets


In [27]:
# Resume training

# Load saved model
model = keras.models.load_model(model_folder)

# Setup target model
model_target = keras.models.clone_model(model)
model_target.set_weights(model.get_weights())

# Setup parameters
players = ['Bob', 'Larry', 'Jr']
policies = ['random', 'dqn', 'program']
n_games = 1000
train_batch_size = 50

# Play the tournament
winner_team, dqn_rewards = play_tournament(players, policies, n_games, train_batch_size)

# Save the model
model.save(model_folder)

# Write team wins to file (note, this code only works for 2 or 3 players)
file_wins = open('log_wins.csv', 'a+')
for i in range(len(policies)):
    file_wins.write(f"{policies[i]},{int(np.sum([1 for win in winner_team if win == i]))},{n_games}\n")

file_wins.write(f"ties,{int(np.sum([1 for win in winner_team if win == -1]))},{n_games}\n")

file_wins.close()
    
# Write DQN Rewards to file
file_dqn_rewards = open('log_rewards.csv', mode='a+')
for reward in dqn_rewards:
    file_dqn_rewards.write(f"{reward}\n")
file_dqn_rewards.close()

INFO:tensorflow:Assets written to: Larry\assets


In [29]:
# Play some games using only the model (exploit not explore)

# Load saved model
model = keras.models.load_model(model_folder)

# Setup parameters
players = ['Bob', 'Larry', 'Jr']
policies = ['random', 'dqn', 'program']
n_games = 500
epsilon = 0.0
winner_team = []
    
# Play games
for i in range(n_games):
    # Play a game
    game = play_game(players, policies, epsilon)
        
    # Track statistics
    team_points = game.final_team_points()
    max_points = max(team_points)
    winner_team.append(team_points.index(max_points) if team_points.count(max_points) == 1 else -1)

# Return winner counts
for i in range(len(players)):
    print(f"{players[i]}: {int(np.sum([1 for win in winner_team if win == i]))} wins")

print(f"Ties: {int(np.sum([1 for win in winner_team if win == -1]))} wins")
print(f"Total Games: {n_games}")

Bob: 20 wins
Larry: 42 wins
Jr: 432 wins
Ties: 6 wins
Total Games: 500
