In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/Othercomputers/Il mio laptop/ANN_Project/code

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/Othercomputers/Il mio laptop/ANN_Project/code


In [None]:
!pip install line_profiler
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


## Setup

In [None]:
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from utils import *
from q_learning import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# Configuration paramaters for the whole setup
#seed = 0

env = TictactoeEnv()

### Definition of the Network


### Training
 

In [None]:
gamma = 0.99  # Discount factor for past rewards
epsilon = 1.0  # Epsilon greedy parameter
decaying_exploration = True
epsilon_min = 0.1  # Minimum epsilon greedy parameter
epsilon_max = 0.8  # Maximum epsilon greedy parameter
epsilon_interval = (epsilon_max - epsilon_min)  # Rate at which to reduce chance of random action being taken

num_episodes = 20000
batch_size = 64  # Size of batch taken from replay buffer
max_memory_length = 10000 # Size of the replay buffer
# How often to update the target network
update_target_network = 500

In [24]:
# Network defined in the project description
def create_q_model():
    """

    :return:
    """
    hidden_neurons = 128
    num_actions = 9
    # Inputs of shape = (3, 3, 2)
    inputs = layers.Input(shape=(3, 3, 2,))

    layer0 = layers.Flatten()(inputs)
    # Two fully connected hidden layers each with 128 neurons and ReLU activation
    layer1 = layers.Dense(units=hidden_neurons, activation="relu")(layer0)
    layer2 = layers.Dense(units=hidden_neurons, activation="relu")(layer1)
    # Output with linear activation function
    action = layers.Dense(num_actions, activation="linear")(layer2)

    return keras.Model(inputs=inputs, outputs=action)

test_model = create_q_model()
test_model.summary()


def grid_to_tensor(grid, player):
    if player == 'X':
        return tf.convert_to_tensor(np.stack((np.where(grid == 1, 1, 0), np.where(grid == -1, 1, 0)), -1))
    else:
        return tf.convert_to_tensor(np.stack((np.where(grid == -1, 1, 0), np.where(grid == 1, 1, 0)), -1))


class DeepQPlayer:
    """

    """
    def __init__(self, model, player='X'):
        """
        __init__
        :param self: self
        :param model:
        :param player: 'X' or 'O'
        """
        self.model = model  # initialize model
        self.player = player  # set the player

    def set_player(self, player='X'):
        """
        Set player to be either 'X' or 'O'
        :param self: self
        :param player: 'X' or 'O' ('X' by default)
        :param j: to change 'X' and 'O'
        """
        self.player = player

    def act(self, grid, **kwargs):
        """
        Performs a greedy move, i.e. a (1-epsilon)-greedy action with epsilon equal to zero
        :param self: self
        :param grid: current state
        :param kwargs: keyword arguments
        :return: the action chosen greedily
        """
        grid = tf.expand_dims(grid_to_tensor(grid, self.player), axis=0)
        action_probs = self.model(grid, training=False)
        # Take best action
        max_indices = tf.where(action_probs[0] == tf.reduce_max(action_probs[0]))
        return int(max_indices[np.random.randint(0, len(max_indices))])  # ties are split randomly


def deep_q_learning_against_opt(env, lr=5e-4, gamma=0.99, num_episodes=20000, epsilon_exploration=0.1,
                                epsilon_exploration_rule=None, epsilon_opt=0.5, test_freq=None, verbose=False):
    """

    :param lr:
    :param env:
    :param gamma:
    :param num_episodes:
    :param epsilon_exploration:
    :param epsilon_exploration_rule:
    :param epsilon_opt:
    :param test_freq:
    :param verbose:
    :return:
    """
    num_actions = 9
    batch_size = 64  # Size of batch taken from replay buffer
    max_memory_length = 10000  # Size of the replay buffer
    # How often to update the target network (num games)
    update_target_network = 500
    update_freq = 1

    # Experience replay buffers
    action_history = []
    state_history = []
    state_next_history = []
    rewards_history = []
    done_history = []
    #episode_reward_history = []
    #episode_count = 0
    frame_count = 0
    #epsilon_random_frames = 0

    model = create_q_model()
    model_target = create_q_model()
    # losing state
    losing_state = - np.ones((3, 3))

    # Adam optimizer
    optimizer = keras.optimizers.Adam(learning_rate=lr)
    # Using huber loss for stability
    loss_function = keras.losses.Huber()

    turns = np.array(['X', 'O'])
    # Stats of training
    episode_rewards = np.empty(num_episodes)
    loss_train = np.empty(num_episodes)
    if test_freq is not None:
        episode_Mopt = [measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=0.))]
        episode_Mrand = [measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=1.))]
    else:
        episode_Mopt = []  # initialize
        episode_Mrand = []  # initialize
    if verbose and (test_freq is not None):
        print('Episode  0 :\tM_opt = ', episode_Mopt[0], '\tM_rand = ', episode_Mrand[0])
    # Rule for exploration
    if epsilon_exploration_rule is None:
        def epsilon_exploration_rule(n):
            return epsilon_exploration  # if an exploration rule is not given, it is the constant one

    for itr in range(num_episodes):
        my_player = turns[itr % 2]
        player_opt = OptimalPlayer(epsilon=epsilon_opt, player=turns[(itr+1) % 2])
        state, _, _ = env.reset()
        if env.current_player == player_opt.player:
            move = player_opt.act(state)
            state, _, _ = env.step(move)
        # state = np.array(state)
        for i in range(num_actions):
            #env.render()
            #frame_count += 1

            state_tensor = grid_to_tensor(state, my_player)
            state_tensor = tf.expand_dims(state_tensor, axis = 0)
            # Use epsilon-greedy for exploration
            if epsilon_exploration_rule(itr+1) > np.random.uniform(0, 1):
                # Take random action
                action = np.random.choice(num_actions)
            else:
                # Predict action Q-values
                # From environment state
                action_probs = model(state_tensor, training = False)
                # Take best action
                max_indices = tf.where(action_probs[0] == tf.reduce_max(action_probs[0]))
                action = int(max_indices[np.random.randint(0, len(max_indices))])  # ties are split randomly

            # Apply the sampled action in our environment
            try:
              state_adv, _, _ = env.step(action)
            except ValueError:
              env.end = True
              env.winner = player_opt.player
            if not env.end:
                action_adv = player_opt.act(state_adv)
                state_next, _, _ = env.step(action_adv)
            else:
                ######## DA RIVEDERE #########
                state_next = state

            reward = env.reward(player=my_player)
            done = env.end

            frame_count+=1
            # Save actions and states in replay buffer
            action_history.append(action)
            #print("------ My player = ", my_player, "------")
            #print(state)
            state_history.append(grid_to_tensor(state, my_player))
            #print(state_history[-1][:,:,0])
            #print(state_history[-1][:,:,1])
            state_next_history.append(grid_to_tensor(state_next, my_player))
            #print(state_next_history[-1][:,:,0])
            #print(state_next_history[-1][:,:,1])
            done_history.append(done)
            rewards_history.append(reward)
            #if done_history[-1] == True and rewards_history[-1] == 0 :
                #print("***************Game number:", itr, "***************")
                #print(state_history[-1][:,:,0])
                #print(state_history[-1][:,:,1])
                #print(state_next_history[-1][:,:,0])
                #print(state_next_history[-1][:,:,1])
            #print("Done = ", done_history[-1])
            #print("Rewards = ", rewards_history[-1])

            state = state_next
            # Update after every update_freq steps and once batch size is over 64
            if frame_count % update_freq == 0 and len(done_history) > batch_size:
                # Get indices of samples for replay buffers
                indices = np.random.choice(range(len(done_history)), size=batch_size)

                # Using list comprehension to sample from replay buffer
                state_sample = tf.stack([state_history[i] for i in indices], axis = 0)
                state_next_sample =  tf.stack([state_next_history[i] for i in indices], axis = 0)
                rewards_sample = [rewards_history[i] for i in indices]
                action_sample = [action_history[i] for i in indices]
                done_sample = tf.convert_to_tensor([float(done_history[i]) for i in indices])

                # Build the updated Q-values for the sampled future states
                # Use the target model for stability
                start = time.time()
                future_rewards = model_target(state_next_sample, training=False) #TODO: COSA CAMBIA CON SENZA
                # Q value = reward + discount factor * expected future reward
                updated_q_values = rewards_sample + gamma * tf.reduce_max(future_rewards, axis=1) * (1 - done_sample)
                
                # If final frame set the last value to -1
                #updated_q_values = updated_q_values * (1 - done_sample) #- done_sample

                # Create a mask so we only calculate loss on the updated Q-values
                masks = tf.one_hot(action_sample, num_actions)

                with tf.GradientTape() as tape:
                    # Train the model on the states and updated Q-values
                    q_values = model(state_sample)
                    # Apply the masks to the Q-values to get the Q-value for action taken
                    q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                    # Calculate loss between new Q-value and old Q-value
                    loss = loss_function(updated_q_values, q_action)

                # Backpropagation
                grads = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(grads, model.trainable_variables))

            # Limit the state and reward history
            if len(rewards_history) > max_memory_length:
                del rewards_history[:1]
                del state_history[:1]
                del state_next_history[:1]
                del action_history[:1]
                del done_history[:1]

            if done:
                break

        if itr % update_target_network == 0:
            # update the target network with new weights
            print("Updating target network")
            model_target.set_weights(model.get_weights())

        episode_rewards[itr] = env.reward(player=my_player)
        if len(done_history) > batch_size:
            loss_train[itr] = loss
        # Testing the performance
        if (test_freq is not None) and ((itr+1) % test_freq == 0):
            M_opt = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=0.))
            M_rand = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=1.))
            episode_Mopt.append(M_opt)
            episode_Mrand.append(M_rand)
            #if verbose:
            print('Episode ', itr+1, ':\tM_opt = ', M_opt, '\tM_rand = ', M_rand)

    # Dictionary of stats
    stats = {
        'rewards': episode_rewards,
        'test_Mopt': episode_Mopt,
        'test_Mrand': episode_Mrand,
        'loss_train': loss_train
    }
    return model, stats


Model: "model_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_18 (InputLayer)       [(None, 3, 3, 2)]         0         
                                                                 
 flatten_17 (Flatten)        (None, 18)                0         
                                                                 
 dense_51 (Dense)            (None, 128)               2432      
                                                                 
 dense_52 (Dense)            (None, 128)               16512     
                                                                 
 dense_53 (Dense)            (None, 9)                 1161      
                                                                 
Total params: 20,105
Trainable params: 20,105
Non-trainable params: 0
_________________________________________________________________


In [25]:
n_star = 1
def rule(n):
  return max(epsilon_min, epsilon_max * (1 - n/n_star))

model, stats = deep_q_learning_against_opt(env,  epsilon_exploration_rule = rule, verbose = True, test_freq = 250, num_episodes=20000)

Episode  0 :	M_opt =  -1.0 	M_rand =  -0.936
Updating target network
Episode  250 :	M_opt =  -0.884 	M_rand =  0.244


KeyboardInterrupt: ignored