## Setup

In [None]:
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from utils import *
from q_learning import *
from deep_q_learning import *

In [None]:
# Configurations
save_stats = True
save_figs = True
train = True
load = not train

seed = 0

In [None]:
# Configuration paramaters for the whole setup
#seed = 0

env = TictactoeEnv()

gamma = 0.99
lr = 5e-4
epsilon_min = 0.1
epsilon_max = 0.8
num_episodes = 20000
test_freq = 250
num_avg = 1

## Learning from experts

### Question 11: Average reward and average loss during training

In [None]:
epsilon = 0.1
rewards_list = []
losses_list = []
for i in range(num_avg):
    model, stats = deep_q_learning(env, verbose=True, epsilon_exploration=epsilon,
                                   test_freq=250, num_episodes=num_episodes, against_opt=True)
    rewards_list.append(stats['rewards'])
    losses_list.append(stats['loss_train'])

In [None]:
# Plotting the average reward and average loss for every 250 games during training
running_rewards_list = []
for reward in rewards_list:
    running_average_rewards, x = running_average(reward)
    running_rewards_list.append(running_average_rewards)
    
running_losses_list = []
for loss in losses_list:
    running_average_losses, x = running_average(loss)
    running_losses_list.append(running_average_losses)

running_average_rewards = np.mean(running_rewards_list, axis=0)
stds = np.std(running_rewards_list, axis=0)
fig = plt.figure()
plt.plot(x, running_average_rewards)
plt.fill_between(x, running_average_rewards - stds, running_average_rewards + stds, alpha=0.2)
plt.ylim([-1,1])
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Average reward during training')
plt.show()
if save_figs:
        output_folder = os.path.join(os.getcwd(), 'figures')
        os.makedirs(output_folder, exist_ok=True)
        fig.savefig(output_folder + '/rewards_Q11_DEEP.png')
        fig.savefig(output_folder + '/rewards_Q11_DEEP.pdf', format = 'pdf')

running_average_losses = np.mean(running_losses_list, axis=0)
stds = np.std(running_rewards_list, axis=0)
fig = plt.figure()
plt.plot(x, running_average_losses)
plt.fill_between(x, running_average_losses - stds, running_average_losses + stds, alpha=0.2)
plt.ylim([0,0.1])
plt.xlabel('Episode')
plt.ylabel('Loss')
plt.title('Average loss during training')
plt.show()
if save_figs:
        output_folder = os.path.join(os.getcwd(), 'figures')
        os.makedirs(output_folder, exist_ok=True)
        fig.savefig(output_folder + '/loss_Q11_DEEP.png')
        fig.savefig(output_folder + '/loss_Q11_DEEP.pdf', format = 'pdf')
        
    
# Comparing the performance with the optimal player and the random player
turns = np.array(['X','O'])
player_opt = OptimalPlayer(epsilon=0.)
player_rand = OptimalPlayer(epsilon=1.)
teacher = OptimalPlayer(epsilon=0.5)

names = ['Trained', 'Optimal', 'Random']
players = [DeepQPlayer(model=model), OptimalPlayer(epsilon=0.), OptimalPlayer(epsilon=1.)]

for (name, player) in zip(names, players):
    print("\n-----", name, " player-----")
    m_opt = measure_performance(player, player_opt)
    print("M_opt = ", m_opt)
    m_rand = measure_performance(player, player_rand)
    print("M_rand = ", m_rand)
    m_teacher = measure_performance(player, teacher)
    print("M_teacher = ", m_teacher)

### Question 12: no replay buffer and batch_size = 1

In [None]:
epsilon = 0.1
batch_size = 1
max_memory_length = 1
rewards_list = []
losses_list = []
for i in range(num_avg):
    model, stats = deep_q_learning(env, verbose=True, epsilon_exploration=epsilon,
                                   test_freq=250, num_episodes=num_episodes, batch_size=batch_size, 
                                   max_memory_length=max_memory_length, against_opt=True)
    rewards_list.append(stats['rewards'])
    losses_list.append(stats['loss_train'])

In [None]:
# Plotting the average reward and average loss for every 250 games during training
running_rewards_list = []
for reward in rewards_list:
    running_average_rewards, x = running_average(reward)
    running_rewards_list.append(running_average_rewards)
    
running_losses_list = []
for loss in losses_list:
    running_average_losses, x = running_average(loss)
    running_losses_list.append(running_average_losses)

running_average_rewards = np.mean(running_rewards_list, axis=0)
stds = np.std(running_rewards_list, axis=0)
fig = plt.figure()
plt.plot(x, running_average_rewards)
plt.fill_between(x, running_average_rewards - stds, running_average_rewards + stds, alpha=0.2)
plt.ylim([-1, 1])
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Average reward during training')
plt.show()
if save_figs:
        output_folder = os.path.join(os.getcwd(), 'figures')
        os.makedirs(output_folder, exist_ok=True)
        fig.savefig(output_folder + '/rewards_Q12_DEEP.png')
        fig.savefig(output_folder + '/rewards_Q12_DEEP.pdf', format = 'pdf')

running_average_losses = np.mean(running_losses_list, axis=0)
stds = np.std(running_rewards_list, axis=0)
fig = plt.figure()
plt.plot(x, running_average_losses)
plt.fill_between(x, running_average_losses - stds, running_average_losses + stds, alpha=0.2)
plt.ylim([0, 0.1])
plt.xlabel('Episode')
plt.ylabel('Loss')
plt.title('Average loss during training')
plt.show()
if save_figs:
        output_folder = os.path.join(os.getcwd(), 'figures')
        os.makedirs(output_folder, exist_ok=True)
        fig.savefig(output_folder + '/loss_Q12_DEEP.png')
        fig.savefig(output_folder + '/loss_Q12_DEEP.pdf', format = 'pdf')
        
    
# Comparing the performance with the optimal player and the random player
turns = np.array(['X','O'])
player_opt = OptimalPlayer(epsilon=0.)
player_rand = OptimalPlayer(epsilon=1.)
teacher = OptimalPlayer(epsilon=0.5)

names = ['Trained', 'Optimal', 'Random']
players = [DeepQPlayer(model=model), OptimalPlayer(epsilon=0.), OptimalPlayer(epsilon=1.)]

for (name, player) in zip(names, players):
    print("\n-----", name, " player-----")
    m_opt = measure_performance(player, player_opt)
    print("M_opt = ", m_opt)
    m_rand = measure_performance(player, player_rand)
    print("M_rand = ", m_rand)
    m_teacher = measure_performance(player, teacher)
    print("M_teacher = ", m_teacher)

### Question 13: Decreasing exploration for different values of $n^{*}$

In [None]:
vec_n_star = np.hstack((np.array([1, 100, 500, 750]), np.round(np.logspace(3, np.log10(40000), 16))))

In [None]:
deep_q_learning_params_list = []
var_name = "n_star_deep"
for n_star in vec_n_star:
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration_rule': return_lambda_explor(epsilon_min, epsilon_max, n_star),
              'test_freq': test_freq,
              'against_opt': True}
    deep_q_learning_params_list.append(params)

In [None]:
if train:
    stats_dict_nstar_list_deep = deep_train_avg(var_name, vec_n_star, deep_q_learning_params_list, 
                                                num_avg=num_avg, save_stats=True)

### Question 14: Different values of $\epsilon_{\text{opt}}$

In [None]:
best_n_star = #########
M = 11
vec_epsilon_opt = np.linspace(0, 1, M)
deep_q_learning_params_list = []
var_name = "epsilon_opt_deep"
for eps in vec_epsilon_opt:
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration_rule': return_lambda_explor(epsilon_min, epsilon_max, best_n_star),
              'epsilon_opt': eps,
              'test_freq': test_freq,
              'against_opt': True}
    deep_q_learning_params_list.append(params)

In [None]:
if train:
    stats_dict_epsilon_opt_list_deep = deep_train_avg(var_name, vec_epsilon_opt, deep_q_learning_params_list,
                                                      num_avg=num_avg, save_stats=True)

### Question 15: Best values of $M_{\text{opt}}$ and $M_{\text{rand}}$

In [None]:
# TODO

## Learning by self-practice

### Question 16: Different values of the exploration rate $\epsilon$

In [None]:
M = 10
vec_eps = np.linspace(0, 1, M)
deep_q_learning_params_list = []
var_name = "epsilon_deep_self"
for eps in vec_eps:
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration': eps,
              'test_freq': test_freq,
              'self_practice': True}
    deep_q_learning_params_list.append(params)

In [None]:
if train:
     stats_dict_epsilon_self_list_deep = deep_train_avg(var_name, vec_eps, deep_q_learning_params_list, 
                                                        num_avg=num_avg, save_stats=True)

### Question 17: Decreasing exploration for different values of $n^*$

In [None]:
vec_n_star = np.hstack((np.array([1, 100, 500, 750]), np.round(np.logspace(3, np.log10(40000), 16))))

In [None]:
deep_q_learning_params_list = []
var_name = "n_star_self_deep"
for n_star in vec_n_star:
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration_rule': return_lambda_explor(epsilon_min, epsilon_max, n_star),
              'test_freq': test_freq,
              'self_practice': True}
    deep_q_learning_params_list.append(params)

In [None]:
if train:
    stats_dict_nstar_self_list_deep = deep_train_avg(var_name, vec_n_star, deep_q_learning_params_list,
                                                     num_avg=num_avg, save_stats=True)

### Question 18: Best values of $M_{\text{opt}}$ and $M_{\text{rand}}$

In [None]:
# TODO

### Question 19: Heatmaps of the Q-values in 3 significant states

In [None]:
# Optimal parameters
epsilon_min = 0.1
epsilon_max = 0.8
n_star = 15000
epsilon_exploration_rule = return_lambda_explor(epsilon_min, epsilon_max, n_star)
model, stats = deep_q_learning(env, epsilon_exploration_rule=epsilon_exploration_rule, num_episodes=20000, 
                               verbose=True, against_opt=True)

In [None]:
grids = np.array([[[1., 0., 0.], [-1., 1., 0.], [0., -1., 0.]]])
print(grids[0])
heatmaps_deep_subplots(grids, model, save=save_figs)