## 1. Import and utilities

In [1]:
import time
import pickle

import numpy as np
import random

%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from utils import *
from q_learning import *
from deep_q_learning import *

In [2]:
# Configurations
save_stats = False
save_figs = False
train = False
load = not train

seed = 0

## 2. Q-Learning

In [3]:
env = TictactoeEnv()

# Hyper-parameters
alpha = 0.05    # Learning rate
gamma = 0.99    # Discount factor
epsilon_opt = 0.5   # Optimal player's epsilon
num_episodes = 20000 # number of episodes
num_avg = 10 # training runs
test_freq = 250 # test frequency
epsilon_min = 0.1 # minimum exploration rate for n^star
epsilon_max = 0.8 # maximum exploration rate for n^star

### 2.1 Learning from experts

#### Question 1: Average reward with $\epsilon = 0.1$

In [None]:
# Seed
np.random.seed(seed)
random.seed(seed)

# Training, we average the results over 10 training runs
epsilon_exploration = 0.1
stats_dict_list = []
if train:
    for i in range(num_avg):
        print('************** RUN', i+1, 'OF', num_avg, '**************')
        stats_dict = {}
        start = time.time()
        Q, stats = q_learning(env, epsilon_exploration=epsilon_exploration, num_episodes=num_episodes, verbose=False, against_opt=True)
        M_opt = measure_performance(QPlayer(Q=Q), OptimalPlayer(epsilon=0.))
        M_rand = measure_performance(QPlayer(Q=Q), OptimalPlayer(epsilon=1.))
        print("M_opt =", M_opt)
        print("M_rand =", M_rand)
        stats_dict.update({epsilon_exploration: (stats, M_opt, M_rand)})
        stats_dict_list.append(stats_dict)
        print('RUN', i+1, 'took', np.round(time.time()-start,decimals=1), 'seconds')
    # Saving the results
    if save_stats:
        output_folder = os.path.join(os.getcwd(), 'results')
        os.makedirs(output_folder, exist_ok=True)
        fname = output_folder + '/Q1.pkl'
        with open(fname, 'wb') as handle:
            pickle.dump(stats_dict_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load results from dictionary
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    fname = output_folder + '/Q1.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_list = pickle.load(handle)
        
# Plot of the average reward over num_avg training runs with 25th and 75th percentiles'        
plot_stats(stats_dict_list, [epsilon_exploration], 'epsilon_exploration_Q1', '\epsilon', save=save_figs, keys = ['rewards'], perc=True)

#### Questions 2 and 3: Average reward and performance measures for different values of $n^{\star}$ 

In [None]:
test_freq = 250
vec_n_star = np.hstack((np.array([1, 100, 500, 750]), np.round(np.logspace(3, np.log10(40000), 16))))

In [None]:
q_learning_params_list = []
var_name = 'q_learning_n_star_experts'
for n_star in vec_n_star:
    params = {'env': env,
              'epsilon_exploration_rule': return_lambda_explor(epsilon_min, epsilon_max, n_star),
              'test_freq': test_freq, 
              'against_opt': True}
    q_learning_params_list.append(params)

In [None]:
np.random.seed(seed)
random.seed(seed)

# Training, performs num_avg complete training runs
if train:
    stats_dict_nstar = train_avg(var_name, vec_n_star, q_learning_params_list, num_avg=num_avg, save_stats=save_stats)

# Load results from dictionary
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/Q2_Q3.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_nstar_list = pickle.load(handle)

In [None]:
# Show plot presented in the report
plot_n_star = [1., 9146., 19127., 40000]
plot_stats(stats_dict_nstar_list, plot_n_star, 'n_star', "n^{\star}", save=save_figs, decaying_exploration=True, perc=True)

In [None]:
# Show plot of all the experimented values
plot_stats(stats_dict_nstar_list, vec_n_star, 'n_star', "n^{\star}", save=save_figs, decaying_exploration=True, perc=True)

#### Question 4: Good experts and bad experts

In [None]:
best_n_star = 9146 # best value according to previous experiments
M = 11
vec_eps_opt = np.round(np.linspace(0, 1, M), decimals=2)
var_name = 'epsilon_opt'
q_learning_params_list = []
for epsilon_opt in vec_eps_opt:
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration_rule': return_lambda_explor(epsilon_min, epsilon_max, best_n_star),
              'epsilon_opt': epsilon_opt,
              'against_opt': True}
    q_learning_params_list.append(params)

In [None]:
# Seed
np.random.seed(seed)
random.seed(seed)

# Training, performs num_avg complete training runs
if train:
    stats_dict_eps_opt_list = train_avg(var_name, vec_eps_opt, q_learning_params_list, num_avg=num_avg, save_stats=save_stats)
    
# Load results from dictionary
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/Q4.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_eps_opt_list = pickle.load(handle)

In [None]:
# Show plot presented in the report
plot_eps_opt = [0., 0.2, 0.5, 0.8, 1.]
plot_stats(stats_dict_eps_opt_list, plot_eps_opt, "epsilon_opt", "\epsilon_{opt}", save=save_figs, keys=['test_Mopt', 'test_Mrand'], perc=True)

In [None]:
# Show plot of all the experimented values
plot_stats(stats_dict_eps_opt_list, vec_eps_opt, "epsilon_opt", "\epsilon_{opt}", save=False, keys=['test_Mopt', 'test_Mrand'])

#### Question 5: Optimal values of $M_{\text{opt}}$ and $M_{\text{rand}}$

In [None]:
# Seed
np.random.seed(seed)
random.seed(seed)

# Setting hyperparams
n_star = 9146 # best n_star
epsilon_min = 0.1
epsilon_max = 0.8
epsilon_exploration_rule = return_lambda_explor(epsilon_min, epsilon_max, n_star)

# Training, single sample run
print("****************** SAMPLE RUN ****************")
Q, stats = q_learning(env, epsilon_exploration_rule = epsilon_exploration_rule, against_opt=True)

# Print results
M_opt = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=0.))
print("M_opt =", M_opt)
M_rand = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=1.))
print("M_rand =", M_rand)

In [4]:
# *************** RESULTS FOR THE SAME PARAMETERS FROM DICTIONARY *************
output_folder = os.path.join(os.getcwd(), 'results')
os.makedirs(output_folder, exist_ok=True)
fname = output_folder + '/Q4.pkl'
with open(fname, 'rb') as handle:
    stats_dict = pickle.load(handle)
    
# Best_n_star
stats_dict_best_n_star = [stats_dict[i][0.5] for i in range(10)]
m_opt_vec = [stats_dict_best_n_star[i][1] for i in range(10)]
m_rand_vec = [stats_dict_best_n_star[i][2] for i in range(10)]


# Results
print('*********************', ' AVERAGE STATS ', '*********************')
print("Median: M_opt = ", np.round(np.median(m_opt_vec),decimals=2), " M_rand = ", np.round(np.median(m_rand_vec),decimals=2))
print("25th quantile: M_opt = ", np.round(np.percentile(m_opt_vec, q=25),decimals=2), 
      " M_rand = ", np.round(np.percentile(m_rand_vec, q=25),decimals=2))
print("75th quantile: M_opt = ", np.round(np.percentile(m_opt_vec, q=75),decimals=2),
      " M_rand = ", np.round(np.percentile(m_rand_vec, q=75),decimals=2))

*********************  AVERAGE STATS  *********************
Median: M_opt =  0.0  M_rand =  0.85
25th quantile: M_opt =  0.0  M_rand =  0.82
75th quantile: M_opt =  0.0  M_rand =  0.89


### 2.2 Learning by self-practice

#### Question 7: performance measures $M_{\text{opt}}$ and $M_{\text{rand}}$ for different exploration rates $\epsilon$

In [None]:
eps_vec = np.round(np.linspace(0, 0.9, 10), decimals=1)
var_name = 'eps_self'
q_learning_params_list = []
for eps in eps_vec:
    params = {'env': env,
              'verbose': True,
              'num_episodes': num_episodes,
              'epsilon_exploration': eps,
              'test_freq': test_freq,
              'self_practice': True}
    q_learning_params_list.append(params)

In [None]:
# Seed
np.random.seed(seed)
random.seed(seed)

# Training, performs num_avg complete training runs
if train:
    stats_dict_eps_self_list = train_avg(var_name, eps_vec, q_learning_params_list, num_avg=num_avg, save_stats=False)

# Load results from dictionary
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/Q7.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_eps_self_list = pickle.load(handle)

In [None]:
# Show plot presented in the report
eps_vec_plot = [0, 0.2, 0.5, 0.8]
plot_stats(stats_dict_eps_self_list, eps_vec_plot, "epsilon_self", "\epsilon", save=save_figs, perc=True, keys=['test_Mopt', 'test_Mrand'])

In [None]:
# Show plot of all the experimented values
plot_stats(stats_dict_eps_self_list, eps_vec, "epsilon_self", "\epsilon", save=False, keys=['test_Mopt', 'test_Mrand'])

#### Question 8: Performance measures  $M_{\text{opt}}$ and $M_{\text{rand}}$ for different values of $n^{\star}$

In [None]:
epsilon_min = 0.1
epsilon_max = 0.8
vec_n_star = np.hstack((np.array([1, 100, 500, 750]), np.round(np.logspace(3, np.log10(40000), 16))))

In [None]:
q_learning_params_list = []
var_name = 'q_learning_n_star_self'
for n_star in vec_n_star:
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration_rule': return_lambda_explor(epsilon_min, epsilon_max, n_star),
              'test_freq': test_freq,
              'self_practice': True}
    q_learning_params_list.append(params)

In [None]:
# Seed
np.random.seed(seed)
random.seed(seed)

# Training, performs num_avg complete training runs
if train:
    stats_dict_nstar = train_avg(var_name, vec_n_star, q_learning_params_list, num_avg=num_avg, save_stats=save_stats)

# Load results from dictionary 
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/Q8.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_nstar_list = pickle.load(handle)

In [None]:
# Show plot presented in the report
plot_n_star = [1., 9146., 24460., 40000.]
plot_stats(stats_dict_nstar_list, plot_n_star, 'n_star_self', "n^{\star}", 
           decaying_exploration=True, save=save_figs, perc=True, keys=['test_Mopt', 'test_Mrand'])

In [None]:
# Show plot of all the experimented values
plot_stats(stats_dict_nstar_list, vec_n_star, 'n_star_self', "n^{\star}", save=False, keys=['test_Mopt', 'test_Mrand'])

#### Question 9: Optimal values of $M_{\text{opt}}$ and $M_{\text{rand}}$

In [None]:
# Seed
np.random.seed(seed)
random.seed(seed)

# Setting hyperparams
n_star = 24460
epsilon_exploration_rule = return_lambda_explor(epsilon_min, epsilon_max, n_star)


# Training, single sample run
print("************* SAMPLE RUN *************")
Q, stats = q_learning(env, epsilon_exploration_rule = epsilon_exploration_rule, self_practice=True)

# Print results
M_opt = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=0.))
print("M_opt =", M_opt)
M_rand = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=1.))
print("M_rand =", M_rand)

In [5]:
# *************** RESULTS FOR THE SAME PARAMETERS FROM DICTIONARY *************
output_folder = os.path.join(os.getcwd(), 'results')
os.makedirs(output_folder, exist_ok=True)
fname = output_folder + '/Q8.pkl'
with open(fname, 'rb') as handle:
    stats_dict = pickle.load(handle)
    
# Best_n_star
stats_dict_best_n_star = [stats_dict[i][24460.] for i in range(10)]
m_opt_vec = [stats_dict_best_n_star[i][1] for i in range(10)]
m_rand_vec = [stats_dict_best_n_star[i][2] for i in range(10)]


# Results
print('*********************', ' AVERAGE STATS ', '*********************')
print("Median: M_opt = ", np.round(np.median(m_opt_vec),decimals=2), " M_rand = ", np.round(np.median(m_rand_vec),decimals=2))
print("25th quantile: M_opt = ", np.round(np.percentile(m_opt_vec, q=25),decimals=2), 
      " M_rand = ", np.round(np.percentile(m_rand_vec, q=25),decimals=2))
print("75th quantile: M_opt = ", np.round(np.percentile(m_opt_vec, q=75),decimals=2),
      " M_rand = ", np.round(np.percentile(m_rand_vec, q=75),decimals=2))

*********************  AVERAGE STATS  *********************
Median: M_opt =  -0.07  M_rand =  0.87
25th quantile: M_opt =  -0.19  M_rand =  0.86
75th quantile: M_opt =  0.0  M_rand =  0.9


#### Question 10: Heatmaps of the Q-values in 3 significant states

In [None]:
# Seed
np.random.seed(seed)
random.seed(seed)

# Optimal params
epsilon_min = 0.1
epsilon_max = 0.8
n_star = 24460
epsilon_exploration_rule = return_lambda_explor(epsilon_min, epsilon_max, n_star)

# Training
Q, stats = q_learning(env, epsilon_exploration_rule=epsilon_exploration_rule, self_practice=True)

In [None]:
# States of interest
win_chance = [1., 0., 0., -1., 1., 0., 0., -1., 0.]
block_win = [1., 0., 1., 0., -1., 0., 0., 0., 0.]
fork_chance = [1., -1., 1., -1., 0., 0., 0., 0., 0.]
grids = [win_chance, block_win, fork_chance]


grid_numpy = []
for i in range(len(grids)):
    grid_numpy.append(np.array(grids[i]).reshape(3,3))
    
# Plot heatmaps
heatmaps_subplots(grid_numpy, Q, save=save_figs)

## 3. Deep Q-Learning

In [None]:
# Configuration paramaters for the whole setup
env = TictactoeEnv()

lr = 1e-4 # learning rate obtained by grid search
num_episodes = 20000
test_freq = 250
num_avg = 4
epsilon_min = 0.1
epsilon_max = 0.8

# Folder for results
output_folder = os.path.join(os.getcwd(), 'results')
os.makedirs(output_folder, exist_ok=True)

### 3.1 Learning from experts

#### Question 11: Average reward and average loss during training for $\epsilon = 0.1$

In [None]:
# Seed
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

# Training, performs num_avg complete training runs without performance test
epsilon_exploration = 0.1
stats_dict_list = []
if train:
    for i in range(num_avg):
        print('************** RUN', i+1, 'OF', num_avg, '**************')
        stats_dict = {}
        start = time.time()
        model, stats = deep_q_learning(env, lr = lr, epsilon_exploration=epsilon_exploration, num_episodes=num_episodes, against_opt=True, verbose=True)
        print('Only training time: ', time.time() - start)
        M_opt = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=0.))
        M_rand = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=1.))
        print("M_opt =", M_opt)
        print("M_rand =", M_rand)
        stats_dict.update({epsilon_exploration: (stats, M_opt, M_rand)})
        stats_dict_list.append(stats_dict)
        print('RUN', i+1, 'took', np.round(time.time()-start,decimals=1), 'seconds')
    if save_stats:
        fname = output_folder + '/dqn_stats_dict_q11.pkl'
        with open(fname, 'wb') as handle:
            pickle.dump(stats_dict_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load results from dictionary
if load:
    fname = output_folder + '/Q11.pkl'
    with open(fname, 'rb') as handle:
            stats_dict_list = pickle.load(handle)

# Plot of the average reward and average training loss over num_avg training runs with 25th and 75th percentiles'   
plot_stats(stats_dict_list, [epsilon_exploration], 'epsilon_exploration_Q11', '\epsilon', save=save_figs, keys = ['rewards', 'loss'], perc=True)

#### Question 12: Same as Question 11, but no replay buffer and batch_size = 1

In [None]:
# Seed
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

# Training, performs num_avg complete training runs without performance test
epsilon_exploration = 0.1
stats_dict_list = []
if train:
    for i in range(num_avg):
        print('************** RUN', i+1, 'OF', num_avg, '**************')
        stats_dict = {}
        start = time.time()
        model, stats = deep_q_learning(env, epsilon_exploration=epsilon_exploration, num_episodes=num_episodes, against_opt=True, batch_size=1, max_memory_length=1)
        M_opt = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=0.))
        M_rand = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=1.))
        print("M_opt =", M_opt)
        print("M_rand =", M_rand)
        stats_dict.update({epsilon_exploration: (stats, M_opt, M_rand)})
        stats_dict_list.append(stats_dict)
        print('RUN', i+1, 'took', np.round(time.time()-start,decimals=1), 'seconds')
    if save_stats:
        fname = output_folder + '/Q12.pkl'
        with open(fname, 'wb') as handle:
            pickle.dump(stats_dict_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load results from dictionary
if load:
    fname = output_folder + '/Q12.pkl'
    with open(fname, 'rb') as handle:
            stats_dict_list = pickle.load(handle)

# Plot of the average reward and average training loss over num_avg training runs with 25th and 75th percentiles'              
plot_stats(stats_dict_list, [epsilon_exploration], 'epsilon_exploration_Q12', '\epsilon', save=save_figs, keys = ['rewards', 'loss'], perc=True)

#### Question 13: Performance measures $M_{\text{opt}}$ and $M_{\text{rand}}$ for different values of $n^{*}$

In [None]:
vec_n_star = np.hstack((np.array([1, 100, 1000]), np.round(np.logspace(np.log10(5000), np.log10(40000), 7))))
var_name = 'deep_q_learning_n_star_experts'
dqn_params_list = []
for n_star in vec_n_star:
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration_rule': return_lambda_explor(epsilon_min, epsilon_max, n_star),
              'test_freq': test_freq,
              'against_opt': True}
    dqn_params_list.append(params)

In [None]:
# Seed
np.random.seed(seed)
random.seed(seed)

# Training, performs num_avg complete training runs
if train:
    stats_dict_nstar = deep_train_avg(var_name, vec_n_star, dqn_params_list, num_avg=num_avg, save_stats=save_stats)

# Load results from dictionary
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/Q13.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_nstar_list = pickle.load(handle)

In [None]:
# Show plot presented in the report
plot_n_star = [1., 10000., 20000., 40000.]
plot_stats(stats_dict_nstar_list, plot_n_star, 'dqn_n_star', "n^{\star}", save=save_figs, perc=True, keys=['test_Mopt', 'test_Mrand'])

In [None]:
# Show plot of all the experimented values
plot_stats(stats_dict_nstar_list, vec_n_star, 'dqn_n_star', "n^{\star}", save=False, decaying_exploration=False)

#### Question 14: Performance measures  $M_{\text{opt}}$ and $M_{\text{rand}}$ for different values of $\epsilon_{\text{opt}}$

In [None]:
best_n_star = 20000 # best n_star according to the previous experiments
M = 11
vec_eps_opt = np.round(np.linspace(0, 1, M), decimals=2)
deep_q_learning_params_list = []
var_name = 'deep_q_learning_epsilon_opt_experts'
for eps in vec_eps_opt:
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration_rule': return_lambda_explor(epsilon_min, epsilon_max, best_n_star),
              'epsilon_opt': eps,
              'test_freq': test_freq,
              'against_opt': True}
    deep_q_learning_params_list.append(params)

In [None]:
# Seed
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

# Training, performs num_avg complete training runs
if train:
    stats_dict_epsilon_opt_list_deep = deep_train_avg(var_name, vec_eps_opt, deep_q_learning_params_list, 
                                                      num_avg=num_avg, save_stats=save_stats)

# Load results from dictionary
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/Q14.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_eps_opt_list = pickle.load(handle)

In [None]:
# Show plot presented in the report
plot_epsilon_opt = [0.0, 0.2, 0.5, 0.7, 1.0]
plot_stats(stats_dict_eps_opt_list, plot_epsilon_opt, 'dqn_epsilon_opt_experts', "\epsilon_{opt}", save=save_figs, perc=True)

In [None]:
# Show plot of all the experimented values
plot_stats(stats_dict_eps_opt_list, vec_eps_opt, 'dqn_epsilon_opt_experts', "\epsilon_{opt}", save=False)

#### Question 15: Optimal values of $M_{\text{opt}}$ and $M_{\text{rand}}$

In [None]:
print("*************** SAMPLE RUN *******************")

# Seed
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

# Optimal params
epsilon_min = 0.1
epsilon_max = 0.8
n_star = 20000
epsilon_exploration_rule = return_lambda_explor(epsilon_min, epsilon_max, n_star)

# Training, single run
model, stats = deep_q_learning(env, epsilon_exploration_rule=epsilon_exploration_rule, against_opt=True)

# Print results
M_opt = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=0.))
print("M_opt = ", M_opt)
M_rand = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=1.))
print("M_rand = ", M_rand)

In [6]:
# *************** RESULTS FOR THE SAME PARAMETERS FROM DICTIONARY *************
output_folder = os.path.join(os.getcwd(), 'results')
os.makedirs(output_folder, exist_ok=True)
fname = output_folder + '/Q14.pkl'
with open(fname, 'rb') as handle:
    stats_dict = pickle.load(handle)
    
# Best_n_star
stats_dict_best_n_star = [stats_dict[i][0.5] for i in range(4)]
final_m_opt = [stats_dict_best_n_star[i][1] for i in range(4)]
final_m_rand = [stats_dict_best_n_star[i][2] for i in range(4)]

# Results
print('*********************', ' AVERAGE STATS ', '*********************')
print("Median:\t M_opt = ", np.round(np.median(final_m_opt),decimals=2),
      "\t M_rand = ", np.round(np.median(final_m_rand),decimals=2))

*********************  AVERAGE STATS  *********************
Median:	 M_opt =  0.0 	 M_rand =  0.94


## 3.2 Learning by self-practice

#### Question 16: Performance measures $M_{\text{opt}}$ and $M_{\text{rand}}$ for different exploration rates $\epsilon$

In [None]:
M = 10
vec_eps = np.linspace(0, 0.9, M)
var_name = 'deep_q_learning_epsilon_self'
dqn_params_list = []
for eps in vec_eps:
    params = {'env': env,
          'num_episodes': num_episodes,
          'epsilon_exploration': eps,
          'verbose': True,
          'test_freq': test_freq,
          'self_practice': True}
    dqn_params_list.append(params)

In [None]:
# Seed
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

# Training, performs num_avg complete training runs
if train:
    stats_dict_epsilon_self_list_deep = deep_train_avg(var_name, vec_eps, dqn_params_list, num_avg=num_avg, save_stats=save_stats)

# Load results from dictionary
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/Q16.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_epsilon_self_list_deep = pickle.load(handle)

In [None]:
# Show plot presented in the report
plot_eps = [0, 0.2, 0.5, 0.8]
plot_stats(stats_dict_epsilon_self_list_deep, plot_eps, 'epsilon_dqn_self', "\epsilon",
           save=save_figs, perc=True, keys=['test_Mopt', 'test_Mrand'])

In [None]:
# Show plot of all the experimented values
plot_stats(stats_dict_epsilon_self_list_deep, vec_eps, 'epsilon', "\epsilon", save=False)

#### Question 17: Performance measures $M_{\text{opt}}$ and $M_{\text{rand}}$ for different values of $n^{\star}$

In [None]:
vec_n_star = np.hstack((np.array([1, 100, 1000]), np.round(np.logspace(np.log10(5000), np.log10(40000), 7))))
var_name = 'deep_q_learning_n_star_self'
dqn_params_list = []
for n_star in vec_n_star:
    params = {'env': env,
          'num_episodes': num_episodes,
          'epsilon_exploration_rule': return_lambda_explor(epsilon_min, epsilon_max, n_star),
          'verbose': True,
          'test_freq': test_freq,
          'self_practice': True}
    dqn_params_list.append(params)

In [None]:
# Seed
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

# Training, performs num_avg complete training runs
if train:
    stats_dict_nstar_self_list_deep = deep_train_avg(var_name, vec_n_star, dqn_params_list, 
                                                     num_avg=num_avg, save_stats=save_stats)

# Load results from dictionary
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/Q17.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_nstar_self_list_deep = pickle.load(handle)

In [None]:
# Show plot presented in the report
plot_n_star = [1., 10000., 20000., 40000.]
plot_stats(stats_dict_nstar_self_list_deep, plot_n_star, 'n_star_dqn_self', "n^{\star}", save=save_figs, decaying_exploration=True, perc=True)

In [None]:
# Show plot of all the experimented values
plot_stats(stats_dict_nstar_self_list_deep, vec_n_star, 'n_star', "n^{\star}", save=False, decaying_exploration=True)

#### Question 18: Optimal values of $M_{\text{opt}}$ and $M_{\text{rand}}$

In [None]:
# **************** SAMPLE RUN ***********************

# Seed
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

# Parameters
epsilon_min = 0.1
epsilon_max = 0.8
n_star = 10000
epsilon_exploration_rule = return_lambda_explor(epsilon_min, epsilon_max, n_star)

# Training, single run
print("***************** SAMPLE RUN ***************")

model, stats = deep_q_learning(env, epsilon_exploration_rule=epsilon_exploration_rule, self_practice=True)

# Print results
M_opt = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=0.))
print("M_opt =", M_opt)
M_rand = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=1.))
print("M_rand =", M_rand)

In [7]:
# *************** RESULTS FOR THE SAME PARAMETERS FROM DICTIONARY *************
output_folder = os.path.join(os.getcwd(), 'results')
os.makedirs(output_folder, exist_ok=True)
fname = output_folder + '/Q17.pkl'
with open(fname, 'rb') as handle:
    stats_dict = pickle.load(handle)
    
# Best_n_star
stats_dict_best_n_star = [stats_dict[i][10000.0] for i in range(4)]
final_m_opt = [stats_dict_best_n_star[i][1] for i in range(4)]
final_m_rand = [stats_dict_best_n_star[i][2] for i in range(4)]

# Results
print('*********************', ' AVERAGE STATS ', '*********************')
print("Median:\t M_opt = ", np.round(np.median(final_m_opt),decimals=2),
      "\t M_rand = ", np.round(np.median(final_m_rand),decimals=2))

*********************  AVERAGE STATS  *********************
Median:	 M_opt =  0.0 	 M_rand =  0.91


#### Question 19: Heatmaps of the Q-values in 3 significant states

In [None]:
# Seed
np.random.seed(seed)
random.seed(seed)

# Optimal params
epsilon_min = 0.1
epsilon_max = 0.8
n_star = 10000
epsilon_exploration_rule = return_lambda_explor(epsilon_min, epsilon_max, n_star)

# Training
model, stats = deep_q_learning(env, alpha=0.05, epsilon_exploration_rule=epsilon_exploration_rule, self_practice=True)

In [None]:
# States of interest
win_chance = [1., 0., 0., -1., 1., 0., 0., -1., 0.]
block_win = [1., 0., 1., 0., -1., 0., 0., 0., 0.]
fork_chance = [1., -1., 1., -1., 0., 0., 0., 0., 0.]
grids = [win_chance, block_win, fork_chance]


grid_numpy = []
for i in range(len(grids)):
    grid_numpy.append(np.array(grids[i]).reshape(3,3))
    
# Plot heatmaps
heatmaps_deep_subplots(grid_numpy, model, save=save_figs)

## 4. Comparison between Q-Learning and Deep Q-Learning

#### Question 20: Training times for both learning methods

In [8]:
# Load results of best parameters from dictionary
output_folder = os.path.join(os.getcwd(), 'results')
os.makedirs(output_folder, exist_ok=True)
fname = output_folder + '/Q4.pkl'
with open(fname, 'rb') as handle:
    stats_dict = pickle.load(handle)

# Best_n_star
stats_dict_best_n_star = [stats_dict[i][0.5] for i in range(10)]
final_m_opt = [stats_dict_best_n_star[i][1] for i in range(10)]
test_m_opt = [stats_dict_best_n_star[i][0]['test_Mopt'] for i in range(10)]
final_m_rand = [stats_dict_best_n_star[i][2] for i in range(10)]
test_m_rand = [stats_dict_best_n_star[i][0]['test_Mrand'] for i in range(10)]

# Compute training time
starting_m_opt = [test_m_opt[i][0] for i in range(10)]
train_times_m_opt = np.array([np.where(np.array(test_m_opt[i]) > starting_m_opt[i] 
                                       + 0.8 * (final_m_opt[i]-starting_m_opt[i]))[0][0] for i in range(10)])
train_times_m_opt = train_times_m_opt * test_freq
train_times_m_opt
starting_m_rand = [test_m_rand[i][0] for i in range(10)]
train_times_m_rand = np.array([np.where(np.array(test_m_rand[i]) > starting_m_rand[i] 
                                       + 0.8 * (final_m_rand[i]-starting_m_rand[i]))[0][0] for i in range(10)])
train_times_m_rand = train_times_m_rand * test_freq
train_times = [np.maximum(train_times_m_opt[i], train_times_m_rand[i]) for i in range(10)]

# Results
print('*********************', ' AVERAGE STATS ', '*********************')
print("Median:\t M_opt = ", np.round(np.median(final_m_opt),decimals=2),
      "\t M_rand = ", np.round(np.median(final_m_rand),decimals=2), 
      "\t T_train = ", np.round(np.median(train_times),decimals=2))
print("25th quantile:\t M_opt = ", np.round(np.percentile(final_m_opt, q=25),decimals=2), 
      "\t M_rand = ", np.round(np.percentile(final_m_rand, q=25),decimals=2), 
      "\t T_train = ", np.round(np.percentile(train_times, q=25),decimals=2))
print("75th quantile:\t M_opt = ", np.round(np.percentile(final_m_opt, q=75),decimals=2),
      "\t M_rand = ", np.round(np.percentile(final_m_rand, q=75),decimals=2),
      "\t T_train = ", np.round(np.percentile(train_times, q=75),decimals=2))

*********************  AVERAGE STATS  *********************
Median:	 M_opt =  0.0 	 M_rand =  0.85 	 T_train =  6625.0
25th quantile:	 M_opt =  0.0 	 M_rand =  0.82 	 T_train =  5812.5
75th quantile:	 M_opt =  0.0 	 M_rand =  0.89 	 T_train =  7375.0


In [9]:
# Load results of best parameters from dictionary
output_folder = os.path.join(os.getcwd(), 'results')
os.makedirs(output_folder, exist_ok=True)
fname = output_folder + '/Q8.pkl'
with open(fname, 'rb') as handle:
    stats_dict = pickle.load(handle)

# Best_n_star
stats_dict_best_n_star = [stats_dict[i][24460.0] for i in range(10)]
final_m_opt = [stats_dict_best_n_star[i][1] for i in range(10)]
test_m_opt = [stats_dict_best_n_star[i][0]['test_Mopt'] for i in range(10)]
final_m_rand = [stats_dict_best_n_star[i][2] for i in range(10)]
test_m_rand = [stats_dict_best_n_star[i][0]['test_Mrand'] for i in range(10)]

# Compute training time
starting_m_opt = [test_m_opt[i][0] for i in range(10)]
train_times_m_opt = np.array([np.where(np.array(test_m_opt[i]) > starting_m_opt[i] 
                                       + 0.8 * (final_m_opt[i]-starting_m_opt[i]))[0][0] for i in range(10)])
train_times_m_opt = train_times_m_opt * test_freq
train_times_m_opt
starting_m_rand = [test_m_rand[i][0] for i in range(10)]
train_times_m_rand = np.array([np.where(np.array(test_m_rand[i]) > starting_m_rand[i] 
                                       + 0.8 * (final_m_rand[i]-starting_m_rand[i]))[0][0] for i in range(10)])
train_times_m_rand = train_times_m_rand * test_freq
train_times = [np.maximum(train_times_m_opt[i], train_times_m_rand[i]) for i in range(10)]

# Results
print('*********************', ' AVERAGE STATS ', '*********************')
print("Median:\t M_opt = ", np.round(np.median(final_m_opt),decimals=2),
      "\t M_rand = ", np.round(np.median(final_m_rand),decimals=2), 
      "\t T_train = ", np.round(np.median(train_times),decimals=2))
print("25th quantile:\t M_opt = ", np.round(np.percentile(final_m_opt, q=25),decimals=2), 
      "\t M_rand = ", np.round(np.percentile(final_m_rand, q=25),decimals=2), 
      "\t T_train = ", np.round(np.percentile(train_times, q=25),decimals=2))
print("75th quantile:\t M_opt = ", np.round(np.percentile(final_m_opt, q=75),decimals=2),
      "\t M_rand = ", np.round(np.percentile(final_m_rand, q=75),decimals=2),
      "\t T_train = ", np.round(np.percentile(train_times, q=75),decimals=2))

*********************  AVERAGE STATS  *********************
Median:	 M_opt =  -0.07 	 M_rand =  0.87 	 T_train =  6625.0
25th quantile:	 M_opt =  -0.19 	 M_rand =  0.86 	 T_train =  5062.5
75th quantile:	 M_opt =  0.0 	 M_rand =  0.9 	 T_train =  8812.5


In [10]:
# Load results of best parameters from dictionary
output_folder = os.path.join(os.getcwd(), 'results')
os.makedirs(output_folder, exist_ok=True)
fname = output_folder + '/Q14.pkl'
with open(fname, 'rb') as handle:
    stats_dict = pickle.load(handle)
    
# Best_n_star
stats_dict_best_n_star = [stats_dict[i][0.5] for i in range(4)]
final_m_opt = [stats_dict_best_n_star[i][1] for i in range(4)]
test_m_opt = [stats_dict_best_n_star[i][0]['test_Mopt'] for i in range(4)]
final_m_rand = [stats_dict_best_n_star[i][2] for i in range(4)]
test_m_rand = [stats_dict_best_n_star[i][0]['test_Mrand'] for i in range(4)]

# Compute training time
starting_m_opt = [test_m_opt[i][0] for i in range(4)]
train_times_m_opt = np.array([np.where(np.array(test_m_opt[i]) > starting_m_opt[i] 
                                       + 0.8 * (final_m_opt[i]-starting_m_opt[i]))[0][0] for i in range(4)])
train_times_m_opt = train_times_m_opt * test_freq
train_times_m_opt
starting_m_rand = [test_m_rand[i][0] for i in range(4)]
train_times_m_rand = np.array([np.where(np.array(test_m_rand[i]) > starting_m_rand[i] 
                                       + 0.8 * (final_m_rand[i]-starting_m_rand[i]))[0][0] for i in range(4)])
train_times_m_rand = train_times_m_rand * test_freq
train_times = [np.maximum(train_times_m_opt[i], train_times_m_rand[i]) for i in range(4)]

# Results
print('*********************', ' AVERAGE STATS ', '*********************')
print("Median:\t M_opt = ", np.round(np.median(final_m_opt),decimals=2),
      "\t M_rand = ", np.round(np.median(final_m_rand),decimals=2), 
      "\t T_train = ", np.round(np.median(train_times),decimals=2))

*********************  AVERAGE STATS  *********************
Median:	 M_opt =  0.0 	 M_rand =  0.94 	 T_train =  3500.0


In [11]:
# Load results of best parameters from dictionary
output_folder = os.path.join(os.getcwd(), 'results')
os.makedirs(output_folder, exist_ok=True)
fname = output_folder + '/Q17.pkl'
with open(fname, 'rb') as handle:
    stats_dict = pickle.load(handle)
    
# Best_n_star
stats_dict_best_n_star = [stats_dict[i][10000.0] for i in range(4)]
final_m_opt = [stats_dict_best_n_star[i][1] for i in range(4)]
test_m_opt = [stats_dict_best_n_star[i][0]['test_Mopt'] for i in range(4)]
final_m_rand = [stats_dict_best_n_star[i][2] for i in range(4)]
test_m_rand = [stats_dict_best_n_star[i][0]['test_Mrand'] for i in range(4)]

# Compute training time
starting_m_opt = [test_m_opt[i][0] for i in range(4)]
train_times_m_opt = np.array([np.where(np.array(test_m_opt[i]) > starting_m_opt[i] 
                                       + 0.8 * (final_m_opt[i]-starting_m_opt[i]))[0][0] for i in range(4)])
train_times_m_opt = train_times_m_opt * test_freq
train_times_m_opt
starting_m_rand = [test_m_rand[i][0] for i in range(4)]
train_times_m_rand = np.array([np.where(np.array(test_m_rand[i]) > starting_m_rand[i] 
                                       + 0.8 * (final_m_rand[i]-starting_m_rand[i]))[0][0] for i in range(4)])
train_times_m_rand = train_times_m_rand * test_freq
train_times = [np.maximum(train_times_m_opt[i], train_times_m_rand[i]) for i in range(4)]

# Results
print('*********************', ' AVERAGE STATS ', '*********************')
print("Median:\t M_opt = ", np.round(np.median(final_m_opt),decimals=2),
      "\t M_rand = ", np.round(np.median(final_m_rand),decimals=2), 
      "\t T_train = ", np.round(np.median(train_times),decimals=2))

*********************  AVERAGE STATS  *********************
Median:	 M_opt =  0.0 	 M_rand =  0.91 	 T_train =  4000.0
