## Setup

In [1]:
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
%load_ext line_profiler

%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from utils import *
from q_learning import *
from deep_q_learning import *

In [2]:
# Configurations
save_stats = True
save_figs = True
train = False
load = not train

seed = 0

In [3]:
# Configuration paramaters for the whole setup
#seed = 0

env = TictactoeEnv()

gamma = 0.99
lr = 1e-4
epsilon_min = 0.1
epsilon_max = 0.8
num_episodes = 20000
test_freq = 250
num_avg = 1

## Learning from experts

### Question 11: Average reward and average loss during training

In [5]:
epsilon_exploration = 0.2
stats_dict_list = []
for i in range(num_avg):
    print('************** RUN', i+1, 'OF', num_avg, '**************')
    stats_dict = {}
    start = time.time()
    model, stats = deep_q_learning(env, lr = lr, epsilon_exploration=epsilon_exploration, num_episodes=num_episodes, against_opt=True, verbose=True)
    print('Only training time: ', time.time() - start)
    M_opt = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=0.))
    M_rand = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=1.))
    print("M_opt =", M_opt)
    print("M_rand =", M_rand)
    stats_dict.update({epsilon_exploration: (stats, M_opt, M_rand)})
    stats_dict_list.append(stats_dict)
    print('RUN', i+1, 'took', np.round(time.time()-start,decimals=1), 'seconds')

plot_stats(stats_dict_list, [epsilon_exploration], 'epsilon_exploration_Q11', '\epsilon', save=save_figs, keys = ['rewards', 'loss'])
output_folder = os.path.join(os.getcwd(), 'results')

fname = output_folder + '/dqn_stats_dict_q11.pkl'
with open(fname, 'wb') as handle:
    pickle.dump(stats_dict_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

************** RUN 1 OF 1 **************


  0%|          | 0/20000 [00:00<?, ?it/s]

******* Updating target network *******


  3%|▎         | 505/20000 [00:17<13:39, 23.80it/s]

******* Updating target network *******


  3%|▎         | 579/20000 [00:20<11:29, 28.16it/s]


KeyboardInterrupt: 

In [None]:
plot_stats([stats_dict_list[-1]], [epsilon_exploration], 'epsilon_exploration_Q11', '\epsilon', save=save_figs)

### Question 12: no replay buffer and batch_size = 1

In [None]:
epsilon_exploration = 0.1
stats_dict_list = []
for i in range(num_avg):
    print('************** RUN', i+1, 'OF', num_avg, '**************')
    stats_dict = {}
    start = time.time()
    model, stats = deep_q_learning(env, epsilon_exploration=epsilon_exploration, num_episodes=num_episodes, against_opt=True,
                                   batch_size=1, max_memory_length=1)
    M_opt = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=0.))
    M_rand = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=1.))
    print("M_opt =", M_opt)
    print("M_rand =", M_rand)
    stats_dict.update({epsilon_exploration: (stats, M_opt, M_rand)})
    stats_dict_list.append(stats_dict)
    print('RUN', i+1, 'took', np.round(time.time()-start,decimals=1), 'seconds')

plot_stats(stats_dict_list, [epsilon_exploration], 'epsilon_exploration_Q11', '\epsilon', save=save_figs, keys = ['rewards', 'loss'])

### Question 13: Decreasing exploration for different values of $n^{*}$

In [None]:
vec_n_star = np.hstack((np.array([1, 100, 500, 750]), np.round(np.logspace(3, np.log10(40000), 16))))
num_splits = 4
chunks_list = np.array_split(vec_n_star, num_splits)
print(chunks_list)

In [None]:
dqn_params_list = []
for (idx, chunk) in enumerate(chunks_list):
    dqn_params_list.append([])
    for n_star in chunk:
        params = {'env': env,
                  'num_episodes': num_episodes,
                  'epsilon_exploration_rule': return_lambda_explor(epsilon_min, epsilon_max, n_star),
                  'test_freq': test_freq,
                  'against_opt': True}
        dqn_params_list[idx].append(params)

In [None]:
chunk_num = 0
if train:
    var_name = 'dqn_n_star_experts_' + str(chunk_num)
    chunk = chunks_list[chunk_num]
    stats_dict_nstar = train_avg(var_name, chunk, dqn_params_list[chunk_num], dqn = True, num_avg=num_avg, save_stats=save_stats)

In [None]:
# Merge
stats_dicts = {}
if save_stats:
    stats_dict_nstar_list = []
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    for i in range(len(chunks_list)):
        fname = output_folder + '/stats_dict_dqn_n_star_experts_' + str(i) + '_list.pkl'
        with open(fname, 'rb') as handle:
            stats_dicts.update({i: pickle.load(handle)})
    for i in range(num_avg):
        to_append = {}
        for key in stats_dicts.keys():
            to_append.update(stats_dicts[key][i])
        stats_dict_nstar_list.append(to_append)
    fname = output_folder + '/dqn_stats_dict_nstar_experts_list.pkl'
    with open(fname, 'wb') as handle:
        pickle.dump(stats_dict_nstar_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/dqn_stats_dict_nstar_experts_list.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_nstar_list = pickle.load(handle)

plot_n_star = [1., 500., 1000., 9146., 19127., 40000]
plot_stats(stats_dict_nstar_list, plot_n_star, 'dqn_n_star', "n^{\star}", save=save_figs, decaying_exploration=True)

### Question 14: Different values of $\epsilon_{\text{opt}}$

In [None]:
best_n_star = #########
M = 11
vec_epsilon_opt = np.linspace(0, 1, M)
deep_q_learning_params_list = []
var_name = "epsilon_opt_deep"
for eps in vec_epsilon_opt:
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration_rule': return_lambda_explor(epsilon_min, epsilon_max, best_n_star),
              'epsilon_opt': eps,
              'test_freq': test_freq,
              'against_opt': True}
    deep_q_learning_params_list.append(params)

In [None]:
if train:
    stats_dict_epsilon_opt_list_deep = train_avg(var_name, vec_epsilon_opt, deep_q_learning_params_list, dqn = True,
                                                      num_avg=num_avg, save_stats=save_stats)

### Question 15: Best values of $M_{\text{opt}}$ and $M_{\text{rand}}$

In [None]:
# TODO

## Learning by self-practice

In [None]:
epsilon_exploration = 0.2
for i in range(num_avg):
    print('************** RUN', i+1, 'OF', num_avg, '**************')
    stats_dict = {}
    start = time.time()
    model, stats = deep_q_learning(env, lr = 1e-4, epsilon_exploration=epsilon_exploration, num_episodes=num_episodes, test_freq=test_freq, verbose=True, self_practice=True)
    M_opt = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=0.))
    M_rand = measure_performance(DeepQPlayer(model=model), OptimalPlayer(epsilon=1.))
    print("M_opt =", M_opt)
    print("M_rand =", M_rand)
    stats_dict.update({epsilon_exploration: (stats, M_opt, M_rand)})
    stats_dict_list.append(stats_dict)
    print('RUN', i+1, 'took', np.round(time.time()-start,decimals=1), 'seconds')

plot_stats([stats_dict_list[-1]], [epsilon_exploration], 'self_practice_test', '\epsilon', save=save_figs)
output_folder = os.path.join(os.getcwd(), 'figures')
fname = output_folder + '/dqn_stats_dict_q11.pkl'
with open(fname, 'wb') as handle:
    pickle.dump(stats_dict_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
plot_stats([stats_dict_list[-1]], [epsilon_exploration], 'self_practice_test', '\epsilon', save=save_figs)

In [None]:
plot_stats(stats_dict_list, [epsilon_exploration], 'self_practice_test', '\epsilon', save=save_figs)

### Question 16: Different values of the exploration rate $\epsilon$

In [None]:
M = 10
vec_eps = np.linspace(0, 0.9, M)
num_splits = 2
chunks_list = np.array_split(vec_eps, num_splits)
print(chunks_list)
deep_q_learning_params_list = []
var_name = "epsilon_deep_self"
for eps in vec_eps:
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration': eps,
              'test_freq': test_freq,
              'self_practice': True}
    deep_q_learning_params_list.append(params)

In [None]:
chunk_num = 0
if train:
    var_name = 'dqn_eps_self_practice' + str(chunk_num)
    chunk = chunks_list[chunk_num]
    stats_dict_epsilon_self_list_deep = train_avg(var_name, chunk, deep_q_learning_params_list, dqn = True,
                                                        num_avg=num_avg, save_stats=save_stats)

In [None]:
# Merge
stats_dicts = {}
if save_stats:
    stats_dict_epsilon_self_list_deep = []
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    for i in range(len(chunks_list)):
        fname = output_folder + '/stats_dict_dqn_eps_self_practice' + str(i) + '_list.pkl'
        with open(fname, 'rb') as handle:
            stats_dicts.update({i: pickle.load(handle)})
    for i in range(num_avg):
        to_append = {}
        for key in stats_dicts.keys():
            to_append.update(stats_dicts[key][i])
        stats_dict_epsilon_self_list_deep.append(to_append)
    fname = output_folder + '/stats_dict_dqn_eps_self_practice.pkl'
    with open(fname, 'wb') as handle:
        pickle.dump(stats_dict_epsilon_self_list_deep, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/stats_dict_dqn_eps_self_practice.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_epsilon_self_list_deep = pickle.load(handle)

plot_eps = [0,0.1,0.2, 0.4, 0.6]
plot_stats(stats_dict_epsilon_self_list_deep, plot_eps, 'n_star', "n^{\star}", save=save_figs, decaying_exploration=True)

In [None]:
plot_stats(stats_dict_epsilon_self_list_deep, vec_eps, 'n_star', "n^{\star}", save=False, decaying_exploration=True)

### Question 17: Decreasing exploration for different values of $n^*$

In [None]:
vec_n_star = np.hstack((np.array([1, 100, 500, 750]), np.round(np.logspace(3, np.log10(40000), 16))))
num_splits = 4
chunks_list = np.array_split(vec_n_star, num_splits)
print(chunks_list)
deep_q_learning_params_list = []
var_name = "n_star_self_deep"
for n_star in vec_n_star:
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration_rule': return_lambda_explor(epsilon_min, epsilon_max, n_star),
              'test_freq': test_freq,
              'self_practice': True}
    deep_q_learning_params_list.append(params)

In [None]:
chunk_num = 0
if train:
    var_name = 'dqn_n_star_self_practice' + str(chunk_num)
    chunk = chunks_list[chunk_num]
    stats_dict_nstar_self_list_deep = train_avg(var_name, chunk, deep_q_learning_params_list, dqn = True,
                                                     num_avg=num_avg, save_stats=True)

In [None]:
# Merge
stats_dicts = {}
if save_stats:
    stats_dict_n_star_self_list_deep = []
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    for i in range(len(chunks_list)):
        fname = output_folder + '/stats_dict_dqn_n_star_self_practice' + str(i) + '_list.pkl'
        with open(fname, 'rb') as handle:
            stats_dicts.update({i: pickle.load(handle)})
    for i in range(num_avg):
        to_append = {}
        for key in stats_dicts.keys():
            to_append.update(stats_dicts[key][i])
        stats_dict_n_star_self_list_deep.append(to_append)
    fname = output_folder + '/stats_dict_dqn_n_star_self_practice.pkl'
    with open(fname, 'wb') as handle:
        pickle.dump(stats_dict_n_star_self_list_deep, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/stats_dict_dqn_n_star_self_practice.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_nstar_self_list_deep = pickle.load(handle)

plot_n_star = [1., 9146., 19127., 40000]
plot_stats(stats_dict_nstar_self_list_deep, plot_n_star, 'n_star', "n^{\star}", save=save_figs, decaying_exploration=True)

In [None]:
plot_stats(stats_dict_nstar_self_list_deep, vec_n_star, 'n_star', "n^{\star}", save=False, decaying_exploration=True)

### Question 18: Best values of $M_{\text{opt}}$ and $M_{\text{rand}}$

In [None]:
# TODO

### Question 19: Heatmaps of the Q-values in 3 significant states

In [None]:
# Optimal parameters
epsilon_min = 0.1
epsilon_max = 0.8
n_star = 15000
epsilon_exploration_rule = return_lambda_explor(epsilon_min, epsilon_max, n_star)
model, stats = deep_q_learning(env, epsilon_exploration_rule=epsilon_exploration_rule, num_episodes=20000, 
                               verbose=True, against_opt=True)

In [None]:
grids = np.array([[[1., 0., 0.], [-1., 1., 0.], [0., -1., 0.]]])
print(grids[0])
heatmaps_deep_subplots(grids, model, save=save_figs)