In [13]:
import time
import pickle

import numpy as np

%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from utils import *
from q_learning import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
# Configurations
save_stats = True
save_figs = False
load = False

seed = 0

## 2. Q-Learning

In [15]:
env = TictactoeEnv()

# Hyper-parameters
alpha = 0.05    # Learning rate
gamma = 0.99    # Discount factor
epsilon_opt = 0.5   # Optimal player's epsilon
num_episodes = 20000 # number of episodes
num_avg = 2 # training runs
test_freq = 250

### 2.1 Learning from experts

In [16]:
var_name = 'epsilon'
epsilon_vec = [0.1]
q_learning_params_list = []
for eps in epsilon_vec:
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration': eps,
              'test_freq': test_freq,
              'against_opt': True}
    q_learning_params_list.append(params)

In [None]:
stats_dict_eps_list = train_avg(var_name, epsilon_vec, q_learning_params_list, num_avg=num_avg, save_stats=save_stats)

************** RUN 1 OF 2 **************
------------- Training with epsilon = 0.1 -------------


In [None]:
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/stats_dict_eps_list.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_eps_list = pickle.load(handle)

plot_stats(stats_dict_eps_list, epsilon_vec, "epsilon", "\epsilon", save=save_figs, std=True)

#### Question 1: Average reward with $\epsilon = 0.1$

In [None]:
epsilon_exploration = 0.1
rewards_list = []
for i in range(num_avg):
    Q, stats = q_learning(env, epsilon_exploration=epsilon_exploration, num_episodes=num_episodes, verbose=False, against_opt=True)
    rewards_list.append(stats['rewards'])

In [None]:
# Plotting the average reward for every 250 games during training
running_rewards_list = []
for reward in rewards_list:
    running_average_rewards, x = running_average(reward)
    running_rewards_list.append(running_average_rewards)

running_average_rewards = np.mean(running_rewards_list, axis=0)
stds = np.std(running_rewards_list, axis=0)
fig = plt.figure()
plt.plot(x, running_average_rewards)
plt.fill_between(x, running_average_rewards - stds, running_average_rewards + stds, alpha=0.2)
plt.ylim([-1,1])
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Average reward during training')
plt.show()
if save_figs:
        output_folder = os.path.join(os.getcwd(), 'figures')
        os.makedirs(output_folder, exist_ok=True)
        fig.savefig(output_folder + '/rewards_Q1.png')
        fig.savefig(output_folder + '/rewards_Q1.eps', format = 'eps')

# Comparing the performance with the optimal player and the random player
turns = np.array(['X','O'])
player_opt = OptimalPlayer(epsilon=0.)
player_rand = OptimalPlayer(epsilon=1.)
teacher = OptimalPlayer(epsilon=0.5)

names = ['Trained', 'Optimal', 'Random']
players = [QPlayer(Q=Q), OptimalPlayer(epsilon=0.), OptimalPlayer(epsilon=1.)]

for (name, player) in zip(names, players):
    print("\n-----", name, " player-----")
    start = time.time()
    m_opt = measure_performance(player, player_opt)
    print(time.time() - start)
    print("M_opt = ", m_opt)
    m_rand = measure_performance(player, player_rand)
    print("M_rand = ", m_rand)
    m_teacher = measure_performance(player, teacher)
    print("M_teacher = ", m_teacher)

#### Questions 2 and 3: Decreasing exploration for different values of $n^{*}$

In [8]:
test_freq = 250
epsilon_min = 0.1
epsilon_max = 0.8
vec_n_star = np.hstack((np.array([1, 100, 500, 750]), np.round(np.logspace(3, np.log10(40000), 16))))

In [9]:
vec_n_star_first = vec_n_star[:10]
var_name = 'nstar_first'
q_learning_params_list = []
for n_star in vec_n_star_first:
    def epsilon_exploration_rule(n):
        return np.max([epsilon_min, epsilon_max * (1 - n/n_star)])
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration_rule': epsilon_exploration_rule,
              'test_freq': test_freq,
              'against_opt': True}
    q_learning_params_list.append(params)
save_stats = True
load = False

In [None]:
stats_dict_nstar_first_list = train_avg(var_name, vec_n_star_first, q_learning_params_list, num_avg=num_avg, save_stats=save_stats)

In [12]:
var_name = 'nstar_second'
vec_n_star_second = np.hstack([[1], vec_n_star[10:]])
q_learning_params_list = []
for n_star in vec_n_star_second:
    def epsilon_exploration_rule(n):
        return np.max([epsilon_min, epsilon_max * (1 - n/n_star)])
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration_rule': epsilon_exploration_rule,
              'test_freq': test_freq,
              'against_opt': True}
    q_learning_params_list.append(params)
save_stats = True
load = False

[1.0000e+00 4.3730e+03 5.5930e+03 7.1520e+03 9.1460e+03 1.1696e+04
 1.4957e+04 1.9127e+04 2.4460e+04 3.1279e+04 4.0000e+04]


In [11]:
stats_dict_nstar_second_list = train_avg(var_name, vec_n_star_second, q_learning_params_list, num_avg=num_avg, save_stats=save_stats)

************** RUN 1 OF 2 **************
------------- Training with nstar_second = 4373.0 -------------


KeyboardInterrupt: 

In [None]:
## if not load:
##    stats_dict_nstar_list = []
##    for i in range(num_avg):
##        print('************** RUN', i+1, 'OF', num_avg, '**************')
##        stats_dict_nstar = {}
##        for n_star in vec_n_star:
##            print("------------- Training with n_star =", n_star, "-------------")
##            def epsilon_exploration_rule(n):
##                return np.max([epsilon_min, epsilon_max * (1 - n/n_star)])
##            start = time.time()
##            Q, stats = q_learning(env, epsilon_exploration_rule=epsilon_exploration_rule, test_freq=test_freq, against_opt=True)
##            M_opt = measure_performance(QPlayer(Q=Q), OptimalPlayer(epsilon=0.), num_episodes=2000)
##            M_rand = measure_performance(QPlayer(Q=Q), OptimalPlayer(epsilon=1.), num_episodes=2000)
##            print("M_opt =", M_opt)
##            print("M_rand =", M_rand)
##           stats_dict_nstar.update({n_star: (stats, M_opt, M_rand)})
##            elapsed = time.time() - start
##            print("Training with n_star =", n_star, " took:", time.strftime("%Hh%Mm%Ss", time.gmtime(elapsed)), "\n\n")
##        stats_dict_nstar_list.append(stats_dict_nstar)

##    if save_stats:
##        output_folder = os.path.join(os.getcwd(), 'results')
##        os.makedirs(output_folder, exist_ok=True)
##        fname = output_folder + '/stats_dict_nstar_list.pkl'
##        with open(fname, 'wb') as handle:
##            pickle.dump(stats_dict_nstar_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
load = True
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/stats_dict_nstar_list.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_nstar_list = pickle.load(handle)

plot_n_star = [1., 24460., 40000]
plot_stats(stats_dict_nstar_first_list, plot_n_star, 'n_star_first', "n^{\star}", save=True, decaying_exploration=True)

plot_n_star = [1., 4373., 9146.]
plot_stats(stats_dict_nstar_second_list, plot_n_star, 'n_star_second', "n^{\star}", save=True, decaying_exploration=True)
# plot_stats(stats_dict_nstar, vec_n_star, 'n_star', "n^{\star}", save=False)

#### Questions 4 and 5: Good experts and bad experts

In [None]:
n_star = 4000 # we pick the best according to previous experiments
M = 11
var_name = 'epsilon_opt'
vec_epsilon_opt = np.linspace(0, 1, M)
q_learning_params_list = []
for epsilon_opt in vec_epsilon_opt:
    def epsilon_exploration_rule(n):
        return np.max([epsilon_min, epsilon_max * (1 - n/n_star)])
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration_rule': epsilon_exploration_rule,
              'test_freq': 250,
              'epsilon_opt': epsilon_opt,
              'against_opt': True}
    q_learning_params_list.append(params)

In [None]:
stats_dict_eps_opt_list = train_avg(var_name, vec_epsilon_opt, q_learning_param_list, num_avg=num_avg, save_stats=save_stats)

In [None]:
## if not load:
##    stats_dict_nstar_list = []
##    for i in range(num_avg):
##        print('************** RUN', i+1, 'OF', num_avg, '**************')
##        stats_dict_nstar = {}
##        for n_star in vec_n_star:
##            print("------------- Training with n_star =", n_star, "-------------")
##            def epsilon_exploration_rule(n):
##                return np.max([epsilon_min, epsilon_max * (1 - n/n_star)])
##            start = time.time()
##            Q, stats = q_learning(env, epsilon_exploration_rule=epsilon_exploration_rule, test_freq=test_freq, against_opt=True)
##            M_opt = measure_performance(QPlayer(Q=Q), OptimalPlayer(epsilon=0.), num_episodes=2000)
##            M_rand = measure_performance(QPlayer(Q=Q), OptimalPlayer(epsilon=1.), num_episodes=2000)
##            print("M_opt =", M_opt)
##            print("M_rand =", M_rand)
##            stats_dict_nstar.update({n_star: (stats, M_opt, M_rand)})
##            elapsed = time.time() - start
##            print("Training with n_star =", n_star, " took:", time.strftime("%Hh%Mm%Ss", time.gmtime(elapsed)), "\n\n")
##        stats_dict_nstar_list.append(stats_dict_nstar)

##    if save_stats:
##        output_folder = os.path.join(os.getcwd(), 'results')
##        os.makedirs(output_folder, exist_ok=True)
##        fname = output_folder + '/stats_dict_nstar_list.pkl'
##        with open(fname, 'wb') as handle:
##            pickle.dump(stats_dict_nstar_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
## if not load:
##    stats_dict_epsilon_opt = {}
##    for epsilon_opt in vec_epsilon_opt:
##        print("------------- Training with epsilon_opt =", epsilon_opt, "-------------")
##        start = time.time()
##        Q, stats = q_learning(env, epsilon_opt=epsilon_opt, test_freq=test_freq, against_opt=True)
##        M_opt = measure_performance(QPlayer(Q=Q), OptimalPlayer(epsilon=0.), num_episodes=2000)
##        M_rand = measure_performance(QPlayer(Q=Q), OptimalPlayer(epsilon=1.), num_episodes=2000)
##        print("M_opt =", M_opt)
##        print("M_rand =", M_rand)
##        stats_dict_epsilon_opt.update({epsilon_opt: (stats, M_opt, M_rand)})
##        elapsed = time.time() - start
##        print("Training with epsilon_opt =", epsilon_opt, " took:", time.strftime("%Hh%Mm%Ss", time.gmtime(elapsed)))

##    if save_stats:
##        output_folder = os.path.join(os.getcwd(), 'results')
##        os.makedirs(output_folder, exist_ok=True)
##        fname = output_folder + '/stats_dict_epsilon_opt.pkl'
##        with open(fname, 'wb') as handle:
##            pickle.dump(stats_dict_epsilon_opt, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/stats_dict_epsilon_opt.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_eps_opt_list = pickle.load(handle)

plot_epsilon_opt = [0., 0.5, 1.]
plot_stats(stats_dict_eps_opt_list, plot_epsilon_opt, "epsilon_opt", "\epsilon_{opt}", save=True)
# plot_stats(stats_dict_epsilon_opt, vec_epsilon_opt, "epsilon_opt", "\epsilon_{opt}", save=False)

### 2.2 Learning by self-practice

#### Question 7: Self-learning with constant rate of exploration $\epsilon$

In [None]:
epsilon_vec = np.array([0, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1])
var_name = 'eps_self'
q_learning_params_list = []
for eps in epsilon_vec:
     params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration': eps,
              'test_freq': test_freq,
              'self_practice': True}
    q_learning_params_list.append(params)
# epsilon_vec = np.hstack((epsilon_vec_plot, np.setdiff1d(np.logspace(-4, -1, 10), epsilon_vec_plot)))

In [None]:
stats_dict_eps_self_list = train_avg(var_name, epsilon_vec, q_learning_params_list, num_avg=num_avg, save_stats=save_stats)

In [None]:
## if not load:
##    env = TictactoeEnv()
##    stats_dict_eps_self = {}
##    for epsilon in epsilon_vec:
##        print("------------- Training with epsilon =", epsilon, "-------------")
##        Q, stats = q_learning(env, num_episodes=num_episodes,
                            epsilon_exploration=epsilon, test_freq = 250, verbose=False, self_practice=True)
##        M_opt = measure_performance(QPlayer(Q=Q), OptimalPlayer(epsilon=0.), num_episodes=2000)
##        M_rand = measure_performance(QPlayer(Q=Q), OptimalPlayer(epsilon=1.), num_episodes=2000)
##        print("M_opt =", M_opt)
##        print("M_rand =", M_rand)
##        stats_dict_eps_self.update({epsilon: (stats, M_opt, M_rand)})

##    if save_stats:
##        output_folder = os.path.join(os.getcwd(), 'results')
##        os.makedirs(output_folder, exist_ok=True)
##        fname = output_folder + '/stats_dict_eps_self.pkl'
##        with open(fname, 'wb') as handle:
##            pickle.dump(stats_dict_eps_self, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/stats_dict_eps_self.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_eps_self_list = pickle.load(handle)

epsilon_vec_plot = [0, 0.001, 0.01, 0.1] # are these okay?
plot_stats(stats_dict_eps_self_list, epsilon_vec_plot, "epsilon_self", "\epsilon", save=save_figs)
# plot_stats(stats_dict_eps_self_list, epsilon_vec, "epsilon_self", "\epsilon", save=False)

#### Question 8: Decaying exploration rule for different values of $n^*$

In [None]:
epsilon_min = 0.1
epsilon_max = 0.8
vec_n_star = np.hstack((np.array([1, 100, 500, 750]), np.round(np.logspace(3, np.log10(40000), 16))))

In [None]:
var_name = 'nstar_first_self'
vec_n_star_first = vec_n_star[:10]
q_learning_params_list = []
for n_star in vec_n_star_first:
    def epsilon_exploration_rule(n):
        return np.max([epsilon_min, epsilon_max * (1 - n/n_star)])
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration_rule': epsilon_exploration_rule,
              'test_freq': test_freq,
              'self_practice': True}
    q_learning_params_list.append(params)
save_stats = True
load = False

In [None]:
stats_dict_nstar_self_first_list = train_avg(var_name, vec_n_star_first, q_learning_params_list, num_avg=num_avg, save_stats=save_stats)

In [None]:
var_name = 'nstar_second'
vec_n_star_first = np.hstack([[1], vec_n_star[:10]])
q_learning_params_list = []
for n_star in vec_n_star_second:
    def epsilon_exploration_rule(n):
        return np.max([epsilon_min, epsilon_max * (1 - n/n_star)])
    params = {'env': env,
              'num_episodes': num_episodes,
              'epsilon_exploration_rule': epsilon_exploration_rule,
              'test_freq': test_freq,
              'self_practice': True}
    q_learning_params_list.append(params)
save_stats = True
load = False

In [None]:
stats_dict_nstar_self_second_list = train_avg(var_name, vec_n_star_second, q_learning_params_list, num_avg=num_avg, save_stats=save_stats)

In [None]:
## load = True
## if not load:
##    stats_dict_nstar_self = {}
##    for n_star in vec_n_star:
##        print("------------- Training with n_star =", n_star, "-------------")
##        def epsilon_exploration_rule(n):
##            return np.max([epsilon_min, epsilon_max * (1 - n/n_star)])
##        start = time.time()
##        Q, stats = q_learning(env, epsilon_exploration_rule=epsilon_exploration_rule,
##                                test_freq=test_freq,  num_episodes=num_episodes, verbose=False, self_practice=True)
##        M_opt = measure_performance(QPlayer(Q=Q), OptimalPlayer(epsilon=0.), num_episodes=200)
##        M_rand = measure_performance(QPlayer(Q=Q), OptimalPlayer(epsilon=1.), num_episodes=200)
##        print("M_opt =", M_opt)
##        print("M_rand =", M_rand)
##        stats_dict_nstar_self.update({n_star: (stats, M_opt, M_rand)})
##        elapsed = time.time() - start
##        print("Training with n_star =", n_star, "took:", time.strftime("%Hh%Mm%Ss", time.gmtime(elapsed)), "\n\n")

##    if save_stats:
##        output_folder = os.path.join(os.getcwd(), 'results')
##        os.makedirs(output_folder, exist_ok=True)
##        fname = output_folder + '/stats_dict_nstar_self.pkl'
##        with open(fname, 'wb') as handle:
##            pickle.dump(stats_dict_nstar_self, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/stats_dict_nstar_self.pkl'
    with open(fname, 'rb') as handle:
        stats_dict_nstar_self_list = pickle.load(handle)

plot_n_star = [1., 4373., 24460., 40000]
plot_stats(stats_dict_nstar_self_first_list, plot_n_star, 'n_star_self_1', "n^{\star}", save=save_figs)

plot_n_star = [1., 4373., 9146.]
plot_stats(stats_dict_nstar_self_second_list, plot_n_star, 'n_star_self_2', "n^{\star}", save=save_figs)
# plot_stats(stats_dict_nstar_self_list, vec_n_star, 'n_star', "n^{\star}", save=False)

#### Question 10: Heatmaps of the Q-values in 3 significant states

In [None]:
# Parametri ottimali di Q
np.random.seed(seed)
epsilon_min = 0.1
epsilon_max = 0.8
n_star = 4373
def epsilon_exploration_rule(n):
            return np.max([epsilon_min, epsilon_max * (1 - n/n_star)])
Q, stats = q_learning(env, epsilon_exploration_rule=epsilon_exploration_rule, num_episodes=20000, verbose=True, self_practice=True)

In [None]:
grids = [[1., 0., 0., -1., 1., 0., 0., -1., 0.], [-1., 0., 0., 1., -1., 0., 0., 1., 0.], [1., -1., 0., 0., 0., 0., 0., 0., 0.]]
heatmaps_subplots(grids, Q)

## Deep Q-Learning