In [1]:
import math
import operator
import numpy as np
import matplotlib.pyplot as plt
from numpy.random import default_rng
from tic_env import TictactoeEnv, OptimalPlayer
from QLearning_env import QPlayer, QTraining, get_state_best_Q
from utils import plots_several_trainings, plots_several_trainings_subfigures, plot_game_heatmaps
# from utils import action_to_key, grid_to_string
from tqdm import tqdm
import random

In [None]:
# Part 2.1 (question 1)
eps_policy = lambda ep: 0.1

training = QTraining(eps_policy)
training.train(0.5, run_test= True)

training.plot_mean_reward_during_training()
training.plot_mopt_mrng_during_training()

 34%|████████████████████████████████████████████████████████████▍                                                                                                                      | 6746/20000 [01:17<01:01, 216.36it/s]

In [None]:
# Part 2.1.1
n_stars = [1, 5000, 10000, 15000, 25000, 40000]
epsilon_min = 0.1
epsilon_max = 0.8

values = []
values_mopt_mrng = []
names = []
epochs = 0
avg_step = 0

for n_star in n_stars:
    epsilon_greedy_policy = lambda ep: max(epsilon_min, epsilon_max*(1-ep/n_star))
    training2 = QTraining(epsilon_greedy_policy)
    training2.train(0.5, run_test=True)
    epochs = training2.epoch
    avg_step = training2.avg_step
    
    values.append(training2.avg_reward)
    values_mopt_mrng.append([training2.score_test_opt, training2.score_test_rng])
    names.append(f"$n^*=${n_star}")

In [None]:
# (question 2)
plots_several_trainings(values, names, avg_step, epochs)

In [None]:
plots_several_trainings_subfigures(values, names, avg_step, epochs)

In [None]:
# (question 3)
plots_several_trainings_subfigures(values_mopt_mrng, names, training2.test_step, epochs, mopt_mrng=True)

In [None]:
# (DEBUG) Plotting just one of the curves

plt.figure(figsize=(20, 10))
xs = range(0, epochs, avg_step)

plt.plot(xs, values[2], label=names[2], lw=2)

plt.xlabel('Number of games played', fontsize= 20)
plt.ylabel('Mean reward over {} games'.format(avg_step), fontsize = 20)
plt.title('Evolution of mean reward (every {} games played) of the learner'.format(avg_step), fontsize = 20)
plt.grid()
plt.legend(loc=2)
plt.show()

In [None]:
# (DEBUG) Avg reward

for elem in values:
    print(np.mean(elem))
    
# ==> so it seems the lower n_star the faster we will learn which does not make intuitive sense

In [None]:
# Part 2.1.2

# TODO: how do we want to select the "best" n_star? Atm just one with best average reward during whole training
n_star_best = n_stars[np.argmax(np.mean(values, axis=1))]

epsilon_opts = [0.0, 0.1, 0.3, 0.5, 0.8, 1.0]
epsilon_min = 0.1
epsilon_max = 0.8
epsilon_greedy_policy = lambda ep: max(epsilon_min, epsilon_max*(1-ep/n_star_best))

values = []
values_mopt_mrng = []
names = []
epochs = 0
avg_step = 0

for epsilon_opt in epsilon_opts:
    training2 = QTraining(epsilon_greedy_policy)
    training2.train(epsilon_opt, run_test=True)
    epochs = training2.epoch
    avg_step = training2.avg_step
    
    values.append(training2.avg_reward)
    values_mopt_mrng.append([training2.score_test_opt, training2.score_test_rng])
    names.append(f"$epsilon_opt=${epsilon_opt}")

In [None]:
# (question 4)
plots_several_trainings(values, names, avg_step, epochs)

In [None]:
plots_several_trainings_subfigures(values, names, avg_step, epochs)

In [None]:
plots_several_trainings_subfigures(values_mopt_mrng, names, training2.test_step, epochs, mopt_mrng=True)

In [None]:
# (DEBUG) Remarque: on dirait que contre Opt(0), il y a un moment ou il trouve une technique parfaite pour toujours faire Draw

In [None]:
# (question 5)
get_max_Mopt_Mrng_for_epsilon(values_mopt_mrng,epsilon_opts, "epsilon-opt")

By looking at the figures above we can see we achieve 0 mean reward (on testing) in each case except when training against fully random adversary. It seems the best $\epsilon_{opt}$ is therefore 0.8.

In [None]:
# (question 7)

eps = [0.5]
#eps = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9]


values = []
values_mopt_mrng = []
names = []
epochs = 0
avg_step = 0

for ep in eps:
    epsilon_greedy_policy = lambda ep: ep
    training = QTraining(epsilon_greedy_policy)
    training.train(ep, run_test=True, self_learning = True)
    epochs = training.epoch
    avg_step = training.avg_step
    
    values.append(training.avg_reward)
    values_mopt_mrng.append([training2.score_test_opt, training2.score_test_rng])
    names.append(f"$epsilon=${ep}")
    training.plot_mean_reward_during_training()
    training.plot_mopt_mrng_during_training()

In [None]:
n_stars = [1, 5000, 10000, 15000, 25000, 40000]
epsilon_min = 0.1
epsilon_max = 0.8

values = []
values_mopt_mrng = []
names = []
epochs = 0
avg_step = 0

for n_star in n_stars:
    epsilon_greedy_policy = lambda ep: max(epsilon_min, epsilon_max*(1-ep/n_star))
    training2 = QTraining(epsilon_greedy_policy)
    training2.train(0.5, run_test=True, self_learning=True)
    epochs = training2.epoch
    avg_step = training2.avg_step
    
    values.append(training2.avg_reward)
    values_mopt_mrng.append([training2.score_test_opt, training2.score_test_rng])
    names.append(f"$n^*=${n_star}")

In [None]:
plots_several_trainings(values, names, avg_step, epochs)

In [None]:
plots_several_trainings_subfigures(values, names, avg_step, epochs)

In [None]:
# (question 3)
plots_several_trainings_subfigures(values_mopt_mrng, names, training2.test_step, epochs, mopt_mrng=True)
# TODO: compare in a plot best eps fixed with best epsilon decreasing

In [None]:
# (question 9)
get_max_Mopt_Mrng_for_epsilon(values_mopt_mrng, n_stars, "n_star")

In [None]:
# (question 10)

states = ["---------", "XX-OO----", "X--O-X-O-"]
titles = ["Starting board", "Winning position", "Tactical move"]

# situation1: Starting board
"""
- - -
- - -
- - -
"""
# situation2: Winning position
""" 
X X -
O O -
- - -
"""

# situation3: Tactical move
""" 
X - -
O - X
- O -
"""

# Train the self-learners to obtain final Q-values
epsilon_min = 0.1
epsilon_max = 0.8
n_star = 10000 # TODO change to the best n_start
training2 = QTraining(lambda ep: max(epsilon_min, epsilon_max*(1-ep/n_star)))
training2.train(0.5, run_test=False, self_learning=True)

plot_game_heatmaps(states, training2.Q_vals, titles)