# BRAZ Lucas & DURAND Pierre-Alain
## SCIPER: 343141 & SCIPER: 344313

In [None]:
import multiprocessing as mp
import random
from collections import deque

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import torch
import torch.nn.functional as nn_functional
from joblib import Parallel, delayed
from plotly.subplots import make_subplots

from src.utils import DQNAgent, OptimalPlayer, QlearningAgent, play_games, play_games_with_m

In [None]:
random.seed(343141)
np.random.seed(343141)

In [None]:
# If a GPU is available
if not torch.cuda.is_available():
    raise Exception("Things will go much quicker if you use a GPU")

# 2 *Q*-Learning

## 2.1 Learning from experts

#### Question 1

In [None]:
epsilon = 0.1

player_opt = OptimalPlayer(epsilon=0.5)
agent = QlearningAgent(epsilon=epsilon)

winner_list = play_games(player_opt, agent, max_games=20_000)

group_size = 250
y = winner_list.reshape(winner_list.size // group_size, group_size).mean(axis=1)
x = np.arange(y.size) * group_size

fig = px.line(x=x, y=y, title=f"Average reward over time of RL agent with policy epsilon={epsilon}")
fig.update_layout(width=1000, xaxis_title="Game number", yaxis_title="Average reward")
fig.show()

### 2.1.1 Decreasing exploration

#### Question 2

In [None]:
n_max_list = [1, 100, 1_000, 10_000, 20_000, 30_000, 40_000]

epsilon = (0.1, 0.8)

max_games_total = 20_000
group_size = 250
df = {"Game number": np.arange(max_games_total // group_size) * group_size}

player_opt = OptimalPlayer(epsilon=0.5)

num_cores = min(len(n_max_list), mp.cpu_count())


def parallel_games(n_max, player_opt_, epsilon_, max_games_total_, group_size_):
    df_ = {}

    agent_ = QlearningAgent(epsilon=epsilon_, n_max=n_max)

    winner_list_ = play_games(player_opt_, agent_, max_games=max_games_total_)

    y_ = winner_list_.reshape(winner_list_.size // group_size_, group_size_).mean(axis=1)

    df_[f"Average reward (n*={n_max})"] = y_

    return df_


dfs = Parallel(n_jobs=num_cores)(
    delayed(parallel_games)(n_max, player_opt, epsilon, max_games_total, group_size)
    for n_max in n_max_list
)

for d in dfs:
    df.update(d)

df = pd.DataFrame(df)
fig = px.line(
    df,
    x="Game number",
    y=df.columns.difference(["Game number"]),
    title=f"Average reward over time of RL agent with policy epsilon={epsilon}",
)
fig.update_layout(width=1000)
fig.show()

#### Question 3

In [None]:
n_max_list = [1, 100, 10_000, 40_000]

epsilon = (0.1, 0.8)

max_games_total = 20_000
group_size = 250
df = {"Game number": np.arange(max_games_total // group_size) * group_size}


num_cores = min(len(n_max_list), mp.cpu_count())


def parallel_games(n_max, epsilon_, max_games_total_, group_size_):
    df_ = {}

    player_opt_ = OptimalPlayer(epsilon=0.5)
    agent_ = QlearningAgent(epsilon=epsilon_, n_max=n_max)

    winner_list_, m_opt, m_random = play_games_with_m(
        player_opt_, agent_, max_games_total_, group_size_
    )

    df_[f"m_opt (n*={n_max})"] = m_opt
    df_[f"m_random (n*={n_max})"] = m_random

    return df_


dfs = Parallel(n_jobs=num_cores)(
    delayed(parallel_games)(n_max, epsilon, max_games_total, group_size) for n_max in n_max_list
)

for d in dfs:
    df.update(d)

df = pd.DataFrame(df)
fig = px.line(
    df,
    x="Game number",
    y=df.columns.difference(["Game number"]),
    title=f"Average reward over time of RL agent with policy epsilon={epsilon}",
)
fig.update_layout(width=1000)
fig.show()

### 2.1.2 Good experts and bad experts

#### Question 4

In [None]:
epsilon_opt_list = [0, 0.1, 0.7, 1.0]

n_max = 1

epsilon = (0.1, 0.8)

max_games_total = 20_000
group_size = 250
df = {"Game number": np.arange(max_games_total // group_size) * group_size}

num_cores = mp.cpu_count()


def parallel_games(epsilon_opt, max_games_total_, group_size_, epsilon_, n_max):
    df_ = {}

    agent_ = QlearningAgent(epsilon=epsilon_, n_max=n_max)
    player_opt_ = OptimalPlayer(epsilon=epsilon_opt)

    _, m_opt, m_random = play_games_with_m(player_opt_, agent_, max_games_total_, group_size_)

    # y = winner_list.reshape(winner_list.size//group_size_, group_size_).mean(axis=1)

    # df_[f'Average reward (epsilon_opt={epsilon_opt})'] = y
    df_[f"m_opt (epsilon_opt={epsilon_opt})"] = m_opt
    df_[f"m_random (epsilon_opt={epsilon_opt})"] = m_random

    return df_


dfs = Parallel(n_jobs=num_cores)(
    delayed(parallel_games)(epsilon_opt, max_games_total, group_size, epsilon, n_max)
    for epsilon_opt in epsilon_opt_list
)

for d in dfs:
    df.update(d)

df = pd.DataFrame(df)
fig = px.line(
    df,
    x="Game number",
    y=df.columns.difference(["Game number"]),
    title=f"Average reward over time of RL agent with policy epsilon={epsilon}",
)
fig.update_layout(width=1000)
fig.show()

## 2.2 Learning by self-practice

#### Question 7

In [None]:
epsilon_list = [0, 0.25, 0.5, 0.75, 1.0]

n_max = 1

max_games_total = 20_000
group_size = 250
df = {"Game number": np.arange(max_games_total // group_size) * group_size}

num_cores = mp.cpu_count()


def parallel_games(epsilon_, max_games_total_, group_size_, n_max):
    df_ = {}

    agent_ = QlearningAgent(epsilon=epsilon_, n_max=n_max)
    player_opt_ = QlearningAgent(
        epsilon=agent_.epsilon,
        learning_rate=agent_.learning_rate,
        discount_factor=agent_.discount_factor,
        n_max=agent_.n_max,
        q=agent_.q,
    )

    _, m_opt, m_random = play_games_with_m(player_opt_, agent_, max_games_total_, group_size_)

    df_[f"m_opt (epsilon={epsilon_})"] = m_opt
    df_[f"m_random (epsilon={epsilon_})"] = m_random

    return df_


dfs = Parallel(n_jobs=num_cores)(
    delayed(parallel_games)(epsilon, max_games_total, group_size, n_max) for epsilon in epsilon_list
)

for d in dfs:
    df.update(d)

fig = px.line(
    df,
    x="Game number",
    y=df.columns.difference(["Game number"]),
    title="Average reward over time of RL agent against himself",
)
fig.update_layout(width=1000)
fig.show()

#### Question 8

In [None]:
n_max_list = [1, 1_000, 10_000, 40_000]

epsilon = (0.1, 0.8)

max_games_total = 20_000
group_size = 250
df = {"Game number": np.arange(max_games_total // group_size) * group_size}

num_cores = min(len(n_max_list), mp.cpu_count())


def parallel_games(n_max, epsilon_, max_games_total_, group_size_):
    df_ = {}

    agent_ = QlearningAgent(epsilon=epsilon_, n_max=n_max)
    player_opt_ = QlearningAgent(
        epsilon=agent_.epsilon,
        learning_rate=agent_.learning_rate,
        discount_factor=agent_.discount_factor,
        n_max=agent_.n_max,
        q=agent_.q,
    )

    _, m_opt, m_random = play_games_with_m(player_opt_, agent_, max_games_total_, group_size_)

    # y = winner_list.reshape(winner_list.size//group_size_, group_size_).mean(axis=1)

    # df_[f'Average reward (n*={n_max})'] = y
    df_[f"m_opt (n*={n_max})"] = m_opt
    df_[f"m_random (n*={n_max})"] = m_random

    return df_


dfs = Parallel(n_jobs=num_cores)(
    delayed(parallel_games)(n_max, epsilon, max_games_total, group_size) for n_max in n_max_list
)

for d in dfs:
    df.update(d)

df = pd.DataFrame(df)
fig = px.line(
    df,
    x="Game number",
    y=df.columns.difference(["Game number"]),
    title=f"Average reward over time of RL agent with policy epsilon={epsilon}",
)
fig.update_layout(width=1000)
fig.show()

#### Question 10

In [None]:
fig = make_subplots(
    rows=1,
    cols=3,
    subplot_titles=("initial state", "State of highest Q-value", "State of lowest Q-value"),
)

state = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])
state_img = state.copy().astype(float).astype(str)
state_img[state_img == "1.0"] = "X"
state_img[state_img == "-1.0"] = "O"
state_img[state_img == "0.0"] = "-"
fig.add_trace(
    go.Heatmap(
        x=["0", "1", "2"],
        y=["0", "1", "2"],
        z=[
            [agent.q[state, (0, 0)], agent.q[state, (0, 1)], agent.q[state, (0, 2)]],
            [agent.q[state, (1, 0)], agent.q[state, (1, 1)], agent.q[state, (1, 2)]],
            [agent.q[state, (2, 0)], agent.q[state, (2, 1)], agent.q[state, (2, 2)]],
        ],
        text=state_img,
        texttemplate="%{text}",
        textfont={"size": 20},
        colorscale="gray",
        zmin=min(agent.q.q_tab.values()),
        zmax=max(agent.q.q_tab.values()),
    ),
    row=1,
    col=1,
)

state = agent.q.reverse_hash(max(agent.q.q_tab, key=agent.q.q_tab.get))[0]
state_img = state.copy().astype(float).astype(str)
state_img[state_img == "1.0"] = "X"
state_img[state_img == "-1.0"] = "O"
state_img[state_img == "0.0"] = "-"
fig.add_trace(
    go.Heatmap(
        x=["0", "1", "2"],
        y=["0", "1", "2"],
        z=[
            [agent.q[state, (0, 0)], agent.q[state, (0, 1)], agent.q[state, (0, 2)]],
            [agent.q[state, (1, 0)], agent.q[state, (1, 1)], agent.q[state, (1, 2)]],
            [agent.q[state, (2, 0)], agent.q[state, (2, 1)], agent.q[state, (2, 2)]],
        ],
        text=state_img,
        texttemplate="%{text}",
        textfont={"size": 20},
        colorscale="gray",
        zmin=min(agent.q.q_tab.values()),
        zmax=max(agent.q.q_tab.values()),
    ),
    row=1,
    col=2,
)

state = agent.q.reverse_hash(min(agent.q.q_tab, key=agent.q.q_tab.get))[0]
state_img = state.copy().astype(float).astype(str)
state_img[state_img == "1.0"] = "X"
state_img[state_img == "-1.0"] = "O"
state_img[state_img == "0.0"] = "-"
fig.add_trace(
    go.Heatmap(
        x=["0", "1", "2"],
        y=["0", "1", "2"],
        z=[
            [agent.q[state, (0, 0)], agent.q[state, (0, 1)], agent.q[state, (0, 2)]],
            [agent.q[state, (1, 0)], agent.q[state, (1, 1)], agent.q[state, (1, 2)]],
            [agent.q[state, (2, 0)], agent.q[state, (2, 1)], agent.q[state, (2, 2)]],
        ],
        text=state_img,
        texttemplate="%{text}",
        textfont={"size": 20},
        colorscale="gray",
        zmin=min(agent.q.q_tab.values()),
        zmax=max(agent.q.q_tab.values()),
    ),
    row=1,
    col=3,
)
fig.update_layout(width=1000)
fig.update_layout(width=1300, height=500, title="Different Q-values for different states")

fig.show()

# 3 Deep *Q*-Learning

## 3.2 Learning from experts

#### Question 11

In [None]:
epsilon = 0.1

player_opt = OptimalPlayer(epsilon=0.5)
agent = DQNAgent(epsilon=epsilon)

winner_list = play_games(player_opt, agent, max_games=20_000)

group_size = 250
y = winner_list.reshape(winner_list.size // group_size, group_size).mean(axis=1)
x = np.arange(y.size) * group_size

fig = make_subplots(
    rows=2,
    cols=1,
    subplot_titles=(
        f"Average reward over time of DQN agent with policy epsilon={epsilon}",
        f"Loss over time of DQN agent with policy epsilon={epsilon}",
    ),
)

fig.add_trace(go.Scatter(x=x, y=y), row=1, col=1)

fig.add_trace(go.Scatter(x=np.arange(len(agent.loss_curve)), y=agent.loss_curve), row=2, col=1)

fig.update_layout(height=700, width=1000)
fig.update_xaxes(title_text="Game number", row=1, col=1)
fig.update_xaxes(title_text="Game number", row=2, col=1)
fig.update_yaxes(title_text="Average reward", row=1, col=1)
fig.update_yaxes(title_text="Loss", row=2, col=1)
fig.show()

#### Question 12

In [None]:
epsilon = 0.1

player_opt = OptimalPlayer(epsilon=0.5)
agent = DQNAgent(epsilon=epsilon, batch_size=1, r=deque(maxlen=1))

winner_list = play_games(player_opt, agent, max_games=20_000)

group_size = 250
y = winner_list.reshape(winner_list.size // group_size, group_size).mean(axis=1)
x = np.arange(y.size) * group_size

fig = make_subplots(
    rows=2,
    cols=1,
    subplot_titles=(
        f"Average reward over time of DQN agent with policy epsilon={epsilon}",
        f"Loss over time of DQN agent with policy epsilon={epsilon}",
    ),
)

fig.add_trace(go.Scatter(x=x, y=y), row=1, col=1)

fig.add_trace(go.Scatter(x=np.arange(len(agent.loss_curve)), y=agent.loss_curve), row=2, col=1)

fig.update_layout(height=700, width=1000)
fig.update_xaxes(title_text="Game number", row=1, col=1)
fig.update_xaxes(title_text="Game number", row=2, col=1)
fig.update_yaxes(title_text="Average reward", row=1, col=1)
fig.update_yaxes(title_text="Loss", row=2, col=1)
fig.show()

#### Question 13

In [None]:
n_max_list = [1, 1_000, 10_000, 40_000]

epsilon = (0.1, 0.8)

max_games_total = 20_000
group_size = 250
df = {"Game number": np.arange(max_games_total // group_size) * group_size}


num_cores = min(len(n_max_list), mp.cpu_count())


def parallel_games(n_max, epsilon_, max_games_total_, group_size_):
    df_ = {}

    player_opt_ = OptimalPlayer(epsilon=0.5)
    agent_ = DQNAgent(epsilon=epsilon_, n_max=n_max)

    _, m_opt, m_random = play_games_with_m(player_opt_, agent_, max_games_total_, group_size_)

    # y = winner_list.reshape(winner_list.size//group_size_, group_size_).mean(axis=1)

    # df_[f'Average reward (n*={n_max})'] = y
    df_[f"m_opt (n*={n_max})"] = m_opt
    df_[f"m_random (n*={n_max})"] = m_random

    return df_


dfs = Parallel(n_jobs=2)(
    delayed(parallel_games)(n_max, epsilon, max_games_total, group_size) for n_max in n_max_list
)

for d in dfs:
    df.update(d)

df = pd.DataFrame(df)
fig = px.line(
    df,
    x="Game number",
    y=df.columns.difference(["Game number"]),
    title=f"Average reward over time of DQN agent with policy epsilon={epsilon}",
)
fig.update_layout(width=1000)
fig.show()

#### Question 14

In [None]:
epsilon_opt_list = [0, 0.25, 0.5, 0.75, 1.0]

n_max = 10

epsilon = (0.1, 0.8)

max_games_total = 20_000
group_size = 250
df = {"Game number": np.arange(max_games_total // group_size) * group_size}

num_cores = mp.cpu_count()


def parallel_games(epsilon_opt, max_games_total_, group_size_, epsilon_, n_max):
    df_ = {}

    agent_ = DQNAgent(epsilon=epsilon_, n_max=n_max)
    player_opt_ = OptimalPlayer(epsilon=epsilon_opt)

    _, m_opt, m_random = play_games_with_m(player_opt_, agent_, max_games_total_, group_size_)

    # y = winner_list.reshape(winner_list.size//group_size_, group_size_).mean(axis=1)

    # df_[f'Average reward (epsilon_opt={epsilon_opt})'] = y
    df_[f"m_opt (epsilon_opt={epsilon_opt})"] = m_opt
    df_[f"m_random (epsilon_opt={epsilon_opt})"] = m_random

    return df_


dfs = Parallel(n_jobs=2)(
    delayed(parallel_games)(epsilon_opt, max_games_total, group_size, epsilon, n_max)
    for epsilon_opt in epsilon_opt_list
)

for d in dfs:
    df.update(d)

df = pd.DataFrame(df)
fig = px.line(
    df,
    x="Game number",
    y=df.columns.difference(["Game number"]),
    title=f"Average reward over time of DQN agent with policy epsilon={epsilon}",
)
fig.update_layout(width=1000)
fig.show()

## 3.3 Learning by self-practice

#### Question 16

In [None]:
epsilon_list = [0, 0.1, 0.6, 1.0]

max_games_total = 20_000
group_size = 250
df = {"Game number": np.arange(max_games_total // group_size) * group_size}


def parallel_games(epsilon_, max_games_total_, group_size_):
    df_ = {}

    player_opt_ = DQNAgent(epsilon=epsilon_, second_player=True)
    agent_ = DQNAgent(epsilon=epsilon_, r=player_opt_.r, q_model=player_opt_.q_model)

    _, m_opt, m_random = play_games_with_m(player_opt_, agent_, max_games_total_, group_size_)

    df_[f"m_opt (epsilon={epsilon_})"] = m_opt
    df_[f"m_random (epsilon={epsilon_})"] = m_random

    return df_


dfs = Parallel(n_jobs=2)(
    delayed(parallel_games)(epsilon, max_games_total, group_size) for epsilon in epsilon_list
)

for d in dfs:
    df.update(d)

df = pd.DataFrame(df)
fig = px.line(
    df,
    x="Game number",
    y=df.columns.difference(["Game number"]),
    title="Average reward over time of DQN agent against himself",
)
fig.update_layout(width=1000)
fig.show()

#### Question 17

In [None]:
n_max_list = [1, 1000, 10_000, 40_000]
epsilon = (0.1, 0.8)

max_games_total = 20_000
group_size = 250
df = {"Game number": np.arange(max_games_total // group_size) * group_size}


def parallel_games(epsilon_, n_max, max_games_total_, group_size_):
    df_ = {}

    player_opt_ = DQNAgent(epsilon=epsilon_, n_max=n_max, second_player=True)
    agent_ = DQNAgent(epsilon=epsilon_, r=player_opt_.r, q_model=player_opt_.q_model, n_max=n_max)

    _, m_opt, m_random = play_games_with_m(player_opt_, agent_, max_games_total_, group_size_)

    df_[f"m_opt (n*={n_max})"] = m_opt
    df_[f"m_random (n*={n_max})"] = m_random

    return df_


dfs = Parallel(n_jobs=2)(
    delayed(parallel_games)(epsilon, n_max, max_games_total, group_size) for n_max in n_max_list
)

for d in dfs:
    df.update(d)

df = pd.DataFrame(df)
fig = px.line(
    df,
    x="Game number",
    y=df.columns.difference(["Game number"]),
    title="Average reward over time of DQN agent against himself",
)
fig.update_layout(width=1000)
fig.show()

#### Question 19

In [None]:
fig = make_subplots(
    rows=1,
    cols=3,
    subplot_titles=("initial state", "State of highest Q-value", "State of lowest Q-value"),
)

state = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])
state_img = state.copy().astype(float).astype(str)
state_img[state_img == "1.0"] = "X"
state_img[state_img == "-1.0"] = "O"
state_img[state_img == "0.0"] = "-"

state = torch.tensor(state, dtype=torch.int64)
state = nn_functional.one_hot(state + 1, 3)
state = state[:, :, (2, 0)]
state = state.unsqueeze(0)
state = state.type(torch.float).to(agent.device)
with torch.no_grad():
    state = agent.q_model.forward(state).detach().cpu().numpy()

fig.add_trace(
    go.Heatmap(
        x=["0", "1", "2"],
        y=["0", "1", "2"],
        z=[
            [state[0][0], state[0][1], state[0][2]],
            [state[0][3], state[0][4], state[0][5]],
            [state[0][6], state[0][7], state[0][8]],
        ],
        text=state_img,
        texttemplate="%{text}",
        textfont={"size": 20},
        colorscale="gray",
        zmin=0,
        zmax=1,
    ),
    row=1,
    col=1,
)

state = np.array([[0, 0, 0], [1, -1, -1], [1, 0, 0]])
state_img = state.copy().astype(float).astype(str)
state_img[state_img == "1.0"] = "X"
state_img[state_img == "-1.0"] = "O"
state_img[state_img == "0.0"] = "-"

state = torch.tensor(state, dtype=torch.int64)
state = nn_functional.one_hot(state + 1, 3)
state = state[:, :, (2, 0)]
state = state.unsqueeze(0)
state = state.type(torch.float).to(agent.device)
with torch.no_grad():
    state = agent.q_model.forward(state).detach().cpu().numpy()

fig.add_trace(
    go.Heatmap(
        x=["0", "1", "2"],
        y=["0", "1", "2"],
        z=[
            [state[0][0], state[0][1], state[0][2]],
            [state[0][3], state[0][4], state[0][5]],
            [state[0][6], state[0][7], state[0][8]],
        ],
        text=state_img,
        texttemplate="%{text}",
        textfont={"size": 20},
        colorscale="gray",
        zmin=0,
        zmax=1,
    ),
    row=1,
    col=2,
)

state = np.array([[-1, 0, 0], [1, -1, 0], [1, -1, 1]])
state_img = state.copy().astype(float).astype(str)
state_img[state_img == "1.0"] = "X"
state_img[state_img == "-1.0"] = "O"
state_img[state_img == "0.0"] = "-"

state = torch.tensor(state, dtype=torch.int64)
state = nn_functional.one_hot(state + 1, 3)
state = state[:, :, (2, 0)]
state = state.unsqueeze(0)
state = state.type(torch.float).to(agent.device)
with torch.no_grad():
    state = agent.q_model.forward(state).detach().cpu().numpy()

fig.add_trace(
    go.Heatmap(
        x=["0", "1", "2"],
        y=["0", "1", "2"],
        z=[
            [state[0][0], state[0][1], state[0][2]],
            [state[0][3], state[0][4], state[0][5]],
            [state[0][6], state[0][7], state[0][8]],
        ],
        text=state_img,
        texttemplate="%{text}",
        textfont={"size": 20},
        colorscale="gray",
        zmin=0,
        zmax=1,
    ),
    row=1,
    col=3,
)

fig.update_layout(width=1300, height=500, title="Different Q-values for different states")

fig.show()

.