# Notebook to experiment with training:

https://docs.pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

## Code:

In [None]:
import wandb
import numpy as np
from gymnasium import spaces
import random
import torch

In [None]:
SEED = 42
# Python RNG
random.seed(SEED)

# NumPy RNG
np.random.seed(SEED)

# PyTorch RNG (CPU + GPU)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


In [None]:
import sys
sys.path.append("/home/martina/codi2/4year/tfg")  # add parent folder of general.py

from general import prepare, Glioblastoma, GlioblastomaPositionalEncoding, testing
from training_dqn import DQNPaper, DQNPositionalEncoding, DQNMriLite
from training_agents import DQNAgent
from training_buffers import ReplayBuffer

In [None]:
RUN_NAME = "Extension044"

CURRENT_CONFIG = {
    'grid_size': 6,
    'rewards': [3.0, -1.0, -0.2],  # [on_tumor, off_tumor, move]
    'action_space': spaces.Discrete(5), 
    'max_steps': 40, 
    'stop': False
}
    
ENVIRONMENT = GlioblastomaPositionalEncoding
NET = DQNPositionalEncoding
AGENT = DQNAgent
BUFFER = ReplayBuffer

NOTES = "Using different parameters and dataset of 200 samples"

LR = 5e-5
MEMORY_SIZE = 50000 
MAX_EPISODES = 1200

EPSILON = 0.7
EPSILON_MIN = 0.01
DECAY_TYPE = "exponential"
# DECAY_TYPE = "subtractive"
if DECAY_TYPE == "exponential":
    EPSILON_DECAY = 0.9992 #Let's try exponential decay
    print(f"Starting at {EPSILON}, decaying {EPSILON_DECAY} each episode, will reach {EPSILON_MIN} after {int(np.log(EPSILON_MIN/EPSILON)/np.log(EPSILON_DECAY))} episodes")
else:
    EPSILON_DECAY = (EPSILON - EPSILON_MIN) / MAX_EPISODES
    print(f"Starting at {EPSILON}, decaying {EPSILON_DECAY}, will reach {EPSILON_MIN} after {MAX_EPISODES} episodes")


GAMMA = 0.99
BATCH_SIZE = 64
BURN_IN = 5000
DNN_UPD = 15
DNN_SYNC = 4000

In [None]:
train_pairs = prepare(dataset = 200)

In [None]:
env=ENVIRONMENT(*train_pairs[0], **CURRENT_CONFIG)
print(env.observation_space.shape)
print(env.action_space.n)
print(np.arange(env.action_space.n))

net = NET(env, learning_rate=LR, device='cpu')
buffer = BUFFER(capacity=MEMORY_SIZE)
agent = AGENT(env_config=CURRENT_CONFIG, dnnetwork=net, buffer_class=BUFFER, train_pairs=train_pairs, env_class=ENVIRONMENT,
                 epsilon=EPSILON, eps_decay=EPSILON_DECAY, eps_decay_type=DECAY_TYPE, epsilon_min=EPSILON_MIN,
                 batch_size=BATCH_SIZE, gamma=GAMMA, 
                 memory_size=MEMORY_SIZE, buffer_initial=BURN_IN,
                 save_name=RUN_NAME)

print(f"Using Glioblastoma class {ENVIRONMENT}, DQN class {NET}, Agent class {AGENT}, Buffer class {BUFFER}")

In [None]:
wandb.login()
wandb.Settings(quiet=True)

wandb.init(project="TFG_Glioblastoma",
           name=RUN_NAME,
           id=RUN_NAME,
           config={
            "environment": ENVIRONMENT,
            "configuration": CURRENT_CONFIG,
            "model": NET,
            "agent": AGENT,
            "buffer": BUFFER,
            "notes": NOTES,
            "lr": LR,
            "MEMORY_SIZE": MEMORY_SIZE,
            "MAX_EPISODES": MAX_EPISODES,
            "EPSILON": EPSILON,
            "EPSILON_DECAY": EPSILON_DECAY,
            "Decay type": DECAY_TYPE,
            "EPSILON_MIN": EPSILON_MIN,
            "GAMMA": GAMMA,
            "BATCH_SIZE": BATCH_SIZE,
            "BURN_IN": BURN_IN,
            "DNN_UPD": DNN_UPD,
            "DNN_SYNC": DNN_SYNC, 
})

In [None]:
agent.train(
    train_pairs=train_pairs,
    gamma=GAMMA,
    max_episodes=MAX_EPISODES,
    dnn_update_frequency=DNN_UPD,
    dnn_sync_frequency=DNN_SYNC
)
wandb.finish()

In [None]:
wandb.finish()