In [None]:
import gym
from gym_marioai import levels

all_actions = (0,1,2,3,4,5,6,7,8,9,10,11,12)

env = gym.make('Marioai-v0', render=True,
               level_path=levels.easy_level,
               compact_observation=False, #this must stay false for proper saving in dataset
               enabled_actions=all_actions,
               rf_width=20, rf_height=10)
while True:
    done = False
    total_reward = 0
    
    #initialize data arrays with initial states for each episode
    observations = [env.reset(seed=0)]
    actions = [12] #nothing
    rewards = [0]
    terminals = [done]

    while not done:
        action = 3
        next_state, reward, done, info = env.step(action)
        
        observations.append(next_state)
        actions.append(action)
        rewards.append(reward)
        terminals.append(done)
     
        total_reward += reward

In the previous section, we have stored the transitions into the experience replay memory, mapping the (state, action)-pairs to their (next-state, reward).
Now, let's train a policy to maximize the discounted, cumulative reward (a.k.a return). For this, we use the DQN algorithm:

### Deep Q-Learning (DQN)  

#### Bellman equation:

$Q(s,a;\theta) = r + \gamma * max_{a'}Q(s',a';\~\theta)$


#### Temporal difference (TD) error: 
The TD-error is the difference between the predicted reward and the actual reward.

$\delta = Q(s,a;\theta) - (r + \gamma * max_{a'}Q(s',a';\~\theta))$


#### Huber Loss:
To minimize the TD error, we use the Huber Loss as our loss function, which is designed to be more robust to outliers.

$L(\delta) =  \begin{cases} \frac{1}{2} * (Q(s,a;\theta) - (r + \gamma * max_{a'}Q(s',a';\~\theta)))^2 for |\delta| \leq \frac{1}{2} \\ |\delta| - \frac{1}{2} otherwise  \end{cases}$


In [None]:
from exercise_dqn import DQN 
from d3rlpy.dataset import MDPDataset
from constants import DATAPATH
import torch

### TODO: Please implement the Huber Loss Function from above. Note that 'value' describes the actual cumulated reward and 'target' the predicted cumulated reward.###

def huber_loss(beta, gamma, rewards, target, value):
  
  y = rewards + gamma * target
  diff = (value - y)
  cond = diff.detach().abs() < beta
  loss = torch.where(cond, 0.5 * diff**2, beta * (diff.abs() - 0.5 * beta))
  
  return loss

In [None]:
### Now, load the dataset and run the DQN algorithm with your implemented loss function ###


dataset = MDPDataset.load(DATAPATH)

dqn = DQN(huber_loss = huber_loss)

dqn.fit(dataset, n_epochs=10) #feel free to adjust the number of complete passes through the training dataset

### Convergend DQN (CDQN)

DQN is a rather simple algorithm, which doesn't always converge. The Convergent DQN (https://arxiv.org/pdf/2106.15419.pdf) ensures loss convergence, by taking the maximum value l_DQN using the target network and l_MSBE (Mean Squared Bellman Error) using the current network. 



$ l\_DQN = Q(s,a;\theta) - (r + \gamma * max_{a'}Q(s',a';\~\theta))$

$ l\_MSBE = Q(s,a;\theta) - (r + \gamma * max_{a'}Q(s',a';\theta))$


$ l\_CDQN = {\mathbb{E}}[max(L_DQN, l_MSBE)] $


In [None]:
### Now, run the CDQN and compare the loss in loss.txt ###

from cdqn import CDQN 
from constants import DATAPATH

cdqn = CDQN()

dataset = MDPDataset.load(DATAPATH)

cdqn.fit(train_episodes, eval_episodes=test_episodes, n_epochs=15, logdir=log_dir, save_interval=1, shuffle=True)

### Super Mario Evaluation 