In [1]:
import gym
from gym import logger as gymlogger
from gym.wrappers import RecordEpisodeStatistics
gymlogger.set_level(40) #error only
from torch import nn
import torch
from torch.nn import functional as F
import numpy as np
import pandas as pd
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML
from base64 import b64encode
from collections import namedtuple, deque
from itertools import count
import torch.optim as optim

from IPython import display as ipythondisplay
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"

In [2]:
class duelingNetwork(nn.Module):

  def __init__(self, n_observations, n_actions):
    super(duelingNetwork, self).__init__()
    self.layer1 = nn.Linear(n_observations, 10)
    self.layer2 = nn.Linear(n_observations, 10)
    self.VLayer = nn.Linear(10, 1)
    self.ALayer = nn.Linear(10, n_actions)

  def forward(self, x, user_mean=True):
    V_x = self.layer1(x)
    A_x = self.layer2(x)
    V=self.VLayer(V_x)
    A=self.ALayer(A_x)
    if user_mean:
      A=A-torch.mean(A, 1 ,keepdim=True)[0]
    else:
      A=A-torch.max(A,1,keepdim=True)[0]
    return (V,A)

In [3]:
def get_V_and_A(iteration=150, user_mean=True):
  state=torch.tensor(np.random.random([1,10]), dtype=torch.float32)
  cost=nn.MSELoss()
  Q_target=torch.tensor([3.5, 4.2, 7.6]).unsqueeze(0)
  dueling_network=duelingNetwork(10, 3)
  opt=optim.Adam(dueling_network.parameters(), lr=1e-2)
  for i in range(iteration):
    V,A=dueling_network(state, user_mean)
    y=V+A
    loss=cost(y,Q_target)
    opt.zero_grad()
    loss.backward()
    opt.step()
  print(f'loss:{loss}')
  print(f'Q_target:{Q_target}')
  print(f'y:{y}')
  print(f'V:{V}')
  print(f'A:{A}')

In [4]:
get_V_and_A()


loss:7.5516613833315205e-06
Q_target:tensor([[3.5000, 4.2000, 7.6000]])
y:tensor([[3.4965, 4.1974, 7.5981]], grad_fn=<AddBackward0>)
V:tensor([[5.0973]], grad_fn=<AddmmBackward0>)
A:tensor([[-1.6008, -0.9000,  2.5008]], grad_fn=<SubBackward0>)


In [5]:
get_V_and_A(user_mean=False)

loss:4.762131720781326e-05
Q_target:tensor([[3.5000, 4.2000, 7.6000]])
y:tensor([[3.4991, 4.1952, 7.6109]], grad_fn=<AddBackward0>)
V:tensor([[7.6109]], grad_fn=<AddmmBackward0>)
A:tensor([[-4.1118, -3.4157,  0.0000]], grad_fn=<SubBackward0>)


In [6]:
get_V_and_A(iteration=1500,user_mean=False)

loss:0.0
Q_target:tensor([[3.5000, 4.2000, 7.6000]])
y:tensor([[3.5000, 4.2000, 7.6000]], grad_fn=<AddBackward0>)
V:tensor([[7.6000]], grad_fn=<AddmmBackward0>)
A:tensor([[-4.1000, -3.4000,  0.0000]], grad_fn=<SubBackward0>)
