In [None]:
!pip install --quiet gymnasium

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/925.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.1/925.5 kB[0m [31m11.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m925.5/925.5 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch as T

In [None]:
def plot_learning_curve(x, scores, epsilons, filename):
  fig = plt.figure()
  ax = fig.add_subplot(111, label='1')
  ax2 = fig.add_subplot(111, label='2', frame_on=False)

  ax.plot(x, epsilons, color='C0')
  ax.set_xlabel('Training steps', color='C0')
  ax.set_ylabel('Epsilon', color='C0')
  ax.tick_params(axis='x', colors='C0')
  ax.tick_params(axis='y', colors='C0')

  N = len(scores)
  running_avg = np.empty(N)
  for t in range(N):
    running_avg[t] = np.mean(scores[max(0, t-100): (t+1)])

  ax2.scatter(x, running_avg, color='C1')
  ax2.axes.get_xaxis().set_visible(False)
  ax2.yaxis.tick_right()
  ax2.set_ylabel('Score', color='C1')
  ax2.yaxis.set_label_position('right')
  ax2.tick_params(axis='y', colors='C1')

  plt.save(filename)

In [None]:
class Network(nn.Module):
  def __init__(self, lr, n_actions, input_dims):
    super(Network, self).__init__()

    self.fc1 = nn.Linear(*input_dims, 128)
    self.fc2 = nn.Linear(128, n_actions)

    # self.parameters comes with nn.Module and return all parameters of the network
    self.optimizer = optim.Adam(self.parameters(), lr=lr)
    self.loss = nn.MSELoss()
    self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
    self.to(self.device)

  def forward(self, state):
    layer1 = F.relu(self.fc1(state))
    actions = self.fc2(layer1)

    return actions

In [None]:
class Agent():
    def __init__(self, lr, gamma, input_dims, n_actions, epsilon, eps_end,
                 eps_dec):
        self.lr = lr
        self.gamma = gamma
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.action_space = [i for i in range(self.n_actions)]

        self.Q = Network(self.lr, self.n_actions, self.input_dims)

    # Choose an action based on epsilon greedy
    # if epsilon is greater than a random number, it will do a random action.
    # A greater epsilon will create more random actions
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor(observation, dtype=T.float).to(self.Q.device)
            actions = self.Q.forward(state)
            action = T.argmax(actions).item()
        else:
          action = np.random.choice(self.action_space)

        return action

    # Linearly decrease epsilon. It's possible to decrease in other ways, like logarithm or exponential
    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon>self.eps_min\
                       else self.eps_min

    def learn(self, state, action, reward, state_):
        self.Q.optimizer.zero_grad()
        states = T.tensor(state, dtype=T.float).to(self.Q.device)
        actions = T.tensor(action).to(self.Q.device)
        rewards = T.tensor(reward).to(self.Q.device)
        states_ = T.tensor(state_, dtype=T.float).to(self.Q.device)

        q_pred = self.Q.forward(states)[actions]
        q_next = self.Q.forward(states_).max()
        q_target = reward + self.gamma*q_next

        loss = self.Q.loss(q_target, q_pred).to(self.Q.device)
        loss.backward()
        self.Q.optimizer.step()
        self.decrement_epsilon()

In [None]:
env = gym.make('CartPole-v1')

agent = Agent(input_dims=env.observation_space.shape,
              n_actions=env.action_space.n,
              lr=0.001,
              gamma=0.99,
              epsilon=1.0,
              eps_end=0.01,
              eps_dec=1e-5)

scores = []
eps_history = []
n_games = 10000

for i in range(n_games):
  done = False
  truncated = False

  observation = env.reset()[0]
  score = 0

  while not done or not truncated:
    action = agent.choose_action(observation)
    observation_, reward, done, truncated, info = env.step(action)
    score += reward
    agent.learn(observation, action, reward, observation_)
    observation = observation_

  scores.append(score)
  eps_history.append(agent.epsilon)

  if i % 100 == 0:
    avg_score = np.mean(scores[-100:])
    print(f'\r episode {i}, score {score}, avg score {avg_score}, epsilon {agent.epsilon}', end='', flush=True)

  logger.warn(


 episode 9700, score 10.0, avg score 9.46, epsilon 0.01

In [None]:
filename = 'cartpole_naive_dqn.png'
x = [i+1 for i in range(n_games)]

plot_learning_curve(x, scores, eps_history, filename)