In [None]:
import gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque


In [None]:
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [None]:
class ReplayBuffer:
    def __init__(self, buffer_size, batch_size):
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size

    def add(self, experience):
        self.memory.append(experience)

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)
        states, actions, rewards, next_states, dones = zip(*experiences)
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)


In [None]:
def epsilon_greedy_action(state, q_network, epsilon, action_size):
    if random.random() > epsilon:
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            q_values = q_network(state)
            action = q_values.max(1)[1].item()
    else:
        action = random.choice(np.arange(action_size))
    return action


In [None]:
def dqn(env, n_episodes=1000, max_t=200, gamma=0.99, epsilon_start=1.0, epsilon_end=0.01, epsilon_decay=0.995, buffer_size=10000, batch_size=64, learning_rate=0.001, update_every=4):
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    q_network = QNetwork(state_size, action_size)
    target_network = QNetwork(state_size, action_size)
    target_network.load_state_dict(q_network.state_dict())
    target_network.eval()

    optimizer = optim.Adam(q_network.parameters(), lr=learning_rate)
    memory = ReplayBuffer(buffer_size, batch_size)

    epsilon = epsilon_start
    timestep = 0

    for episode in range(1, n_episodes+1):
        result = env.reset()
        if isinstance(result, tuple):
            state = result[0]
        else:
            state = result
        total_reward = 0

        for t in range(max_t):
            action = epsilon_greedy_action(state, q_network, epsilon, action_size)
            next_state, reward, done, _, _ = env.step(action)
            memory.add((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward

            timestep += 1
            if timestep % update_every == 0 and len(memory) > batch_size:
                experiences = memory.sample()
                states, actions, rewards, next_states, dones = experiences

                q_targets_next = target_network(next_states).detach().max(1)[0].unsqueeze(1)
                q_targets = rewards.unsqueeze(1) + (gamma * q_targets_next * (1 - dones.unsqueeze(1)))

                q_expected = q_network(states).gather(1, actions.unsqueeze(1))
                loss = nn.MSELoss()(q_expected, q_targets)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if done:
                break

        epsilon = max(epsilon_end, epsilon_decay*epsilon)
        target_network.load_state_dict(q_network.state_dict())

        print(f"Episode {episode}/{n_episodes}, Total Reward: {total_reward}, Epsilon: {epsilon}")

    return q_network


In [None]:
env = gym.make('CartPole-v1', new_step_api=True)
trained_q_network = dqn(env)


  if not isinstance(terminated, (bool, np.bool8)):
  states = torch.tensor(states, dtype=torch.float32)


Episode 1/1000, Total Reward: 21.0, Epsilon: 0.995
Episode 2/1000, Total Reward: 32.0, Epsilon: 0.990025
Episode 3/1000, Total Reward: 46.0, Epsilon: 0.985074875
Episode 4/1000, Total Reward: 20.0, Epsilon: 0.9801495006250001
Episode 5/1000, Total Reward: 17.0, Epsilon: 0.9752487531218751
Episode 6/1000, Total Reward: 15.0, Epsilon: 0.9703725093562657
Episode 7/1000, Total Reward: 23.0, Epsilon: 0.9655206468094844
Episode 8/1000, Total Reward: 29.0, Epsilon: 0.960693043575437
Episode 9/1000, Total Reward: 16.0, Epsilon: 0.9558895783575597
Episode 10/1000, Total Reward: 11.0, Epsilon: 0.9511101304657719
Episode 11/1000, Total Reward: 20.0, Epsilon: 0.946354579813443
Episode 12/1000, Total Reward: 15.0, Epsilon: 0.9416228069143757
Episode 13/1000, Total Reward: 10.0, Epsilon: 0.9369146928798039
Episode 14/1000, Total Reward: 15.0, Epsilon: 0.9322301194154049
Episode 15/1000, Total Reward: 41.0, Epsilon: 0.9275689688183278
Episode 16/1000, Total Reward: 39.0, Epsilon: 0.9229311239742362
E