In [4]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        states, actions, rewards, next_states, dones = zip(*random.sample(self.buffer, batch_size))
        return np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

    def __len__(self):
        return len(self.buffer)

def epsilon_greedy_policy(state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.randint(env.action_space.n)
    else:
        with torch.no_grad():
            q_values = model(torch.tensor(state, dtype=torch.float32))
            return np.argmax(q_values.numpy())

def train_model(model, optimizer, batch_size, gamma):
    if len(buffer) < batch_size:
        return
    states, actions, rewards, next_states, dones = buffer.sample(batch_size)
    states = torch.tensor(states, dtype=torch.float32)
    actions = torch.tensor(actions, dtype=torch.long)
    rewards = torch.tensor(rewards, dtype=torch.float32)
    next_states = torch.tensor(next_states, dtype=torch.float32)
    dones = torch.tensor(dones, dtype=torch.float32)

    q_values = model(states)
    next_q_values = model(next_states)
    q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
    next_q_value = next_q_values.max(1)[0]
    expected_q_value = rewards + gamma * next_q_value * (1 - dones)

    loss = F.smooth_l1_loss(q_value, expected_q_value)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Initialize environment and hyperparameters
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
model = DQN(state_size, action_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)
buffer = ReplayBuffer(capacity=10000)
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.995
min_epsilon = 0.01
batch_size = 64
num_episodes = 100

# Training loop
for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = epsilon_greedy_policy(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        buffer.push(state, action, reward, next_state, done)
        state = next_state

        train_model(model, optimizer, batch_size, gamma)

        if epsilon > min_epsilon:
            epsilon *= epsilon_decay

    print(f"Episode: {episode + 1}, Total Reward: {total_reward}")

# Evaluate the trained model
total_rewards = []
num_eval_episodes = 10

for _ in range(num_eval_episodes):
    state = env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = np.argmax(model(torch.tensor(state, dtype=torch.float32)).detach().numpy())
        next_state, reward, done, _ = env.step(action)
        state = next_state
        episode_reward += reward

    total_rewards.append(episode_reward)

avg_reward = sum(total_rewards) / num_eval_episodes
print(f"Average Reward over {num_eval_episodes} episodes: {avg_reward}")


Episode: 1, Total Reward: 27.0
Episode: 2, Total Reward: 19.0
Episode: 3, Total Reward: 15.0
Episode: 4, Total Reward: 31.0
Episode: 5, Total Reward: 17.0
Episode: 6, Total Reward: 18.0
Episode: 7, Total Reward: 14.0
Episode: 8, Total Reward: 10.0
Episode: 9, Total Reward: 33.0
Episode: 10, Total Reward: 15.0
Episode: 11, Total Reward: 9.0
Episode: 12, Total Reward: 35.0
Episode: 13, Total Reward: 15.0
Episode: 14, Total Reward: 13.0
Episode: 15, Total Reward: 22.0
Episode: 16, Total Reward: 16.0
Episode: 17, Total Reward: 30.0
Episode: 18, Total Reward: 11.0
Episode: 19, Total Reward: 31.0
Episode: 20, Total Reward: 15.0
Episode: 21, Total Reward: 22.0
Episode: 22, Total Reward: 22.0
Episode: 23, Total Reward: 16.0
Episode: 24, Total Reward: 21.0
Episode: 25, Total Reward: 15.0
Episode: 26, Total Reward: 110.0
Episode: 27, Total Reward: 87.0
Episode: 28, Total Reward: 33.0
Episode: 29, Total Reward: 19.0
Episode: 30, Total Reward: 48.0
Episode: 31, Total Reward: 44.0
Episode: 32, Tota