In [8]:
# Importing the required packages
import numpy as np
import gym
from gym.wrappers import GrayScaleObservation
from gym.wrappers import ResizeObservation
from gym.wrappers import FrameStack
from qnetwork import QNetwork
import ale_py

import torch
from torch import nn
from torch.functional import F

from matplotlib import pyplot as plt
from collections import deque
import sys
import importlib

In [9]:
# Spawning the environment
env = gym.make('ALE/Pong-v5', render_mode='rgb_array')

In [10]:
# Testing the environment 
done = True
print('Press interrupt to stop execution')
rewards = 0.0
try:
    for step in range(5000):
        if done:
            state = env.reset()
        state, reward, interupted, terminated, info = env.step(env.action_space.sample())
        rewards += reward
        done = interupted or terminated
except KeyboardInterrupt:
       print('Execution Interrupted.')
finally:
    env.close()
print('Total Reward:', rewards)

# Structure the environment for processing
env = GrayScaleObservation(env)
env = ResizeObservation(env, 84)
env = FrameStack(env, 4)

Press interrupt to stop execution


  if not isinstance(terminated, (bool, np.bool8)):


Total Reward: -107.0


In [11]:
# Initialize the Q-Network
import math


q_net = QNetwork(num_channels=4, num_actions=env.action_space.n)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(q_net.parameters(), lr=0.0001)
loss_fn = nn.MSELoss()

# Initialize the replay buffer
replay_buffer = deque(maxlen=100000)

# Initialize hyperparameters
epsilon_start = 1.0
epsilon_end = 0.02
epsilon_decay = 1000000
batch_size = 32
gamma = 0.99
update_freq = 10000
num_episodes = 200
step = 0
epsilon_by_frame = lambda frame_idx: epsilon_end + (epsilon_start - epsilon_end) * math.exp(-1. * frame_idx / epsilon_decay)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
# Training loop
for episode in range(num_episodes):
    obs = env.reset()
    done = False
    total_reward = 0
    while not done:
        
        epsilon = epsilon_by_frame(frame_idx)
        # Choose action using epsilon-greedy policy
        if np.random.uniform() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                obs_tensor = torch.tensor(obs).unsqueeze(0).to(device)
                action = q_net(obs_tensor).argmax().item()

        # Take action and observe next state and reward
        next_obs, reward, done, info = env.step(action)
        total_reward += reward

        # Store transition in replay buffer
        replay_buffer.store_transition(obs, action, reward, next_obs, done)

        # Sample mini-batch from replay buffer
        obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = replay_buffer.sample_batch(batch_size)

        # Compute targets using Bellman equation
        with torch.no_grad():
            next_Q = q_net(torch.tensor(next_obs_batch).to(device)).max(dim=1)[0]
            target_Q = torch.where(done_batch, torch.tensor(reward_batch).to(device), reward_batch + gamma * next_Q)

        # Compute loss and update Q-network parameters
        obs_batch_tensor = torch.tensor(obs_batch).to(device)
        action_batch_tensor = torch.tensor(action_batch).unsqueeze(1).to(device)
        Q = q_net(obs_batch_tensor).gather(1, action_batch_tensor)
        loss = F.mse_loss(Q, target_Q.unsqueeze(1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        obs = next_obs

    # Decay epsilon
    epsilon *= epsilon_decay

    # Print total reward for episode
    print("Episode {} total reward: {:.2f}".format(episode+1, total_reward))

    # Save model checkpoint every 100 episodes
    if (episode+1) % 100 == 0:
        torch.save(q_net.state_dict(), "QNet_checkpoint_{}.pt".format(episode+1))


NameError: name 'frame_idx' is not defined