In [None]:
pip install gymnasium



In [None]:
from time import sleep
import numpy as np
from IPython.display import clear_output
import gymnasium as gym
from gymnasium.envs.registration import register

In [None]:
#Give colab access to your google drive:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
%cd /gdrive/MyDrive/CSCI181V/

/gdrive/MyDrive/CSCI181V


In [None]:
#Import MiniPacMan environment class definition
from MiniPacManGym import MiniPacManEnv

In [None]:
#Register MiniPacMan in your gymnasium environments
register(
    id="MiniPacMan-v0",
    entry_point=MiniPacManEnv,
    max_episode_steps=20
)

In [None]:
#Create a MiniPacMan gymnasium environment
env = gym.make("MiniPacMan-v0", render_mode="human", frozen_ghost=True)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class QNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(2, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 4)


    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
q_network = QNetwork().to(device)
opt = torch.optim.Adam(q_network.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

In [None]:
gamma = 0.995
num_episodes = 10000
epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.9996

for e in range(num_episodes):
    new_obs, info = env.reset()
    new_pos = np.argwhere(new_obs == 1)[0]
    done = False
    truncated = False
    steps = 0

    while not done and not truncated:
        pos = new_pos
        state_tensor = torch.tensor([pos[0], pos[1]], dtype=torch.float32, device=device).unsqueeze(0)
        if np.random.random() > epsilon:
            with torch.no_grad(): # exploitation
                q_values = q_network(state_tensor)
                action = torch.argmax(q_values, dim=1).item()
        else:

            action = np.random.randint(4) # exploration

        new_obs, reward, done, truncated, info = env.step(action)
        steps += 1
        new_pos = np.argwhere(new_obs == 1)[0]
        next_state_tensor = torch.tensor([new_pos[0], new_pos[1]], dtype=torch.float32, device=device).unsqueeze(0) # update
        q_pred = q_network(state_tensor)[0, action]

        with torch.no_grad():
            q_next = q_network(next_state_tensor)
            max_q_next = torch.max(q_next)
            q_target = reward + gamma * max_q_next

        loss = loss_fn(q_pred, q_target)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    if e % 100 == 0:
        print(f"Episode: {e}, Steps: {steps}, Epsilon: {epsilon:.3f}, Win: {reward == 10}")

NameError: name 'optimizer' is not defined

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class QNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(6 * 6, 128)
        self.activation = nn.ReLU()
        self.fc2 = nn.Linear(128, 4)
    def forward(self, x):
        x = self.fc1(x)
        x = self.activation(x)
        x = self.fc2(x)  # Changed from self.linear2 to self.fc2
        return x


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
q_network = QNetwork().to(device)
opt = torch.optim.Adam(q_network.parameters(), lr=lr)
loss_fn = nn.MSELoss()



In [None]:
# Hyperparameters
gamma = 0.995
num_episodes = 10000
epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.9997  # Slower decay

for e in range(num_episodes):
    new_obs, info = env.reset()
    done = False
    truncated = False
    steps = 0

    while not done and not truncated:
        # Flatten full game state for the Q-network
        state_tensor = torch.tensor(new_obs.flatten(), dtype=torch.float32, device=device).unsqueeze(0)

        # Choose action (ε-greedy)
        if np.random.random() > epsilon:
            with torch.no_grad():
                q_values = q_network(state_tensor)
                action = torch.argmax(q_values, dim=1).item()
        else:
            action = np.random.randint(4)

        # Take action in the environment
        new_obs, reward, done, truncated, info = env.step(action)
        steps += 1

        # Prepare next state
        next_state_tensor = torch.tensor(new_obs.flatten(), dtype=torch.float32, device=device).unsqueeze(0)

        # Get Q-value prediction for the chosen action
        q_values = q_network(state_tensor)
        q_pred = q_values[0, action]

        # Compute target Q-value
        with torch.no_grad():
            q_next = q_network(next_state_tensor)
            max_q_next = torch.max(q_next).detach()
            q_target = reward + gamma * max_q_next

        # Compute loss
        loss = loss_fn(q_pred, q_target)

        # Backpropagation
        opt.zero_grad()
        loss.backward()
        opt.step()

    # Update epsilon (slower decay)
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    # Progress monitoring
    if e % 100 == 0:
        print(f"Episode: {e}, Steps: {steps}, Epsilon: {epsilon:.3f}, Win: {reward == 10}")

Episode: 0, Steps: 12, Epsilon: 1.000, Win: True
Episode: 100, Steps: 1, Epsilon: 0.970, Win: False
Episode: 200, Steps: 3, Epsilon: 0.941, Win: False
Episode: 300, Steps: 1, Epsilon: 0.914, Win: False
Episode: 400, Steps: 5, Epsilon: 0.887, Win: False
Episode: 500, Steps: 8, Epsilon: 0.860, Win: False
Episode: 600, Steps: 4, Epsilon: 0.835, Win: False
Episode: 700, Steps: 1, Epsilon: 0.810, Win: False
Episode: 800, Steps: 1, Epsilon: 0.786, Win: False
Episode: 900, Steps: 4, Epsilon: 0.763, Win: False
Episode: 1000, Steps: 20, Epsilon: 0.741, Win: False
Episode: 1100, Steps: 3, Epsilon: 0.719, Win: False
Episode: 1200, Steps: 1, Epsilon: 0.697, Win: False
Episode: 1300, Steps: 1, Epsilon: 0.677, Win: False
Episode: 1400, Steps: 2, Epsilon: 0.657, Win: False
Episode: 1500, Steps: 1, Epsilon: 0.637, Win: False
Episode: 1600, Steps: 12, Epsilon: 0.619, Win: False
Episode: 1700, Steps: 2, Epsilon: 0.600, Win: False
Episode: 1800, Steps: 18, Epsilon: 0.583, Win: False
Episode: 1900, Steps:

In [None]:
obs, info = env.reset()
done = False
truncated = False

while not done and not truncated:
    env.render()

    # Convert full 6x6 board to a flattened tensor
    state_tensor = torch.tensor(obs.flatten(), dtype=torch.float32, device=device).unsqueeze(0)

    # Get the Q-values for the current state from Q-network
    with torch.no_grad():
        q_values = q_network(state_tensor)

    # Choose the action with the highest Q-value
    action = torch.argmax(q_values, dim=1).item()

    # Take action in environment
    obs, reward, done, truncated, info = env.step(action)

    sleep(1)
    clear_output(wait=True)

env.close()


xxxxxx
x····x
x··ᗣ·x
x···ᗧx
x···◯x
xxxxxx



In [None]:
gamma = 0.995
lr = 0.002
num_episodes = 10000
epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.9996

for e in range(num_episodes):
    new_obs, info = env.reset()
    new_pos = np.argwhere(new_obs == 1)[0]
    done = False
    truncated = False
    steps = 0

    while not done and not truncated:
        pos = new_pos
        state_tensor = torch.tensor(new_obs.flatten(), dtype=torch.float32, device=device).unsqueeze(0)
        next_state_tensor = torch.tensor(new_obs.flatten(), dtype=torch.float32, device=device).unsqueeze(0)
        if np.random.random() > epsilon:
            with torch.no_grad(): # exploitation
                q_values = q_network(state_tensor)
                action = torch.argmax(q_values, dim=1).item()
        else:

            action = np.random.randint(4) # exploration

        new_obs, reward, done, truncated, info = env.step(action)
        steps += 1
        new_pos = np.argwhere(new_obs == 1)[0]
        next_state_tensor = torch.tensor([new_pos[0], new_pos[1]], dtype=torch.float32, device=device).unsqueeze(0) # update
        q_pred = q_network(state_tensor)[0, action]

        with torch.no_grad():
            q_next = q_network(next_state_tensor)
            max_q_next = torch.max(q_next).detach()
            q_target = reward + gamma * max_q_next

        loss = loss_fn(q_pred, q_target)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epsilon=max(0.01, epsilon - 1.0/num_episodes)

    if e % 100 == 0:
        print(f"Episode: {e}, Steps: {steps}, Epsilon: {epsilon:.3f}, Win: {reward == 10}")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x2 and 36x128)

In [None]:
#Run this code cell to see your trained agent in action!

obs, info = env.reset()
done = False
truncated = False

while not done and not truncated:
    env.render()
    pos=pos = np.argwhere(obs == 1)[0]  # pacman position
    # obs == 1 creates boolean array from obs where each element = 1 if True and 0 is false, in this instance 1 represents Pacman's position
    # np.argwhere returns indices of all elements in array that are True, so it returns the indices of pacman's position
    # [0] picks the first (and only) coordinate from the result, so pos becomes [1,1] if that's where pacman is located
    state_tensor = torch.tensor([pos[0], pos[1]], dtype=torch.float32, device=device).unsqueeze(0) # convert position to tensor
    # Get the Q-values for the current state from Q-network.
    with torch.no_grad():
        q_values = q_network(state_tensor)
    action = torch.argmax(q_values, dim=1).item()     # Choose the action with the highest Q-value.
    obs, reward, done, truncated, info = env.step(action)
    sleep(1)
    clear_output(wait=True)

env.close()

xxxxxx
xᗧ···x
x··ᗣ·x
x····x
x···◯x
xxxxxx



RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x2 and 36x128)

In [None]:
import torch
import numpy as np
from time import sleep
from IPython.display import clear_output

# Keep running until Pac-Man succeeds
success = False
attempts = 0  # Track the number of attempts

while not success:
    attempts += 1
    print(f"Attempt: {attempts}")

    obs, info = env.reset()
    done = False
    truncated = False
    total_reward = 0  # Track total reward

    while not done and not truncated:
        env.render()
        pos = np.argwhere(obs == 1)[0]  # Get Pac-Man's position

        # Convert position to tensor
        state_tensor = torch.tensor([pos[0], pos[1]], dtype=torch.float32, device=device).unsqueeze(0)

        # Get Q-values from the Q-network
        with torch.no_grad():
            q_values = q_network(state_tensor)

        # Choose the best action
        action = torch.argmax(q_values, dim=1).item()

        # Take the step
        obs, reward, done, truncated, info = env.step(action)
        total_reward += reward  # Accumulate total reward

        sleep(0.5)  # Adjust sleep time for visibility
        clear_output(wait=True)

    # Check if Pac-Man won
    if total_reward >= 10:
        success = True
        print(f"✅ Success! Pac-Man won after {attempts} attempts.")
    else:
        print("❌ Failed. Restarting...")


xxxxxx
xᗧ···x
x··ᗣ·x
x····x
x···◯x
xxxxxx



KeyboardInterrupt: 

In [None]:
#Run this code cell to see your trained agent in action!

obs, info = env.reset()
done = False
truncated = False

while not done and not truncated:
    env.render()
    pos=pos = np.argwhere(obs == 1)[0]  # pacman position
    # obs == 1 creates boolean array from obs where each element = 1 if True and 0 is false, in this instance 1 represents Pacman's position
    # np.argwhere returns indices of all elements in array that are True, so it returns the indices of pacman's position
    # [0] picks the first (and only) coordinate from the result, so pos becomes [1,1] if that's where pacman is located
    state_tensor = torch.tensor([pos[0], pos[1]], dtype=torch.float32, device=device).unsqueeze(0) # convert position to tensor
    # Get the Q-values for the current state from your Q-network.
    with torch.no_grad():
        q_values = q_network(state_tensor)
    action = torch.argmax(q_values, dim=1).item()     # Choose the action with the highest Q-value.
    obs, reward, done, truncated, info = env.step(action)
    sleep(1)
    clear_output(wait=True)

env.close()

xxxxxx
x···ᗧx
x··ᗣ·x
x····x
x···◯x
xxxxxx



In [None]:
#set hyperparams -- feel free to play with these!
gamma=0.95
alpha=0.9
num_episodes=10000

#initialize epsilon, Q
epsilon=1
Q=np.zeros((6,6,4)) #First two coordinates encode state, last encodes action

for e in range(num_episodes):
  new_obs,info=env.reset()
  new_pos=np.argwhere(new_obs==1)[0] #current pacman position
  done=False
  truncated=False
  steps=0

  while not done and not truncated: #Loop for one episode
    obs=new_obs
    pos=new_pos

    #choose action, essentially here, exploration uses randint to choose one of 4 random actions, exploitation used np.argmax to choose action that expects best reward
    t=np.random.random()
    if t>epsilon: # t is a random number, epsilon is the probability threshold for exploring, at the start, epsilon is set high meaning high chance of exploration
      # over time, epsilon decays leading to more exploitation over time
      # so if t < epsilon, we choose a random action, if t > epsilon we exploit choosing action with most ideal reward
      action=action = np.argmax(Q[pos[0], pos[1], :]) #exploitation
      # Q is your Q-table where the first two indices represent the state
      # and the last index represents the action.
      # np.argmax(Q[pos[0], pos[1], :]) returns the index of the action with the highest Q-value for that state.
      # This is the action you believe will yield the best long-term reward.
    else:
      action=action = np.random.randint(4) #exploration
      # The environment has 4 possible actions (up, down, left, right), so np.random.randint(4) returns a random integer between 0 and 3.

    # we establish this balance of exploration and exploitation with epsilon because exploration is needed to gather more info about environment
    # such that exploitation will actually work, since exploitation uses current best known info to make optimal decision

    #take a step:
    new_obs,reward, done, truncated, info=env.step(action)
    steps+=1
    new_pos=np.argwhere(new_obs==1)[0] #next pacman position

    #Q-table update rule:
    Q[pos[0],pos[1],action]=Q[pos[0], pos[1], action] = Q[pos[0], pos[1], action] + alpha * (reward + gamma * np.max(Q[new_pos[0], new_pos[1], :]) - Q[pos[0], pos[1], action])
    # pos = current state, action = action, update the Q-value at the current state (pos) for the action taken.
    # reward is immediate reward after taking action, np.max finds max q-value for next state (best expected reward)
    # alpha is learning rate, gamma is discount factor, in this case 0.999 which determines importance of future reward
    # gamma used to disincentivize processes that take too many steps

  #reduce episilon if its not too low
  #Should be close to zero after 50 - 60% of episodes, and then level off
  epsilon=epsilon = max(0.1, epsilon * 0.995)
  # we want to start with exploration then shift to exploitation
  # .995 gradually reduces epsilon each episode
  # max(0.1) ensures epsilon never goes below 0.1, this means there will always be a 10% chance to explore, avoid local optima

  #periodic reporting:
  if e%100==0:
    print(f'episode: {e}, steps: {steps}, epislon: {epsilon}, win: {reward==10}')


episode: 0, steps: 8, epislon: 0.995, win: False
episode: 100, steps: 5, epislon: 0.6027415843082742, win: False
episode: 200, steps: 4, epislon: 0.36512303261753626, win: False
episode: 300, steps: 8, epislon: 0.2211807388415433, win: True
episode: 400, steps: 1, epislon: 0.13398475271138335, win: False
episode: 500, steps: 10, epislon: 0.1, win: True
episode: 600, steps: 6, epislon: 0.1, win: True
episode: 700, steps: 6, epislon: 0.1, win: True
episode: 800, steps: 6, epislon: 0.1, win: True
episode: 900, steps: 6, epislon: 0.1, win: True
episode: 1000, steps: 6, epislon: 0.1, win: True
episode: 1100, steps: 9, epislon: 0.1, win: False
episode: 1200, steps: 6, epislon: 0.1, win: True
episode: 1300, steps: 1, epislon: 0.1, win: False
episode: 1400, steps: 5, epislon: 0.1, win: False
episode: 1500, steps: 6, epislon: 0.1, win: True
episode: 1600, steps: 14, epislon: 0.1, win: True
episode: 1700, steps: 6, epislon: 0.1, win: True
episode: 1800, steps: 7, epislon: 0.1, win: False
episode

In [None]:
#Run this code cell to see your trained agent in action!

obs, info = env.reset()
done = False
truncated = False

while not done and not truncated:
    env.render()
    pos=pos = np.argwhere(obs == 1)[0]  # pacman position
    # obs == 1 creates boolean array from obs where each element = 1 if True and 0 is false, in this instance 1 represents Pacman's position
    # np.argwhere returns indices of all elements in array that are True, so it returns the indices of pacman's position
    # [0] picks the first (and only) coordinate from the result, so pos becomes [1,1] if that's where pacman is located
    action = np.argmax(Q[pos[0], pos[1], :]) # pick the action with the highest Q-value for the current state.
    # our q-table is a 3d np array, first two indices represent the state, (pacman's pos on grid) and last represents 4 possible actions
    # by indexing (Q[pos...]) we are retrieving Q-values for all actions at state given by Pacman's current position
    # np.argmax selects the action that gives us the greatest Q value among the 4 possible actions
    obs, reward, done, truncated, info = env.step(action)
    sleep(1)
    clear_output(wait=True)

env.close()

In [None]:
#Run this code cell to see your trained agent in action!

obs, info = env.reset()
done = False
truncated = False

while not done and not truncated:
    env.render()
    pos=pos = np.argwhere(obs == 1)[0]  # pacman position
    # obs == 1 creates boolean array from obs where each element = 1 if True and 0 is false, in this instance 1 represents Pacman's position
    # np.argwhere returns indices of all elements in array that are True, so it returns the indices of pacman's position
    # [0] picks the first (and only) coordinate from the result, so pos becomes [1,1] if that's where pacman is located
    action = np.argmax(Q[pos[0], pos[1], :]) # pick the action with the highest Q-value for the current state.
    # our q-table is a 3d np array, first two indices represent the state, (pacman's pos on grid) and last represents 4 possible actions
    # by indexing (Q[pos...]) we are retrieving Q-values for all actions at state given by Pacman's current position
    # np.argmax selects the action that gives us the greatest Q value among the 4 possible actions
    obs, reward, done, truncated, info = env.step(action)
    sleep(1)
    clear_output(wait=True)

env.close()

xxxxxx
xᗧ···x
x··ᗣ·x
x····x
x···◯x
xxxxxx



NameError: name 'Q' is not defined