In [None]:
pip install gymnasium



In [None]:
from time import sleep
import numpy as np
from IPython.display import clear_output
import gymnasium as gym
from gymnasium.envs.registration import register
import torch
from torch import nn


In [None]:
#Give colab access to your google drive:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
#Change current directory to folder with MiniPacMan
%cd /gdrive/MyDrive/CSCI181V/

/gdrive/MyDrive/CSCI181V


In [None]:
#Import MiniPacMan environment class definition
from MiniPacManGymV2 import MiniPacManEnv

In [None]:
#Register MiniPacMan in your gymnasium environments
register(
    id="MiniPacMan-v0",
    entry_point=MiniPacManEnv,  # Update with your actual module path
    max_episode_steps=20          # You can also set a default here
)

In [None]:
#Create a MiniPacMan gymnasium environment
env = gym.make("MiniPacMan-v0", render_mode="human", frozen_ghost=False)

In [None]:
import torch
import torch.nn as nn

class QNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(6 * 6, 128)  # Increase hidden units
        self.activation1 = nn.ReLU()
        self.linear2 = nn.Linear(128, 64)
        self.activation2 = nn.ReLU()
        self.linear3 = nn.Linear(64, 4)  # 4 actions

    def forward(self, x):
        x = self.flatten(x)
        x = self.linear1(x)
        x = self.activation1(x)
        x = self.linear2(x)
        x = self.activation2(x)
        x = self.linear3(x)
        return x


In [None]:
pred_network = QNetwork()
target_network = QNetwork()
target_network.load_state_dict(pred_network.state_dict())
Q_optimizer = torch.optim.Adam(pred_network.parameters(), lr=0.0001) #feel free to change this
loss_fn = nn.MSELoss()

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) >= self.capacity:
            self.buffer.pop(0)
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, next_states, dones = zip(*[self.buffer[i] for i in indices])
        return torch.stack(states), actions, torch.tensor(rewards), torch.stack(next_states), torch.tensor(dones)

In [None]:
def update_target_network(pred_network, target_network):
    target_network.load_state_dict(pred_network.state_dict())

In [None]:
#set hyperparams -- play with any of these!
gamma=0.99
buffer_size=1000
batch_size=64
num_episodes=10000
epsilon_min = 0.1
epsilon_decay = 0.9997
target_update_interval = 100
RB=ReplayBuffer(buffer_size) #initialize Replay Buffer
epsilon=1 #initialize epsilon

for e in range(num_episodes):
  new_obs,info=env.reset()
  new_obs=torch.tensor(new_obs,dtype=torch.float32)

  done=False
  truncated=False
  steps=0

  while not done and not truncated: #Loop for one episode
    obs=new_obs
    state_tensor = obs.unsqueeze(0)

    #choose action
    t=np.random.random()
    if t>epsilon:
      with torch.no_grad():
          q_values = pred_network(state_tensor)
          action = torch.argmax(q_values, dim=1).item()
    else:
      action=torch.randint(4,(1,)).item()

    #take a step:
    new_obs,reward, done, truncated, info=env.step(action)
    new_obs=torch.tensor(new_obs,dtype=torch.float32)
    RB.push(obs,action,reward,new_obs,done)
    steps+=1

    if len(RB.buffer)>=batch_size:
      states, actions, rewards, next_states, dones=RB.sample(batch_size)

      #Define predictions, targets, loss here
      actions = torch.tensor(actions, dtype=torch.long)  # Ensure it's integer type
      current_q = pred_network(states)[range(batch_size), actions]


      # Compute the next state's Q-values.
      with torch.no_grad():
        next_q = target_network(next_states)
        max_next_q, _ = next_q.max(dim=1)  # Get max Q-value for next state
      target_q = rewards + gamma * max_next_q * (1 - dones.float())

      #loss
      loss = loss_fn(current_q, target_q)
      Q_optimizer.zero_grad()
      loss.backward()
      Q_optimizer.step()

  #reduce episilon if its not too low:
  epsilon = max(epsilon_min, epsilon * epsilon_decay)

  #periodic reporting:
  if e>0 and e%100==0:
    print(f'episode: {e}, steps: {steps}, epislon: {epsilon},win: {reward==10}')

  if e % target_update_interval == 0:
        update_target_network(pred_network, target_network)
        print(f'Target network updated at episode {e}')


Target network updated at episode 0
episode: 100, steps: 1, epislon: 0.9701500333301386,win: False
Target network updated at episode 100
episode: 200, steps: 5, epislon: 0.9414735292292377,win: False
Target network updated at episode 200
episode: 300, steps: 4, epislon: 0.913644669161937,win: False
Target network updated at episode 300
episode: 400, steps: 1, epislon: 0.8866383977586842,win: False
Target network updated at episode 400
episode: 500, steps: 1, epislon: 0.8604304002574455,win: False
Target network updated at episode 500
episode: 600, steps: 1, epislon: 0.8349970806122093,win: False
Target network updated at episode 600
episode: 700, steps: 2, epislon: 0.8103155402485782,win: False
Target network updated at episode 700
episode: 800, steps: 5, epislon: 0.786363557447322,win: False
Target network updated at episode 800
episode: 900, steps: 1, epislon: 0.7631195673373274,win: False
Target network updated at episode 900
episode: 1000, steps: 5, epislon: 0.7405626424799333,win:

In [None]:
import torch
import numpy as np
from time import sleep
from IPython.display import clear_output

# Set epsilon to 0 for full exploitation (greedy policy)
epsilon = 0

win = False  # Track whether Pac-Man has won

while not win:  # Keep running episodes until a win
    obs, info = env.reset()
    done = False
    truncated = False
    episode_reward = 0  # Track reward per episode

    while not done and not truncated:
        env.render()
        obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

        # Choose action (greedy exploitation)
        with torch.no_grad():
            q_values = pred_network(obs)
            action = torch.argmax(q_values, dim=1).item()

        # Take action
        obs, reward, done, truncated, info = env.step(action)
        episode_reward += reward  # Accumulate reward

        sleep(0.5)  # Adjust for better visualization
        clear_output(wait=True)

    env.render()

    # Check if Pac-Man won
    if episode_reward >= 20:  # Adjust based on reward structure
        print(f"Pac-Man won with a reward of {episode_reward}!")
        win = True  # Exit loop

env.close()


xxxxxx
x····x
x····x
x····x
x·ᗣᗧ◯x
xxxxxx



In [4]:
import torch
import numpy as np
from time import sleep
from IPython.display import clear_output

obs, info = env.reset()
done = False
truncated = False

while not done and not truncated:
    env.render()
    obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

    t = np.random.random()
    if t > epsilon:  # Exploitation
        with torch.no_grad():
            q_values = pred_network(obs)  # Get Q-values from the network
            action = torch.argmax(q_values, dim=1).item()  # Select best action
    else:  # Exploration
        action = env.action_space.sample()  # Choose a random action

    # Take action
    obs, reward, done, truncated, info = env.step(action)

    sleep(1)  # Pause for visualization
    clear_output(wait=True)

env.render()
env.close()

NameError: name 'env' is not defined

In [None]:
import torch
import numpy as np
from time import sleep
from IPython.display import clear_output

obs, info = env.reset()
done = False
truncated = False

while not done and not truncated:
    obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

    t = np.random.random()
    if t > epsilon:  # Exploitation
        with torch.no_grad():
            q_values = pred_network(obs)  # Get Q-values from the network
            action = torch.argmax(q_values, dim=1).item()  # Select best action
    else:  # Exploration
        action = env.action_space.sample()  # Choose a random action

    # Take action
    obs, reward, done, truncated, info = env.step(action)
    obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)  # Convert new observation


    sleep(1)  # Pause for visualization
    clear_output(wait=True)

env.render()
env.close()



xxxxxx
x····x
xᙉ···x
x··ᗧ·x
x····x
xxxxxx



In [None]:
obs, info = env.reset()
done = False
truncated = False

while not done and not truncated:
    env.render()
    obs=torch.tensor(obs,dtype=torch.float32)
    action=#your code here
    obs, reward, done, truncated, info = env.step(action)
    sleep(1)
    clear_output(wait=True)

env.render()
env.close()

In [None]:
import torch
import torch.nn as nn

class QNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(6 * 6, 128)  # Increase hidden units
        self.activation1 = nn.ReLU()
        self.linear2 = nn.Linear(128, 64)
        self.activation2 = nn.ReLU()
        self.linear3 = nn.Linear(64, 4)  # 4 actions

    def forward(self, x):
        x = self.flatten(x)
        x = self.linear1(x)
        x = self.activation1(x)
        x = self.linear2(x)
        x = self.activation2(x)
        x = self.linear3(x)
        return x

In [None]:
# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize networks on GPU
pred_network = QNetwork().to(device)
target_network = QNetwork().to(device)
secondary_target_network = QNetwork().to(device)  # If using double DQN

# Optimizer (it automatically moves params to GPU)
Q_optimizer = torch.optim.Adam(pred_network.parameters(), lr=0.01)


In [2]:
print("
xxxxxx
x····x
xᗧ···x
x····x
x····x
xxxxxx

"")

SyntaxError: unterminated string literal (detected at line 1) (<ipython-input-2-c723e25f4224>, line 1)

In [3]:
print("""
xxxxxx
x····x
xᗧ···x
x····x
x····x
xxxxxx
""")



xxxxxx
x····x
xᗧ···x
x····x
x····x
xxxxxx

