In [1]:
import random
# Install required libraries
# Import required libraries
import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import namedtuple, deque
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "cpu"
)

print(device)
CUDA_LAUNCH_BLOCKING=1

cuda


In [2]:
class Net(nn.Module):

    def __init__(self, obs, action):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(obs, 64)
        self.layer2 = nn.Linear(64, action)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        return self.layer2(x)

In [3]:
class DQN:
  # initialize values
  def __init__(self, N, C, env):
    # initialize environment
    self.env = env
    # initialize replay memory to capacity N
    self.replay = []
    self.capacity = N
    self.pointer = 0
    state, info = env.reset()
    if isinstance(state, int):
        state = [state]
        print("here: ", len(state))
    
    self.policy_net = Net(len(state), self.env.action_space.n).to(device)
    self.target_net = Net(len(state), self.env.action_space.n).to(device)

    self.optimizer = optim.SGD(self.policy_net.parameters(), lr=0.1)
    self.C = C


  
  # Main training function
  def train(self, episodes, epsilon, discount, action_function, greedy):
    total_reward = [0] * episodes  
    for i in range(episodes):
      # initialize sequence S and preprocessed sequence o
      state, info = self.env.reset()
      if isinstance(state, int):
        state = torch.tensor([state],device=device, dtype=torch.float32).unsqueeze(0)
      else:
        state = torch.tensor(state,device=device, dtype=torch.float32).unsqueeze(0)
      stopped = False
      rewards = steps = 0
      eps = epsilon ** i if not greedy else 0
      while not stopped:
        # Select action and observe reward
        action_type = action_function(state, eps)
        observation, reward, terminated, truncated, _ = self.env.step(action_type)
        # For Lake
        reward = reward if reward != 0 else -1
        self.env.render()
        # Form the next action
        if isinstance(observation, int):
            next_state = torch.tensor([observation],device=device, dtype=torch.float32).unsqueeze(0)
        else:
            next_state = torch.tensor(observation, device=device, dtype=torch.float32).unsqueeze(0)
        
        # Encode action type for ease of use
        action_type = torch.tensor([action_type], device=device, dtype=torch.int64)
        # store transition in replay buffer
        transition = state, action_type,  next_state, reward
        state = next_state
        
        if self.pointer < self.capacity:
            self.replay.append(transition)
        else:
            self.replay[self.pointer % self.capacity] = transition
        self.pointer += 1

        # When terminated store the last value found
        stopped = terminated or truncated
        reward = -10 if reward == -1 and stopped else reward
        if stopped:
            transition = state, action_type,  None, reward
            if self.pointer < self.capacity:
                self.replay.append(transition)
            else:
                self.replay[self.pointer % self.capacity] = transition
            self.pointer += 1
            
        # Add rewards to the count
        rewards += reward
        # Run the replay function
        self.replay_function(discount ** steps)
        # Every C steps update the target function
        if steps % self.C == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())

        steps += 1
    
      print("Episode: ", i, " Reward: ", rewards)
      total_reward[i] = rewards  
    self.env.close()
    return total_reward
  # Determine the action for the warehouse environment
      
  def classic_action(self, state, epsilon):
      if isinstance(state, int):
        state = torch.tensor([state],device=device, dtype=torch.float32).unsqueeze(0)
      if np.random.rand() < epsilon:
        action_type = self.env.action_space.sample()
      else:
        with torch.no_grad():
            action_type = self.policy_net(state).max(1).indices.item()
      return action_type
  
  def replay_function(self, discount):
    BATCH_SIZE = 128
    if len(self.replay) < BATCH_SIZE:
        return
    else:
        sample = random.sample(self.replay, k=BATCH_SIZE)
        Q_list = torch.tensor([] , device=device)
        target_list = torch.tensor([] , device=device)
        action_list = torch.tensor([], dtype=torch.int64, device=device)
        for state, action, next_state, reward in sample:
            if next_state is None:
                Q_list = torch.cat((Q_list, self.policy_net(state)))
                # Make an actions array
                action_list = torch.cat((action_list, action), 0)

                # Calculate updated Q value
                Q_val = torch.tensor([reward * dic], device=device)
                # Add value to expected target list
                target_list = torch.cat((target_list, Q_val))
            else:
                    # Take entire Q row
                    Q_list = torch.cat((Q_list, self.policy_net(state)))
                    # Make an actions array
                    action_list = torch.cat((action_list, action), 0)
                    # Take max expected Q from the target network
                    max_expected = self.target_net(next_state).max(1).values

                    # Calculate updated Q value
                    Q_val = torch.tensor([(max_expected * discount) + reward], device=device)
                    # Add value to expected target list
                    target_list = torch.cat((target_list, Q_val))



        # Apply the action list to get real expected Q values
        selected_q_values = Q_list.gather(1, action_list.unsqueeze(1))
        loss_function = nn.SmoothL1Loss()

        
        loss = loss_function(selected_q_values, target_list.unsqueeze(1))
        #backprop
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

  # Save the current weights
  def save(self, filename):
    with open("pickles/" + filename, 'wb') as file:
      pickle.dump(self.policy_net, file)


In [4]:
#Prints the reward per epsisode graph
def reward_print(reward_per_episode, episodes, info): 
    mins = int(min(reward_per_episode)) - abs(int(min(reward_per_episode)) * (.2))
    maxs = int(max(reward_per_episode)) + abs(int(max(reward_per_episode)) * (.3) )
    plt.figure()
    plt.plot(reward_per_episode)
    plt.xlabel('Episode', fontsize=20)
    plt.ylabel('Cumulative Reward', fontsize=20)
    plt.title(f'Cumulative Reward Per Episode ({info})', fontsize=24)
    plt.xticks([0, episodes * .2, episodes * .4, episodes * .6, episodes * .8, episodes], fontsize=18)
    plt.yticks(fontsize=18)
    plt.ylim(ymin= mins, ymax=maxs)
    plt.xlim(xmin=0, xmax=episodes)
    plt.grid()
    plt.show()

#prints the epsilon decay graph
def ep_decay(eps, episodes):
    epsilon_values = [(eps ** i) * 1 for i in range(episodes)]
    plt.figure()
    plt.plot(epsilon_values, linewidth=4)
    plt.xlabel('Episode', fontsize=20)
    plt.ylabel('Epsilon Value', fontsize=20)
    plt.title(f"Epsilon Decay for {eps}", fontsize=24)
    plt.xticks([0, episodes * .2, episodes * .4, episodes * .6, episodes * .8, episodes], fontsize=18)
    plt.yticks(fontsize=18)
    plt.ylim(ymin=0, ymax=1)
    plt.xlim(xmin=0, xmax=episodes)
    plt.grid()
    plt.show()


In [5]:
N = 5000
C = 10
env = gym.make("FrozenLake-v1", render_mode="human")
env.reset()
Lake = DQN(N, C, env)

episodes = 100
epsilon = .95
discount = .9
action = Lake.classic_action
total_rewards = Lake.train(episodes, epsilon, discount, action, False)
print("Best reward: ", max(total_rewards))
Lake.save("drpreisl_angustsa_assignment2_part2_dqn_FrozenLake.pickle")
reward_print(total_rewards, episodes, "FrozenLake")
ep_decay(epsilon, episodes)
total_rewards = Lake.train(6, epsilon, discount, action, True)
reward_print(total_rewards, 5, "FrozenLake")

here:  1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1


KeyboardInterrupt: 

In [None]:
N = 5000
C = 3
env = gym.make("CartPole-v1", render_mode="human")
env.reset()
cart = DQN(N, C, env)

episodes = 300
epsilon = .99
discount = 1.05
action = cart.classic_action
total_rewards = cart.train(episodes, epsilon, discount, action, False)
cart.save("drpreisl_angustsa_assignment2_part2_dqn_cartpole.pickle")
reward_print(total_rewards, episodes, "CartPole")
ep_decay(epsilon, episodes)
total_rewards = cart.train(6, epsilon, discount, action, True)
reward_print(total_rewards, 5, "CartPole")

In [None]:
print(cart.target_net[()])