In [1]:
import gym
import torch
import random
import warnings
import itertools
import numpy as np
import pandas as pd
import torch.nn as nn
from typing import Sequence
import matplotlib.pyplot as plt
import torch.nn.functional as F
from collections import namedtuple, deque
from plot_script import plot_result, draw_neural_net
from torch.distributions.categorical import Categorical

warnings.filterwarnings("ignore")

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [2]:
GAMMA = 0.99
BATCH_SIZE = 64
BUFFER_SIZE = 10000
MIN_REPLAY_SIZE = 5000
EPS_START = 1.0
EPS_END = 0.05
EPS_DECAY = 0.995
TARGET_UPDATE_FREQ = 5

In [3]:
env = gym.make("LunarLander-v2")
obs = env.reset()
episode_reward = 0.0

In [4]:
Transition = namedtuple('Transition', ('states', 'actions', 'rewards', 'dones', 'next_states'))
class Replay_memory():

    def __init__(self, env, fullsize, minsize, batchsize):
        self.env = env
        self.memory = deque(maxlen=fullsize)
        self.rewards = deque(maxlen=50)
        self.batchsize = batchsize
        self.minsize = minsize

    def append(self, transition):
        self.memory.append(transition)
    
    def sample_batch(self):
        # Filter out empty tuples and handle observations with additional information
        non_empty_batch = [obs for obs in self.memory if isinstance(obs.states[0], np.ndarray)]

        # Check if there are enough non-empty transitions
        if len(non_empty_batch) < self.batchsize:
            #print("Not enough non-empty transitions.")
            return None

        # Sample from the filtered batch
        batch = random.sample(non_empty_batch, self.batchsize)
        batch = Transition(*zip(*batch))

        # Add debug prints
        #print("States in batch:")
        for obs in batch.states:
            if isinstance(obs[0], np.ndarray):  # Check if states[0] is iterable
                state_array = obs[0]
                #print(f"Observation shape: {state_array.shape}")
                #print(f"Observation: {state_array}")
            elif isinstance(obs, tuple) and isinstance(obs[0], np.ndarray):
                state_array = obs[0]
                #print(f"Observation shape: {state_array.shape}")
                #print(f"Observation: {state_array}")
            else:
                print("Unsupported observation format")
                continue

        # Check if there are any valid observations
        valid_obs = [obs for obs in batch.states if isinstance(obs[0], np.ndarray)]
        if not valid_obs:
            # Handle the case when there are no valid observations
            #print("No valid observations in the batch.")
            return None

        # Find the maximum shape for each dimension
        max_shapes = [max(obs[0].shape[j] for obs in valid_obs) for j in range(len(valid_obs[0][0].shape))]

        #print("Max Shapes:", max_shapes)

        # Pad or crop state observations to have the same shape
        padded_states = np.array([
            [np.pad(obs[j], (0, max_shapes[j] - obs[j].shape[0])) for j in range(min(len(obs), len(max_shapes)))]
            for obs in valid_obs
        ])



        states = torch.from_numpy(padded_states.astype(np.float32)).to(device)
        actions = torch.from_numpy(np.array(batch.actions, dtype=np.int64)).to(device).unsqueeze(1)



        rewards = torch.from_numpy(np.array(batch.rewards, dtype=np.float32)).unsqueeze(1).to(device)
        dones = torch.from_numpy(np.array(batch.dones, dtype=np.bool8)).unsqueeze(1).to(device)
        next_states = torch.from_numpy(np.array(batch.next_states, dtype=np.float32)).to(device)

        return states, actions, rewards, dones, next_states

    def initialize(self):
        obs = env.reset()
        for _ in range(self.minsize):
            action = self.env.action_space.sample()
            #print(env.step(action))
            new_obs, reward, done, info, _ = env.step(action)
            transition = Transition(obs, action, reward, done, new_obs)
            self.append(transition)
            obs = new_obs
            if done:
                self.env.reset()
        return self


In [5]:
replay_memory = Replay_memory(env, BUFFER_SIZE, MIN_REPLAY_SIZE, BATCH_SIZE).initialize()

In [6]:
class DQN(nn.Module):
    def __init__(self, ninputs, noutputs):
        super(DQN, self).__init__()
        self.a1 = nn.Linear(ninputs, 64, device=device)
        self.a2 = nn.Linear(64, noutputs, device=device)
    
    def forward(self, X):
        o = self.a1(X)
        o = torch.tanh(o)
        o = self.a2(o)
        
        return o
    
    def __call__(self, X):
        return self.forward(X)

In [7]:
dqn_policy = DQN(env.observation_space.shape[0], env.action_space.n)
dqn_target = DQN(env.observation_space.shape[0], env.action_space.n)
dqn_target.load_state_dict(dqn_policy.state_dict())
dqn_target.eval()

DQN(
  (a1): Linear(in_features=8, out_features=64, bias=True)
  (a2): Linear(in_features=64, out_features=4, bias=True)
)

In [8]:
loss_fn = nn.SmoothL1Loss()
learning_rate = 0.001
optimizer = torch.optim.Adam(dqn_policy.parameters(), lr=learning_rate)

In [9]:
def epsilon_greedy_policy(epsilon, obs):
    rnd_sample = random.random()
    
    if rnd_sample <= epsilon:
        action = env.action_space.sample()
    else:
        with torch.no_grad():
            # Extract the state array from the observation tuple
            state_array = obs[0] if isinstance(obs, tuple) else obs
            obs_tensor = torch.Tensor(state_array).to(device)
            action = int(torch.argmax(dqn_policy(obs_tensor)))
    
    return action


In [10]:
obs = env.reset()
eps_threshold = EPS_START
episode = 1
history = []

for step in itertools.count():
    
    # Get action using Epsilon-Greedy Policy
    action = epsilon_greedy_policy(eps_threshold, obs)

    # Print the shape of the observation
    #print(f'Observation: {obs}')


    # Get the new observation and reward.
    new_obs, reward, done, _, _ = env.step(action)

    # Append to Replay Memory
    replay_memory.append(Transition(obs, action, reward, done, new_obs))
    episode_reward += reward
    obs = new_obs
    
    # If the episode is finished
    if done:
        
        episode += 1
        eps_threshold = np.max((eps_threshold * EPS_DECAY, EPS_END))
        replay_memory.rewards.append(episode_reward)
        obs = env.reset()
        avg_res = np.mean(replay_memory.rewards)
        history.append((episode-1, avg_res))
        
        if episode % 100 == 0: 
            print(f'Episode: {episode} Avg Results: {avg_res} Epsilon: {eps_threshold}')

        if avg_res >= 195:
            print(f'Solved at episode: {episode} Avg Results: {avg_res}')
            break
        
        if step % TARGET_UPDATE_FREQ == 0:
            dqn_target.load_state_dict(dqn_policy.state_dict())

        episode_reward = 0
    
    # Sample from the Replay Memory
    batch_result = replay_memory.sample_batch()

    # Check if sample_batch returned None
    if batch_result is None:
        continue  # Skip this iteration if there are not enough non-empty transitions

    b_states, b_actions, b_rewards, b_dones, b_next_states = batch_result
    
    # Get Q-Values of every state-action pair from the Replay Memory Sample
    #print("Shapes before gather:", b_states.shape, b_actions.shape)
    qvalues = dqn_policy(b_states).gather(2, b_actions.unsqueeze(2))




    

    
    # Train the Neural Network to better evaluate the states observed.
    with torch.no_grad():
        target_qvalues = dqn_target(b_next_states)
        max_target_qvalues = torch.max(target_qvalues, axis=1).values.unsqueeze(1)
        expected_qvalues = b_rewards + GAMMA * (1 - b_dones.type(torch.int64)) * max_target_qvalues

    loss = loss_fn(qvalues, expected_qvalues)
    optimizer.zero_grad()
    loss.backward()
    for param in dqn_policy.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()


Episode: 100 Avg Results: -205.1715481053213 Epsilon: 0.6088145090359074
Episode: 200 Avg Results: -340.15041594550667 Epsilon: 0.36880183088056995
Episode: 300 Avg Results: -399.1352068865279 Epsilon: 0.22340924607110255
Episode: 400 Avg Results: -264.6417827248626 Epsilon: 0.1353347165085562
Episode: 500 Avg Results: -391.3400722584682 Epsilon: 0.08198177029173696
Episode: 600 Avg Results: -329.97895601668256 Epsilon: 0.05
Episode: 700 Avg Results: -244.6880772658695 Epsilon: 0.05
Episode: 800 Avg Results: -143.95188489160543 Epsilon: 0.05
Episode: 900 Avg Results: -174.76430046386463 Epsilon: 0.05
Episode: 1000 Avg Results: -199.1591453015795 Epsilon: 0.05
Episode: 1100 Avg Results: -190.8595251073349 Epsilon: 0.05
Episode: 1200 Avg Results: -241.2524842499609 Epsilon: 0.05
Episode: 1300 Avg Results: -159.14867219848483 Epsilon: 0.05


KeyboardInterrupt: 

In [None]:
history = pd.DataFrame(
    history, columns=['Episode', 'Score'])

(fig, ax) = plt.subplots(1, 1)

ax.plot(history['Episode'], history['Score'])

ax.set_xlabel('Epoch')
ax.set_ylabel('Score over last 50 episodes')

plt.show()