In [1]:
import torch
import torch.nn as nn
import random
import gym
import numpy as np
import cv2
from torch.utils.tensorboard import SummaryWriter
import statistics

# Preprocess each frame from 210x160 to 84x84
class Preprocess(gym.ObservationWrapper):
    
    def __init__(self, env=None):
        super(Preprocess, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)

    def observation(self, obs):
        return Preprocess.process(obs)

    def process(frame):
        frame = np.reshape(frame, [210, 160, 1]).astype(np.uint8)
        resized_frame = cv2.resize(frame, (84, 110), interpolation=cv2.INTER_AREA)
        result = resized_frame[18:102, :]
        result = np.reshape(result, [84, 84, 1])
        return result.astype(np.uint8)

# Change axis accordingly to neural network input
class ChangeAxis(gym.ObservationWrapper):
    
    def __init__(self, env):
        super(ChangeAxis, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.uint8)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)

  from .autonotebook import tqdm as notebook_tqdm
  if not hasattr(tensorboard, "__version__") or LooseVersion(


In [2]:
#Convolutional neural network
class DQN_Network(nn.Module):
    
    def __init__(self, input_shape, n_actions):
        super(DQN_Network, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        
        output_size = self.get_output_size(input_shape)
        self.fc = nn.Sequential(
            nn.Linear(output_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )
    
    def get_output_size(self, input_shape):
        ot = self.conv(torch.zeros(1, *input_shape))
        return int(np.prod(ot.size()))
    
    def forward(self, x):
        x = x.float() / 255.0
        mid_output = self.conv(x).view(x.size()[0], -1)
        return self.fc(mid_output)

#DQN agent 
class DQN_Agent:

    def __init__(self, state_space, action_space, epsilon_max, epsilon_min, rb_size, batch_size, gamma, lr):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.state_space = state_space
        self.action_space = action_space        
       
        self.dqn = DQN_Network(state_space, action_space).to(self.device)
        self.dqn_target = DQN_Network(state_space, action_space).to(self.device)
        
        self.optimizer = torch.optim.Adam(self.dqn.parameters(), lr=lr)
        self.l1 = nn.SmoothL1Loss().to(self.device)
        self.batch_size = batch_size
        self.gamma = gamma
        
        self.epsilon = epsilon_max
        self.epsilon_max = epsilon_max
        self.epsilon_min = epsilon_min
        self.epsilon_decay = (epsilon_max - epsilon_min)/20000

        self.mem_size = rb_size
        self.state_buffer = torch.zeros(self.mem_size, *self.state_space, dtype=torch.uint8)
        self.action_buffer = torch.zeros(self.mem_size, 1)
        self.reward_buffer = torch.zeros(self.mem_size, 1)
        self.next_state_buffer = torch.zeros(self.mem_size, *self.state_space, dtype=torch.uint8)
        self.done_buffer = torch.zeros(self.mem_size, 1)
        self.last_pos = 0
        self.current = 0
    
    def select_action(self, state):
        if random.random() < self.epsilon:  
            return torch.tensor([[random.randrange(self.action_space)]])
        else:
            return torch.argmax(self.dqn(state.to(self.device))).unsqueeze(0).unsqueeze(0).cpu()
    
    def train(self):
        if self.batch_size > self.current:
            return
        
        states, actions, rewards, next_states, dones = self.sample()
        states = states.float().to(self.device)
        actions = actions.float().to(self.device)
        rewards = rewards.float().to(self.device)
        next_states = next_states.float().to(self.device)
        dones = dones.float().to(self.device)
        
        self.optimizer.zero_grad()
        current = self.dqn(states).gather(1, actions.long())
        target = rewards + torch.mul((self.gamma * self.dqn_target(next_states).max(1).values.unsqueeze(1)), 1 - dones)
    
        loss = self.l1(current, target)
        loss.backward()
        self.optimizer.step()

        self.epsilon -= self.epsilon_decay
        self.epsilon = max(self.epsilon, self.epsilon_min)
        
    def store(self, state, action, reward, next_state, done):
        self.state_buffer[self.last_pos] = state
        self.action_buffer[self.last_pos] = action
        self.reward_buffer[self.last_pos] = reward
        self.next_state_buffer[self.last_pos] = next_state
        self.done_buffer[self.last_pos] = done
        
        self.last_pos = (self.last_pos + 1) % self.mem_size
        self.current = min(self.current + 1, self.mem_size)
    
    def sample(self):
        indices = random.choices(range(self.current), k=self.batch_size)
        states = self.state_buffer[indices]
        actions = self.action_buffer[indices]
        rewards = self.reward_buffer[indices]
        next_states = self.next_state_buffer[indices]
        dones = self.done_buffer[indices]      
        return states, actions, rewards, next_states, dones
        
    def update_target(self):
         self.dqn_target.load_state_dict(self.dqn.state_dict())
            
    def save_models(self):
        torch.save(self.dqn.state_dict(), "state_dict_model.pt")
        torch.save(self.dqn_target.state_dict(), "state_dict_target_model.pt")

In [3]:
#Train run
def train_agent(episodes=2000):
    
    writer = SummaryWriter()
   
    env = gym.make('ALE/MsPacman-v5', frameskip=16, obs_type='grayscale')
    env.seed(12)
    env.action_space.seed(12)
    env = Preprocess(env)
    env = ChangeAxis(env) 
     
    state_space = env.observation_space.shape
    action_space = env.action_space.n
    agent = DQN_Agent(state_space=state_space, action_space=action_space, epsilon_max=1.0, epsilon_min=0.02, rb_size=200000, batch_size=32, gamma=0.95, lr=0.00025)
    
    update_target = 200
    total_steps = 0
          
    for episode in range(episodes):
        episode_score = 0
        step = 0
        
        state = env.reset()
        state = torch.from_numpy(np.array(state)).unsqueeze(0)
        
        while True:
            action = agent.select_action(state)
            step += 1
            
            next_state, reward, done, info = env.step(int(action[0]))
            next_state = torch.from_numpy(np.array(next_state)).unsqueeze(0)
            episode_score += reward  
                    
            agent.store(state, action, reward, next_state, done)
            agent.train()
            if step % update_target == 0:
                agent.update_target()
            
            state = next_state
            
            if done:
                total_steps += step
                writer.add_scalar("Score/Step", episode_score, total_steps)
                writer.add_scalar("Score/Episode", episode_score, episode)
                writer.flush()
                break        
        
    agent.save_models()
    env.close()
    writer.close()

In [4]:
train_agent(episodes=20000)

  logger.warn(


Episode 101 score = 1580.0, average score = 593.6633663366337, total steps = 14886
Episode 201 score = 400.0, average score = 713.2835820895523, total steps = 30918
Episode 301 score = 1410.0, average score = 781.5614617940199, total steps = 47986
Episode 401 score = 700.0, average score = 907.4812967581047, total steps = 66324
Episode 501 score = 720.0, average score = 993.7325349301398, total steps = 84939
Episode 601 score = 1040.0, average score = 1053.5607321131447, total steps = 103501
Episode 701 score = 2140.0, average score = 1104.7646219686162, total steps = 122487
Episode 801 score = 1850.0, average score = 1147.6404494382023, total steps = 142504
Episode 901 score = 1810.0, average score = 1203.6625971143174, total steps = 162951
Episode 1001 score = 1040.0, average score = 1257.4925074925075, total steps = 183498
Episode 1101 score = 1110.0, average score = 1296.4850136239781, total steps = 204242
Episode 1201 score = 1210.0, average score = 1340.7160699417152, total steps

Episode 9701 score = 2320.0, average score = 2509.4516029275333, total steps = 2031312
Episode 9801 score = 5860.0, average score = 2522.7619630649933, total steps = 2052852
Episode 9901 score = 5720.0, average score = 2538.028481971518, total steps = 2074666
Episode 10001 score = 5020.0, average score = 2549.76902309769, total steps = 2097273
Episode 10101 score = 4000.0, average score = 2567.123057123057, total steps = 2119070
Episode 10201 score = 1540.0, average score = 2579.457896284678, total steps = 2141656
Episode 10301 score = 2900.0, average score = 2587.532278419571, total steps = 2162887
Episode 10401 score = 4240.0, average score = 2591.826747428132, total steps = 2184191
Episode 10501 score = 3700.0, average score = 2597.3564422435957, total steps = 2205620
Episode 10601 score = 2750.0, average score = 2608.492595038204, total steps = 2227646
Episode 10701 score = 1570.0, average score = 2617.8674890197176, total steps = 2249972
Episode 10801 score = 1630.0, average score

Episode 19101 score = 4330.0, average score = 3247.592272655882, total steps = 4120295
Episode 19201 score = 4390.0, average score = 3251.5921045778864, total steps = 4142873
Episode 19301 score = 4420.0, average score = 3254.0324335526657, total steps = 4165496
Episode 19401 score = 1690.0, average score = 3255.929075820834, total steps = 4187821
Episode 19501 score = 4290.0, average score = 3258.533408543152, total steps = 4209698
Episode 19601 score = 4350.0, average score = 3265.321157083822, total steps = 4232881
Episode 19701 score = 1240.0, average score = 3268.7432110045174, total steps = 4255070
Episode 19801 score = 4140.0, average score = 3274.7436998131407, total steps = 4277744
Episode 19901 score = 4790.0, average score = 3282.43756595146, total steps = 4301891
Episode 20000 score = 5160.0, average score = 3289.3625, total steps = 4324932


In [3]:
#Agent evaluation after training
def evaluate():
    env = gym.make('ALE/MsPacman-v5', frameskip=16, obs_type='grayscale', render_mode= 'human')
    env = Preprocess(env)
    env = ChangeAxis(env) 

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = DQN_Network(env.observation_space.shape, env.action_space.n).to(device)
    model.load_state_dict(torch.load("state_dict_model.pt"))
    model.eval()

    epsilon = 0.02

    scores = []

    for i in range(30):
        episode_score = 0
        state = env.reset()
        state = torch.from_numpy(np.array(state)).unsqueeze(0)
    
        while True:
            if random.random() < epsilon:  
                action =  torch.tensor([[random.randrange(9)]])
            else:
                action = torch.argmax(model(state.to(device))).unsqueeze(0).unsqueeze(0).cpu()
            
            next_state, reward, done, info = env.step(int(action[0]))
            next_state = torch.from_numpy(np.array(next_state)).unsqueeze(0)
            episode_score += reward  
        
            state = next_state
        
            if done:
                scores.append(episode_score)
                print("Episode: {}, Score: {}" .format(i + 1, scores[-1]))
                break
            
    print("Avarage score: {}" .format(np.mean(scores)))
    print(statistics.stdev(scores))
                

In [None]:
evaluate()

  logger.warn(
