# Training pong Agent with deep Q learning

In this first notebook, you can find a part of our code for the RL project "  Simulating atari pong game using Deepreinforcement learning methods ". This part concerns only DQN and in another notebook you can find our code for A2C model.

In [None]:
import gym
import random
import matplotlib.pyplot as plt
import cv2
import numpy as np
import torch.nn as nn 
import torch
import pickle
from torch.autograd import Variable


First we import environment name 

In [None]:
env_name = 'PongDeterministic-v4'  

Then we do image preprocessing ( part 2.1 in the project report). 

In [None]:
def ProcessFrame(image):
    """
    convert to grayscale , crop and resize 
    """
    image = image[30:-12,5:-4]
    image = np.mean(image,axis = 2)
    image = cv2.resize(image, (84,84),interpolation = cv2.INTER_NEAREST)
    image = np.array(image,dtype = np.uint8)
    return image[:,:,None]

Next we define the dualing network architecture  ( part 2.2.1 )

In [None]:
class DQNetwork(nn.Module) : 
     def __init__(self , n_actions , n_hidden = 512 , n_hidden_inter = 256 , history_length = 4 ):
         super().__init__()

         self.conv1 = nn.Conv2d(in_channels= history_length, out_channels= 32 , 
                                kernel_size=(8,8) , stride=4 , bias = False)
         self.bn1 = nn.BatchNorm2d(32)
         self.conv2 = nn.Conv2d(in_channels = 32, out_channels= 64 , 
                                kernel_size=(4,4) , stride=2 , bias = False)
         self.bn2 = nn.BatchNorm2d(64)
         self.conv3 = nn.Conv2d(in_channels = 64, out_channels= 64 , 
                                kernel_size=(3,3) , stride=1 , bias = False)
         self.bn3 = nn.BatchNorm2d(64)
         self.conv4 = nn.Conv2d(in_channels = 64, out_channels= n_hidden , 
                                kernel_size=(7,7) , stride=1 , bias = False)
         
         ## split to predict the state value and the advantage function
         self.state_pred_hidd = nn.Linear(n_hidden , n_hidden_inter)
         self.advantage_pred_hidd = nn.Linear(n_hidden , n_hidden_inter)
         
         self.state_pred  = nn.Linear(n_hidden_inter , 1)
         self.advantage_pred = nn.Linear(n_hidden_inter , n_actions)

         self.relu = nn.ReLU()
         self.n_actions = n_actions
     def forward(self,x) :
         out = self.relu(self.bn1(self.conv1(x)))
         out = self.relu(self.bn2(self.conv2(out)))
         out = self.relu(self.bn3(self.conv3(out)))
         out = self.relu(self.conv4(out))
         out = out.view(out.shape[0],-1)
         ## split 
         state_value = self.state_pred(self.relu(self.state_pred_hidd(out)))
         advantage =  self.advantage_pred(self.relu(self.advantage_pred_hidd(out)))
         ## define Q function
         Q_pred = state_value + advantage - torch.mean(advantage , dim = 1 , keepdim = True)

         
         return Q_pred

Here we define the function for training ( part 2.2.2)

In [None]:

def train_batch(minibatch , gamma , n_actions ) : 
   """
   Perform a gradient descent step given a minibatch of (states , actions , rewards , next states and terminal flags)
   """
   states , actions , rewards , next_states , terminal_flags = minibatch
   states , next_states , actions  = states.to(device , dtype=torch.float32) , next_states.to(device, dtype=torch.float32) , actions.to(device)
   Q_pred = main_model(states)   
   # get only the predictions corresponding to the given actions. 
   Q_pred = torch.gather(Q_pred,1,actions.unsqueeze(1)).squeeze()
   rewards = rewards.to(device)
   terminal_flags = terminal_flags.to(device)
   
   with torch.no_grad(): 
        next_states_q_values = main_model(next_states)
        next_states_target_q_values = target_model(next_states)
        best_action_main = torch.argmax(next_states_q_values , dim = 1)
        Q_pred_next = next_states_target_q_values.gather(1 ,best_action_main.unsqueeze(1)).squeeze(1)
        Q_target = rewards + gamma * Q_pred_next * (1-terminal_flags)
   
   optimizer.zero_grad()
   loss = hubert_loss (Q_pred , Q_target )
   loss.backward()
   # clamp the gradients to avoid gradient explosion. 
   for param in main_model.parameters():
        param.grad.data.clamp_(-1, 1)
   optimizer.step()
   
   return loss.item()

Next we will define the replay memory to sample from when learning ( part 2.3 )

In [None]:
class MEMORY(object) :
     """
     Implements replay memory class. 
     """
     def __init__(self , batch_size = 32 , img_size = (84,84) , max_transitions = 100000 , frames_stack= 4):
        
        self.batch_size = batch_size
        self.max_transitions = max_transitions
        self.frames =  np.empty((max_transitions,img_size[0],img_size[1]), dtype = np.uint8)
        self.actions = np.empty(max_transitions , dtype = np.int32)
        self.rewards = np.empty(max_transitions , dtype = np.float32)
        self.terminal_flags = np.empty(max_transitions, dtype=np.bool)
        self.current = 0 
        self.count = 0
        self.frames_stack = frames_stack

     def add_experience(self , action, frame, reward, terminal) : 
        """
        Given a new transition , add it to the memory object. If the number exceeds the maximum number of transitions,
        we replace the first transitions in a "cyclic" way. 
        """
        self.frames [self.current] = frame
        self.actions[self.current] = action 
        self.rewards[self.current] = reward
        self.terminal_flags[self.current] = terminal
        self.count = max(self.current+1 , self.count )
        self.current = (self.current+1) % self.max_transitions
    
     def sample_minibatch(self) :
         """
         Sample randomly a minibatch. Concretely we sample an index i , the frames corresponding to (i,i+1,i+2,i+3) 
         are concatenated to yield the current state and (i+1,i+2,i+3,i+4) are concatenated to yield the next state. 
         """
         states , next_states , actions_batch , rewards_batch, terminal_batch = [], [], [], [], []
         for i in range(self.batch_size) :
             while True :
                 index = np.random.randint(low = self.frames_stack , high = self.count )
                 if self.terminal_flags[index - self.frames_stack:index].any() == False:
                    break 
             current_state = torch.from_numpy(self.frames[index - self.frames_stack:index,...]/255).unsqueeze(0)
             next_state = torch.from_numpy(self.frames[index - self.frames_stack + 1:index+1,...]/255).unsqueeze(0)
             states.append(current_state)
             next_states.append(next_state)
             actions_batch.append(self.actions[index])
             rewards_batch.append(self.rewards[index])
             terminal_batch.append(int(self.terminal_flags[index]))
         return (torch.cat(states, dim = 0), torch.tensor(actions_batch , dtype= torch.int64), torch.tensor(rewards_batch),
                 torch.cat(next_states , dim = 0), torch.tensor(terminal_batch))
        
                

Next we implement epsilon greedy strategy ( part 2.4 ) 

In [None]:
class EpsilonGreedy():
      """
      Implements epsilon greedy strategy with decay. 
      """
      def __init__(self , n_actions , eps_start = 1. , eps_interm = 0.1 , eps_final = 0.01,
                   explore_iter = 50000 , decrease_iter = 1000000 , exploit_iter = 25000000) :
          
          self.explore_iter = explore_iter
          self.decrease_iter = decrease_iter 
          self.exploit_iter = exploit_iter
          self.n_actions = n_actions
          self.eps_start  = eps_start 
          self.eps_final = eps_final 
          self.eps_interm = eps_interm

          self.slope = -(self.eps_start - self.eps_interm)/self.decrease_iter
          self.intercept = self.eps_start - self.slope*self.explore_iter
          self.slope_2 = -(self.eps_interm - self.eps_final)/(self.exploit_iter - self.decrease_iter - self.explore_iter)
          self.intercept_2 = self.eps_final - self.slope_2*self.exploit_iter

      def get_epsilon(self , frame_iter , eval) :

          if eval :
             return 0. ## only exploitation at test time
          if frame_iter < self.explore_iter :
             return self.eps_start
          elif frame_iter >= self.explore_iter and frame_iter < self.decrease_iter + self.explore_iter:
               return self.slope * frame_iter + self.intercept
          elif frame_iter >= self.decrease_iter + self.explore_iter  :
               return self.slope_2 * frame_iter + self.intercept_2

      def sample_action (self , frame_iter , state , eval= False) :
          """
          Sample an action given an updated value of epsilon. 
          """
          epsilon = self.get_epsilon(frame_iter,eval)
          a = random.random()
          if a < epsilon :
             selected_action = np.random.choice(np.arange(self.n_actions) , 1)[0]
             return selected_action
          else :
             with torch.no_grad() :
                tensor_state = torch.from_numpy(state/255).unsqueeze(0).to(device , dtype=torch.float32)
                Q_pred = main_model(tensor_state.permute((0,3,1,2)))
             return torch.argmax(Q_pred , dim = 1)[0].item()
          

Next we define the agent class. It containts mainly the methods : reset to restart a game and step to perform an action.

In [None]:
class AGENT(object) :
      """
      Agent class that reimplements gym ai methods of reset and step. 
      params : 
          env_name : in our case will be PongDeterministic-v4. 
          no_op_steps : action 1 fire is repeated for a random number of steps between 1 and no_op_steps.
                        This ensures that the agent starts in different situation when we reset the game.
          frames_stacked : the number of frames to stack to get a state. 
      """
      def __init__(self, env_name, no_op_steps=10, frames_stacked=4) :
         self.env = gym.make(env_name)
         self.state = None
         self.no_op_steps = no_op_steps
         self.frames_stacked = frames_stacked

      def reset(self, evaluation=False) :
          frame = self.env.reset()
          if evaluation:
              for _ in range(random.randint(1, self.no_op_steps)):
                  frame, _, _, _ = self.env.step(1) # Action 'Fire'

          processed_frame = ProcessFrame(frame)  
          self.state = np.repeat(processed_frame, self.frames_stacked, axis=2)
          
      def step(self, action):
        """
        Args:
            action: Integer, action the agent performs
        Performs an action and observes the reward and terminal state from the environment
        """

        new_frame, reward, terminal, info = self.env.step(action) 
        processed_new_frame = ProcessFrame(new_frame) 
        new_state = np.append(self.state[:, :, 1:], processed_new_frame, axis=2)  
        self.state = new_state
        return processed_new_frame, reward, terminal , new_frame

In [None]:
import imageio
from skimage.transform import resize

def clip_reward(reward):
    if reward > 0:
        return 1
    elif reward == 0:
        return 0
    else:
        return -1

def generate_gif(n_frame, frames_for_gif, reward, path):
    """
        Args:
            n_frame: Integer, determining the number of the current frame
            frames_for_gif: A sequence of (210, 160, 3) frames of an Atari game in RGB
            reward: Integer, Total reward of the episode that es ouputted as a gif
            path: String, path where gif is saved
    """
    for idx, frame_idx in enumerate(frames_for_gif): 
        frames_for_gif[idx] = resize(frame_idx, (420, 320, 3), 
                                     preserve_range=True, order=0).astype(np.uint8)
        
    imageio.mimsave(f'{path}{"ATARI_frame_{0}_reward_{1}.gif".format(n_frame, reward)}', 
                    frames_for_gif, duration=1/30)

# Results ( part 2.5 in report )  

First we define main and target models.

In [None]:
agent = AGENT(env_name=env_name)

## models
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
main_model = DQNetwork(n_actions = agent.env.action_space.n)
main_model.to(device)
target_model = DQNetwork(n_actions= agent.env.action_space.n)
target_model.to(device)
target_model.load_state_dict(main_model.state_dict())
target_model.eval()

## memory 
memory = MEMORY(max_transitions=1000000)
epsilon_strategy = EpsilonGreedy(n_actions= agent.env.action_space.n , exploit_iter = 30000000 )

Next we define the set of configurations and hyperparameters for the model.

In [None]:
learning_rate = 0.0001

hubert_loss = nn.SmoothL1Loss()
optimizer = torch.optim.Adam(main_model.parameters(), lr=learning_rate)

gamma = 0.99
eval_freq  =  200 000  ## evaluate every eval_freq steps
eval_steps = 10000    ## Number of frames for one evaluation
update_target = 10000 ## update target network weights with the main model once every update_target steps
max_episode_length = 18000    
max_frames = 30000000
start_memory = 50000 
update_main = 4 # Every update_main actions a gradient descend step is performed

## Paths 
path_weights = '/content/drive/MyDrive/data_CS_MVA/RL/second experience/model_state.pth'
path_save = '/content/drive/MyDrive/data_CS_MVA/RL/second experience/'
path_results = '/content/drive/MyDrive/data_CS_MVA/RL/second experience/dict_res.pkl'

Next is the main cell , where we train and evaluate. 

In [None]:
n_frame = 0
rewards = []
loss_list = []
dict_results = {'loss': [], 'rewards': [] , 'rewards-eval': []}
while n_frame < max_frames :
      ############# train ################### 
      epoch = 0 
      while epoch < eval_freq :
            agent.reset()
            episode_reward_sum = 0
            for _ in range(max_episode_length) :
                selected_action = epsilon_strategy.sample_action(n_frame , agent.state )
                processed_new_frame , reward, terminal, _ = agent.step(selected_action)
                n_frame += 1
                epoch += 1
                episode_reward_sum += reward
                
                # Clip the reward
                clipped_reward = clip_reward(reward)
                
                #  Store transition in the replay memory
                memory.add_experience(action=selected_action, 
                                      frame=processed_new_frame[:, :, 0],
                                      reward=clipped_reward, 
                                      terminal=terminal)   
                #update main network weights every upadte_main frames, and when memory contains enough frames (start_memory)
                if  n_frame % update_main == 0 and n_frame > start_memory:
                    minibatch = memory.sample_minibatch()
                    loss = train_batch(minibatch = minibatch , gamma = gamma , 
                                       n_actions = agent.env.action_space.n)
                    loss_list.append(loss)

                #update target network weights every upadte_target frames, and when memory contains enough frames (start_memory)
                if n_frame % update_target == 0 and n_frame > start_memory:
                    target_model.load_state_dict(main_model.state_dict())
                
                #if finished break the loop
                if terminal:
                    terminal = False
                    break
            
            rewards.append(episode_reward_sum)
            # Output the progress:
            if len(rewards) % 10 == 0:
                if n_frame >  start_memory:
                    dict_results['loss'].append(loss_list)
                    dict_results['rewards'].append([len(rewards), n_frame, 
                          np.mean(rewards[-100:])])
                    pickle.dump(dict_results, open(path_results , 'wb') )
                    loss_list = []

                
                print('We are at episode {} , frame number is {} : mean rewards for the last 100 episodes is {}'.format(len(rewards), n_frame, 
                          np.mean(rewards[-100:])))
              
      ####################### evaluate ############################
      terminal = True
      gif = True
      frames_for_gif = []
      eval_rewards = []
      
      for _ in range(eval_steps):
          if terminal:
              agent.reset(evaluation=True)
              episode_reward_sum = 0
              terminal = False
          
          # Fire (action 1), when a life was lost or the game just started, 
          # so that the agent does not stand around doing nothing. When playing 
          # with other environments, you might want to change this...
          selected_action = epsilon_strategy.sample_action(n_frame , agent.state , eval = True)
          
          processed_new_frame, reward, terminal, new_frame = agent.step(selected_action)
          episode_reward_sum += reward

          if gif: 
              frames_for_gif.append(new_frame)
          if terminal:
              eval_rewards.append(episode_reward_sum)
              gif = False # Save only the first game of the evaluation as a gif
                
      print("Evaluation score:\n", np.mean(eval_rewards))       
      try:
          generate_gif(n_frame, frames_for_gif, eval_rewards[0], path_save)
      except IndexError:
          print("No evaluation game finished")
      
      #Save the network parameters
      torch.save(main_model.state_dict(), path_weights)
      frames_for_gif = []
      
      # Show the evaluation score in tensorboard
      dict_results['rewards-eval'].append([n_frame, np.mean(eval_rewards)])
      pickle.dump(dict_results, open(path_results , 'wb') )        

We are at episode 10 , frame number is 8770 : mean rewards for the last 100 episodes is -20.5
We are at episode 20 , frame number is 18120 : mean rewards for the last 100 episodes is -20.4
We are at episode 30 , frame number is 27331 : mean rewards for the last 100 episodes is -20.333333333333332
We are at episode 40 , frame number is 36590 : mean rewards for the last 100 episodes is -20.35
We are at episode 50 , frame number is 45851 : mean rewards for the last 100 episodes is -20.32
We are at episode 60 , frame number is 55405 : mean rewards for the last 100 episodes is -20.266666666666666
We are at episode 70 , frame number is 65022 : mean rewards for the last 100 episodes is -20.257142857142856
We are at episode 80 , frame number is 74484 : mean rewards for the last 100 episodes is -20.2875
We are at episode 90 , frame number is 83324 : mean rewards for the last 100 episodes is -20.344444444444445
We are at episode 100 , frame number is 92378 : mean rewards for the last 100 episode

KeyboardInterrupt: ignored

In [None]:
terminal = True
gif = True
frames_for_gif = []
eval_rewards = []

for _ in range(eval_steps):
    if terminal:
        agent.reset(evaluation=True)
        episode_reward_sum = 0
        terminal = False
    
    # Fire (action 1), when a life was lost or the game just started, 
    # so that the agent does not stand around doing nothing. When playing 
    # with other environments, you might want to change this...
    selected_action = epsilon_strategy.sample_action(n_frame , agent.state , eval = True)
    
    processed_new_frame, reward, terminal, new_frame = agent.step(selected_action)
    episode_reward_sum += reward

    if gif: 
        frames_for_gif.append(new_frame)
    if terminal:
        eval_rewards.append(episode_reward_sum)
        gif = False # Save only the first game of the evaluation as a gif
          
print("Evaluation score:\n", np.mean(eval_rewards))       
try:
    generate_gif(n_frame, frames_for_gif, eval_rewards[0], path_save)
except IndexError:
    print("No evaluation game finished")

Evaluation score:
 21.0
