<a href="https://colab.research.google.com/github/KGF2/DeepLearningAmateur/blob/Test/CartNPole_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# SOURCE: https://www.youtube.com/watch?v=PyQNfsGUnQA&list=PLZbbT5o_s2xoWNVdDudn51XM8lOuZ_Njv&index=16

# DQN using PyTorch

# import libraries

%matplotlib inline
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [0]:
#  setup display

is_ipython = 'inline' in matplotlib.get_backend() # for interactive display of plot
if is_ipython : from IPython import display


In [0]:
# Deep Q network using PyTorch

class DQN(nn.Module): # Module class is the base class for all NNs. class DQN extends from this module. DQN will recieve screenshoot like images from the Module
  def __init__(self, img_height, img_width): # class constructor
    super().__init__()

    # Network with 2 fully connected hidden layers and an output layer
    self.fc1 = nn.Linear(in_features=img_height*img_width*3, out_features=24) # 3 corresponds to 3 RGB input images. flattened/linearised inputs at the input stage.
    self.fc2 = nn.Linear(in_features=24, out_features=32)
    out = nn.Linear(in_features=32, out_features=2) # 2 since only available actions for the cartpole are Left and Right

  def forward(self, t): # network forward propagation, t is the image tensor
    t = t.flattened(start_dim=1) # flattenning the image tensor
    t = F.relu(self.fc1(t)) # t is passed to the 1st connected layer and RELU is applied as the activation unit
    t = F.relu(self.fc2(t))
    t = self.out(t)
    return t


In [0]:
  # experiences from replay memory is used to train the network
  # to create experiences, we create an Experience class which is used to create instances of experience objects that will get stored in and sampled from replay memory later
  Experience = namedtuple('Experiences', ('state', 'action', 'reward', 'new_state'))
  #e = Experience(1,2,4,5)   # Eg Object of an experience class
  #print(e) 

In [0]:
# replay memory class to store the above experiences

class ReplayMemory():
  def __init__(self, capacity): # capacity of the Replay memory, taken as argument
    self.capacity = capacity # initialise RMs capacity
    self.memory = [] # define a memory attribute equal to an empty list. this is the structure that holds the stored experiences
    self.push_count = 0 # to keep track of number of exs added to mem

  # to store experiences in RM
  def push(self, experience):
    if len(self.memory) < self.capacity:
      self.memory.append(experience)
    else:
      self.memory[self.push_count % self.capacity] = experience # overriding oldest exps
    self.push_count += 1
  
  # to return random sample exs from RM used to train the DQN
  def sample(self, batch_size):
    return random.sample(self.memory, batch_size)

  # check to see if RM can provide batch_size number of samples
  def can_provide_sample(self, batch_size):
    return len(self.memory) >= batch_size


In [0]:
# Epsilon-greedy strategy class

class EpsilonGreedyStrategy():
  
  def __init__(self, min, max, decay): # start, end and decay values of epsilon
    self.min = min
    self.max = max
    self.decay = decay
  
  def get_exploration_rate(self, current_step):
    return self.max + (self.max-self.min) * math.exp(-1*self.decay*current_step)
    

In [0]:
# RL agent

class Agent():
  
  # initialise all the variables below
  def __init__(self, strategy, num_actions, device): # num_actions is 2 ie, Left and Right
    self.strategy = strategy
    self.num_actions = num_actions
    self.current_step = 0
    self.device = device # device that we tell PyTorch to use for Tensor calculations ie, CPU or GPU

  def select_action(self, state, policy_net): # policy_net is DQN policy we train to learn the optimal policy
    explor_rate = strategy.get_exploration_rate(self.current_step)
    self.current_step += 1

    if self.explor_rate > random.random(): # explore
      action = random.randrange(self.num_actions) 
      return torch.tensor([action]).to(device)
    else:  # exploit (taking action of highest q-value ouput for the corr state)
      with torch.no_grad(): # to turn off gradient tracking since we are using this model just for inference and not training
        return policy_net(state).argmax(dim=1).to(device)


In [0]:
# Environment manager

class CartPoleEnvManager ():
  def __init__(self, device):
    self.device = device
    self.env = gym.make('CartPole-v0').unwrapped
    self.env.reset()
    self.current_screen = None # we are at start of an episode where we havent yet rendered the screen
    self.done = False # to check if any action taken has ended in the episode

  def reset(self):
    self.env.reset()
    self.current_screen = None

  def close(self):
    self.env.close()

  def render(self, mode = "human"):
    return self.env.render(mode)
  


