<a href="https://colab.research.google.com/github/Hazem9806/Deep_Learning/blob/Artificial_Neural_Network/Pong_Mine_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
import numpy as np
import collections

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import cv2
import gym
import gym.spaces
from torch.distributions import Categorical
from itertools import count

import argparse

In [None]:
'''
    Pong require a user to press the FIRE button to start the game.
    The following code corresponds to the wrapper FireResetEnvthat presses the FIRE button in 
    environments that require that for the game to start.
    
    In addition to pressing FIRE, this wrapper checks for several corner cases that are present in some games.

'''

class FireResetEnv(gym.Wrapper):
    def __init__(self, env):
        """Take action on reset for environments that are fixed until firing."""
        gym.Wrapper.__init__(self, env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def reset(self, **kwargs):
        self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)

In [None]:
'''

The next wrapper that we will require is MaxAndSkipEnv that codes a couple of important transformations for Pong
'''

class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        super(MaxAndSkipEnv, self).__init__(env)
        self._obs_buffer = collections.deque(maxlen=2)
        self._skip = skip
    def step(self, action):
            total_reward = 0.0
            done = None
            for _ in range(self._skip):
              obs, reward, done, info = self.env.step(action)
              self._obs_buffer.append(obs)
              total_reward += reward
              if done:
                  break
            max_frame = np.max(np.stack(self._obs_buffer), axis=0)
            return max_frame, total_reward, done, info
    def reset(self):
          self._obs_buffer.clear()
          obs = self.env.reset()
          self._obs_buffer.append(obs)
          return obs

In [None]:
class ScaledFloatFrame(gym.ObservationWrapper):
    """Normalize pixel values in frame --> 0 to 1"""
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0

In [None]:
class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps, dtype=np.float32):
        super(BufferWrapper, self).__init__(env)
        self.dtype = dtype
        old_space = env.observation_space
        self.observation_space = gym.spaces.Box(old_space.low.repeat(n_steps, axis=0),
                                                old_space.high.repeat(n_steps, axis=0), dtype=dtype)

    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer

In [None]:
class ImageToPyTorch(gym.ObservationWrapper):
    def __init__(self, env):
        super(ImageToPyTorch, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]),
                                                dtype=np.float32)

    def observation(self, observation):
      ## moveaxis --> Move axes of an array to new positions
      ## as the Conv2d takes the image argument as (channels,height, width)
        return np.moveaxis(observation, 2, 0)

In [None]:
def make_env(env_name):
    env = gym.make(env_name)
    env = MaxAndSkipEnv(env)
    env = FireResetEnv(env)
    env = ProcessFrame84(env)
    env = ImageToPyTorch(env)
    env = BufferWrapper(env, 4)
    return ScaledFloatFrame(env)

In [None]:
class PolicyNet(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(PolicyNet, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 128, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(128, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        conv_out_size = self._get_conv_out(input_shape)
        self.policy = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )
    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
#        print(',-- torch.zeros(1, *shape): ',torch.zeros(1, *shape), ', -- shape: ', shape, ', \nreturn data: ',int(np.prod(o.size())))
        return int(np.prod(o.size()))

    def forward(self, x):
        conv_out = self.conv(x).view(x.size()[0], -1)
        return Categorical(torch.softmax(self.policy(conv_out), dim=1))

In [None]:
class CriticNet(nn.Module):
    def __init__(self, input_shape):
        super(CriticNet, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 128, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(128, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        conv_out_size = self._get_conv_out(input_shape)
        self.critic = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        conv_out = self.conv(x).view(x.size()[0], -1)
        return self.critic(conv_out)

In [None]:
'''
    Before feeding the frames to the neural network every frame is scaled down from 210x160, 
    with three color frames (RGB color channels), 
    to a single-color 84 x84 image using a colorimetric grayscale conversion. 
    Different approaches are possible. 
    One of them is cropping non-relevant parts of the image and then scaling down
'''

from google.colab.patches import cv2_imshow
class ProcessFrame84(gym.ObservationWrapper):
    """
    Downsamples image to 84x84
    Greyscales image
    Returns numpy array
    """
    def __init__(self, env=None):
        super(ProcessFrame84, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)

    def observation(self, obs):
        return ProcessFrame84.process(obs)

    @staticmethod
    def process(frame):
#        print('frame size: ',frame.size)
        if frame.size == 210 * 160 * 3:
            img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
        elif frame.size == 250 * 160 * 3:
            img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
        else:
            assert False, "Unknown resolution."
#        print('RGB image size: ', img.size)

        ## Conversion from RGB to Gray Scale --> b as [0.2989, 0.5870, 0.1140]
        Gray_Scale_Parameters = [0.2989, 0.5870, 0.1140]
        img = img[:, :, 0] * Gray_Scale_Parameters[0] + img[:, :, 1] * Gray_Scale_Parameters[1] + img[:, :, 2] * Gray_Scale_Parameters[2]
        thresh = 120
        img = cv2.threshold(img, thresh, 255, cv2.THRESH_BINARY)[1]
#        cv2.imwrite('/content/sample_data/binary_image.png', img)
#        print('Gray Scale image size: ', img.shape)
        img = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        img = img[18:102, :]
        img = np.reshape(img, [84, 84, 1])
#        print(img.dtype)
        #cv2.imwrite('/content/sample_data/img.png', img)
#        print(img.shape)
        #img2 = img.astype(np.uint8)
        #img2 = cv2.threshold(img2,127,255,cv2.THRESH_BINARY)
#        cv2_imshow(img)
        #cv2.imwrite(img2,'/content/sample_data/img.png')
        return img.astype(np.uint8)


In [None]:
# env = make_env(ENVIROMENT_NAME)
# while True:
#   state = env.reset()

In [None]:
## function that takes a list of rewards and reutrn the list of returns for each step
def learn_step(action_log_prob, critic, value, reward, done, next_state, optimizerA, optimizerC, gamma=0.9):
    target_q = reward + gamma * (1 - int(done)) * critic(torch.FloatTensor([next_state]))
    advantage = target_q - value

    ## compute losses
    optimizerA.zero_grad()
    actor_loss = -action_log_prob * advantage.detach()
    actor_loss.backward()
    optimizerA.step()

    optimizerC.zero_grad()
    critic_loss = advantage.pow(2)
    critic_loss.backward()
    optimizerC.step()
        ## Init R
    #R = 0
    #returns = list()
    #for reward in reversed(rewards):
    #    R = reward + gamma * R
    #    #print(R)
    #    returns.insert(0, R)
    #    #returns.append(R)

    #returns = torch.tensor(returns)
    ## normalize the returns
    #returns = (returns - returns.mean()) / (returns.std() + 1e-6)
    #return returns

In [None]:
# env = make_env(ENVIROMENT_NAME)


# #observations = env.reset()

# GPU = False
# device = ("cuda" if GPU else "cpu")
# #print('env.observation_space.shape: ',env.observation_space.shape)
# policy_nn = PolicyNet(env.observation_space.shape, env.action_space.n).to(device)
# critic_nn = CriticNet(env.observation_space.shape).to(device)
# #print(policy_nn)
# #print(critic_nn)

# ## initialize an optimizer
# policy_optimizer = torch.optim.Adam(policy_nn.parameters(), lr=1e-3)
# critic_optimizer = torch.optim.Adam(critic_nn.parameters(), lr=1e-3)

# state = env.reset()
# #print(state.shape[0])


# action_log_probs = list()
# rewards = list()
# values = list()
# next_states = list()
# running_reward = 10
# #while True:
# ## take an action sampled from a categorical distribution given the state
# action_prob = policy_nn(torch.FloatTensor(state).unsqueeze(0))
# action = action_prob.sample()
# action_log_prob = action_prob.log_prob(action)

# #print(action_log_prob)
# # print(action)
# # print(action_log_prob)

# value = critic_nn(torch.FloatTensor(state).unsqueeze(0))
# #print(value)
# #values.append(value[0])
# #print(action)

# next_state, reward, is_done, _ = env.step(action.item()) # take a random action
# rewards.append(reward)

# ## do a learning step (online) ##
# learn_step(action_log_prob, critic_nn, value, reward, is_done, next_state, policy_optimizer, critic_optimizer)

# ## current state is next state now
# state = next_state


In [None]:
ENVIROMENT_NAME = 'Pong-v0'

##################################################################
##                   HYPERPARAMETERS                            ##
##################################################################
GAMMA = 0.99

In [None]:
env = make_env(ENVIROMENT_NAME)


#observations = env.reset()

GPU = False
device = ("cuda" if GPU else "cpu")
print('env.observation_space.shape: ',env.observation_space.shape)
policy_nn = PolicyNet(env.observation_space.shape, env.action_space.n).to(device)
critic_nn = CriticNet(env.observation_space.shape).to(device)
#print(policy_nn)
#print(critic_nn)

## initialize an optimizer
policy_optimizer = torch.optim.Adam(policy_nn.parameters(), lr=1e-3)
critic_optimizer = torch.optim.Adam(critic_nn.parameters(), lr=1e-3)


#print(state.shape[0])
running_reward = 10
for e in count(1):

  action_log_probs = list()
  rewards = list()
  values = list()
  next_states = list()
  state = env.reset()

  for t in range(100000):
    ## take an action sampled from a categorical distribution given the state
    action_prob = policy_nn(torch.FloatTensor(state).unsqueeze(0))
    action = action_prob.sample()
    action_log_prob = action_prob.log_prob(action)

    # print(action_prob)
    # print(action)
    # print(action_log_prob)

    value = critic_nn(torch.FloatTensor(state).unsqueeze(0))
    #values.append(value[0])
    #print(action)

    next_state, reward, is_done, _ = env.step(action.item()) # take a random action
    rewards.append(reward)

    ## do a learning step (online) ##
    #learn_step(action_log_prob, critic_nn, value, reward, is_done, next_state, policy_optimizer, critic_optimizer)
    #def learn_step(action_log_prob, critic, value, reward, done, next_state, optimizerA, optimizerC, gamma=0.9):

    target_q = reward + GAMMA * (1 - int(is_done)) * critic_nn(torch.FloatTensor([next_state]))
    advantage = target_q - value

    ## compute losses
    policy_optimizer.zero_grad()
    actor_loss = -action_log_prob * advantage.detach()
    actor_loss.backward()
    policy_optimizer.step()

    critic_optimizer.zero_grad()
    critic_loss = advantage.pow(2)
    critic_loss.backward()
    critic_optimizer.step()
    
    ## current state is next state now
    state = next_state

    if is_done:
      #print(rewards)
      #print(values)
      break

    ## get stats
  ep_reward = sum(rewards)
  running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
  if e % 10 == 0:
      print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                e, ep_reward, running_reward))
  if (env.spec.reward_threshold is not None):
    if (running_reward > env.spec.reward_threshold):
        print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
        break

env.observation_space.shape:  (4, 84, 84)
Episode 10	Last reward: -21.00	Average reward: -2.44
Episode 20	Last reward: -21.00	Average reward: -9.89
Episode 30	Last reward: -21.00	Average reward: -14.35
Episode 40	Last reward: -21.00	Average reward: -17.02
Episode 50	Last reward: -21.00	Average reward: -18.61
Episode 60	Last reward: -21.00	Average reward: -19.57
Episode 70	Last reward: -21.00	Average reward: -20.14
Episode 80	Last reward: -21.00	Average reward: -20.49
Episode 90	Last reward: -21.00	Average reward: -20.69
Episode 100	Last reward: -21.00	Average reward: -20.82
Episode 110	Last reward: -21.00	Average reward: -20.89
