In [None]:
'''
Installing packages for rendering the game on Colab
'''

!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
!pip install git+https://github.com/tensorflow/docs > /dev/null 2>&1
!pip install gym[classic_control]

Collecting pygame==2.1.0 (from gym[classic_control])
  Downloading pygame-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pygame
  Attempting uninstall: pygame
    Found existing installation: pygame 2.5.2
    Uninstalling pygame-2.5.2:
      Successfully uninstalled pygame-2.5.2
Successfully installed pygame-2.1.0


In [None]:
'''
A bunch of imports, you don't have to worry about these
'''

import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import namedtuple, deque
import torch.optim as optim
import datetime
import gym
from gym.wrappers.record_video import RecordVideo
import glob
import io
import base64
import matplotlib.pyplot as plt
from IPython.display import HTML
from pyvirtualdisplay import Display
import tensorflow as tf
from IPython import display as ipythondisplay
from PIL import Image
import tensorflow_probability as tfp

from scipy.special import softmax

  if (distutils.version.LooseVersion(tf.__version__) <


## Environment 1 : CartPole-v1 and Type 1 Algorithm

In [None]:
'''
'Cartpole-v1 Type 1'
'''

for this_iterator in range(5):
    print("Experiment ",this_iterator+1," Starting")

    env = gym.make('CartPole-v1')
    env.seed(this_iterator)

    state_shape = env.observation_space.shape[0]
    no_of_actions = env.action_space.n

    print(state_shape)
    print(no_of_actions)
    print(env.action_space.sample())
    print("----")

    '''
    The Environment keeps a variable specifically for the current state.
    - Everytime an action is passed to the environment, it calculates the new state and updates the current state variable.
    - It returns the new current state and reward for the agent to take the next action
    '''

    state = env.reset()
    ''' This returns the initial state (when environment is reset) '''

    print(state)
    print("----")

    action = env.action_space.sample()
    ''' We take a random action now '''

    print(action)
    print("----")

    next_state, reward, done, info = env.step(action)
    ''' env.step is used to calculate new state and obtain reward based on old state and action taken  '''

    print(next_state)
    print(reward)
    print(done)
    print(info)
    print("----")

    '''
    ### Q Network & Some 'hyperparameters'

    QNetwork1:
    Input Layer - 4 nodes (State Shape) \
    Hidden Layer 1 - 128 nodes \
    Hidden Layer 2 - 64 nodes \
    Output Layer - 2 nodes (Action Space) \
    Optimizer - zero_grad()
    '''

    import torch
    import torch.nn as nn
    import torch.nn.functional as F


    '''
    Bunch of Hyper parameters (Which you might have to tune later)
    '''
    BUFFER_SIZE = int(1e5)  # replay buffer size
    BATCH_SIZE = 64         # minibatch size
    GAMMA = 0.99            # discount factor
    LR = 5e-4               # learning rate
    UPDATE_EVERY = 20       # how often to update the network (When Q target is present)


    class QNetwork1(nn.Module):

        def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
            """Initialize parameters and build model.
            Params
            ======
                state_size (int): Dimension of each state
                action_size (int): Dimension of each action
                seed (int): Random seed
                fc1_units (int): Number of nodes in first hidden layer
                fc2_units (int): Number of nodes in second hidden layer
            """
            super(QNetwork1, self).__init__()
            self.seed = torch.manual_seed(seed)
            self.fc1 = nn.Linear(state_size, fc1_units)
            self.fc_value = nn.Linear(fc1_units, fc2_units)
            self.fc_adv = nn.Linear(fc1_units, fc2_units)
            self.out_value = nn.Linear(fc2_units, 1)
            self.out_adv = nn.Linear(fc2_units, action_size)

        def forward(self, state):
            """Build a network that maps state -> action values."""
            x = F.relu(self.fc1(state))
            x1 = F.relu(self.fc_value(x))
            x2 = F.relu(self.fc_adv(x))

            value = self.out_value(x1)
            adv = self.out_adv(x2)

            Q = value + ( adv - torch.mean(adv, dim=1, keepdim=True) )

            return Q

    import random
    import torch
    import numpy as np
    from collections import deque, namedtuple

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    class ReplayBuffer:
        """Fixed-size buffer to store experience tuples."""

        def __init__(self, action_size, buffer_size, batch_size, seed):
            """Initialize a ReplayBuffer object.

            Params
            ======
                action_size (int): dimension of each action
                buffer_size (int): maximum size of buffer
                batch_size (int): size of each training batch
                seed (int): random seed
            """
            self.action_size = action_size
            self.memory = deque(maxlen=buffer_size)
            self.batch_size = batch_size
            self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
            self.seed = random.seed(seed)

        def add(self, state, action, reward, next_state, done):
            """Add a new experience to memory."""
            e = self.experience(state, action, reward, next_state, done)
            self.memory.append(e)

        def sample(self):
            """Randomly sample a batch of experiences from memory."""
            experiences = random.sample(self.memory, k=self.batch_size)

            states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
            actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
            rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
            next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
            dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

            return (states, actions, rewards, next_states, dones)

        def __len__(self):
            """Return the current size of internal memory."""
            return len(self.memory)

    class TutorialAgent_epsilon():

        def __init__(self, state_size, action_size, seed):

            ''' Agent Environment Interaction '''
            self.state_size = state_size
            self.action_size = action_size
            self.seed = random.seed(seed)

            ''' Q-Network '''
            self.qnetwork_local = QNetwork1(state_size, action_size, seed).to(device)
            self.qnetwork_target = QNetwork1(state_size, action_size, seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

            ''' Replay memory '''
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

            ''' Initialize time step (for updating every UPDATE_EVERY steps)           -Needed for Q Targets '''
            self.t_step = 0

        def step(self, state, action, reward, next_state, done):

            ''' Save experience in replay memory '''
            self.memory.add(state, action, reward, next_state, done)

            ''' If enough samples are available in memory, get random subset and learn '''
            if len(self.memory) >= BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

            """ +Q TARGETS PRESENT """
            ''' Updating the Network every 'UPDATE_EVERY' steps taken '''
            self.t_step = (self.t_step + 1) % UPDATE_EVERY
            if self.t_step == 0:

                self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

        def act(self, state, eps=0.):

            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            self.qnetwork_local.eval()
            with torch.no_grad():
                action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()

            ''' Epsilon-greedy action selection (Already Present) '''
            if random.random() > eps:
                return np.argmax(action_values.cpu().data.numpy())
            else:
                return random.choice(np.arange(self.action_size))

        def learn(self, experiences, gamma):
            """ +E EXPERIENCE REPLAY PRESENT """
            states, actions, rewards, next_states, dones = experiences

            ''' Get max predicted Q values (for next states) from target model'''
            Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

            ''' Compute Q targets for current states '''
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

            ''' Get expected Q values from local model '''
            Q_expected = self.qnetwork_local(states).gather(1, actions)

            ''' Compute loss '''
            loss = F.mse_loss(Q_expected, Q_targets)

            ''' Minimize the loss '''
            self.optimizer.zero_grad()
            loss.backward()

            ''' Gradiant Clipping '''
            """ +T TRUNCATION PRESENT """
            for param in self.qnetwork_local.parameters():
                param.grad.data.clamp_(-1, 1)

            self.optimizer.step()

    ''' Defining DQN Algorithm '''

    state_shape = env.observation_space.shape[0]
    action_shape = env.action_space.n


    def dqn_epsilon(agent, n_episodes=10000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):

        scores_window = deque(maxlen=100)
        ''' last 100 scores for checking if the avg is more than 195 '''
        rewards_list = []
        eps = eps_start
        ''' initialize epsilon '''

        for i_episode in range(1, n_episodes+1):

            state = env.reset()
            score = 0
            for t in range(max_t):
                action = agent.act(state, eps)
                next_state, reward, done, _ = env.step(action)
                agent.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break

            scores_window.append(score)
            rewards_list.append(score)
            eps = max(eps_end, eps_decay*eps)
            ''' decrease epsilon '''

            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")

            if i_episode % 100 == 0:
              print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))

        return rewards_list

    ''' Trial run to check if algorithm runs and saves the data '''

    begin_time = datetime.datetime.now()

    agent_epsilon = TutorialAgent_epsilon(state_size=state_shape,action_size = action_shape,seed = this_iterator)
    rewards_epsilon = dqn_epsilon(agent_epsilon)

    time_taken = datetime.datetime.now() - begin_time
    np.save('/content/drive/MyDrive/Gaurav_Jikooshokai/CartPole_Type_1_Exp_'+str(this_iterator+1)+'.npy', rewards_epsilon)
    print(time_taken)
    print("============================================================================================")

Experiment  4  Starting
4
2
1
----
[-0.04143508 -0.02631895  0.03012745  0.0082162 ]
----
0
----
[-0.04196146 -0.22185972  0.03029177  0.3102504 ]
1.0
False
{}
----


  deprecation(
  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Episode 100	Average Score: 40.11
Episode 200	Average Score: 126.56
Episode 300	Average Score: 160.11
Episode 400	Average Score: 139.45
Episode 500	Average Score: 107.50
Episode 600	Average Score: 150.19
Episode 700	Average Score: 135.62
Episode 800	Average Score: 82.44
Episode 900	Average Score: 33.33
Episode 1000	Average Score: 25.75
Episode 1100	Average Score: 20.85
Episode 1200	Average Score: 19.25
Episode 1300	Average Score: 18.48
Episode 1400	Average Score: 25.27
Episode 1500	Average Score: 160.13
Episode 1600	Average Score: 273.39
Episode 1700	Average Score: 208.18
Episode 1800	Average Score: 209.06
Episode 1900	Average Score: 221.37
Episode 2000	Average Score: 209.93
Episode 2100	Average Score: 201.30
Episode 2200	Average Score: 192.22
Episode 2300	Average Score: 186.68
Episode 2400	Average Score: 169.40
Episode 2500	Average Score: 79.02
Episode 2600	Average Score: 24.20
Episode 2700	Average Score: 16.82
Episode 2800	Average Score: 14.80
Episode 2900	Average Score: 14.17
Episode

  deprecation(
  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Episode 100	Average Score: 30.64
Episode 200	Average Score: 139.00
Episode 300	Average Score: 160.54
Episode 400	Average Score: 87.04
Episode 500	Average Score: 14.53
Episode 600	Average Score: 31.03
Episode 700	Average Score: 200.81
Episode 800	Average Score: 181.72
Episode 900	Average Score: 187.00
Episode 1000	Average Score: 194.82
Episode 1100	Average Score: 215.00
Episode 1200	Average Score: 186.23
Episode 1300	Average Score: 100.35
Episode 1400	Average Score: 17.58
Episode 1500	Average Score: 11.73
Episode 1600	Average Score: 10.58
Episode 1700	Average Score: 9.78
Episode 1800	Average Score: 9.85
Episode 1900	Average Score: 11.04
Episode 2000	Average Score: 75.17
Episode 2100	Average Score: 41.74
Episode 2200	Average Score: 207.04
Episode 2300	Average Score: 207.15
Episode 2400	Average Score: 229.76
Episode 2500	Average Score: 247.00
Episode 2600	Average Score: 245.39
Episode 2700	Average Score: 217.00
Episode 2800	Average Score: 205.14
Episode 2900	Average Score: 198.43
Episode 

## Environment 1 : CartPole-v1 and Type 2 Algorithm

In [None]:
'''
'Cartpole-v1 Type 2'
'''

for this_iterator in range(5):
    print("Experiment ",this_iterator+1," Starting")

    env = gym.make('CartPole-v1')
    env.seed(this_iterator)

    state_shape = env.observation_space.shape[0]
    no_of_actions = env.action_space.n

    print(state_shape)
    print(no_of_actions)
    print(env.action_space.sample())
    print("----")

    '''
    The Environment keeps a variable specifically for the current state.
    - Everytime an action is passed to the environment, it calculates the new state and updates the current state variable.
    - It returns the new current state and reward for the agent to take the next action
    '''

    state = env.reset()
    ''' This returns the initial state (when environment is reset) '''

    print(state)
    print("----")

    action = env.action_space.sample()
    ''' We take a random action now '''

    print(action)
    print("----")

    next_state, reward, done, info = env.step(action)
    ''' env.step is used to calculate new state and obtain reward based on old state and action taken  '''

    print(next_state)
    print(reward)
    print(done)
    print(info)
    print("----")

    '''
    ### Q Network & Some 'hyperparameters'

    QNetwork1:
    Input Layer - 4 nodes (State Shape) \
    Hidden Layer 1 - 128 nodes \
    Hidden Layer 2 - 64 nodes \
    Output Layer - 2 nodes (Action Space) \
    Optimizer - zero_grad()
    '''

    import torch
    import torch.nn as nn
    import torch.nn.functional as F


    '''
    Bunch of Hyper parameters (Which you might have to tune later)
    '''
    BUFFER_SIZE = int(1e5)  # replay buffer size
    BATCH_SIZE = 64         # minibatch size
    GAMMA = 0.99            # discount factor
    LR = 5e-4               # learning rate
    UPDATE_EVERY = 20       # how often to update the network (When Q target is present)


    class QNetwork1(nn.Module):

        def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
            """Initialize parameters and build model.
            Params
            ======
                state_size (int): Dimension of each state
                action_size (int): Dimension of each action
                seed (int): Random seed
                fc1_units (int): Number of nodes in first hidden layer
                fc2_units (int): Number of nodes in second hidden layer
            """
            super(QNetwork1, self).__init__()
            self.seed = torch.manual_seed(seed)
            self.fc1 = nn.Linear(state_size, fc1_units)
            self.fc_value = nn.Linear(fc1_units, fc2_units)
            self.fc_adv = nn.Linear(fc1_units, fc2_units)
            self.out_value = nn.Linear(fc2_units, 1)
            self.out_adv = nn.Linear(fc2_units, action_size)

        def forward(self, state):
            """Build a network that maps state -> action values."""
            x = F.relu(self.fc1(state))
            x1 = F.relu(self.fc_value(x))
            x2 = F.relu(self.fc_adv(x))

            value = self.out_value(x1)
            adv = self.out_adv(x2)        # Type 2 Algorithm

            Q = value + ( adv - torch.max(adv, dim=1, keepdim=True)[0] )

            return Q

    import random
    import torch
    import numpy as np
    from collections import deque, namedtuple

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    class ReplayBuffer:
        """Fixed-size buffer to store experience tuples."""

        def __init__(self, action_size, buffer_size, batch_size, seed):
            """Initialize a ReplayBuffer object.

            Params
            ======
                action_size (int): dimension of each action
                buffer_size (int): maximum size of buffer
                batch_size (int): size of each training batch
                seed (int): random seed
            """
            self.action_size = action_size
            self.memory = deque(maxlen=buffer_size)
            self.batch_size = batch_size
            self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
            self.seed = random.seed(seed)

        def add(self, state, action, reward, next_state, done):
            """Add a new experience to memory."""
            e = self.experience(state, action, reward, next_state, done)
            self.memory.append(e)

        def sample(self):
            """Randomly sample a batch of experiences from memory."""
            experiences = random.sample(self.memory, k=self.batch_size)

            states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
            actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
            rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
            next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
            dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

            return (states, actions, rewards, next_states, dones)

        def __len__(self):
            """Return the current size of internal memory."""
            return len(self.memory)

    class TutorialAgent_epsilon():

        def __init__(self, state_size, action_size, seed):

            ''' Agent Environment Interaction '''
            self.state_size = state_size
            self.action_size = action_size
            self.seed = random.seed(seed)

            ''' Q-Network '''
            self.qnetwork_local = QNetwork1(state_size, action_size, seed).to(device)
            self.qnetwork_target = QNetwork1(state_size, action_size, seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

            ''' Replay memory '''
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

            ''' Initialize time step (for updating every UPDATE_EVERY steps)           -Needed for Q Targets '''
            self.t_step = 0

        def step(self, state, action, reward, next_state, done):

            ''' Save experience in replay memory '''
            self.memory.add(state, action, reward, next_state, done)

            ''' If enough samples are available in memory, get random subset and learn '''
            if len(self.memory) >= BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

            """ +Q TARGETS PRESENT """
            ''' Updating the Network every 'UPDATE_EVERY' steps taken '''
            self.t_step = (self.t_step + 1) % UPDATE_EVERY
            if self.t_step == 0:

                self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

        def act(self, state, eps=0.):

            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            self.qnetwork_local.eval()
            with torch.no_grad():
                action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()

            ''' Epsilon-greedy action selection (Already Present) '''
            if random.random() > eps:
                return np.argmax(action_values.cpu().data.numpy())
            else:
                return random.choice(np.arange(self.action_size))

        def learn(self, experiences, gamma):
            """ +E EXPERIENCE REPLAY PRESENT """
            states, actions, rewards, next_states, dones = experiences

            ''' Get max predicted Q values (for next states) from target model'''
            Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

            ''' Compute Q targets for current states '''
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

            ''' Get expected Q values from local model '''
            Q_expected = self.qnetwork_local(states).gather(1, actions)

            ''' Compute loss '''
            loss = F.mse_loss(Q_expected, Q_targets)

            ''' Minimize the loss '''
            self.optimizer.zero_grad()
            loss.backward()

            ''' Gradiant Clipping '''
            """ +T TRUNCATION PRESENT """
            for param in self.qnetwork_local.parameters():
                param.grad.data.clamp_(-1, 1)

            self.optimizer.step()

    ''' Defining DQN Algorithm '''

    state_shape = env.observation_space.shape[0]
    action_shape = env.action_space.n


    def dqn_epsilon(agent, n_episodes=10000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):

        scores_window = deque(maxlen=100)
        ''' last 100 scores for checking if the avg is more than 195 '''
        rewards_list = []
        eps = eps_start
        ''' initialize epsilon '''

        for i_episode in range(1, n_episodes+1):

            state = env.reset()
            score = 0
            for t in range(max_t):
                action = agent.act(state, eps)
                next_state, reward, done, _ = env.step(action)
                agent.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break

            scores_window.append(score)
            rewards_list.append(score)
            eps = max(eps_end, eps_decay*eps)
            ''' decrease epsilon '''

            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")

            if i_episode % 100 == 0:
              print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))

        return rewards_list

    ''' Trial run to check if algorithm runs and saves the data '''

    begin_time = datetime.datetime.now()

    agent_epsilon = TutorialAgent_epsilon(state_size=state_shape,action_size = action_shape,seed = this_iterator)
    rewards_epsilon = dqn_epsilon(agent_epsilon)

    time_taken = datetime.datetime.now() - begin_time
    np.save('/content/drive/MyDrive/Gaurav_Jikooshokai/CartPole_Type_2_Exp_'+str(this_iterator+1)+'.npy', rewards_epsilon)
    print(time_taken)
    print("============================================================================================")

Experiment  4  Starting
4
2
1
----
[-0.04143508 -0.02631895  0.03012745  0.0082162 ]
----
1
----
[-0.04196146  0.16835827  0.03029177 -0.27481097]
1.0
False
{}
----


  deprecation(
  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Episode 100	Average Score: 40.56
Episode 200	Average Score: 126.41
Episode 300	Average Score: 94.47
Episode 400	Average Score: 59.36
Episode 500	Average Score: 94.37
Episode 600	Average Score: 161.83
Episode 700	Average Score: 166.59
Episode 800	Average Score: 144.59
Episode 900	Average Score: 144.06
Episode 1000	Average Score: 138.74
Episode 1100	Average Score: 138.97
Episode 1200	Average Score: 128.20
Episode 1300	Average Score: 120.83
Episode 1400	Average Score: 106.69
Episode 1500	Average Score: 89.96
Episode 1600	Average Score: 75.62
Episode 1700	Average Score: 67.14
Episode 1800	Average Score: 68.61
Episode 1900	Average Score: 80.23
Episode 2000	Average Score: 85.72
Episode 2100	Average Score: 160.48
Episode 2200	Average Score: 177.29
Episode 2300	Average Score: 181.70
Episode 2400	Average Score: 185.76
Episode 2500	Average Score: 188.83
Episode 2600	Average Score: 153.67
Episode 2700	Average Score: 111.21
Episode 2800	Average Score: 90.48
Episode 2900	Average Score: 70.58
Episod

  deprecation(
  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Episode 100	Average Score: 38.49
Episode 200	Average Score: 128.22
Episode 300	Average Score: 111.59
Episode 400	Average Score: 41.83
Episode 500	Average Score: 97.85
Episode 600	Average Score: 143.17
Episode 700	Average Score: 121.50
Episode 800	Average Score: 117.19
Episode 900	Average Score: 107.75
Episode 1000	Average Score: 118.22
Episode 1100	Average Score: 100.27
Episode 1200	Average Score: 111.42
Episode 1300	Average Score: 156.51
Episode 1400	Average Score: 154.50
Episode 1500	Average Score: 143.22
Episode 1600	Average Score: 130.91
Episode 1700	Average Score: 131.72
Episode 1800	Average Score: 146.82
Episode 1900	Average Score: 150.20
Episode 2000	Average Score: 122.13
Episode 2100	Average Score: 94.81
Episode 2200	Average Score: 88.84
Episode 2300	Average Score: 156.87
Episode 2400	Average Score: 169.84
Episode 2500	Average Score: 169.27
Episode 2600	Average Score: 150.46
Episode 2700	Average Score: 113.19
Episode 2800	Average Score: 83.78
Episode 2900	Average Score: 71.08
E

## Environment 2 : Acrobot-v1

In [None]:
'''
'Acrobot-v1 Type 1'
'''

for this_iterator in range(5):
    print("Experiment ",this_iterator+1," Starting")

    env = gym.make('Acrobot-v1')
    env.seed(this_iterator)

    state_shape = env.observation_space.shape[0]
    no_of_actions = env.action_space.n

    print(state_shape)
    print(no_of_actions)
    print(env.action_space.sample())
    print("----")

    '''
    The Environment keeps a variable specifically for the current state.
    - Everytime an action is passed to the environment, it calculates the new state and updates the current state variable.
    - It returns the new current state and reward for the agent to take the next action
    '''

    state = env.reset()
    ''' This returns the initial state (when environment is reset) '''

    print(state)
    print("----")

    action = env.action_space.sample()
    ''' We take a random action now '''

    print(action)
    print("----")

    next_state, reward, done, info = env.step(action)
    ''' env.step is used to calculate new state and obtain reward based on old state and action taken  '''

    print(next_state)
    print(reward)
    print(done)
    print(info)
    print("----")

    '''
    ### Q Network & Some 'hyperparameters'

    QNetwork1:
    Input Layer - 4 nodes (State Shape) \
    Hidden Layer 1 - 128 nodes \
    Hidden Layer 2 - 64 nodes \
    Output Layer - 2 nodes (Action Space) \
    Optimizer - zero_grad()
    '''

    import torch
    import torch.nn as nn
    import torch.nn.functional as F


    '''
    Bunch of Hyper parameters (Which you might have to tune later)
    '''
    BUFFER_SIZE = int(1e5)  # replay buffer size
    BATCH_SIZE = 64         # minibatch size
    GAMMA = 0.99            # discount factor
    LR = 5e-4               # learning rate
    UPDATE_EVERY = 20       # how often to update the network (When Q target is present)


    class QNetwork1(nn.Module):

        def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
            """Initialize parameters and build model.
            Params
            ======
                state_size (int): Dimension of each state
                action_size (int): Dimension of each action
                seed (int): Random seed
                fc1_units (int): Number of nodes in first hidden layer
                fc2_units (int): Number of nodes in second hidden layer
            """
            super(QNetwork1, self).__init__()
            self.seed = torch.manual_seed(seed)
            self.fc1 = nn.Linear(state_size, fc1_units)
            self.fc_value = nn.Linear(fc1_units, fc2_units)
            self.fc_adv = nn.Linear(fc1_units, fc2_units)
            self.out_value = nn.Linear(fc2_units, 1)
            self.out_adv = nn.Linear(fc2_units, action_size)

        def forward(self, state):
            """Build a network that maps state -> action values."""
            x = F.relu(self.fc1(state))
            x1 = F.relu(self.fc_value(x))
            x2 = F.relu(self.fc_adv(x))

            value = self.out_value(x1)
            adv = self.out_adv(x2)

            Q = value + ( adv - torch.mean(adv, dim=1, keepdim=True) )

            return Q

    import random
    import torch
    import numpy as np
    from collections import deque, namedtuple

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    class ReplayBuffer:
        """Fixed-size buffer to store experience tuples."""

        def __init__(self, action_size, buffer_size, batch_size, seed):
            """Initialize a ReplayBuffer object.

            Params
            ======
                action_size (int): dimension of each action
                buffer_size (int): maximum size of buffer
                batch_size (int): size of each training batch
                seed (int): random seed
            """
            self.action_size = action_size
            self.memory = deque(maxlen=buffer_size)
            self.batch_size = batch_size
            self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
            self.seed = random.seed(seed)

        def add(self, state, action, reward, next_state, done):
            """Add a new experience to memory."""
            e = self.experience(state, action, reward, next_state, done)
            self.memory.append(e)

        def sample(self):
            """Randomly sample a batch of experiences from memory."""
            experiences = random.sample(self.memory, k=self.batch_size)

            states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
            actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
            rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
            next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
            dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

            return (states, actions, rewards, next_states, dones)

        def __len__(self):
            """Return the current size of internal memory."""
            return len(self.memory)

    class TutorialAgent_epsilon():

        def __init__(self, state_size, action_size, seed):

            ''' Agent Environment Interaction '''
            self.state_size = state_size
            self.action_size = action_size
            self.seed = random.seed(seed)

            ''' Q-Network '''
            self.qnetwork_local = QNetwork1(state_size, action_size, seed).to(device)
            self.qnetwork_target = QNetwork1(state_size, action_size, seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

            ''' Replay memory '''
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

            ''' Initialize time step (for updating every UPDATE_EVERY steps)           -Needed for Q Targets '''
            self.t_step = 0

        def step(self, state, action, reward, next_state, done):

            ''' Save experience in replay memory '''
            self.memory.add(state, action, reward, next_state, done)

            ''' If enough samples are available in memory, get random subset and learn '''
            if len(self.memory) >= BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

            """ +Q TARGETS PRESENT """
            ''' Updating the Network every 'UPDATE_EVERY' steps taken '''
            self.t_step = (self.t_step + 1) % UPDATE_EVERY
            if self.t_step == 0:

                self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

        def act(self, state, eps=0.):

            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            self.qnetwork_local.eval()
            with torch.no_grad():
                action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()

            ''' Epsilon-greedy action selection (Already Present) '''
            if random.random() > eps:
                return np.argmax(action_values.cpu().data.numpy())
            else:
                return random.choice(np.arange(self.action_size))

        def learn(self, experiences, gamma):
            """ +E EXPERIENCE REPLAY PRESENT """
            states, actions, rewards, next_states, dones = experiences

            ''' Get max predicted Q values (for next states) from target model'''
            Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

            ''' Compute Q targets for current states '''
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

            ''' Get expected Q values from local model '''
            Q_expected = self.qnetwork_local(states).gather(1, actions)

            ''' Compute loss '''
            loss = F.mse_loss(Q_expected, Q_targets)

            ''' Minimize the loss '''
            self.optimizer.zero_grad()
            loss.backward()

            ''' Gradiant Clipping '''
            """ +T TRUNCATION PRESENT """
            for param in self.qnetwork_local.parameters():
                param.grad.data.clamp_(-1, 1)

            self.optimizer.step()

    ''' Defining DQN Algorithm '''

    state_shape = env.observation_space.shape[0]
    action_shape = env.action_space.n


    def dqn_epsilon(agent, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):

        scores_window = deque(maxlen=100)
        ''' last 100 scores for checking if the avg is more than 195 '''
        rewards_list = []
        eps = eps_start
        ''' initialize epsilon '''

        for i_episode in range(1, n_episodes+1):

            state = env.reset()
            score = 0
            for t in range(max_t):
                action = agent.act(state, eps)
                next_state, reward, done, _ = env.step(action)
                agent.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break

            scores_window.append(score)
            rewards_list.append(score)
            eps = max(eps_end, eps_decay*eps)
            ''' decrease epsilon '''

            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")

            if i_episode % 100 == 0:
              print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))

        return rewards_list

    ''' Trial run to check if algorithm runs and saves the data '''

    begin_time = datetime.datetime.now()

    agent_epsilon = TutorialAgent_epsilon(state_size=state_shape,action_size = action_shape,seed = this_iterator)
    rewards_epsilon = dqn_epsilon(agent_epsilon)

    time_taken = datetime.datetime.now() - begin_time
    np.save('/content/drive/MyDrive/Gaurav_Jikooshokai/Acrobot_Type_1_Exp_'+str(this_iterator+1)+'.npy', rewards_epsilon)
    print(time_taken)
    print("============================================================================================")

Experiment  1  Starting
6
3
2
----
[ 0.99962485  0.02738891  0.9989402  -0.04602639 -0.09180529 -0.09669447]
----
2
----
[ 0.99996984 -0.0077642   0.9997182  -0.02373883 -0.25169677  0.31000718]
-1.0
False
{}
----


  deprecation(
  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Episode 100	Average Score: -382.12
Episode 200	Average Score: -172.20
Episode 300	Average Score: -135.85
Episode 400	Average Score: -104.41
Episode 500	Average Score: -92.81
Episode 600	Average Score: -92.52
Episode 700	Average Score: -87.62
Episode 800	Average Score: -82.70
Episode 900	Average Score: -79.84
Episode 1000	Average Score: -79.67
Episode 1100	Average Score: -79.49
Episode 1200	Average Score: -78.57
Episode 1300	Average Score: -77.84
Episode 1400	Average Score: -79.39
Episode 1500	Average Score: -80.47
Episode 1600	Average Score: -76.37
Episode 1700	Average Score: -80.36
Episode 1800	Average Score: -78.87
Episode 1900	Average Score: -78.76
Episode 2000	Average Score: -79.01
0:16:11.360481
Experiment  2  Starting
6
3
2
----
[ 0.9999972   0.00236432  0.9959444   0.08997092 -0.07116808  0.08972989]
----
1
----
[ 0.9999519  -0.00980622  0.9952099   0.0977615  -0.0478776  -0.0145893 ]
-1.0
False
{}
----
Episode 100	Average Score: -374.57
Episode 200	Average Score: -163.34
Episod

In [None]:
'''
'Acrobot-v1 Type 2'
'''

for this_iterator in range(5):
    print("Experiment ",this_iterator+1," Starting")

    env = gym.make('Acrobot-v1')
    env.seed(this_iterator)

    state_shape = env.observation_space.shape[0]
    no_of_actions = env.action_space.n

    print(state_shape)
    print(no_of_actions)
    print(env.action_space.sample())
    print("----")

    '''
    The Environment keeps a variable specifically for the current state.
    - Everytime an action is passed to the environment, it calculates the new state and updates the current state variable.
    - It returns the new current state and reward for the agent to take the next action
    '''

    state = env.reset()
    ''' This returns the initial state (when environment is reset) '''

    print(state)
    print("----")

    action = env.action_space.sample()
    ''' We take a random action now '''

    print(action)
    print("----")

    next_state, reward, done, info = env.step(action)
    ''' env.step is used to calculate new state and obtain reward based on old state and action taken  '''

    print(next_state)
    print(reward)
    print(done)
    print(info)
    print("----")

    '''
    ### Q Network & Some 'hyperparameters'

    QNetwork1:
    Input Layer - 4 nodes (State Shape) \
    Hidden Layer 1 - 128 nodes \
    Hidden Layer 2 - 64 nodes \
    Output Layer - 2 nodes (Action Space) \
    Optimizer - zero_grad()
    '''

    import torch
    import torch.nn as nn
    import torch.nn.functional as F


    '''
    Bunch of Hyper parameters (Which you might have to tune later)
    '''
    BUFFER_SIZE = int(1e5)  # replay buffer size
    BATCH_SIZE = 64         # minibatch size
    GAMMA = 0.99            # discount factor
    LR = 5e-4               # learning rate
    UPDATE_EVERY = 20       # how often to update the network (When Q target is present)


    class QNetwork1(nn.Module):

        def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
            """Initialize parameters and build model.
            Params
            ======
                state_size (int): Dimension of each state
                action_size (int): Dimension of each action
                seed (int): Random seed
                fc1_units (int): Number of nodes in first hidden layer
                fc2_units (int): Number of nodes in second hidden layer
            """
            super(QNetwork1, self).__init__()
            self.seed = torch.manual_seed(seed)
            self.fc1 = nn.Linear(state_size, fc1_units)
            self.fc_value = nn.Linear(fc1_units, fc2_units)
            self.fc_adv = nn.Linear(fc1_units, fc2_units)
            self.out_value = nn.Linear(fc2_units, 1)
            self.out_adv = nn.Linear(fc2_units, action_size)

        def forward(self, state):
            """Build a network that maps state -> action values."""
            x = F.relu(self.fc1(state))
            x1 = F.relu(self.fc_value(x))
            x2 = F.relu(self.fc_adv(x))

            value = self.out_value(x1)
            adv = self.out_adv(x2)

            Q = value + ( adv - torch.max(adv, dim=1, keepdim=True)[0] )

            return Q

    import random
    import torch
    import numpy as np
    from collections import deque, namedtuple

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    class ReplayBuffer:
        """Fixed-size buffer to store experience tuples."""

        def __init__(self, action_size, buffer_size, batch_size, seed):
            """Initialize a ReplayBuffer object.

            Params
            ======
                action_size (int): dimension of each action
                buffer_size (int): maximum size of buffer
                batch_size (int): size of each training batch
                seed (int): random seed
            """
            self.action_size = action_size
            self.memory = deque(maxlen=buffer_size)
            self.batch_size = batch_size
            self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
            self.seed = random.seed(seed)

        def add(self, state, action, reward, next_state, done):
            """Add a new experience to memory."""
            e = self.experience(state, action, reward, next_state, done)
            self.memory.append(e)

        def sample(self):
            """Randomly sample a batch of experiences from memory."""
            experiences = random.sample(self.memory, k=self.batch_size)

            states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
            actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
            rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
            next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
            dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

            return (states, actions, rewards, next_states, dones)

        def __len__(self):
            """Return the current size of internal memory."""
            return len(self.memory)

    class TutorialAgent_epsilon():

        def __init__(self, state_size, action_size, seed):

            ''' Agent Environment Interaction '''
            self.state_size = state_size
            self.action_size = action_size
            self.seed = random.seed(seed)

            ''' Q-Network '''
            self.qnetwork_local = QNetwork1(state_size, action_size, seed).to(device)
            self.qnetwork_target = QNetwork1(state_size, action_size, seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

            ''' Replay memory '''
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

            ''' Initialize time step (for updating every UPDATE_EVERY steps)           -Needed for Q Targets '''
            self.t_step = 0

        def step(self, state, action, reward, next_state, done):

            ''' Save experience in replay memory '''
            self.memory.add(state, action, reward, next_state, done)

            ''' If enough samples are available in memory, get random subset and learn '''
            if len(self.memory) >= BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

            """ +Q TARGETS PRESENT """
            ''' Updating the Network every 'UPDATE_EVERY' steps taken '''
            self.t_step = (self.t_step + 1) % UPDATE_EVERY
            if self.t_step == 0:

                self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

        def act(self, state, eps=0.):

            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            self.qnetwork_local.eval()
            with torch.no_grad():
                action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()

            ''' Epsilon-greedy action selection (Already Present) '''
            if random.random() > eps:
                return np.argmax(action_values.cpu().data.numpy())
            else:
                return random.choice(np.arange(self.action_size))

        def learn(self, experiences, gamma):
            """ +E EXPERIENCE REPLAY PRESENT """
            states, actions, rewards, next_states, dones = experiences

            ''' Get max predicted Q values (for next states) from target model'''
            Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

            ''' Compute Q targets for current states '''
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

            ''' Get expected Q values from local model '''
            Q_expected = self.qnetwork_local(states).gather(1, actions)

            ''' Compute loss '''
            loss = F.mse_loss(Q_expected, Q_targets)

            ''' Minimize the loss '''
            self.optimizer.zero_grad()
            loss.backward()

            ''' Gradiant Clipping '''
            """ +T TRUNCATION PRESENT """
            for param in self.qnetwork_local.parameters():
                param.grad.data.clamp_(-1, 1)

            self.optimizer.step()

    ''' Defining DQN Algorithm '''

    state_shape = env.observation_space.shape[0]
    action_shape = env.action_space.n


    def dqn_epsilon(agent, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):

        scores_window = deque(maxlen=100)
        ''' last 100 scores for checking if the avg is more than 195 '''
        rewards_list = []
        eps = eps_start
        ''' initialize epsilon '''

        for i_episode in range(1, n_episodes+1):

            state = env.reset()
            score = 0
            for t in range(max_t):
                action = agent.act(state, eps)
                next_state, reward, done, _ = env.step(action)
                agent.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break

            scores_window.append(score)
            rewards_list.append(score)
            eps = max(eps_end, eps_decay*eps)
            ''' decrease epsilon '''

            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")

            if i_episode % 100 == 0:
              print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))

        return rewards_list

    ''' Trial run to check if algorithm runs and saves the data '''

    begin_time = datetime.datetime.now()

    agent_epsilon = TutorialAgent_epsilon(state_size=state_shape,action_size = action_shape,seed = this_iterator)
    rewards_epsilon = dqn_epsilon(agent_epsilon)

    time_taken = datetime.datetime.now() - begin_time
    np.save('/content/drive/MyDrive/Gaurav_Jikooshokai/Acrobot_Type_2_Exp_'+str(this_iterator+1)+'.npy', rewards_epsilon)
    print(time_taken)
    print("============================================================================================")

Experiment  1  Starting
6
3
0
----
[ 0.99962485  0.02738891  0.9989402  -0.04602639 -0.09180529 -0.09669447]
----
0
----
[ 0.9998245   0.01873245  0.995746   -0.09214022  0.00529764 -0.3585254 ]
-1.0
False
{}
----
Episode 100	Average Score: -341.07
Episode 200	Average Score: -150.74
Episode 300	Average Score: -114.72
Episode 400	Average Score: -105.12
Episode 500	Average Score: -90.93
Episode 600	Average Score: -87.90
Episode 700	Average Score: -81.59
Episode 800	Average Score: -83.90
Episode 900	Average Score: -81.31
Episode 1000	Average Score: -81.05
Episode 1100	Average Score: -82.23
Episode 1200	Average Score: -81.97
Episode 1300	Average Score: -80.16
Episode 1400	Average Score: -78.99
Episode 1500	Average Score: -81.80
Episode 1600	Average Score: -77.57
Episode 1700	Average Score: -79.87
Episode 1800	Average Score: -80.80
Episode 1900	Average Score: -75.77
Episode 2000	Average Score: -78.04
0:15:24.559531
Experiment  2  Starting
6
3
1
----
[ 0.9999972   0.00236432  0.9959444   0.0