## Import libraries

In [1]:
!pip install pyvirtualdisplay
import numpy as np
import random
from collections import namedtuple, deque
import gym

Collecting pyvirtualdisplay
  Downloading PyVirtualDisplay-3.0-py3-none-any.whl.metadata (943 bytes)
Downloading PyVirtualDisplay-3.0-py3-none-any.whl (15 kB)
Installing collected packages: pyvirtualdisplay
Successfully installed pyvirtualdisplay-3.0


In [2]:
import torch
from torch.distributions import Categorical
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd

In [3]:
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay

In [4]:
import wandb

## Duelling DQN

In [31]:
GAMMA = 0.99            # discount factor
class DuellingDQN(nn.Module):

    def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64, update_type=1):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        self.update_type = update_type

        super(DuellingDQN, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.V = nn.Linear(fc2_units, 1)
        self.A = nn.Linear(fc2_units, action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        V = self.V(x)
        A = self.A(x)

        if self.update_type == 1:
            normalization_term = A.mean(dim=1, keepdim=True)
        else:
            normalization_term, _ = A.max(dim=1, keepdim=True)

        Q = V + (A - normalization_term)
        return Q

In [32]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

cuda:0


In [33]:
class DDQNAgent():

    def __init__(self, state_size, action_size, seed, update_type, action_policy, buffer_size, batch_size, lr, update_every):

        ''' Agent Environment Interaction '''
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.update_type = update_type
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.lr = lr
        self.update_every = update_every
        self.action_policy = action_policy

        ''' Q-Network '''
        self.qnetwork_local = DuellingDQN(state_size, action_size, seed, update_type = update_type).to(device)
        self.qnetwork_target = DuellingDQN(state_size, action_size, seed, update_type = update_type).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        ''' Replay memory '''
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)

        ''' Initialize time step (for updating every UPDATE_EVERY steps)           -Needed for Q Targets '''
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):

        ''' Save experience in replay memory '''
        self.memory.add(state, action, reward, next_state, done)

        ''' If enough samples are available in memory, get random subset and learn '''
        if len(self.memory) >= self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

        """ +Q TARGETS PRESENT """
        ''' Updating the Network every 'UPDATE_EVERY' steps taken '''
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:

            self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

    def act(self, state, param=0.):

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        if self.action_policy == 'egreedy':
            ''' Epsilon-greedy action selection (Already Present) '''
            if random.random() > param:
                action = np.argmax(action_values.cpu().data.numpy())
            else:
                action = random.choice(np.arange(self.action_size))

        elif self.action_policy == 'softmax':
            ''' softmax action selection (Task 1b Solution) '''
            action_probs = F.softmax(action_values/param, dim = 1).cpu().data.numpy().squeeze()
            action = np.random.choice(np.arange(self.action_size), p = action_probs)

        return action

    def learn(self, experiences, gamma):
        """ +E EXPERIENCE REPLAY PRESENT """
        states, actions, rewards, next_states, dones = experiences

        ''' Get max predicted Q values (for next states) from target model'''
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        ''' Compute Q targets for current states '''
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        ''' Get expected Q values from local model '''
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        ''' Compute loss '''
        loss = F.mse_loss(Q_expected, Q_targets)

        ''' Minimize the loss '''
        self.optimizer.zero_grad()
        loss.backward()

        ''' Gradiant Clipping '''
        """ +T TRUNCATION PRESENT """
        for param in self.qnetwork_local.parameters():
            param.grad.data.clamp_(-1, 1)

        self.optimizer.step()

#### Defining DuellingDQN Algorithm

In [34]:

def ddqn(env, agent, n_episodes, max_t, p_start, p_end, p_decay, action_policy):

    scores_window = deque(maxlen=100)
    ''' last 100 scores for checking if the avg is more than 195 '''
    scores = [] # store scores to plot the reward curve

    param = p_start

    for i_episode in range(1, n_episodes+1):
        state_temp = env.reset()
        state = state_temp[0]
        score = 0
        for t in range(max_t):
            action = agent.act(state, param)
            next_state, reward, done, _, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break

        scores_window.append(score)
        scores.append(score) # append scores

        param = max(p_end, p_decay*param)

        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")

        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))

    # Plot the reward curve
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.title('Reward Curve')
    plt.show()

    return scores

## Hyperparameter Tuning

In [8]:
# logging in to wandb to record sweeps
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# sweep configuration for wandb
sweep_config = {
    'program': 'AE20B034_EE20B092_PA2.ipynb',
    'method': 'bayes',
    'name': "complete-sweep",
    'metric':{
        'name': 'returns',
        'goal': 'maximize'
    },
'parameters':{
    'action_policy':
      {'values': ['softmax']},
    'buffer_size':
        {'values':[int(1e5)]},
    'batch_size':
        {'values':[128]},
    'lr':
        {'values':[0.01, 0.001, 0.0001]},
    'update_every':
        {'values':[20]},
    'p_start':
        {'values':[1.2, 1, 0.9, 0.5, 0.2]},
    'p_end':
        {'values':[0.005]},
    'p_decay':
        {'values':[1, 0.995]},
}
}

In [None]:
def wandb_exp(env_name, update_type = 2, n_episodes=5000, max_t=1000, seed = 0):
    # initializing wandb
    run = wandb.init()
    config = wandb.config
    
    scores = run_exp(env_name, config, update_type, n_episodes, max_t, seed)
    returns = np.mean(scores)
    
    wandb.log({"returns": returns})

    return returns

In [None]:
def run_wandb(env_name, update_type, n_episodes, max_t, seed, title):
    # run wandb 
    def exp_run():
        wandb_exp(env_name, update_type, n_episodes, max_t, seed)
    #     
    sweep_id = wandb.sweep(sweep_config, project=title)
    wandb.agent(sweep_id, function=exp_run, count = 24)

In [None]:
run_wandb('Acrobot-v1', 1, 1000, 1000, 7, 'env1_update1')
run_wandb('Acrobot-v1', 2, 1000, 1000, 7, 'env1_update2')
run_wandb('CartPole-v1', 1, 1000, 500, 7, 'env2_update1')
run_wandb('CartPole-v1', 2, 1000, 500, 7, 'env2_update2')

In [None]:
hyperparams = {
    'action_policy': 'softmax',
    'buffer_size': int(1e5),
    'batch_size': 128,
    'lr': 0.0001,
    'update_every': 20,
    'p_start': 1.2,
    'p_end': 0.005,
    'p_decay': 0.995
}

run_exp_plot('CartPole-v1', hyperparams, n_episodes = 500, max_t = 200)

# REINFORCE

In [5]:
class Policy(nn.Module):
    def __init__(self, state_size, action_size, hidden_size, dropout):
        super(Policy, self).__init__()
        self.action_size = action_size
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.dropout = nn.Dropout(p=dropout)
        self.fc2 = nn.Linear(hidden_size, action_size)
        self.f = nn.Linear(hidden_size, hidden_size)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def forward(self, state):
        # Define the forward pass of the neural network
        x = self.fc1(state)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.f(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.fc2(x)
        return F.softmax(x, dim=-1)

    def act(self, state):
        # Perform an action based on the given state
        with torch.no_grad():
            state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
            probs = self.forward(state).cpu()
            model = Categorical(probs)
            action = model.sample()
        return action.item(), model.log_prob(action)


In [6]:
def calculate_returns(rewards, discount_factor):
    # Calculate discounted returns for the given rewards
    discounted_returns = []
    G = 0
    for R in reversed(rewards):
        G = R + discount_factor * G
        discounted_returns = [G, *discounted_returns]
    return discounted_returns


In [7]:
def calc_log_prob(model, state, action):
    # Calculate the log probability of an action given a state and a model
    probs = model.forward(state).cpu()
    m = Categorical(probs)
    return m.log_prob(torch.tensor(action))


In [8]:
class Value(nn.Module):
    def __init__(self, state_size, hidden_size, dropout):
        super(Value, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.dropout = nn.Dropout(p=dropout)
        self.f = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, state):
        # Define the forward pass of the neural network
        x = self.fc1(state)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.f(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x


In [9]:
def train(policy_model, value_model, policy_optimizer, value_optimizer, rewards, states, actions, device, is_baseline):
    # Calculate discounted returns
    with torch.no_grad():
        discounted_returns = torch.tensor(calculate_returns(rewards, 0.99)).to(device).view(-1, 1)
    
    log_probs = []
    values = []
    policy_loss = []
    
    # Calculate log probabilities for actions taken
    for state, action in zip(states, actions):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        log_probs.append(calc_log_prob(policy_model, state, action).to(device))
        
    if is_baseline:
        t_states = torch.tensor(states, dtype=torch.float32).to(device)
        values.append(value_model.forward(t_states))
        
        # Calculate advantage and policy loss
        with torch.no_grad():
            deltas = discounted_returns - torch.cat(values)
        for log_prob, delta in zip(log_probs, deltas):
            policy_loss.append(-log_prob * delta)
        policy_loss = torch.cat(policy_loss).mean()

        # Update policy parameters
        policy_optimizer.zero_grad()
        policy_loss.backward()
        policy_optimizer.step()
        
        # Calculate value loss and update value network parameters
        value_loss = F.mse_loss(torch.cat(values), discounted_returns)
        value_optimizer.zero_grad()
        value_loss.backward()
        value_optimizer.step()
    else:
        # Calculate policy loss without baseline
        for log_prob, ret in zip(log_probs, discounted_returns):
            policy_loss.append(-log_prob * ret)
        policy_loss = torch.cat(policy_loss).mean()
        policy_optimizer.zero_grad()
        policy_loss.backward()
        policy_optimizer.step()
    
    return


In [10]:
def run(env, n_episodes, update_type, max_t, policy_lr, value_lr, hidden_size, dropout):
    # Initialize an empty list to store rewards
    rewards = []
    # Determine the device for computation
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Initialize policy and value networks
    policy_model = Policy(env.observation_space.shape[0], env.action_space.n, hidden_size, dropout).to(device)
    value_model = Value(env.observation_space.shape[0], hidden_size, dropout).to(device)
    # Initialize optimizers
    policy_optimizer = optim.Adam(policy_model.parameters(), lr=policy_lr)
    value_optimizer = optim.Adam(value_model.parameters(), lr=value_lr)
    # Check if baseline should be used
    is_baseline = bool(update_type - 1)

    # Run episodes
    for episode in tqdm(range(n_episodes)):
        state = env.reset()[0]
        rewards_episode = []  # Store rewards for each episode
        policy_loss = []  # Store policy loss for each step
        action_episode = []  # Store actions taken for each step
        state_episode = []  # Store states observed for each step
        for t in range(max_t):
            action, log_probs = policy_model.act(state)
            next_state, reward, done, _, _ = env.step(action)
            rewards_episode.append(reward)
            state_episode.append(state)
            action_episode.append(action)
            state = next_state
            if done:
                break
        rewards.append(sum(rewards_episode))  # Store total reward for the episode
        # Train the policy and value networks using collected data
        train(policy_model, value_model, policy_optimizer, value_optimizer, rewards_episode, state_episode, action_episode, device, is_baseline)
    
    return rewards


In [13]:
def run_exp(env_name, algo, hyperparams, update_type=2, n_episodes=500, max_t=200, seed=0):
    # Create the environment
    env = gym.make(env_name)
    state_shape = env.observation_space.shape[0]
    action_shape = env.action_space.n
    
    if algo == 'ddqn':
        # Extract hyperparameters for DDQN
        action_policy = hyperparams['action_policy']
        buffer_size = hyperparams['buffer_size']
        batch_size = hyperparams['batch_size']
        lr = hyperparams['lr']
        update_every = hyperparams['update_every']
        p_start = hyperparams['p_start']
        p_end = hyperparams['p_end']
        p_decay = hyperparams['p_decay']
        
        # Initialize DDQN agent
        agent = DDQNAgent(state_size=state_shape, action_size=action_shape, seed=seed, update_type=update_type, action_policy=action_policy, buffer_size=buffer_size, batch_size=batch_size, lr=lr, update_every=update_every)
        
        # Run DDQN training
        scores = ddqn(env, agent, n_episodes=n_episodes, max_t=max_t, p_start=p_start, p_end=p_end, p_decay=p_decay, action_policy=action_policy)
    
    elif algo == 'reinforce':
        # Extract hyperparameters for REINFORCE
        policy_lr = hyperparams['policy_lr']
        value_lr = hyperparams['value_lr']
        hidden_size = hyperparams['hidden_size']
        dropout = hyperparams['dropout']
        
        # Run REINFORCE training
        scores = run(env, n_episodes, update_type, max_t, policy_lr, value_lr, hidden_size, dropout)
    
    else:
        scores = 0
        print("error")
    
    return scores


In [14]:
def run_exp_plot(env_name, hyperparams, n_episodes=10000, max_t=200, algo='ddqn'):
    # Collect scores for Update Type 1
    scores1 = [run_exp(env_name=env_name, algo=algo, hyperparams=hyperparams, update_type=1, n_episodes=n_episodes, max_t=max_t, seed=seed)[::10] for seed in range(5)]
    
    # Collect scores for Update Type 2
    scores2 = [run_exp(env_name=env_name, algo=algo, hyperparams=hyperparams, update_type=2, n_episodes=n_episodes, max_t=max_t, seed=seed)[::10] for seed in range(5)]
    
    # Convert to NumPy arrays
    scores1 = np.array(scores1)
    scores2 = np.array(scores2)
    
    # Compute mean scores over seeds
    mean_scores1 = np.mean(scores1, axis=0)
    mean_scores2 = np.mean(scores2, axis=0)
    
    # Compute variance over seeds
    std_scores1 = np.std(scores1, axis=0)
    std_scores2 = np.std(scores2, axis=0)
    
    # Plot the mean scores with variance bands
    episodes = range(1, n_episodes + 1)
    plt.subplots(figsize=(10, 6))
    if algo == 'ddqn':
        # Update Type 1
        plt.plot(mean_scores1, label='Update Type 1')
        plt.fill_between(range(len(mean_scores1)), mean_scores1 - std_scores1, mean_scores1 + std_scores1, alpha=0.2)

        # Update Type 2
        plt.plot(mean_scores2, label='Update Type 2')
        plt.fill_between(range(len(mean_scores2)), mean_scores2 - std_scores2, mean_scores2 + std_scores2,alpha=0.2)
    else:
        # Update Type 1
        plt.plot(mean_scores1, label='w/o baseline')
        plt.fill_between(range(len(mean_scores1)), mean_scores1 - std_scores1, mean_scores1 + std_scores1,alpha=0.2)

        # Update Type 2
        plt.plot(mean_scores2, label='w/ baseline')
        plt.fill_between(range(len(mean_scores2)), mean_scores2 - std_scores2, mean_scores2 + std_scores2,alpha=0.2)

    # Add axis labels and a legend
    plt.xlabel('Episodes')
    plt.ylabel('Episodic Returns')
    plt.title(f'Returns vs. Episodes for {env_name} Environment - {algo}')
    plt.legend()

    # Save the plot
    plt.savefig(f'{env_name}_{algo}_scores.png', dpi=300, bbox_inches='tight')

    # Show the plot
    plt.show()


### Hyper parameters

In [None]:
# logging in to wandb to record sweeps
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

In [None]:
# sweep configuration for wandb
sweep_config = {
    'program': 'AE20B034_EE20B092_PA2.ipynb',
    'method': 'bayes',
    'name': "complete-sweep",
    'metric':{
        'name': 'returns',
        'goal': 'maximize'
    },
'parameters':{
    'policy_lr':
      {'values': [0.005, 0.001]},
    'value_lr':
      {'values': [0.005, 0.001]},
    'hidden_size':
        {'values':[64, 128, 256]},
    'dropout':
        {'values':[0.0, 0.25, 0.5]},
}
}

In [None]:
def wandb_exp(env_name, update_type = 2, n_episodes=5000, max_t=1000, seed = 0, algo = 'ddqn'):
    # initializing wandb
    run = wandb.init()
    config = wandb.config
    scores = run_exp(env_name, algo, config, update_type, n_episodes = n_episodes, max_t = max_t, seed = seed)
#     scores = run_exp(env_name, config, algo, update_type, n_episodes, max_t, seed)
    returns = np.mean(scores)
    
    wandb.log({"returns": returns})

    return returns

In [None]:
def run_wandb(env_name, update_type, n_episodes, max_t, seed, title, algo):

    def exp_run():
        
        wandb_exp(env_name, update_type, n_episodes, max_t, seed, algo)
        
    sweep_id = wandb.sweep(sweep_config, project=title)
    wandb.agent(sweep_id, function=exp_run, count = 24)

In [None]:
run_wandb('Acrobot-v1', 1, 500, 200, 7, 'env1_update1', 'reinforce')
run_wandb('Acrobot-v1', 2, 500, 200, 7, 'env1_update2', 'reinforce')
run_wandb('CartPole-v1', 1, 500, 200, 7, 'env2_update1', 'reinforce')
run_wandb('CartPole-v1', 2, 500, 200, 7, 'env2_update2', 'reinforce')

### Run environments


In [194]:
hyperparams = {
    'action_policy': 'softmax',
    'buffer_size': int(1e5),
    'batch_size': 128,
    'lr': 0.0001,
    'update_every': 20,
    'p_start': 0.2,
    'p_end': 0.005,
    'p_decay': 0.995
}

run_exp_plot('Acrobot-v1', hyperparams, n_episodes = 1000, max_t = 200, algo = 'ddqn')

In [None]:
hyperparams = {
    'action_policy': 'softmax',
    'buffer_size': int(1e5),
    'batch_size': 128,
    'lr': 0.0001,
    'update_every': 20,
    'p_start': 1.2,
    'p_end': 0.005,
    'p_decay': 0.995
}

run_exp_plot('CartPole-v1', hyperparams, n_episodes = 500, max_t = 500)

In [None]:
hyperparams = {
    'policy_lr': 1e-3,
    'value_lr': 1e-3,
    'hidden_size': 256,
    'dropout': 0.0
}

run_exp_plot('Acrobot-v1', hyperparams, n_episodes = 500, max_t = 500, algo = 'reinforce')

In [None]:
hyperparams = {
    'policy_lr': 1e-3,
    'value_lr': 1e-3,
    'hidden_size': 128,
    'dropout': 0.25
}

run_exp_plot('Cartpole-v1', hyperparams, n_episodes = 500, max_t = 500, algo = 'reinforce')