In [1]:
from torch import nn
import torch
import gym
from collections import deque
import itertools
import numpy as np
import random
import torch.nn.functional as F
from datetime import datetime
import matplotlib.pyplot as plt

In [2]:
GAMMA=0.99
BATCH_SIZE=512
BUFFER_SIZE=50000
MIN_REPLAY_SIZE=10000
EPSILON_START=1.0
EPSILON_END=0.02
EPSILON_DECAY=150000
MAX_REWARD_REACH = -100
TARGET_UPDATE_FREQ = 1000
LR =1e-5
ENV_Name='Acrobot-v1'
TOTAl_episode =250000

In [3]:
torch.manual_seed(2)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
env = gym.make(ENV_Name)
env.seed(0)
env.env.book_or_nips = 'nips'

  deprecation(
  deprecation(
  deprecation(


In [4]:
class DuelingDQN_Mean(nn.Module):
    def __init__(self, env):
        super(DuelingDQN_Mean, self).__init__()

        in_features = int(np.prod(env.observation_space.shape))

        self.fc1 = nn.Linear(in_features, 128)
        self.fc2 = nn.Linear(128, 128)

        self.fc3 = nn.Linear(128, 128)
        '''Value stream'''
        self.value_stream = nn.Linear(128, 1)

        '''Advantage stream'''
        self.advantage_stream = nn.Linear(128, env.action_space.n)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))


        value = self.value_stream(x)
        advantage = self.advantage_stream(x)

        '''Combining value and advantage streams'''
        q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))
        # q_values = value + (advantage - advantage.max(dim=1, keepdim=True)[0])

        return q_values
    
    def act(self, obs):
        obs_t = torch.as_tensor(obs, dtype=torch.float32)
        q_values = self(obs_t.unsqueeze(0))
        max_q_index = torch.argmax(q_values, dim=1)[0]
        action = max_q_index.cpu().detach().item()
        return action

In [5]:
class DuelingDQN_Max(nn.Module):
    def __init__(self, env):
        super(DuelingDQN_Max, self).__init__()

        in_features = int(np.prod(env.observation_space.shape))

        self.fc1 = nn.Linear(in_features, 128)
        self.fc2 = nn.Linear(128, 128)

        self.fc3 = nn.Linear(128, 128)
        '''Value stream'''
        self.value_stream = nn.Linear(128, 1)

        '''Advantage stream'''
        self.advantage_stream = nn.Linear(128, env.action_space.n)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))


        value = self.value_stream(x)
        advantage = self.advantage_stream(x)

        '''Combining value and advantage streams'''
#         q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))
        q_values = value + (advantage - advantage.max(dim=1, keepdim=True)[0])

        return q_values
    
    def act(self, obs):
        obs_t = torch.as_tensor(obs, dtype=torch.float32)
        q_values = self(obs_t.unsqueeze(0))
        max_q_index = torch.argmax(q_values, dim=1)[0]
        action = max_q_index.cpu().detach().item()
        return action

In [6]:
def train(iteration_no,Mean_Flag = True):
    Reward_max = -1000
    replay_buffer = deque(maxlen=BUFFER_SIZE)
    rew_buffer = deque([0.0], maxlen=100)

    episode_reward = 0
    
    if Mean_Flag:
        online_net = DuelingDQN_Mean(env).to(device)
        target_net = DuelingDQN_Mean(env).to(device)
        
    else :
        online_net = DuelingDQN_Max(env).to(device)
        target_net = DuelingDQN_Max(env).to(device)

    target_net.load_state_dict(online_net.state_dict())

    optimizer = torch.optim.Adam(online_net.parameters(), lr=LR)

    '''Initialize replay buffer with MIN_REPLAY_SIZE in deque '''
    obs = env.reset()
    for _ in range(MIN_REPLAY_SIZE):
        action = env.action_space.sample()

        new_obs, rew, done, _ = env.step(action)
        transition = (obs, action, rew, done, new_obs)
        replay_buffer.append(transition)
        obs = new_obs

        if done:
            obs = env.reset()


    # Main Training Loop
    Average_reward = []
    obs = env.reset()
#     for step in itertools.count():
    for step in range(TOTAl_episode):
        epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END])

        rnd_sample = random.random()
        if rnd_sample <= epsilon:
            action = env.action_space.sample()
        else:
            obs = torch.as_tensor(obs, dtype=torch.float32).to(device)
            action = online_net.act(obs)
            obs = obs.cpu().numpy()

        new_obs, rew, done, _ = env.step(action)
        transition = (obs, action, rew, done, new_obs)
        replay_buffer.append(transition)
        obs = new_obs

        episode_reward += rew

        if done:
            obs = env.reset()
            rew_buffer.append(episode_reward)
            episode_reward = 0

        # # If we solved it lets just watch it play, put in last
        if step % 10000 == 0:
            if step !=0:
                print(f'current_Reward_max {Reward_max}')
                print(f'Trained_Reward_max {np.mean(rew_buffer)}')
                if np.mean(rew_buffer) > MAX_REWARD_REACH:
                    if np.mean(rew_buffer) > Reward_max : 
                        print(f'current_Reward_max {Reward_max}')
                        print(f'Trained_Reward_max {np.mean(rew_buffer)}')
                        Reward_max = np.mean(rew_buffer)
    
                        if np.mean(rew_buffer) >= MAX_REWARD_REACH:
    
                            checkpoint = {
                                "ENV_Name":ENV_Name,
                                "state_dict": online_net.state_dict(),
                                "GAMMA":GAMMA,
                                "BATCH_SIZE":BATCH_SIZE,
                                "BUFFER_SIZE":BUFFER_SIZE,
                                "MIN_REPLAY_SIZE":MIN_REPLAY_SIZE,
                                "EPSILON_START":EPSILON_START,
                                "EPSILON_END":EPSILON_END,
                                "EPSILON_DECAY":EPSILON_DECAY,
                                "MAX_REWARD_REACH" :MAX_REWARD_REACH,
                                "TARGET_UPDATE_FREQ" :TARGET_UPDATE_FREQ,
                                "LR" :LR}
                            if Mean_Flag:
                                model_name=f'iteration_{iteration_no}_{ENV_Name}_Qmean_best'
                            else:
                                model_name=f'iteration_{iteration_no}_{ENV_Name}_Qmax_best'
                            save_checkpoint(checkpoint,model_name)

#                 break

        transitions = random.sample(replay_buffer, BATCH_SIZE)

        obses = []
        actions = []
        rews = []
        dones = []
        new_obses = []

        for t in transitions:
            obses.append(t[0])
            actions.append(t[1])
            rews.append(t[2])
            dones.append(t[3])
            new_obses.append(t[4])


        obses = np.array(obses)
        actions = np.array(actions)
        rews = np.array(rews)
        dones = np.array(dones)
        new_obses = np.array(new_obses)

        obses_t = torch.as_tensor(obses, dtype=torch.float32).to(device)
        actions_t = torch.as_tensor(actions, dtype=torch.int64).unsqueeze(-1).to(device)
        rews_t = torch.as_tensor(rews, dtype=torch.float32).unsqueeze(-1).to(device)
        dones_t = torch.as_tensor(dones, dtype=torch.float32).unsqueeze(-1).to(device)
        new_obses_t = torch.as_tensor(new_obses, dtype=torch.float32).to(device)

        # Compute Targets
        # targets = r + gamma * target q vals * (1 - dones)
        target_q_values = target_net(new_obses_t)
        max_target_q_values = target_q_values.max(dim=1, keepdim=True)[0]

        targets = rews_t + GAMMA * (1 - dones_t) * max_target_q_values

        # Compute Loss
        q_values = online_net(obses_t)
        action_q_values = torch.gather(input=q_values, dim=1, index=actions_t)

        # loss = nn.functional.smooth_l1_loss(action_q_values, targets)
        loss = nn.functional.mse_loss(action_q_values, targets)

        # Gradient Descent
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update Target Net
        if step % TARGET_UPDATE_FREQ == 0:
            target_net.load_state_dict(online_net.state_dict())

        # Logging
        if step % 10000 == 0:
            if step !=0:
                print(f'epsilon -- > {epsilon}')
                print('Step:', step)
                print('Avg Rew:', np.mean(rew_buffer))

            
        if step % 1000 == 0:
            if step !=0:
                Average_reward.append(np.mean(rew_buffer))
    
    checkpoint = {
        "ENV_Name":ENV_Name,
        "state_dict": online_net.state_dict(),
        "GAMMA":GAMMA,
        "BATCH_SIZE":BATCH_SIZE,
        "BUFFER_SIZE":BUFFER_SIZE,
        "MIN_REPLAY_SIZE":MIN_REPLAY_SIZE,
        "EPSILON_START":EPSILON_START,
        "EPSILON_END":EPSILON_END,
        "EPSILON_DECAY":EPSILON_DECAY,
        "MAX_REWARD_REACH" :MAX_REWARD_REACH,
        "TARGET_UPDATE_FREQ" :TARGET_UPDATE_FREQ,
        "LR" :LR}
    if Mean_Flag:
        model_name=f'iteration_{iteration_no}_{ENV_Name}_Qmean_last'
    else:
        model_name=f'iteration_{iteration_no}_{ENV_Name}_Qmax_last'
    save_checkpoint(checkpoint,model_name)
    

    return Average_reward,online_net

def save_checkpoint(state,model_name):
    filename=f'./All_ckpt/{model_name}_checkpoint__.pth'
#     print("=> Saving checkpoint")
    torch.save(state, filename)
    
def View_output(online_net):
    obs = env.reset()
    episode_length = 0
    for i in range(100):
        print(f'episode_length {episode_length}')
        episode_length +=1
        obs = torch.as_tensor(obs, dtype=torch.float32).to(device)
        action = online_net.act(obs)

        obs, _, done, _ = env.step(action)
        env.render()
        if done: 
            env.reset()
            episode_length = 0

In [None]:
Mean_Flag = True
Iterations =5
total_reward_list = []
for k in range(Iterations):
    random_number = np.random.randint(1, 100)
    torch.manual_seed(random_number)
    env.seed(random_number)
    env.reset()
    print(f'\n\n\n  ITERATION   {k} \n\n ' )
    Average_reward,online_net = train(k,Mean_Flag)
#     View_output(online_net)
    Average_reward = np.array(Average_reward)
    total_reward_list.append(Average_reward)
    file_name = f'./npy/Q1_itertaion_{k}_mean_{Mean_Flag}_{ENV_Name}_'

    np.save(f'{file_name}.npy', Average_reward)




  ITERATION   0 

 


  if not isinstance(terminated, (bool, np.bool8)):


current_Reward_max -1000
Trained_Reward_max -476.1904761904762
epsilon -- > 0.9346666666666666
Step: 10000
Avg Rew: -476.1904761904762
current_Reward_max -1000
Trained_Reward_max -486.7560975609756
epsilon -- > 0.8693333333333333
Step: 20000
Avg Rew: -486.7560975609756
current_Reward_max -1000
Trained_Reward_max -458.26153846153846
epsilon -- > 0.804
Step: 30000
Avg Rew: -458.26153846153846
current_Reward_max -1000
Trained_Reward_max -421.3404255319149
epsilon -- > 0.7386666666666667
Step: 40000
Avg Rew: -421.3404255319149
current_Reward_max -1000
Trained_Reward_max -343.64
epsilon -- > 0.6733333333333333
Step: 50000
Avg Rew: -343.64
current_Reward_max -1000
Trained_Reward_max -249.09
epsilon -- > 0.608
Step: 60000
Avg Rew: -249.09
current_Reward_max -1000
Trained_Reward_max -194.37
epsilon -- > 0.5426666666666666
Step: 70000
Avg Rew: -194.37
current_Reward_max -1000
Trained_Reward_max -171.33
epsilon -- > 0.4773333333333334
Step: 80000
Avg Rew: -171.33


In [None]:
# Mean_Flag = True
# mean_Plot_array = []
# for k in range(Iterations):
#     file_name = f'./npy/Q1_itertaion_{k}_mean_{Mean_Flag}_{ENV_Name}_.npy'
#     loaded_array = np.load(file_name)
#     mean_Plot_array.append(loaded_array)

# mean_scrs = np.mean(mean_Plot_array,axis = 0)

# plt.plot(mean_scrs,label='Mean DQN')
# plt.legend()
# plt.ylabel ('Total Reward')
# plt.xlabel('Episodes')
# plt.title(f'Reward curve (Average over {Iterations} runs)')
# plt.savefig('Mean_DQN_CartPole-v0.png')
# plt.show()

In [None]:
# Mean_Flag = False
# max_Plot_array = []
# for k in range(Iterations):
#     file_name = f'./npy/Q1_itertaion_{k}_mean_{Mean_Flag}_{ENV_Name}_.npy'
#     loaded_array = np.load(file_name)
#     max_Plot_array.append(loaded_array)


# max_scrs = np.mean(max_Plot_array,axis = 0)

# plt.plot(max_scrs,label='Max DQN')
# plt.legend()
# plt.ylabel ('Total Reward')
# plt.xlabel('Episodes')
# plt.title(f'Reward curve (Average over {Iterations} runs)')
# plt.savefig('Max_DQN_CartPole-v0.png')
# plt.show()

In [None]:
# Mean_Flag = True
# mean_Plot_array = []
# for k in range(Iterations):
#     file_name = f'./npy/Q1_itertaion_{k}_mean_{Mean_Flag}_{ENV_Name}_.npy'
#     loaded_array = np.load(file_name)
#     mean_Plot_array.append(loaded_array)

# Mean_Flag = False
# max_Plot_array = []
# for k in range(Iterations):
#     file_name = f'./npy/Q1_itertaion_{k}_mean_{Mean_Flag}_{ENV_Name}_.npy'
#     loaded_array = np.load(file_name)
#     max_Plot_array.append(loaded_array)

# mean_scrs = np.mean(mean_Plot_array,axis = 0)
# max_scrs = np.mean(max_Plot_array,axis = 0)

# plt.plot(mean_scrs,label='Mean DQN')
# plt.plot(max_scrs,label='Max DQN')
# plt.legend()
# plt.ylabel ('Total Reward')
# plt.xlabel('Episodes')
# plt.title(f'Reward curve (Average over {Iterations} runs)')
# plt.savefig('DQN_CartPole-v0.png')
# plt.show()

In [None]:

# obs = env.reset()
# episode_length = 0
# for i in range(5000):
#     print(f'episode_length {episode_length}')
#     episode_length +=1
#     obs = torch.as_tensor(obs, dtype=torch.float32).to(device)
#     action = online_net.act(obs)

#     obs, _, done, _ = env.step(action)
#     env.render()
#     if done: 
#         env.reset()
#         episode_length = 0