In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [2]:
import numpy as np
import gym
from tqdm import tqdm
import random as rand
from itertools import count

In [3]:
env = gym.make("CartPole-v0")

In [318]:
class ReplayMemory():
    def __init__(self,capacity):   
        self.capacity = capacity
        self.memory = []
        self.push_count = 0
        
    def push(self, experience):
        if len(self.memory) < self.capacity:
            self.memory.append(experience)
        else:
            self.memory[self.push_count%self.capacity] = experience
        self.push_count+=1
    
    def sample(self, batch_size):
        return rand.sample(self.memory,batch_size)
    
    def can_provide_sample(self, batch_size):
        return len(self.memory)>=batch_size
    
    def update_td_error(self, sampled_experiences):
        indexes = []
        for sampled_idx,sampled_exp in enumerate(sampled_experiences):
            for mem_idx, mem_exp in enumerate(self.memory):
                if mem_exp.timestep == sampled_exp.timestep:
                    self.memory[mem_idx] = sampled_exp #update memory
                    break
                     
    def get_memory_values(self):
        return self.memory    

In [269]:
def extract_tensors(experiences):
    batch = Xp(*zip(*experiences))
    state = np.stack(batch.state) #stack
    action = np.stack(batch.action)
    next_state = np.stack(batch.next_state)
    reward = np.stack(batch.reward)
    done = np.stack(batch.done)
    abs_td_error = np.stack(batch.abs_td_error)
    timestep = np.stack(batch.timestep)
    return state,action,next_state,reward,done,abs_td_error

In [317]:
def rebuild_experiences(state, action, next_state, reward, done, abs_error, timestep):
    exp_list = []
    exp = (state, action, next_state, reward, done, abs_error, timestep)
    for e in exp:
        state, action, next_state, reward, is_failure, td_error, t_step = e
        exp_list.append(\
                        Xp(state, action, next_state, reward, is_failure, td_error, t_step))
    return exp_list

In [268]:
from collections import namedtuple
Xp = namedtuple('Experience',
                        ('state', 'action', 'next_state', 'reward', 'done', 'abs_td_error','timestep'))
Xp_points = Xp(5,6,7,8,9,10,11)
Xp_points

Experience(state=5, action=6, next_state=7, reward=8, done=9, abs_td_error=10, timestep=11)

In [19]:
class linearApproximator(nn.Module):
    def __init__(self,state_shape,n_fc1,n_fc2, action_n):
        super(linearApproximator, self).__init__()
        self.input_size = state_shape
        self.n_fc1 = n_fc1
        self.n_fc2 = n_fc2
        self.out = action_n
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.fc1  = nn.Linear(self.input_size,self.n_fc1)
        self.fc2  = nn.Linear(self.n_fc1,self.n_fc2)
        
        self.state_value = nn.Linear(self.n_fc2, 1)
        self.advantage_actions  = nn.Linear(self.n_fc2,self.out)
        
        
        self.to(self.device)
        
    def forward(self, state_shape):
        x = self.fc1(state_shape)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        
        
        actions_ = self.advantage_actions(x)
        actions_ = F.relu(actions_)
        state_value_=self.state_value(x)
        state_value_=F.relu(state_value_)
        state_value_ = state_value_.expand_as(actions_)
        #adabantage function equation pg:311 - 316
        q = state_value_ + (actions_ - actions_.mean().expand_as(actions_))
        return q

In [8]:
def update_networks(online_network, target_network, tau):
    #polyak averaging: pg:319
    for target_weights, online_weights in zip(target_network.parameters(), online_network.parameters()):
        target_weight_update = (1.0 - tau)*target_weights.data
        online_weight_update = tau*online_weights.data
        sum_up = target_weight_update + online_weight_update
        target_weights.data.copy_(sum_up)
    return online_network, target_network

In [105]:
def update_online_model(experience_samples, online_network, target_network, gamma, optimizer,\
                        weighted_importance, indices):
    states, actions, next_states, rewards, done, td_errors, timesteps = extract_tensors(experience_samples)
    
    states = torch.tensor(states).float()
    actions = torch.tensor(actions)
    actions = actions.type(torch.LongTensor)
    actions = actions.unsqueeze(1)
    next_states=torch.tensor(next_states).float()
    rewards = torch.tensor(rewards).float()
    done = torch.tensor(done).float()
    weighted_importance = torch.tensor(weighted_importance).float()
    
    #rearrange to follow indices
    #torch.gather(src, 0, index), gather each one by batch and rearrange batch according to indices
    states = torch.gather(states, 0, indices)
    actions = torch.gather(actions, 0, indices)
    next_states = torch.gather(next_states, 0, indices)
    rewards = torch.gather(rewards, 0, indices)
    done = torch.gather(done, 0, indices)
    
    q_online_next_states = online_network(next_states)#we now take next_states from online network 
    q_online_next_states = q_online_next_states.max(1)[1]#we now take the indices and not the values from online network
    q_online_next_states = q_online_next_states.unsqueeze(1)#iindices
     
    
    q_target_next_states_action = target_network(next_states)
    q_target_next_states_action = q_target_next_states_action.detach()#always make sure detach on target net
    q_target_next_states_action = q_target_next_states_action.gather(1, q_online_next_states)
    #print(q_target_next_states_action.shape)
    done = done.unsqueeze(1)
    
    #print(q_target_next_states_action.shape, done.shape)
    
    q_target_next_states_action *=(1 - done) 
    rewards = rewards.unsqueeze(1)
    q_target = rewards + (gamma*q_target_next_states_action)
    
    
    q_online_state = online_network(states)
    q_online_state = q_online_state.gather(1, actions)
    
    q_online_state*= weighted_importance
    q_target*=weighted_importance
    
    abs_error = abs(q_online_state - q_target)#update experience errors
    
    q_u_loss = torch.nn.SmoothL1Loss()
    q_u_loss = q_u_loss(q_online_state,q_target)
    
    optimizer.zero_grad()
    q_u_loss.backward()
    optimizer.step()
    
    states, actions, next_states, rewards, done, td_errors, timesteps = extract_tensors(experience_samples)
    experiences_rebuilded = rebuild_experiences(states, actions, next_states, rewards, done, abs_error, timesteps)
    
    return online_network, target_network, experiences_rebuilded

In [10]:
def freeze_model(model):
    for param in model.parameters():
        param.requires_grad = False
    return model

In [22]:
def select_action(state, model, epsilon):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    state = torch.tensor(state).float()
    with torch.no_grad():
        q_values= model(state).cpu().detach()
        q_values = q_values.data.numpy().squeeze()
    if np.random.rand() > epsilon:
        action = np.argmax(q_values)
    else:
        action = np.random.randint(len(q_values))
    return action

In [12]:
def epsilon_decay_linear(init_eps, min_eps, decay_ratio, timestep, timestep_max):
    decay_t_step = timestep_max*decay_ratio
    epsilon = 1 - timestep/decay_t_step
    epsilon *= init_eps-min_eps
    epsilon+=min_eps
    epsilon = np.clip(epsilon, min_eps, init_eps)
    return epsilon

In [106]:
def query_error(online_model, state, next_state):
    q_val = online_model(state)
    q_val = q_val.detach().cpu().numpy()#detatch
    q_val = q_val.max()[0]#take values of max
    q_val_nxt = online_model(next_state)
    q_val_nxt = q_val_nxt.detach().cpu().numpy()#detach
    q_val_nxt = q_val_nxt.max()[0]#
    return q_val, q_val_nxt

In [104]:
def prioritize_samples(experience_samples, alpha, beta):
    state,action,next_state,reward,done,abs_td_error = experience_samples
    #rank based
    print("abs tensor shape: ", abs_td_error.shape)
    abs_td_error  = abs_td_error.unsqueeze(1)
    abs_td_error, indices_ = abs_td_error.sort(0, descending=True)#big to small
    
    indices = np.arange(1, len(abs_td_error)+1)
    priorities = 1.0/indices
    priorities = priorities**alpha#scale by alpha
    probabilities = priorities/priorities.sum(pr, axis=0)#sums up to 1(or 0.999999)
    
    assert np.isclose(probabilities.sum(), 1.0)#ensures probs add up to 1
    
    number_of_samples  = len(probabilities)
    weight_importance_ = number_of_samples*probabilities
    weight_importance_ = weight_importance_**-beta
    
    #downscale the weights, max==1, everything else lower
    weight_importance_max = np.max(weight_importance_)
    weight_importance_scaled = weight_importance_/weight_importance_max
    
    return weight_importance_scaled, indices_ #return weight important samples, return indices for re_arranging sampled experiences

In [13]:
def Duelling_DDQN_PER(env,
         gamma=0.9,
         alpha_pr=0.01,
         beta_pr=0.01,
         memory_size = 50000,
         init_epsilon=1.0,
         min_epsilon=0.3,
         epsilon_decay_ratio=0.4,
         tau = 0.3,
         target_update = 30,
         min_sample_size=320,
         batch_size = 64,
         n_ep=20000,
         priority_epsilon_constant = 0.01,
         max_t_steps = 100000):
    
    action_space = env.action_space.n
    observation_space = len(env.reset())
    hidden_1 = 512
    hidden_2 = 128
    
    online_network = linearApproximator(observation_space, hidden_1, hidden_2, action_space)
    target_network = linearApproximator(observation_space, hidden_1, hidden_2, action_space)
    target_network.eval()
    target_network = freeze_model(target_network)
    
    optimizer = torch.optim.RMSprop(online_network.parameters(),lr=0.0007)
    memory = ReplayMemory(memory_size)
    
    t_step = 0 #important
    reward_per_ep = []
    
    for e in tqdm(range(n_ep)):
        state = env.reset()
        reward_accumulated = 0
        while True:
            epsilon = epsilon_decay_linear(init_epsilon, min_epsilon, epsilon_decay_ratio, t_step, max_t_steps)
            action = select_action(state, online_network, epsilon)
            
            next_state, reward, done, info = env.step(action)
            
            q_state_a, q_next_state_a = query_error(online_model, state, next_state)
            #convert them to numpy
            #calculate shock value and append error to replay memory for PER analysis
            td_target = reward + gamma*q_next_state_a
            td_error = abs(td_target - q_state_a)#shock value
            
            reward_accumulated+=reward
            is_truncated = 'TimeLimit.truncated' in info and\
                                info['TimeLimit.truncated']
            is_failure = done and not is_truncated
           
            memory.push(Xp(state, action, next_state, reward, is_failure, td_error, t_step))
            state = next_state
            t_step+=1
            if memory.can_provide_sample(min_sample_size):
                #we only update td_errors for samples used to update the network - meaning batch_size?
                experience_samples = memory.sample(batch_size)
                weighted_importance, indices = prioritize_samples(experience_samples, alpha, beta)
                online_network, target_network, rebuilded_exp = update_online_model(experience_samples, online_network,\
                                                                                    target_network, gamma, optimizer,\
                                                                                    weighted_importance, indices)
                memory.update_td_error(rebuilded_exp)
            if t_step%target_update:
                online_network, target_network = update_networks(online_network, target_network, tau)
            if done == True:
                reward_per_ep.append(reward_accumulated)
                break
            if t_step > max_t_steps:
                return online_network, reward_per_ep
    online_network, reward_per_ep

In [14]:
model, rewards = Duelling_DDQN_PER(env)

  0%|                                                                               | 21/20000 [00:02<33:44,  9.87it/s]


KeyboardInterrupt: 

In [96]:
>>> x = torch.randn(7, 1)
print(x.shape)
x

torch.Size([7, 1])


tensor([[-1.5445],
        [-0.0126],
        [-2.1726],
        [ 0.5998],
        [ 0.3943],
        [-0.5349],
        [-0.5002]])

In [108]:
x_sorted, index = x.sort(0, descending=True)

In [109]:
x_sorted

tensor([[ 0.5998],
        [ 0.3943],
        [-0.0126],
        [-0.5002],
        [-0.5349],
        [-1.5445],
        [-2.1726]])

In [110]:
index

tensor([[3],
        [4],
        [1],
        [6],
        [5],
        [0],
        [2]])

In [99]:
_arr = np.arange(1, len(x_sorted) + 1)
_arr

array([1, 2, 3, 4, 5, 6, 7])

In [100]:
arr2 = 1/_arr
arr2

array([1.        , 0.5       , 0.33333333, 0.25      , 0.2       ,
       0.16666667, 0.14285714])

In [101]:
pr = arr2**2
pr.shape
pr

array([1.        , 0.25      , 0.11111111, 0.0625    , 0.04      ,
       0.02777778, 0.02040816])

In [102]:
pr/=np.sum(pr, axis=0)

In [103]:
pr

array([0.66146445, 0.16536611, 0.07349605, 0.04134153, 0.02645858,
       0.01837401, 0.01349927])

In [90]:
np.max(pr)

0.6614644462860121

In [91]:
pr/np.max(pr)

array([1.        , 0.25      , 0.11111111, 0.0625    , 0.04      ,
       0.02777778, 0.02040816])

In [88]:
pr.sum()

0.9999999999999999

In [170]:
src = torch.arange(1, 8)
src.shape

torch.Size([7])

In [171]:
index = index.squeeze()

In [172]:
index = index.type(torch.LongTensor)

In [173]:
index.shape

torch.Size([7])

In [174]:
src_clone = src.clone()
src_clone

tensor([1, 2, 3, 4, 5, 6, 7])

In [180]:
src

tensor([1, 2, 3, 4, 5, 6, 7])

In [181]:
index#4,5,2,7

tensor([3, 4, 1, 6, 5, 0, 2])

In [186]:
torch.gather(src, 0, index)

tensor([4, 5, 2, 7, 6, 1, 3])

In [192]:
import random
l = np.arange(10)  # example list
random.shuffle(l) # we shuffle the list
print(l) # outputs [4, 1, 5, 0, 6, 7, 9, 2, 8, 3]
index_value = random.sample(list(enumerate(l)), 2)
print(index_value) # outputs [(4, 6), (6, 9)]


[3 5 8 7 6 0 2 4 1 9]
[(6, 2), (2, 8)]


In [189]:
np.arange(10) 

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [272]:
memory = ReplayMemory(10)

In [275]:
for t in range(10):
    memory.push(Xp(t,t,t,t,t,t,t))

In [349]:
s_m = memory.sample(1)
(s_m[0].timestep)

6

In [350]:
t_m = memory.get_memory_values()
t_m

[Experience(state=100, action=100, next_state=100, reward=100, done=100, abs_td_error=100, timestep=100),
 Experience(state=1, action=1, next_state=1, reward=1, done=1, abs_td_error=1, timestep=1),
 Experience(state=2, action=2, next_state=2, reward=2, done=2, abs_td_error=2, timestep=2),
 Experience(state=3, action=3, next_state=3, reward=3, done=3, abs_td_error=3, timestep=3),
 Experience(state=4, action=4, next_state=4, reward=4, done=4, abs_td_error=4, timestep=4),
 Experience(state=5, action=5, next_state=5, reward=5, done=5, abs_td_error=5, timestep=5),
 Experience(state=6, action=6, next_state=6, reward=6, done=6, abs_td_error=6, timestep=6),
 Experience(state=7, action=7, next_state=7, reward=7, done=7, abs_td_error=7, timestep=7),
 Experience(state=8, action=8, next_state=8, reward=8, done=8, abs_td_error=8, timestep=8),
 Experience(state=9, action=9, next_state=9, reward=9, done=9, abs_td_error=9, timestep=9)]

In [351]:
for a,b in enumerate(s_m):
    for x, y in enumerate(t_m):
        if y.timestep == b.timestep:
            t_m[x] = Xp(100,100,100,100,100,100,100)
            print(y[a])

6


In [352]:
t_m

[Experience(state=100, action=100, next_state=100, reward=100, done=100, abs_td_error=100, timestep=100),
 Experience(state=1, action=1, next_state=1, reward=1, done=1, abs_td_error=1, timestep=1),
 Experience(state=2, action=2, next_state=2, reward=2, done=2, abs_td_error=2, timestep=2),
 Experience(state=3, action=3, next_state=3, reward=3, done=3, abs_td_error=3, timestep=3),
 Experience(state=4, action=4, next_state=4, reward=4, done=4, abs_td_error=4, timestep=4),
 Experience(state=5, action=5, next_state=5, reward=5, done=5, abs_td_error=5, timestep=5),
 Experience(state=100, action=100, next_state=100, reward=100, done=100, abs_td_error=100, timestep=100),
 Experience(state=7, action=7, next_state=7, reward=7, done=7, abs_td_error=7, timestep=7),
 Experience(state=8, action=8, next_state=8, reward=8, done=8, abs_td_error=8, timestep=8),
 Experience(state=9, action=9, next_state=9, reward=9, done=9, abs_td_error=9, timestep=9)]

In [308]:
[x for x, y in enumerate(t_m) if y[-1] == s_m[0].timestep]

[3]

In [304]:
#[x for t_step in enumerate(t_m) if y[1] == 7]
for cnt_, exp_ in enumerate(s_m):    
    for cnt_m, exp_m in enumerate(t_m):
        if exp_[-1] == exp_m[-1]:
            exp_m[-2] = 100#update td_error
            break

TypeError: 'Experience' object does not support item assignment

In [295]:
[x for x, y in enumerate(t_m) if y[-1] == s_m[0].timestep]

[1]

In [297]:
t_m[1].timestep

1

In [294]:
for x, y in enumerate(t_m):
    print(y[8])

IndexError: tuple index out of range

In [266]:
import operator
f = operator.itemgetter(0)
map(f, t_m).index(s_m)

AttributeError: 'map' object has no attribute 'index'

In [221]:
np.where(s_m == t_m)

  """Entry point for launching an IPython kernel.


(array([], dtype=int64),)

In [227]:
np.where((t_m == (s_m))).all(axis=1)

  """Entry point for launching an IPython kernel.


AttributeError: 'tuple' object has no attribute 'all'

In [237]:
type(t_m)

list