In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [2]:
!pip install ALE
!pip install gym[atari,accept-rom-license]==0.21.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ALE
  Downloading Ale-0.8.4.tar.gz (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 KB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ALE
  Building wheel for ALE (setup.py) ... [?25l[?25hdone
  Created wheel for ALE: filename=Ale-0.8.4-py3-none-any.whl size=70176 sha256=1b525af65abcc4cfd6eb0d8be8b4c8b75ad824af2659d46d5c3d8c662bc3fadd
  Stored in directory: /root/.cache/pip/wheels/90/6e/89/be043555e2e48a57e1797b91174868898b7545a305178016cb
Successfully built ALE
Installing collected packages: ALE
Successfully installed ALE-0.8.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym[accept-rom-license,atari]==0.21.0
  Downloading gym-0.21.0.tar.gz (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from collections import namedtuple, deque
import numpy as np
import math
from numpy.random import choice
import gym 
import operator
from copy import deepcopy
import torch
import torch.nn as nn
import numpy as np
import cv2
import gym
import gym.spaces
import collections
import argparse
import time
import torch.optim as optim
import pickle

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, experiences_per_sampling, seed, compute_weights):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            experiences_per_sampling (int): number of experiences to sample during a sampling iteration
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.experiences_per_sampling = experiences_per_sampling
        
        self.alpha = 0.5
        self.alpha_decay_rate = 0.99
        self.beta = 0.5
        self.beta_growth_rate = 1.001
        self.seed = random.seed(seed)
        self.compute_weights = compute_weights
        self.experience_count = 0
        
        self.experience = namedtuple("Experience", 
            field_names=["state", "action", "reward", "next_state", "done"])
        self.data = namedtuple("Data", 
            field_names=["priority", "probability", "weight","index"])

        indexes = []
        datas = []
        for i in range(buffer_size):
            indexes.append(i)
            d = self.data(0,0,0,i)
            datas.append(d)
        
        self.memory = {key: self.experience for key in indexes}
        self.memory_data = {key: data for key,data in zip(indexes, datas)}
        self.sampled_batches = []
        self.current_batch = 0
        self.priorities_sum_alpha = 0
        self.priorities_max = 1
        self.weights_max = 1
    
    def update_priorities(self, tds, indices):
        for td, index in zip(tds, indices):
            N = min(self.experience_count, self.buffer_size)

            updated_priority = td
            if updated_priority > self.priorities_max:
                self.priorities_max = updated_priority
            
            if self.compute_weights:
                updated_weight = ((N * updated_priority)**(-self.beta))/self.weights_max
                if updated_weight > self.weights_max:
                    self.weights_max = updated_weight
            else:
                updated_weight = 1

            old_priority = self.memory_data[index].priority
            self.priorities_sum_alpha += updated_priority**self.alpha - old_priority**self.alpha
            updated_probability = td**self.alpha / self.priorities_sum_alpha
            data = self.data(updated_priority, updated_probability, updated_weight, index) 
            self.memory_data[index] = data

    def update_memory_sampling(self):
        """Randomly sample X batches of experiences from memory."""
        # X is the number of steps before updating memory
        self.current_batch = 0
        values = list(self.memory_data.values())
        random_values = random.choices(self.memory_data, 
                                       [data.probability for data in values], 
                                       k=self.experiences_per_sampling)
        self.sampled_batches = [random_values[i:i + self.batch_size] 
                                    for i in range(0, len(random_values), self.batch_size)]

    def update_parameters(self):
        self.alpha *= self.alpha_decay_rate
        self.beta *= self.beta_growth_rate
        if self.beta > 1:
            self.beta = 1
        N = min(self.experience_count, self.buffer_size)
        self.priorities_sum_alpha = 0
        sum_prob_before = 0
        for element in self.memory_data.values():
            sum_prob_before += element.probability
            self.priorities_sum_alpha += element.priority**self.alpha
        sum_prob_after = 0
        for element in self.memory_data.values():
            probability = element.priority**self.alpha / self.priorities_sum_alpha
            sum_prob_after += probability
            weight = 1
            if self.compute_weights:
                weight = ((N *  element.probability)**(-self.beta))/self.weights_max
            d = self.data(element.priority, probability, weight, element.index)
            self.memory_data[element.index] = d
        print("sum_prob before", sum_prob_before)
        print("sum_prob after : ", sum_prob_after)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        self.experience_count += 1
        index = self.experience_count % self.buffer_size
        state = state.transpose(2,0,1)
        next_state = next_state.transpose(2,0,1)

        if self.experience_count > self.buffer_size:
            temp = self.memory_data[index]
            self.priorities_sum_alpha -= temp.priority**self.alpha
            if temp.priority == self.priorities_max:
                self.memory_data[index].priority = 0
                self.priorities_max = max(self.memory_data.items(), key=operator.itemgetter(1)).priority
            if self.compute_weights:
                if temp.weight == self.weights_max:
                    self.memory_data[index].weight = 0
                    self.weights_max = max(self.memory_data.items(), key=operator.itemgetter(2)).weight

        priority = self.priorities_max
        weight = self.weights_max
        self.priorities_sum_alpha += priority ** self.alpha
        probability = priority ** self.alpha / self.priorities_sum_alpha
        e = self.experience(state, action, reward, next_state, done)
        self.memory[index] = e
        d = self.data(priority, probability, weight, index)
        self.memory_data[index] = d
            
    def sample(self):
        sampled_batch = self.sampled_batches[self.current_batch]
        self.current_batch += 1
        experiences = []
        weights = []
        indices = []
        
        for data in sampled_batch:
            experiences.append(self.memory.get(data.index))
            weights.append(data.weight)
            indices.append(data.index)

        #print("+++++++++++++++",str(type(experiences[1].state)))

        # a = []
        # for _ in range(9):
        #   b = np.array(experiences[0].state)
        #   b = torch.from_numpy(b)
        #   a.append(b)

        

        

        
        
        
        states = torch.stack([torch.from_numpy(np.array(e.state)) for e in experiences if e is not None]).float().to(device)
        actions = torch.stack([torch.from_numpy(np.array(e.action)) for e in experiences if e is not None]).long().to(device)
        rewards = torch.stack([torch.from_numpy(np.array(e.reward)) for e in experiences if e is not None]).float().to(device)
        next_states = torch.stack([torch.from_numpy(np.array(e.next_state)) for e in experiences if e is not None]).float().to(device)
        dones = torch.stack([torch.from_numpy(np.array(e.done)) for e in experiences if e is not None]).float().to(device)
        # states = torch.from_numpy(
        #     np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        # actions = torch.from_numpy(
        #     np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        # rewards = torch.from_numpy(
        #     np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        # next_states = torch.from_numpy(
        #     np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        # dones = torch.from_numpy(
        #     np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        #print("-------------------",str(states.shape))

        return (states, actions, rewards, next_states, dones, weights, indices)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [6]:
objects = []
with (open("/content/drive/MyDrive/Colab Notebooks/expert_tracefirst_final.pkl", "rb")) as openfile:
    while True:
        try:
            objects.append(pickle.load(openfile))
        except EOFError:
            break

obs = objects[0][0]
act = objects[0][1]
rew = objects[0][2]
next = objects[0][3]
done = objects[0][4]

In [8]:
BUFFER_SIZE = int(1e5)      # replay buffer size
BATCH_SIZE = 64             # minibatch size
GAMMA = 0.99                # discount factor
TAU = 1e-3                  # for soft update of target parameters
LR = 0.001                   # learning rate 
UPDATE_NN_EVERY = 1        # how often to update the network

# prioritized experience replay
env = gym.make("MontezumaRevenge-v4",render_mode="rgb_array")
UPDATE_MEM_EVERY = 20          # how often to update the priorities
UPDATE_MEM_PAR_EVERY = 3000     # how often to update the hyperparameters
EXPERIENCES_PER_SAMPLING = 64
print(EXPERIENCES_PER_SAMPLING)
memory = ReplayBuffer(env.action_space.n, BUFFER_SIZE, BATCH_SIZE, EXPERIENCES_PER_SAMPLING, 0, True)

64


In [9]:
for i in range(len(obs)//3):
  memory.add(obs[i:i+10],act[i:i+10],rew[i:i+10],next[i:i+10],done[i:i+10])
memory.update_memory_sampling()

In [11]:
class DQN(nn.Module):
  def __init__(self, num_actions):
    super(DQN, self).__init__()
    self.layer1 = nn.Sequential(
        nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=0),
        nn.BatchNorm2d(96),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size = 3, stride = 2))
    self.layer2 = nn.Sequential(
        nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
        nn.BatchNorm2d(256),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size = 3, stride = 2))
    self.layer3 = nn.Sequential(
        nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
        nn.BatchNorm2d(384),
        nn.ReLU())
    self.layer4 = nn.Sequential(
        nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
        nn.BatchNorm2d(384),
        nn.ReLU())
    self.layer5 = nn.Sequential(
        nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
        nn.BatchNorm2d(256),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size = 3, stride = 2))
    self.fc = nn.Sequential(
        nn.Dropout(0.5),
        nn.Linear(3840, 4096),
        nn.ReLU())
    self.fc1 = nn.Sequential(
        nn.Dropout(0.5),
        nn.Linear(4096, 4096),
        nn.ReLU())
    self.fc2= nn.Sequential(
        nn.Linear(4096, num_actions))

  def forward(self, x):
      out = self.layer1(x)
      out = self.layer2(out)
      out = self.layer3(out)
      out = self.layer4(out)
      out = self.layer5(out)
      out = out.reshape(out.size(0), -1)
      out = self.fc(out)
      out = self.fc1(out)
      out = self.fc2(out)
      return out,F.softmax(out, dim=1)

  def act(self, state):
      q_vals,_ = self.forward(state)
      action = q_vals.argmax(-1)
      return action

In [49]:
learning_rate = 10e-8
weight_decay = 0.002
momentum = 0.9
gamma = 0.99
epsilon = 1.0
tgt_update_freq = 10
epochs = 300

net = DQN(env.action_space.n)
tgt = deepcopy(net)
net.to(device)
tgt.to(device)

optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate, weight_decay = weight_decay, momentum = momentum)  

In [27]:
def preproc_state(state):
  # state = state.transpose(0,3,1,2)
  # state = torch.from_numpy(state)
  return state/255

In [29]:
def calc_loss(size,states,actions,rewards,next_states,dones):
  states = preproc_state(states).to(device)
  actions = actions.to(device)
  rewards = rewards.to(device)
  next_states = preproc_state(next_states).to(device)
  dones = dones.to(device)

  ### DQN loss
  dqn_net_max_act = net.act(next_states)
  dqn_tgt_q_vals,_ = tgt(next_states)
  dqn_tgt_q = dqn_tgt_q_vals.gather(1,dqn_net_max_act.unsqueeze(-1)).squeeze(-1)
  dqn_net_q_vals,_ = net(states)
  dqn_net_q = dqn_net_q_vals.gather(1,actions.unsqueeze(-1)).squeeze(-1)

  # print("Rewards: ",rewards.shape)
  # print("Q next: ", dqn_tgt_q.shape)
  # print("Dones: ",dones.shape)
  # print("Q: ",dqn_net_q.shape)

  pred = rewards + (1-dones)*gamma*dqn_tgt_q
  diff = pred - dqn_net_q
  jdqn = torch.mean(diff**2)
  jdqn /= size
  
  ## Supervised expert loss
  net_q_vals,_ = net(states)
  exp_net_q = net_q_vals.gather(1,actions.unsqueeze(-1)).squeeze(-1)
  net_acts = (torch.tensor(np.ones((size,18)))*10).to(device)
  
  for s in range(size):
    net_acts[s][actions[s]] = 0

  
  net_q = net_q_vals + net_acts
  max_net_q = torch.max(net_q)

  je = torch.mean(max_net_q - exp_net_q)
  je /= size
  
  # print("Exp act: ",actions)
  # print("Array l: ",net_acts)
  # print("Qs:", net_q.shape)
  # print("Q exp: ", exp_net_q.shape)
  


  states.detach()
  actions.detach()
  rewards.detach()
  next_states.detach()
  dones.detach()
  net_acts.detach()
  net_q_vals.detach()
  
  loss_tot = jdqn+0.9*je

  return loss_tot

In [31]:
def expert_train(buffer):
  print("--- Pre-training with expert trace in buffer: start")
  pos = 0

  for elem in range(buffer.__len__()):
      optimizer.zero_grad()
      
      memory.update_memory_sampling()
      states, actions, rewards, next_states, dones,_ ,_ = buffer.sample()
      

      loss = calc_loss(64,states,actions,rewards,next_states,dones)

      loss.backward()
      optimizer.step()
      
      if pos % tgt_update_freq == 0:
        tgt.load_state_dict(net.state_dict())
        print("Updated tgt")

      pos += 1
      
      print("Episode {} | Loss {:.2f}".format(elem,loss.item()))      

In [50]:
expert_train(memory)

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Episode 6605 | Loss 0.16
Episode 6606 | Loss 2.59
Episode 6607 | Loss 0.16
Episode 6608 | Loss 0.16
Episode 6609 | Loss 0.16
Updated tgt
Episode 6610 | Loss 0.16
Episode 6611 | Loss 44.13
Episode 6612 | Loss 0.16
Episode 6613 | Loss 2.63
Episode 6614 | Loss 0.16
Episode 6615 | Loss 0.16
Episode 6616 | Loss 0.16
Episode 6617 | Loss 2.61
Episode 6618 | Loss 0.16
Episode 6619 | Loss 22.18
Updated tgt
Episode 6620 | Loss 22.16
Episode 6621 | Loss 46.73
Episode 6622 | Loss 22.15
Episode 6623 | Loss 0.15
Episode 6624 | Loss 2.61
Episode 6625 | Loss 22.08
Episode 6626 | Loss 2.62
Episode 6627 | Loss 2.60
Episode 6628 | Loss 24.70
Episode 6629 | Loss 22.17
Updated tgt
Episode 6630 | Loss 0.16
Episode 6631 | Loss 22.17
Episode 6632 | Loss 0.16
Episode 6633 | Loss 0.16
Episode 6634 | Loss 0.16
Episode 6635 | Loss 0.16
Episode 6636 | Loss 0.16
Episode 6637 | Loss 2.61
Episode 6638 | Loss 0.16
Episode 6639 | Loss 0.16
Updated tgt
Epis

KeyboardInterrupt: ignored

In [51]:
def train(buffer,env):
  print("--- Pre-training with expert trace in buffer: start")
  pos = 0
  done = False
  epsilon = 1.0
  
  for epoch in range(epochs):
      optimizer.zero_grad()
      s0 = env.reset()

      while not done:
        p = np.random.random()
        if p < epsilon:
            env_act = env.action_space.sample()
            #print("Explore",env_act)
        else:
            s0 = preproc_state(s0).unsqueeze(0).to(device)
            env_act = net(s0)
            #print("Explote",env_act)

        s,r,done,_ = env.step(env_act)
        buffer.add(s0,env_act,r,s,done)
        s0 = s

      epsilon *= 0.7

      memory.update_memory_sampling()
      states, actions, rewards, next_states, dones,_ ,_ = buffer.sample()
      
      states = preproc_state(states).to(device)
      actions = actions.to(device)
      rewards = rewards.to(device)
      next_states = preproc_state(next_states).to(device)
      dones = dones.to(device)

      ### DQN loss
      dqn_net_max_act = net.act(next_states)
      dqn_tgt_q_vals,_ = tgt(next_states)
      dqn_tgt_q = dqn_tgt_q_vals.gather(1,dqn_net_max_act.unsqueeze(-1)).squeeze(-1)
      dqn_net_q_vals,_ = net(states)
      dqn_net_q = dqn_net_q_vals.gather(1,actions.unsqueeze(-1)).squeeze(-1)

      # print("Rewards: ",rewards.shape)
      # print("Q next: ", dqn_tgt_q.shape)
      # print("Dones: ",dones.shape)
      # print("Q: ",dqn_net_q.shape)

      pred = rewards + (1-dones)*gamma*dqn_tgt_q
      diff = pred - dqn_net_q
      jdqn = torch.mean(diff**2)
      jdqn /= 10

      jdqn.backward()
      optimizer.step()
      
      if epoch % tgt_update_freq == 0:
        tgt.load_state_dict(net.state_dict())
        print("Updated tgt")
      
      print("Episode {} | Loss {:.2f}".format(epoch,jdqn.item()))      

In [52]:
train(memory,env)

--- Pre-training with expert trace in buffer: start
Updated tgt
Episode 0 | Loss 140.81
Episode 1 | Loss 0.01
Episode 2 | Loss 0.02
Episode 3 | Loss 0.02
Episode 4 | Loss 0.02
Episode 5 | Loss 15.62
Episode 6 | Loss 0.02
Episode 7 | Loss 15.82
Episode 8 | Loss 0.02
Episode 9 | Loss 140.98
Updated tgt
Episode 10 | Loss 0.02
Episode 11 | Loss 0.02
Episode 12 | Loss 141.73
Episode 13 | Loss 15.78
Episode 14 | Loss 156.75
Episode 15 | Loss 156.62
Episode 16 | Loss 0.01
Episode 17 | Loss 0.02
Episode 18 | Loss 0.02
Episode 19 | Loss 0.02
Updated tgt
Episode 20 | Loss 0.02
Episode 21 | Loss 15.88
Episode 22 | Loss 0.02
Episode 23 | Loss 15.41
Episode 24 | Loss 0.02
Episode 25 | Loss 15.61
Episode 26 | Loss 31.10
Episode 27 | Loss 0.02
Episode 28 | Loss 15.54
Episode 29 | Loss 140.68
Updated tgt
Episode 30 | Loss 140.71
Episode 31 | Loss 140.62
Episode 32 | Loss 0.02
Episode 33 | Loss 140.52
Episode 34 | Loss 0.02
Episode 35 | Loss 0.02
Episode 36 | Loss 0.02
Episode 37 | Loss 140.46
Episode 

In [53]:
net_checkpoint = "/content/drive/MyDrive/Colab Notebooks/RL/Project/bet.pt"
torch.save(net.state_dict(), net_checkpoint)