In [None]:
"""!pip install gym
!apt-get install python-opengl -y
!apt install xvfb -y"""

'!pip install gym\n!apt-get install python-opengl -y\n!apt install xvfb -y'

In [None]:
%matplotlib inline
from gym import wrappers
from time import sleep
import time
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim

#interacting with the file system
from google.colab import files
from google.colab import drive
drive.mount('/content/gdrive')

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython: from IPython import display
import torch.nn.functional as F
import torchvision.transforms as T

Mounted at /content/gdrive


In [None]:
# google deep mind paper
class DQN(nn.Module):
    def __init__(self, img_height, img_width, num_frames=4):
        super().__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.dense = torch.nn.Linear(64 * 7 * 7, 512)
        self.out = nn.Linear(512, 4)            

        
    def forward(self, t):
        t = F.relu(self.conv1(t))
        t = F.relu(self.conv2(t))
        t = F.relu(self.conv3(t))
        t = t.view(t.size(0), -1)
        t = F.relu(self.dense(t))
        t = self.out(t)
        return t

In [None]:
Experience = namedtuple(
    'Experience',
    ('state', 'action', 'next_state', 'reward')
)

In [None]:
class ReplayMemory():
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.push_count = 0
        
    def push(self, experience):
        if len(self.memory) < self.capacity:
            self.memory.append(experience)
        else:
            self.memory[self.push_count % self.capacity] = experience
        self.push_count += 1

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def can_provide_sample(self, batch_size):
        return len(self.memory) >= batch_size

In [None]:
class EpsilonGreedyStrategy():
    def __init__(self, start, end, decay):
        self.start = start
        self.end = end
        self.decay = decay
    
    def get_exploration_rate(self, current_step):
        return self.end + (self.start - self.end) * \
            math.exp(-1. * current_step * self.decay)

In [None]:
class Agent():
    def __init__(self, strategy, num_actions, device):
        self.current_step = 0
        self.strategy = strategy
        self.num_actions = num_actions
        self.device = device

    def select_action(self, state, policy_net):
        rate = strategy.get_exploration_rate(self.current_step)
        self.current_step += 1

        if rate > random.random():
            action = random.randrange(self.num_actions)
            return torch.tensor([action]).to(self.device) # explore      
        else:
            with torch.no_grad():
                return policy_net(state).argmax(dim=1).to(self.device) # exploit

In [None]:
def plot(values, moving_avg_period, time_taken):
    plt.figure(2)
    plt.clf()        
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Points')
    plt.plot(values)
    
    moving_avg = get_moving_average(moving_avg_period, values)
    plt.plot(moving_avg)    
    plt.pause(0.001)
    print("Episode", len(values), "\n", \
          moving_avg_period, "episode moving avg:", moving_avg[-1])
    print(f"Episode times: {time_taken[-1]} (s)")
    if is_ipython: display.clear_output(wait=True)

def get_moving_average(period, values):
    values = torch.tensor(values, dtype=torch.float)
    if len(values) >= period:
        moving_avg = values.unfold(dimension=0, size=period, step=1) \
            .mean(dim=1).flatten(start_dim=0)
        moving_avg = torch.cat((torch.zeros(period-1), moving_avg))
        return moving_avg.numpy()
    else:
        moving_avg = torch.zeros(len(values))
        return moving_avg.numpy()

In [None]:
def extract_tensors(experiences):
    # Convert batch of Experiences to Experience of batches
    batch = Experience(*zip(*experiences))

    t1 = torch.cat(batch.state)
    t2 = torch.cat(batch.action)
    t3 = torch.cat(batch.reward)
    t4 = torch.cat(batch.next_state)

    return (t1,t2,t3,t4)

In [None]:
class QValues():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    @staticmethod
    def get_current(policy_net, states, actions):
        return policy_net(states).gather(dim=1, index=actions.unsqueeze(-1))
    
    @staticmethod        
    def get_next(target_net, next_states):                
        final_state_locations = next_states.flatten(start_dim=1) \
            .max(dim=1)[0].eq(0).type(torch.bool)
        non_final_state_locations = (final_state_locations == False)
        non_final_states = next_states[non_final_state_locations]
        batch_size = next_states.shape[0]
        values = torch.zeros(batch_size).to(QValues.device)
        values[non_final_state_locations] = target_net(non_final_states).max(dim=1)[0].detach()
        return values

In [None]:
class BreakoutEnvManager():
    def __init__(self, device):
        self.device = device
        self.env = gym.make('BreakoutNoFrameskip-v4').unwrapped
        self.env = wrappers.AtariPreprocessing(self.env)
        self.env = wrappers.FrameStack(self.env, num_stack = 4)
        self.env.reset()
        self.current_screen = None
        self.done = False
    
    def reset(self):
        self.env.reset()
        self.current_screen = None
        
    def close(self):
        self.env.close()
        
    def render(self, mode='human'):
        return self.env.render(mode)
        
    def num_actions_available(self):
        return self.env.action_space.n
        
    def take_action(self, action):        
        _, reward, self.done, _ = self.env.step(action.item())
        return torch.tensor([reward], device=self.device)
    
    def just_starting(self):
        return self.current_screen is None
    
    def get_state(self):
        self.current_screen = self.get_processed_screen()
        return self.current_screen

    
    def get_screen_height(self):
        screen = self.get_processed_screen()
        #print(screen, "Get height", screen.shape)
        return screen.shape[2]
    
    def get_screen_width(self):
        screen = self.get_processed_screen()
        return screen.shape[3]
       
    def get_processed_screen(self):
        #frame = np.array(obs[0])[f]
        screen = np.asarray(self.env.frames) # PyTorch expects CHW
        #print("Screen shape get process", screen.shape)
        return self.transform_screen_data(screen)
    
    def transform_screen_data(self, screen):       
        # Convert to float, rescale, convert to tensor
        #print("Shape screen", screen.shape)
        screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
        screen = torch.from_numpy(screen)
        
        return screen.unsqueeze(0).to(self.device) # add a batch dimension (BCHW)

In [None]:
def save_state(episode, policy_net, target_net, optimizer, agent):
    state = {
      'episode': episode,
      'policy_state_dict': policy_net.state_dict(),
      'target_state_dict': target_net.state_dict(),
      'optimizer': optimizer.state_dict(),
      'agent_current_step': agent.current_step
    }

    model_save_name = 'classifier.pt'
    filepath = f"/content/gdrive/My Drive/models/{model_save_name}" 
    torch.save(state, filepath)

    
    model_save_name = f'classifier_{episode}config{configuration}.pt'
    filepath = f"/content/gdrive/My Drive/models/{model_save_name}" 
    torch.save(state, filepath)

In [None]:
batch_size = 256
gamma = 0.999
eps_start = 1
eps_end = 0.01
eps_decay = 0.001
target_update = 10
memory_size = 100000
lr = 0.001 #
num_episodes = 8500 # run for more episodes for better results
configuration = 1 

last_ep_cp = 0
#Load model
last_ep_cp = 6000
model_save_name = f'classifier_{last_ep_cp}config{configuration}.pt'
path = f"/content/gdrive/My Drive/models/{model_save_name}"

checkpoint = torch.load(path) #

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

em = BreakoutEnvManager(device)


#ADDED
#print(em.env.frames)
#print(np.asarray(em.env.frames[0]).shape)
strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay)
#agent gives you the proper epsilon value
agent = Agent(strategy, em.num_actions_available(), device)
agent.current_step = checkpoint['agent_current_step'] #
memory = ReplayMemory(memory_size)


policy_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device)
policy_net.load_state_dict(checkpoint['policy_state_dict']) #
policy_net.eval()

target_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device)
target_net.load_state_dict(checkpoint['target_state_dict']) #
target_net.load_state_dict(policy_net.state_dict()) #
target_net.eval()
optimizer = optim.Adam(params=policy_net.parameters(), lr=lr)
optimizer.load_state_dict(checkpoint['optimizer']) #

episode_points = []
episode_times = []
for episode in range(last_ep_cp+1, last_ep_cp+num_episodes):

    if episode % 500 == 0:
      save_state(episode, policy_net, target_net, optimizer, agent)

    start_time = time.time()
    em.reset()
    total_reward = 0
    state = em.get_state()
    render_game = False #(episode > 1) and (episode % 900 == 0)

    for timestep in count():
        if render_game: em.render()
        action = agent.select_action(state, policy_net)
        reward = em.take_action(action)
        total_reward += reward
        next_state = em.get_state()
        memory.push(Experience(state, action, next_state, reward))
        state = next_state

        if memory.can_provide_sample(batch_size):
            experiences = memory.sample(batch_size)
            states, actions, rewards, next_states = extract_tensors(experiences)
            
            current_q_values = QValues.get_current(policy_net, states, actions)
            next_q_values = QValues.get_next(target_net, next_states)
            target_q_values = (next_q_values * gamma) + rewards

            loss = F.mse_loss(current_q_values, target_q_values.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        if em.done:
            episode_points.append(total_reward)

            end_time = time.time()
            time_elapsed = end_time - start_time
            episode_times.append(time_elapsed)

            if episode %50 == 0:
              plot(episode_points, 100, episode_times)
              np.savetxt("config_1_ep_points.csv", episode_points, delimiter =",")
              np.savetxt("config_1_ep_times.csv", episode_times, delimiter=",")
            break

    if episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())
        
em.close()

RuntimeError: ignored

In [None]:
plot(episode_points, 100, episode_times)