In [66]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np 
import gym

In [6]:
def preprocess_obs(obs):
    temp = np.copy(obs)
    temp = temp[31:193, 7:152]
    temp = np.dot(temp[...,:3], [0.2989, 0.5870, 0.1140])
    temp = temp / 148.3038
    return temp

In [7]:
env = gym.make("Breakout-v0")

In [15]:
obs = env.reset()
obs = preprocess_obs(obs)
print(obs.shape)
print(type(obs.shape))
print((42,*obs.shape))
print(type(obs))

(162, 145)
<class 'tuple'>
(15, 162, 145)
<class 'numpy.ndarray'>


In [19]:
print(env.action_space.n)

4


In [26]:
obs = env.reset()
obs = preprocess_obs(obs)


In [73]:
class MemoryBuffer:
    def __init__(self,maxlen, n_actions, state_dim):
        self.maxlen = maxlen
        self.n_actions = n_actions
        self.state_memory = np.zeros((maxlen, *state_dim),dtype=np.float32)
        self.new_state_memory = np.zeros((maxlen, *state_dim),dtype=np.float32)
        self.action_memory = np.zeros((maxlen, n_actions),dtype=np.int32)
        self.reward_memory = np.zeros((maxlen),dtype=np.float32)
        self.terminal_memory = np.zeros((maxlen),dtype=np.uint8)
        self.mem_counter = 0
    def store_transition(self,state,action,reward,state_,terminal):
        index = self.mem_counter % self.maxlen
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        actions = np.zeros(self.n_actions)
        actions[action] = 1.0
        self.action_memory[index] = actions
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - terminal
        self.mem_counter+=1
    def sample_memory(self,batch_size):
        mem_size = self.mem_counter if self.mem_counter < self.maxlen else self.maxlen
        batch = np.random.choice(mem_size,batch_size,replace=False)
        state_batch = self.state_memory[batch]
        action_batch = self.action_memory[batch]
        reward_batch = self.reward_memory[batch]
        terminal_batch = self.terminal_memory[batch]
        new_state_batch = self.new_state_memory[batch]
        return (state_batch,action_batch,reward_batch,new_state_batch,terminal_batch)

In [74]:
buffer = MemoryBuffer(20,4,(162,145))
current = env.reset()
current = preprocess_obs(current)
for i in range(25):
    new_state, reward, done, info = env.step(0)
    new_state = preprocess_obs(new_state)
    buffer.store_transition(current,0,reward,new_state,done)
print(buffer.mem_counter)
state_batch, action_batch, reward_batch, new_state_batch, terminal_batch = buffer.sample_memory(10) 
print(state_batch.shape)
print(action_batch.shape)
print(reward_batch.shape)
print(new_state_batch.shape)
print(terminal_batch.shape)

25
(10, 162, 145)
(10, 4)
(10,)
(10, 162, 145)
(10,)


In [211]:
class Net(nn.Module): 
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1,32,5)
        self.conv2 = nn.Conv2d(32,64,5)
        self.conv3 = nn.Conv2d(64,128,5)
        self.fc1 = nn.Linear(28672,64)
        self.fc2 = nn.Linear(64,4)
    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2,2))
        x = F.max_pool2d(F.relu(self.conv2(x)), (2,2))
        x = F.max_pool2d(F.relu(self.conv3(x)), (2,2))
        x = x.view(-1,28672)
        
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x))
        return x

In [212]:
net = Net()

net.zero_grad()
optimizer = optim.Adam(net.parameters(), lr=0.001)
state_batch_tensor = torch.Tensor(state_batch)
print(state_batch_tensor.shape)
action_batch_tensor = torch.Tensor(action_batch)
state_batch_tensor=state_batch_tensor.view(-1,1,162,145)
output = net.forward(state_batch_tensor)
print(output.shape)
loss = nn.MSELoss()
print(action_batch_tensor.shape)
loss = loss(output,action_batch_tensor)
loss.backward()

torch.Size([10, 162, 145])
torch.Size([10, 4])
torch.Size([10, 4])


  app.launch_new_instance()


In [213]:
env = gym.make("Breakout-v0")
agent = Agent()
scores = []
n_games = 500
score = 0
eps_history = []

for i in range(n_games):
    print(f'episode: {i} score = {score}')
    score = 0
    eps_history.append(agent.epsilon)
    obs = preprocess_obs(env.reset())
    done = False 
    while not done:
        action = agent.choose_action(obs)
        obs_, reward, done, info = env.step(action)
        obs_ = preprocess_obs(obs_)
        score+=reward
        agent.memory.store_transition(obs,action,reward,obs_,done)
        agent.train()
        obs = obs_
    score.append(score)

episode: 0 score = 0
()
(64, 4)


  app.launch_new_instance()


torch.Size([64])
torch.Size([64])
torch.Size([64])


IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [64], [64, 4]

In [209]:
class Agent:
    epsilon = 1
    eps_decay = 0.9998
    eps_min = 0.007
    batch_size = 64
    obs_dim = (162,145)
    gamma = 0.99
    def __init__(self):
        self.model = Net()
        self.optimzer = optim.Adam(self.model.parameters(),lr=0.001)
        self.loss = nn.MSELoss()
        self.memory = MemoryBuffer(20000,4,self.obs_dim)
    def train(self):
        if self.memory.mem_counter > self.batch_size:
            state_batch, action_batch, reward_batch, new_state_batch, terminal_batch = self.memory.sample_memory(self.batch_size)
            
            action_values = np.array(4,dtype=np.uint8)
            action_indices = np.dot(action_batch, action_values)
            
            print(action_values.shape)
            print(action_indices.shape)
            
            state_batch_tensor = torch.from_numpy(state_batch).view(-1,1,*self.obs_dim)
            new_state_batch_tensor = torch.from_numpy(new_state_batch).view(-1,1,*self.obs_dim)
            reward_batch_tensor = torch.from_numpy(reward_batch)
            terminal_batch_tensor = torch.from_numpy(terminal_batch)
            
            q_eval = self.model.forward(state_batch_tensor)
            q_target = self.model.forward(state_batch_tensor)
            q_next = self.model.forward(new_state_batch_tensor)
            
            batch_index = np.arange(self.batch_size, dtype=np.int32)
            
            print(reward_batch_tensor.shape)
            print(terminal_batch_tensor.shape)
            stuff = torch.max(q_next,dim=1)[0]
            print(stuff.shape)
            q_target[batch_index,action_indices] = reward_batch_tensor + self.gamma*torch.max(q_next,dim=1)[0]*terminal_batch_tensor
            self.epsilon = self.epsilon * self.eps_decay if self.epsilon > self.eps_min else self.eps_min
            loss = self.loss(q_target,q_eval)
            loss.backward()
            self.optimizer.step()
    def choose_action(self,obs):
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(4)
        else:
            actions = self.model.forward(obs)
            action = torch.argmax(actions).item()
        return action