<a href="https://colab.research.google.com/github/MoustHolmes/AMAS_Project/blob/Aske/DQN_sol.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

class DeepQNetwork(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, 
            n_actions):
        super(DeepQNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)

        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)

        return actions

class Agent():
    def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
            max_mem_size=100000, eps_end=0.05, eps_dec=5e-4):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)]
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0
        self.iter_cntr = 0
        self.replace_target = 100

        self.Q_eval = DeepQNetwork(lr, n_actions=n_actions, input_dims=input_dims,
                                    fc1_dims=256, fc2_dims=256)
        self.Q_next = DeepQNetwork(lr, n_actions=n_actions, input_dims=input_dims,
                                    fc1_dims=256, fc2_dims=256)

        self.state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)

    def store_transition(self, state, action, reward, state_, terminal):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = terminal

        self.mem_cntr += 1

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation]).to(self.Q_eval.device)
            actions = self.Q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def learn(self):
        if self.mem_cntr < self.batch_size:
            return

        self.Q_eval.optimizer.zero_grad()
        
        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, self.batch_size, replace=False)
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)

        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        action_batch = self.action_memory[batch]
        reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)

        q_pred = self.Q_eval.forward(state_batch)[batch_index, action_batch]
        q_next = self.Q_next.forward(new_state_batch)
        q_eval = self.Q_eval.forward(new_state_batch)
        q_next[terminal_batch] = 0.0

        max_actions = T.argmax(q_eval,dim=1)

        # q_target = reward_batch + self.gamma*T.max(q_next,dim=1)[0]
        q_target = reward_batch + self.gamma*q_next[batch_index, max_actions]

        loss = self.Q_eval.loss(q_target, q_pred).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()

        self.iter_cntr += 1
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min \
                       else self.eps_min

        if self.iter_cntr % self.replace_target == 0:
          self.Q_next.load_state_dict(self.Q_eval.state_dict())

In [2]:
!pip3 install Box2d-py

Collecting Box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/87/34/da5393985c3ff9a76351df6127c275dcb5749ae0abbe8d5210f06d97405d/box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448kB)
[K     |▊                               | 10kB 10.7MB/s eta 0:00:01[K     |█▌                              | 20kB 14.7MB/s eta 0:00:01[K     |██▏                             | 30kB 9.7MB/s eta 0:00:01[K     |███                             | 40kB 8.3MB/s eta 0:00:01[K     |███▋                            | 51kB 4.6MB/s eta 0:00:01[K     |████▍                           | 61kB 5.2MB/s eta 0:00:01[K     |█████▏                          | 71kB 5.2MB/s eta 0:00:01[K     |█████▉                          | 81kB 5.7MB/s eta 0:00:01[K     |██████▋                         | 92kB 5.6MB/s eta 0:00:01[K     |███████▎                        | 102kB 5.8MB/s eta 0:00:01[K     |████████                        | 112kB 5.8MB/s eta 0:00:01[K     |████████▊                       | 1

In [None]:
import gym
import numpy as np

if __name__ == '__main__':
    env = gym.make('LunarLander-v2')
    agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, eps_end=0.01,
                 input_dims=[8], lr=0.0005)
    scores, eps_history = [], []
    n_games = 600
    
    for i in range(n_games):
        score = 0
        done = False
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.store_transition(observation, action, reward, 
                                    observation_, done)
            agent.learn()
            observation = observation_
        scores.append(score)
        eps_history.append(agent.epsilon)

        avg_score = np.mean(scores[-100:])

        print('episode ', i, 'score %.2f' % score,
                'average score %.2f' % avg_score,
                'epsilon %.2f' % agent.epsilon)
    x = [i+1 for i in range(n_games)]

episode  0 score -102.17 average score -102.17 epsilon 1.00
episode  1 score -412.06 average score -257.12 epsilon 0.95
episode  2 score -236.66 average score -250.30 epsilon 0.90
episode  3 score -130.14 average score -220.26 epsilon 0.86
episode  4 score -124.21 average score -201.05 epsilon 0.82
episode  5 score -183.00 average score -198.04 epsilon 0.74
episode  6 score 41.20 average score -163.86 epsilon 0.70
episode  7 score -388.36 average score -191.92 epsilon 0.62
episode  8 score -23.23 average score -173.18 epsilon 0.56
episode  9 score -144.11 average score -170.27 epsilon 0.49
episode  10 score -120.71 average score -165.77 epsilon 0.45
episode  11 score -107.78 average score -160.94 epsilon 0.40
episode  12 score -186.48 average score -162.90 epsilon 0.35
episode  13 score -101.83 average score -158.54 epsilon 0.31
episode  14 score -179.06 average score -159.91 epsilon 0.24
episode  15 score -387.38 average score -174.12 epsilon 0.19
episode  16 score -172.43 average sco

In [4]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay
from gym.wrappers import Monitor

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()
# ja jeg har tyv stjålet dette
# og nej jeg har ingen anse om hvad det gør men det virker!
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [None]:
env = wrap_env(env = gym.make('LunarLander-v2'))
observation = env.reset()
action_space_size = env.action_space.n
observation_space_size = env.observation_space.shape
state_space_size = 8
final_score = 0
while True:
  
    env.render()
    
    #your agent goes here
    action = agent.choose_action(observation)
         
    observation, reward, done, info = env.step(action) 
   
    final_score += reward    
    if done: 
      break;
            
env.close()
show_video()

NameError: ignored

In [None]:
final_score

-55.07237619753844