# Laboratorium 4 (4 pkt.)

Celem czwartego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmów głębokiego uczenia aktywnego. Zaimplementowane algorytmy będą testowane z wykorzystaniem wcześniej przygotowanych środowisk: *FrozenLake* i *Pacman* oraz środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [1]:
from collections import deque
import gym
import numpy as np
import random

Dołączenie bibliotek ze środowiskami:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/ISI/env/FrozenLakeMDP.py .
!cp /content/drive/MyDrive/ISI/env/FrozenLakeMDPExtended.py .
from FrozenLakeMDP import frozenLake
from FrozenLakeMDPExtended import frozenLakeExtended


Dołączenie bibliotek do obsługi sieci neuronowych

In [2]:
%tensorflow_version 1.x
from keras import Model
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
from keras.utils import to_categorical

TensorFlow 1.x selected.


Using TensorFlow backend.


## Zadanie 1 - Deep Q-Network

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu Deep Q-Network. Wartoscią oczekiwaną sieci jest:
\begin{equation}
        Q(s_t, a_t) = r_{t+1} + \gamma \text{max}_a Q(s_{t + 1}, a)
\end{equation}
</p>

In [None]:
class DQNAgent:
    def __init__(self, action_size, learning_rate, model, get_legal_actions):
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999
        self.learning_rate = learning_rate
        self.model = model
        
        self.get_legal_actions = get_legal_actions

    def remember(self, state, action, reward, next_state, done):
        #Function adds information to the memory about last action and its results
        self.memory.append((state, action, reward, next_state, done)) 

    def get_action(self, state):
        possible_actions = self.get_legal_actions(state)  
        epsilon = self.epsilon  
        if random.random()<epsilon:
            chosen_action = random.choice(possible_actions)
        else:   
            chosen_action = self.get_best_action(state) 
        return chosen_action

  
    def get_best_action(self, state):
        best_actions = self.model.predict(state)
        #print("BEST: ",best_actions)
        best_action = np.argmax(best_actions)
        #s 83  'numpy.int64' object is not iterable  max(best_action[0]) 
        #print(best_action)
        return best_action


    def replay(self, batch_size):

      if batch_size >     len(self.memory):
        raise NameError('!!! To BIG batch_size !!! Should be lower than: ',len(self.memory))

      nasz_batch = random.sample(self.memory,batch_size)
      x = []
      y = []
      for sample in nasz_batch:#wy
          state, action, reward, next_state, done = sample
          x.append(state.flatten())
          target = self.model.predict(state)
          #print(target)
          if done:
            target[0][action] = reward
          else:
            target[0][action] = reward + self.gamma * max(self.model.predict(next_state)[0])
          y.append(target.flatten())

      self.model.train_on_batch(np.array(x), np.array(y))
      self.update_epsilon_value()
        
        

    def update_epsilon_value(self):
      temp = self.epsilon*self.epsilon_decay
      if temp < self.epsilon_min:
          self.epsilon = self.epsilon_min
      else:
          self.epsilon = temp

Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLake*, warstwa wejściowa powinna mieć tyle neuronów ile jest możlliwych stanów, warstwa wyjściowa tyle neuronów ile jest możliwych akcji do wykonania:

In [None]:
env = frozenLake("8x8")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001
print(state_size)
print(action_size)

64
4


In [None]:
env = frozenLake("8x8")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

model = [
    Dense(16, input_dim=state_size,activation='relu'),
    Dense(32,activation='relu'),
    #Dropout(0.5),
    Dense(16,activation='relu'),
    Dense(action_size)#,activation='softmax'
] 
model = Sequential(model)
model.compile(loss="mean_squared_error",
             optimizer=Adam(lr=learning_rate))

 Czas nauczyć agenta poruszania się po środowisku *FrozenLake*, jako stan przyjmij wektor o liczbie elementów równej liczbie możliwych stanów, z wartością 1 ustawioną w komórce o indeksie równym aktualnemu stanowi, pozostałe elementy mają być wypełnione zerami:
* 1 pkt < 35 epok,
* 0.5 pkt < 60 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [None]:
agent = DQNAgent(action_size, learning_rate, model, env.get_possible_actions)#0.999

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 10000
counter = 0
for e in range(EPISODES):

    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([to_categorical(env_state, num_classes=state_size)])
        
        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([to_categorical(next_state_env, num_classes=state_size)])

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
          agent.replay(batch_size)
        
        summary.append(total_reward)
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break


epoch #0	mean reward = 0.000	epsilon = 0.680
epoch #1	mean reward = 0.010	epsilon = 0.615
epoch #2	mean reward = 0.000	epsilon = 0.557
epoch #3	mean reward = 0.010	epsilon = 0.504
epoch #4	mean reward = 0.000	epsilon = 0.456
epoch #5	mean reward = 0.010	epsilon = 0.412
epoch #6	mean reward = 0.010	epsilon = 0.373
epoch #7	mean reward = 0.030	epsilon = 0.338
epoch #8	mean reward = 0.070	epsilon = 0.305
epoch #9	mean reward = 0.100	epsilon = 0.276
epoch #10	mean reward = 0.390	epsilon = 0.250
epoch #11	mean reward = 0.000	epsilon = 0.226
epoch #12	mean reward = 0.030	epsilon = 0.205
epoch #13	mean reward = 0.130	epsilon = 0.185
epoch #14	mean reward = 0.620	epsilon = 0.168
epoch #15	mean reward = 0.350	epsilon = 0.152
epoch #16	mean reward = 0.630	epsilon = 0.137
epoch #17	mean reward = 0.780	epsilon = 0.124
epoch #18	mean reward = 0.910	epsilon = 0.112
You Win!


In [None]:
model = [
    Dense(64, input_dim=state_size,activation='relu'),
    Dense(256,activation='relu'),
    #Dropout(0.5),
    Dense(32,activation='relu'),
    Dense(action_size)#,activation='softmax'
] 
model = Sequential(model)
model.compile(loss="mean_squared_error",
             optimizer=Adam(lr=learning_rate))

In [None]:
agent = DQNAgent(action_size, learning_rate, model, env.get_possible_actions)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 10000
counter = 0
for e in range(EPISODES):

    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([to_categorical(env_state, num_classes=state_size)])
        
        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([to_categorical(next_state_env, num_classes=state_size)])

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
          agent.replay(batch_size)
        
        summary.append(total_reward)
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break

epoch #0	mean reward = 0.000	epsilon = 0.679
epoch #1	mean reward = 0.000	epsilon = 0.615
epoch #2	mean reward = 0.000	epsilon = 0.556
epoch #3	mean reward = 0.030	epsilon = 0.503
epoch #4	mean reward = 0.010	epsilon = 0.455
epoch #5	mean reward = 0.000	epsilon = 0.412
epoch #6	mean reward = 0.080	epsilon = 0.373
epoch #7	mean reward = 0.120	epsilon = 0.337
epoch #8	mean reward = 0.070	epsilon = 0.305
epoch #9	mean reward = 0.240	epsilon = 0.276
epoch #10	mean reward = 0.330	epsilon = 0.250
epoch #11	mean reward = 0.190	epsilon = 0.226
epoch #12	mean reward = 0.060	epsilon = 0.204
epoch #13	mean reward = 0.400	epsilon = 0.185
epoch #14	mean reward = 0.220	epsilon = 0.167
epoch #15	mean reward = 0.440	epsilon = 0.151
epoch #16	mean reward = 0.590	epsilon = 0.137
epoch #17	mean reward = 0.410	epsilon = 0.124
epoch #18	mean reward = 0.210	epsilon = 0.112
epoch #19	mean reward = 0.440	epsilon = 0.102
epoch #20	mean reward = 0.270	epsilon = 0.092
epoch #21	mean reward = 0.280	epsilon = 0.08

Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLakeExtended*, tym razem stan nie jest określany poprzez pojedynczą liczbę, a przez 3 tablice:
* pierwsza zawierająca informacje o celu,
* druga zawierająca informacje o dziurach,
* trzecia zawierająca informację o położeniu gracza.

In [None]:
env = frozenLakeExtended("4x4")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001
state_size *= 3

model = [
    Dense(16, input_dim=state_size,activation='relu'),
    Dense(32,activation='relu'),
    #Dropout(0.5),
    Dense(16,activation='relu'),
    Dense(action_size)#,activation='softmax'
] 
model = Sequential(model)
model.compile(loss="mean_squared_error",
             optimizer=Adam(lr=learning_rate))

 Czas nauczyć agenta poruszania się po środowisku *FrozenLakeExtended*, jako stan przyjmij wektor składający się ze wszystkich trzech tablic (2 pkt.):

In [None]:
agent = DQNAgent(action_size, learning_rate, model, env.get_possible_actions )

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 2000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([np.array(env_state).flatten()])
        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([np.array(next_state_env).flatten()])
            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
          agent.replay(batch_size)
        
        summary.append(total_reward)
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))

epoch #0	mean reward = 0.000	epsilon = 0.681
epoch #1	mean reward = 0.000	epsilon = 0.616
epoch #2	mean reward = 0.030	epsilon = 0.557
epoch #3	mean reward = 0.260	epsilon = 0.504
epoch #4	mean reward = 0.150	epsilon = 0.456
epoch #5	mean reward = 0.260	epsilon = 0.413
epoch #6	mean reward = 0.430	epsilon = 0.373
epoch #7	mean reward = 0.560	epsilon = 0.338
epoch #8	mean reward = 0.610	epsilon = 0.306
epoch #9	mean reward = 0.560	epsilon = 0.277
epoch #10	mean reward = 0.590	epsilon = 0.250
epoch #11	mean reward = 0.730	epsilon = 0.226
epoch #12	mean reward = 0.840	epsilon = 0.205
epoch #13	mean reward = 0.770	epsilon = 0.185
epoch #14	mean reward = 0.860	epsilon = 0.168
epoch #15	mean reward = 0.780	epsilon = 0.152
epoch #16	mean reward = 0.640	epsilon = 0.137
epoch #17	mean reward = 0.820	epsilon = 0.124
epoch #18	mean reward = 0.900	epsilon = 0.112
epoch #19	mean reward = 0.870	epsilon = 0.102
epoch #20	mean reward = 0.820	epsilon = 0.092
epoch #21	mean reward = 0.810	epsilon = 0.08

Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [7]:
env = gym.make("CartPole-v0").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001
# PO 10 rewaerd 61
# model = [
#     Dense(16, input_dim=state_size,activation='relu'),
#     Dense(32,activation='relu'),
#     #Dropout(0.5),
#     Dense(16,activation='relu'),
#     Dense(action_size)#,activation='softmax'
# ] 
# model = Sequential(model)
# model.compile(loss="mean_squared_error",
#              optimizer=Adam(lr=learning_rate))
# PO 6 reaward 97 i waliło oclaba
# model = [
#     Dense(64, input_dim=state_size,activation='relu'),
#     Dense(256,activation='relu'),
#     #Dropout(0.5),
#     Dense(32,activation='relu'),
#     Dense(action_size)#,activation='softmax'
# ] 
# model = Sequential(model)
# model.compile(loss="mean_squared_error",
#              optimizer=Adam(lr=learning_rate))
model = [
    Dense(64, input_dim=state_size,activation='relu'),#ten super, wygrywa w 7 przjsciu
    Dense(128,activation='relu'),
    Dense(256,activation='relu'),
    #Dropout(0.5),
    Dense(64,activation='relu'),
    Dense(16,activation='relu'),
    Dense(action_size)#,activation='softmax'
] 
model = Sequential(model)
model.compile(loss="mean_squared_error",
             optimizer=Adam(lr=learning_rate))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Czas nauczyć agenta gry w środowisku *CartPool*:
* 1 pkt < 10 epok,
* 0.5 pkt < 20 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [None]:
import types#epsilon_decay = 0.999
def get_possible_actions(self, state):
    """ return a tuple of possible actions in a given state """
    return [0,1]
env.get_possible_actions = types.MethodType(get_possible_actions, env)
#, type(env).__name__

agent = DQNAgent(action_size, learning_rate, model, env.get_possible_actions)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        #print(env_state)
        state = np.array([np.array(env_state).flatten()])
        
        for time in range(300):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([np.array(next_state_env).flatten()])
            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
          agent.replay(batch_size)
        summary.append(total_reward)
    if np.mean(summary) > 195:
        print ("You Win!")
        break
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))

epoch #0	mean reward = 17.580	epsilon = 0.680
epoch #1	mean reward = 20.050	epsilon = 0.615
epoch #2	mean reward = 27.340	epsilon = 0.557
epoch #3	mean reward = 45.800	epsilon = 0.504
epoch #4	mean reward = 115.620	epsilon = 0.456
epoch #5	mean reward = 175.540	epsilon = 0.412
epoch #6	mean reward = 187.800	epsilon = 0.373
You Win!


In [None]:
import types   #epsilon_decay = 0.99
def get_possible_actions(self, state):
    """ return a tuple of possible actions in a given state """
    return [0,1]
env.get_possible_actions = types.MethodType(get_possible_actions, env)
#, type(env).__name__

agent = DQNAgent(action_size, learning_rate, model, env.get_possible_actions)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([np.array(env_state).flatten()])
        
        for time in range(300):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([np.array(next_state_env).flatten()])
            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
          agent.replay(batch_size)
        summary.append(total_reward)
    if np.mean(summary) > 195:
        print ("You Win!")
        break
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))


epoch #0	mean reward = 13.290	epsilon = 0.286
epoch #1	mean reward = 16.020	epsilon = 0.105
epoch #2	mean reward = 50.770	epsilon = 0.038
epoch #3	mean reward = 161.320	epsilon = 0.014
You Win!
