# Laboratorium 5 (4 pkt)

Celem czwartego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmów głębokiego uczenia aktywnego. Zaimplementowane algorytmy będą testowane z wykorzystaniem środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [None]:
from collections import deque
import gym
import numpy as np
import random

Dołączenie bibliotek do obsługi sieci neuronowych

In [None]:
%tensorflow_version 1.x
from keras import Model
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
from keras.utils import to_categorical

## Zadanie 1 - Double Deep Q-Network

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu Double Deep Q-Network. Wartoscią oczekiwaną sieci jest:
\begin{equation}
       Q^*(s, a) \approx r + \gamma argmax_{a'}Q_\theta'(s', a') 
\end{equation}
a wagi pomiędzy sieciami wymieniane są co dziesięć aktualizacji wag sieci sterującej poczynaniami agenta ($Q$).
</p>

In [None]:
class DDQNAgent:
    def __init__(self, state_size, action_size, get_legal_actions, input_model):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 0.5  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.99#0.95
        self.learning_rate = 0.001
        # self.model = self._build_model()
        # self.target_model = self._build_model()
        self.model = input_model
        self.target_model = input_model
        self.update_weights()
        self.replay_counter = 1
        #self.ttt = t
        self.get_legal_actions = get_legal_actions
        
       # print(self.ttt)

    def remember(self, state, action, reward, next_state, done):
        #Function adds information to the memory about last action and its results
        self.memory.append((state, action, reward, next_state, done)) 

    def get_action(self, state):
        possible_actions = self.get_legal_actions(state)  
        epsilon = self.epsilon  
        if random.random()<epsilon:
            chosen_action = random.choice(possible_actions)
        else:   
            chosen_action = self.get_best_action(state) 
        return chosen_action

  
    def get_best_action(self, state):
        return np.argmax(self.model.predict(state))


    def replay(self, batch_size):

        if batch_size > len(self.memory):
          raise NameError('!!! To BIG batch_size !!! Should be lower than: ',len(self.memory))

        nasz_batch = random.sample(self.memory,batch_size)
        x = []
        y = []
        for sample in nasz_batch:
          state, action, reward, next_state, done = sample
          x.append(state.flatten())
          target = self.model.predict(state)
          #target = self.target_model.predict(state)
          if done:
            target[0][action] = reward
          else:
            target[0][action] = reward + self.gamma * max(self.target_model.predict(next_state)[0])
          y.append(target.flatten())

        self.model.train_on_batch(np.array(x), np.array(y))
        self.update_epsilon_value()
        
        if self.replay_counter == 10: 
          self.replay_counter = 0
          self.update_weights()
        else:
          self.replay_counter += 1

    def update_epsilon_value(self):
      temp = self.epsilon*self.epsilon_decay
      if temp < self.epsilon_min:
          self.epsilon = self.epsilon_min
      else:
          self.epsilon = temp

    def update_weights(self):
      t = 0.7
      weights = model.get_weights()
      target_weights = self.target_model.get_weights()
      #print(weights)
      for i in range(len(target_weights)):
        target_weights[i] = t* weights[i] + (1-t)*target_weights[i] 

      self.target_model.set_weights(target_weights)


Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [None]:
env = gym.make("CartPole-v0").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

model = [
    Dense(64, input_dim=state_size,activation='relu'),#ten super, wygrywa w 7 przjsciu
    Dense(128,activation='relu'),
    Dense(256,activation='relu'),
    #Dropout(0.5),
    Dense(64,activation='relu'),
    Dense(16,activation='relu'),
    Dense(action_size)#,activation='softmax'
] 
model = Sequential(model)
model.compile(loss="mean_squared_error",
             optimizer=Adam(lr=learning_rate))

Czas nauczyć agenta gry w środowisku *CartPool*:

In [None]:
import types
def get_possible_actions(self, state):
    """ return a tuple of possible actions in a given state """
    return [0,1]
env.get_possible_actions = types.MethodType(get_possible_actions, env)

agent = DDQNAgent(state_size, action_size, env.get_possible_actions, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([np.array(env_state).flatten()])
        
        for time in range(500):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([np.array(next_state_env).flatten()])
            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
          agent.replay(batch_size)
        summary.append(total_reward)
        
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))    
    
    if np.mean(summary) > 195:
        print ("You Win!")
        break



epoch #0	mean reward = 11.580	epsilon = 0.010
epoch #1	mean reward = 15.320	epsilon = 0.010
epoch #2	mean reward = 48.640	epsilon = 0.010
epoch #3	mean reward = 135.040	epsilon = 0.010
epoch #4	mean reward = 118.150	epsilon = 0.010
epoch #5	mean reward = 183.130	epsilon = 0.010
epoch #6	mean reward = 195.350	epsilon = 0.010
You Win!


In [None]:
#JESZCZE DRUGA PROBA, tym razem  epsilon_decay = 0.999

In [None]:
import types
def get_possible_actions(self, state):
    """ return a tuple of possible actions in a given state """
    return [0,1]
env.get_possible_actions = types.MethodType(get_possible_actions, env)

agent = DDQNAgent(state_size, action_size, env.get_possible_actions, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([np.array(env_state).flatten()])
        
        for time in range(500):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([np.array(next_state_env).flatten()])
            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
          agent.replay(batch_size)
        summary.append(total_reward)
        
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))    
    
    if np.mean(summary) > 195:
        print ("You Win!")
        break


epoch #0	mean reward = 20.960	epsilon = 0.679
epoch #1	mean reward = 30.370	epsilon = 0.615
epoch #2	mean reward = 44.570	epsilon = 0.556
epoch #3	mean reward = 55.390	epsilon = 0.503
epoch #4	mean reward = 89.900	epsilon = 0.455
epoch #5	mean reward = 172.810	epsilon = 0.412
epoch #6	mean reward = 227.530	epsilon = 0.373
You Win!


In [None]:
#JESZCZE DRUGA PROBA, tym razem  epsilon_decay = 0.99
import types
def get_possible_actions(self, state):
    """ return a tuple of possible actions in a given state """
    return [0,1]
env.get_possible_actions = types.MethodType(get_possible_actions, env)

agent = DDQNAgent(state_size, action_size, env.get_possible_actions, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([np.array(env_state).flatten()])
        
        for time in range(500):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([np.array(next_state_env).flatten()])
            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
          agent.replay(batch_size)
        summary.append(total_reward)
        
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))    
    
    if np.mean(summary) > 195:
        print ("You Win!")
        break


epoch #0	mean reward = 207.640	epsilon = 0.275
You Win!


In [None]:
#JESZCZE DRUGA PROBA, tym razem  epsilon_decay = 0.99, tak dla pewności
import types
def get_possible_actions(self, state):
    """ return a tuple of possible actions in a given state """
    return [0,1]
env.get_possible_actions = types.MethodType(get_possible_actions, env)

agent = DDQNAgent(state_size, action_size, env.get_possible_actions, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([np.array(env_state).flatten()])
        
        for time in range(500):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([np.array(next_state_env).flatten()])
            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
          agent.replay(batch_size)
        summary.append(total_reward)
        
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))    
    
    if np.mean(summary) > 195:
        print ("You Win!")
        break

epoch #0	mean reward = 126.030	epsilon = 0.277
epoch #1	mean reward = 313.950	epsilon = 0.101
You Win!
