<a href="https://colab.research.google.com/github/LiamFosterLFF/Bath_SWE_CW2/blob/main/RLCW_just_v3_algo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !git clone https://github.com/LiamFosterLFF/blockudoku.git
# print("Done!")

In [None]:
import sys
import os
import numpy as np
import gym


py_file_location = "/"
sys.path.append(os.path.abspath(py_file_location))

import blockudoku as bd
import blockudoku_toy_env as bdtoy

In [None]:
import tensorflow as tf
from tensorflow.python.keras import Sequential, Model
from tensorflow.python.keras.layers import Dense, Input, Flatten
from tensorflow.keras.optimizers import Adam
from collections import deque
import random
import numpy as np
import copy
import pylab as plt
# from google.colab import output
import os.path
from datetime import datetime

class DQNAgent_with_ER_and_FTN_V3:
  def __init__(self, verbose=2, state_includes_pieces=False, use_toy_env=False, clone=None):
    # Parameters
    self.epsilon = 0.99
    self.epsilon_decay = .99
    self.epsilon_min = .01
    self.alpha = 0.15
    self.gamma = 1
    self.inverse_gamma = 1
    self.d_min = 1000
    self.d_max = 2000
    self.c = 20
    self.no_replays = 10

    #Adjustable Parameters
    self.verbose=verbose
    self.start_time = datetime.now()
    self.state_includes_pieces = state_includes_pieces;

    self.env = bdtoy.SudokuTetrisGame() if use_toy_env else bd.SudokuTetrisGame()
    
    self.input_shape = (self.state_action_representation(self.env).size,)
    
    ## Algorithm: Deep Q-Learning with Experience Replay and Fixed Target Network
    ## Initialise replay memory 𝐷 to capacity 𝑁
    self.d = deque(maxlen = self.d_max)
    ## Initialise action-value network 𝑞1 with random weights 𝜽1
    self.q1 = self._build_DNN()
    ## Initialise target action-value network 𝑞2 with weights 𝜽2 = 𝜽1
    self.q2 = tf.keras.models.clone_model(self.q1)

    if clone != None:
      self.start_time = clone.start_time
      self.d = clone.d
      self.q1 = clone.q1
      self.q2 = clone.q2

  def train(self, episodes=1):
    scores, eps = [], []
    step_count = 1
    ## For episode = 1, 𝑀 do
    for e in range(episodes):
      ## Initialise initial state 𝑆1
      self.env.reset()
      state = copy.deepcopy(self.env)
      ## For 𝑡 = 1, 𝑇 do
      done = False
      total_reward = 0
      while not done:
        ## With probability 𝜖 select random action 𝐴𝑡
        action = self.get_epsilon_greedy_action()
        ## Execute action 𝐴𝑡 and observe reward 𝑅𝑡, next state 𝑆𝑡+1
        _, reward, done, _ = self.env.step(action)
        culm_reward=copy.deepcopy(reward)
        if reward<5:
          reward=-1
        total_reward += culm_reward
        ## Store transition (𝑆𝑡, 𝐴𝑡, 𝑅𝑡, 𝑆𝑡+1) in 𝐷
        next_state = copy.deepcopy(self.env)
        self.d.append((state, action, reward, next_state))

        ## Sample random minibatch of transitions (𝑆𝑗, 𝐴𝑗, 𝑅𝑗, 𝑆𝑗+1) from 𝐷
        if len(self.d) > self.d_min:
          self.replay(self.no_replays)
          
        ## Every 𝐶 steps update 𝜽2 = 𝜽1
        if step_count % self.c == 0:
          self.q2 = tf.keras.models.clone_model(self.q1)
        # update variables
        step_count += 1
        state = next_state

        
        if done:
          ## Decay epsilon after each episode (not in original algorithm) (only if memory full)
          if self.epsilon > self.epsilon_min and len(self.d) > self.d_min:
            self.epsilon *= self.epsilon_decay

          # every episode, plot the play time
          eps.append(e+1)
          scores.append(total_reward)
          
          # self.plot_print_and_write_to_file(eps, scores)
          
      ## End For
    ## End For

  def plot_print_and_write_to_file(self, eps, scores):
    # Plot
    x, y =  np.array(eps), np.array(scores)
    
    plt.plot(x, y, 'b')
    m, b = np.polyfit(x, y, 1)
    plt.plot(x, m*x + b, color='red')
    plt.title('Total Reward Per Episode')
    plt.xlabel('Episodes')
    plt.ylabel('Total Reward')

    # Print
    if self.verbose >= 1:
      plt_text = "Episode # {} Final Score: {}, Memory Length: {}, Epsilon: {:.4f}, Slope: {:.4f}".format(eps[-1], scores[-1], len(self.d), self.epsilon, m)
      plt.text(.5, .05, plt_text)

    # Write graph to file "DQN_V3_output{datetime}.png"
    current_time = self.start_time.strftime("%a_%d%B%Y-%H:%M:%S")
    png_fname = "DQN_V3_output-" + current_time + ".png"
    plt.savefig('/content/{}'.format(png_fname))

    # Save data to a csv-formatted file
    csv_fname = "'DQN_V3_data-" + current_time + ".csv"
    if not os.path.exists(csv_fname):
      with open(csv_fname, 'w+') as f:
        title = "episode_no,final_score,memory_length,epsilon,slope\n"
        f.write(title)

    with open(csv_fname, 'a') as f:
      data = "{},{},{},{},{}\n".format(eps[-1], scores[-1], len(self.d), self.epsilon, m)
      f.write(data)

    # # Save q1 and q2 to files (Use if Pickle doesn't work, otherwise delete)
    # q1_model_fname = "'DQN_V3_q1_model-" + current_time
    # tf.keras.models.save_model(self.q1, q1_model_fname, overwrite=True)
    # q2_model_fname = "'DQN_V3_q2_model-" + current_time
    # tf.keras.models.save_model(self.q1, q2_model_fname, overwrite=True)

    if self.verbose == 2:
      # Show plot
      output.clear()
      plt.show()

  def get_greedy_action(self):
    best_action = None
    best_value = - float('inf')
    for a in self.env.get_available_actions():
      env_copy = copy.deepcopy(self.env)
      _, reward, done, _ = env_copy.step(a)
      if not done:
        v = reward + self.q1.predict(self.state_action_representation(env_copy))[0][0]
      else:
        v = reward
      if v > best_value:
        best_action = a
        best_value = v
    action = best_action
    return action

  def get_greedy_action2(self):
    done_arr, reward_arr, state_arr = [], [], []
    action_arr = self.env.get_available_actions()
    for a in action_arr:
      env_copy = copy.deepcopy(self.env)
      _, reward, done, _ = env_copy.step(a)
      done_arr.append(0 if done else 1)
      reward_arr.append(reward)
      state_arr.append(self.state_action_representation(env_copy))
    done_arr, reward_arr, state_arr = np.array(done_arr), np.array(reward_arr), np.array(state_arr)
    # Run predict on a batch of all possible states for action
    next_value_arr = np.reshape(self.q1.predict(np.array(state_arr)), reward_arr.shape)
    # v = r + (v_next_state if done else 0)
    v_arr = np.add(np.array(reward_arr), np.multiply(np.array(done_arr), next_value_arr))
    # Choose best action from v then find corresponding action
    action = action_arr[np.argmax(v_arr)]
    return action
    
  def get_epsilon_greedy_action(self):
    # Get Random Action
    if self.epsilon > random.random():
      return self.env.sample_action()
    ## Greedy Action: With probability 1 − 𝜖
    else:
      # return self.get_greedy_action2()
      return self.get_greedy_action()

  def replay(self, no_replays):
      for i in range(no_replays):
        state, action, reward, next_state = random.choice(self.d)
        yj = 0
        ## if 𝑆𝑗+1 is terminal: Set 𝑦𝑗 = 𝑅𝑗 + 0
        if next_state.check_if_done():
          yj = reward        
        ## otherwise: Set 𝑦𝑗 = 𝑅𝑗 + max 𝑎′𝑞2(𝑆𝑗+1, 𝑎′, 𝜽2)
        else:
          # prediction = self.predict_q2(next_state).flatten()
          # best_action = np.nanargmax(prediction)
          # best_value = prediction[best_action]
          # yj[action] = self.inverse_gamma*reward + self.gamma*best_value
          yj = reward + self.predict_q2(next_state)[0][0]
        ## Perform gradient descent step ∇𝜽1𝐿𝛿(𝑦𝑗, 𝑞1(𝑆𝑗, 𝐴𝑗, 𝜽1))
        self.q1.fit(self.state_action_representation(state), np.array([yj]).reshape(1, -1), epochs=1, verbose=False)

  def _build_DNN(self):
    model = Sequential()
    model.add(Input(shape=self.input_shape))
    model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(1, activation='linear', kernel_initializer='he_uniform'))
    model.compile(loss=tf.keras.losses.Huber(), optimizer=Adam(lr=self.alpha))
    return model

  def predict_q1(self, state) :
    DNN_input = self.state_action_representation(state)
    value_prediction = self.q1.predict(DNN_input)
    return value_prediction

  def predict_q2(self, state):
    DNN_input = self.state_action_representation(state)
    value_prediction = self.q2.predict(DNN_input)
    return value_prediction

  def state_action_representation(self, state):
    # Get state representation
    board_rep = [1 if square else 0 for square in state.board.flatten()]
    # Change parameter to include the pieces in the state representation (Optional)
    if self.state_includes_pieces:
      pieces_rep = [1 if bd.PIECES[x] in state.current_pieces else 0 for x in range(len(bd.PIECES))]
      board_rep = board_rep + pieces_rep

    return np.array(board_rep).reshape(-1, len(board_rep))

agent = DQNAgent_with_ER_and_FTN_V3()
agent.train(10000)