<a href="https://colab.research.google.com/github/MoNoSpaze/RL_frozenlake_superai_ss2/blob/main/FrozenLake_QL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Essential**

In [None]:
import gym
import random
import numpy as np
import matplotlib as plt
from IPython.display import clear_output
import time

**Agent Setting**

In [None]:
class Q_Agent:
  def __init__ (self, env, discount_factor, learning_rate, epsilon):
    self.gamma = discount_factor
    self.lr = learning_rate
    self.epsilon = epsilon
    # self.terminal = [5, 7, 11, 12, 15]

    # Create Q-Table
    action_size = env.action_space.n
    state_size = env.observation_space.n
    self.q_table = np.zeros((state_size, action_size))
    print("Q - table")
    print( "-----------------------" )
    print(self.q_table)

# epsilon-greedy selection
  def action(self, state):

    if np.random.uniform() < self.epsilon:
      action = env.action_space.sample()
    else:
      action = np.argmax(self.q_table[state,:])
    return action
  
  #from Q(s,a) = Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)] ; func:equa -> s = s, a = a, r = R(s, a), s_ = s'
  def update_ql(self, s, a, r, s_):
    self.q_table[s, a] = self.q_table[s, a] + self.lr * ( r + self.gamma * np.max( self.q_table[s_, :] ) - self.q_table[s, a] )

**Update Q-Table**

In [None]:
env = gym.make("FrozenLake-v0")

#Epsilon Greedy
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.0005 #epsilon update
agent = Q_Agent ( env, discount_factor = 0.98, learning_rate = 0.05, epsilon = max_epsilon )

total_episodes = 100000
rewards = np.zeros((total_episodes))
for i in range(total_episodes):
  total_rewards = 0
  s = env.reset() #init
  while True :
    a = agent.action(s)
    s_, r, done, info = env.step(a) # Observe next state

    agent.update_ql(s, a, r, s_)

    s = s_

    total_rewards += r

    agent.epsilon = min_epsilon + ( max_epsilon - min_epsilon )*np.exp( -decay_rate*i )

    if done:
      print('episode :',i)
      env.render()
      #time.sleep(0.5)
      clear_output(wait=True)
      break
  
  rewards[i] = total_rewards

print("Final Q - table")
print( "-----------------------" )
print(agent.q_table)

Final Q - table
-----------------------
[[0.39642099 0.36486826 0.35869941 0.35863573]
 [0.21697869 0.15487484 0.18073257 0.34227168]
 [0.29717031 0.22256597 0.21824689 0.22190979]
 [0.04406611 0.09105867 0.0375012  0.04655435]
 [0.41451383 0.27839599 0.24479843 0.29607958]
 [0.         0.         0.         0.        ]
 [0.29690204 0.11724545 0.16096211 0.08095514]
 [0.         0.         0.         0.        ]
 [0.27260153 0.30003143 0.28818795 0.45661166]
 [0.41945173 0.54553555 0.38150908 0.39454803]
 [0.51823541 0.41130557 0.38376955 0.23686135]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.28958672 0.50188046 0.66800957 0.53953801]
 [0.66528796 0.83035689 0.69575106 0.66646616]
 [0.         0.         0.         0.        ]]


**Evaluate Model**

In [None]:
env.reset()
total_episodes = 10000
rewards = np.zeros((total_episodes))
agent.epsilon = 0.0

for i in range( total_episodes ):
  total_rewards = 0
  s = env.reset()
  while True :
    a = agent.action(s)
    s_, r, done, info = env.step(a)

    s = s_

    total_rewards += r

    #if not done:
     # print( "Episode: " + str(i) + " ---> Reward: " + str(rewards[i]) )
     # env.render()
     # time.sleep(0.5)
     # clear_output(wait=True)
    if done:
      env.render()
      # time.sleep(0.5)
      # clear_output(wait=True)
      break
    
  rewards[i] = total_rewards
  print( "Episode: " + str(i) + " ---> Reward: " + str(rewards[i]) )
  print( "Mean Reward: " + str(np.sum(rewards / total_episodes)) )
  print( "*" * 20 )
  print('Success Percentage = ' , 100*np.sum(rewards)/len(rewards), '%')

env.close()

[1;30;43mเอาต์พุตของการสตรีมมีการตัดเหลือเพียง 5000 บรรทัดสุดท้าย[0m
HFF[41mG[0m
Episode: 9444 ---> Reward: 1.0
Mean Reward: 0.6838000000000002
********************
Success Percentage =  68.38 %
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Episode: 9445 ---> Reward: 1.0
Mean Reward: 0.6839000000000002
********************
Success Percentage =  68.39 %
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Episode: 9446 ---> Reward: 1.0
Mean Reward: 0.6840000000000002
********************
Success Percentage =  68.4 %
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Episode: 9447 ---> Reward: 1.0
Mean Reward: 0.6841000000000002
********************
Success Percentage =  68.41 %
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Episode: 9448 ---> Reward: 1.0
Mean Reward: 0.6842000000000001
********************
Success Percentage =  68.42 %
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Episode: 9449 ---> Reward: 1.0
Mean Reward: 0.6843000000000001
********************
Success Percentage =  68.43 %
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Epis