# Q-Learning vs. SARSA (Cliff Walking)

### Q-Learning Training

In [None]:
from IPython.display import clear_output
import time
import pickle as pkl
import numpy as np
import gym
import random

# Create Cliff Walking environment
env = gym.make("CliffWalking-v0")

# Initialize Q-table
state_size = env.observation_space.n  # Total number of states
action_size = env.action_space.n      # Total number of actions
q_table = np.zeros((state_size, action_size))
print("State space:", state_size)
print("Action space:", action_size)

# Hyperparameters
learning_rate = 0.1  # Alpha, learning rate
discount_rate = 0.99  # Gamma, discount factor
epsilon = 1.0       # Exploration rate
decay_rate = 0.0001    # Decay rate for epsilon

# Training variables
num_episodes = 2000  # Total number of episodes
max_steps = 40      # Max steps per episode
epsilon_decayed = 1.0

# To store total rewards for each episode
rewards = []

# Policy function for epsilon-greedy action selection
def policy(state, epsilon):
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()  # Explore
    else:
        return np.argmax(q_table[state])  # Exploit

# Training the agent
for episode in range(num_episodes):
    state, info = env.reset()  # Reset the environment to the initial state
    done = False               # Variable to check if the episode is finished
    total_reward = 0

    for _ in range(max_steps):
        action = policy(state, epsilon_decayed)
        next_state, reward, done, truncated, info = env.step(action)  # Take action and observe the result

        # Update Q-table
        q_table[state, action] = q_table[state, action] + learning_rate * (reward + discount_rate * np.max(q_table[next_state]) - q_table[state, action])
        
        state = next_state  # Move to the next state
        total_reward += reward

        # Render the environment
        clear_output(wait=True)
        # env.render()
        # time.sleep(0.01)

        if done or truncated:
            break
    
    rewards.append(total_reward)
    epsilon_decayed = np.exp(-decay_rate * episode)
    print(f"Episode {episode + 1}: Total Reward: {total_reward}, Epsilon: {epsilon_decayed}")

# Save the Q-table
pkl.dump(q_table, open("q-learning_q_table.pkl", "wb"))
print("Training Complete! Q-table saved!")

In [None]:
import matplotlib.pyplot as plt
# Plotting rewards over time
plt.plot(rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Total Rewards over Episodes')
plt.show()

### Q-Learning Testing

In [None]:
from IPython.display import clear_output
from time import sleep
import pickle as pkl
import gym
import numpy as np
import pygame

file = open('q-learning_q_table.pkl', 'rb')
q_table = pkl.load(file)

# Trained variables
max_steps = 40

# Watch the trained agent
env = gym.make("CliffWalking-v0", render_mode='human')
state, info = env.reset()
done = False
rewards = 0

for s in range(max_steps):
    clear_output(wait=True)
    env.render()  # Render the environment in the human mode
    action = np.argmax(q_table[state])
    next_state, reward, done, truncated, info = env.step(action)
    rewards += reward
    
    print(f"Step {s+1}, Total Reward: {rewards}")

    state = next_state

    if done or truncated:
        pygame.quit()
        break

env.close()

### SARSA Testing

In [None]:
from IPython.display import clear_output
from time import sleep
import pickle as pkl
import gym
import numpy as np
import pygame

file = open('sarsa_q_table.pkl', 'rb')
q_table = pkl.load(file)

# Trained variables
max_steps = 40

# Watch the trained agent
env = gym.make("CliffWalking-v0", render_mode='human')
state, info = env.reset()
done = False
rewards = 0

for s in range(max_steps):
    clear_output(wait=True)
    env.render()  # Render the environment in the human mode
    action = np.argmax(q_table[state])
    next_state, reward, done, truncated, info = env.step(action)
    rewards += reward
    print(f"Step {s+1}, Total Reward: {rewards}")

    state = next_state

    if done or truncated:
        pygame.quit()
        break

env.close()