In [2]:
!pip install gym



In [3]:
import gym
import numpy as np
import random

# Buat environment FrozenLake
env = gym.make("FrozenLake-v1")

# Parameter Q-Learning
action_size = env.action_space.n   # Banyaknya aksi yang bisa diambil
state_size = env.observation_space.n  # Banyaknya state yang mungkin
q_table = np.zeros((state_size, action_size))  # Inisialisasi Q-table

# Hyperparameters
learning_rate = 0.8   # alpha
discount_rate = 0.95  # gamma
epsilon = 1.0         # Untuk epsilon-greedy policy
epsilon_min = 0.01    # Nilai minimum epsilon
epsilon_decay = 0.995 # Untuk mengurangi epsilon secara bertahap
episodes = 10000      # Jumlah episode latihan
max_steps = 100       # Langkah maksimal per episode

# Latih agent menggunakan Q-Learning
for episode in range(episodes):
    state = env.reset()  # Reset environment di awal episode
    done = False

    for step in range(max_steps):
        # Pilih action (epsilon-greedy strategy)
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Random action (exploration)
        else:
            action = np.argmax(q_table[state, :])  # Pilih action terbaik (exploitation)

        # Lakukan action, dan dapatkan feedback (reward, state berikutnya, dan status selesai)
        next_state, reward, done, info = env.step(action)

        # Update Q-table berdasarkan rumus Q-Learning
        q_table[state, action] = q_table[state, action] + learning_rate * (reward + discount_rate * np.max(q_table[next_state, :]) - q_table[state, action])

        state = next_state

        if done:
            break

    # Update nilai epsilon untuk lebih banyak exploitation di episode selanjutnya
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

# Tampilkan Q-Table hasil training
print("Q-Table setelah latihan:")
print(q_table)

# Evaluasi hasil setelah latihan
total_rewards = 0

for episode in range(100):
    state = env.reset()
    done = False

    for step in range(max_steps):
        action = np.argmax(q_table[state, :])  # Pilih action terbaik berdasarkan Q-Table
        next_state, reward, done, info = env.step(action)
        total_rewards += reward
        state = next_state

        if done:
            break

print(f"Total reward yang didapat setelah evaluasi: {total_rewards}")
env.close()


  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Q-Table setelah latihan:
[[1.80815631e-01 6.02380502e-02 4.58565037e-02 1.94080776e-02]
 [1.53371939e-03 1.04811473e-02 3.92776990e-05 1.03560110e-01]
 [1.36699239e-01 2.67487223e-02 7.39681381e-03 3.04344732e-02]
 [6.40975688e-04 2.21085549e-04 2.29833106e-03 3.19803545e-02]
 [1.61676527e-01 4.01300994e-02 4.99960011e-04 3.51528742e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.77970352e-02 3.12514334e-04 3.10698117e-04 4.58583250e-06]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [8.93447722e-02 1.49066161e-02 4.16345237e-02 3.01222012e-01]
 [6.47890774e-02 3.93735804e-01 3.04252731e-02 2.87796027e-02]
 [2.14778891e-01 3.29568493e-03 4.90512218e-03 4.38343937e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [9.04354997e-02 6.92300979e-03 6.43922995e-01 9.72392802e-02]
 [1.94752047e-01 9.47849864e-01 2.69408268e-01 2.12100225e-01]
 [0.00000000e+00 0.00000000e+0