In [5]:
%pip install gymnasium

Collecting gymnasium
  Using cached gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Using cached Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
   ---------------------------------------- 0.0/958.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/958.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/958.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/958.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/958.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/958.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/958.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/958.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/958.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/958.1 kB ? eta -:--:--
   ----------

In [6]:
import numpy as np
import gymnasium as gym
import tensorflow as tf
from tensorflow import keras
from collections import deque

In [7]:
# HYPERPARAMETERS
GAMMA = 0.99 # discount factor for reward
LEARNING_RATE = 0.001
MEMORY_SIZE = 1000000
BATCH_SIZE = 64
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995 # Decay rate for exploration probability so that the agent exploits more as it gets more experience

In [8]:
class DQNAgent():
    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX
        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)
        self.model = self.create_model(observation_space, action_space)

    def create_model(self, observation_space, action_space):
        model = keras.Sequential([
            keras.layers.Dense(24, input_shape=(observation_space,), activation='relu'),
            keras.layers.Dense(24, activation='relu'),
            keras.layers.Dense(action_space, activation='linear')
        ])
        
        model.compile(loss='mse', optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE))
        return model
    
    #Explore / Exploit
    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return np.random.choice(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = np.random.choice(len(self.memory), BATCH_SIZE, replace=False)
        
        for idx in batch:
            state, action, reward, next_state, done = self.memory[idx]

            q_update = reward
            if not done:
                q_update = (reward + GAMMA * np.amax(self.model.predict(next_state)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
            
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

In [9]:
# Setting up the environment

env = gym.make("CartPole-v1", render_mode='human')
# OBS 4 ACT 2
OBSERVATION_SPACE = env.observation_space.shape[0]
ACTION_SPACE = env.action_space.n

In [13]:
dqn_agent = DQNAgent(OBSERVATION_SPACE, ACTION_SPACE)
run = 0

for _ in range(100):
    run += 1
    state, _ = env.reset()
    print(state)
    state = np.array(state)
    state = np.reshape(state, [1, OBSERVATION_SPACE])
    step = 0
    while True:
        step += 1
        action = dqn_agent.act(state)
        next_state, reward, done, info, _ = env.step(action)
        reward = reward if not done else -reward
        next_state = np.reshape(next_state, [1, OBSERVATION_SPACE])
        dqn_agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print(f'Run: {run}, exploration: {dqn_agent.exploration_rate}, score: {step}')
            break

        dqn_agent.experience_replay()

env.close()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[ 0.02401312  0.01149488 -0.0071544   0.02796989]
Run: 1, exploration: 1.0, score: 15
[-0.03487497 -0.04574735 -0.04601137 -0.01235008]
Run: 2, exploration: 1.0, score: 36
[-0.00679988 -0.04823601 -0.02007355 -0.02155745]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [12]:
%pip install "gymnasium[classic-control]"

Collecting pygame>=2.1.3 (from gymnasium[classic-control])
  Downloading pygame-2.6.1-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading pygame-2.6.1-cp312-cp312-win_amd64.whl (10.6 MB)
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   -------------------------------------