In [1]:
!pip install tensorflow


Collecting tensorflow
  Using cached tensorflow-2.19.0-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.3.0-py3-none-any.whl.metadata (2.4 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Using cached flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Using cached gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Using cached libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Using cached opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Using cached term

In [8]:
!pip install gymnasium

Collecting gymnasium
  Using cached gymnasium-1.1.1-py3-none-any.whl.metadata (9.4 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Using cached Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Using cached gymnasium-1.1.1-py3-none-any.whl (965 kB)
Using cached Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.1.1


In [9]:
import numpy as np
import random
from collections import deque
import gymnasium as gym
import tensorflow as tf
from tensorflow.keras import Model, layers
import os

In [72]:
env = gym.make('CartPole-v1')

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

print("State Size: ", state_size)
print("Action Size: ", action_size)

State Size:  4
Action Size:  2


In [77]:
class DQN(Model):
    def __init__(self, action_size, **kwargs):
        super(DQN, self).__init__(**kwargs)
        self.action_size = action_size
        self.d1 = layers.Dense(24, activation='relu', name='d1')
        self.d2 = layers.Dense(24, activation='relu', name='d2')
        self.d3 = layers.Dense(action_size, activation='linear', name='d3')
        # two 24 neurons hidden layer . 4-24-24-2 , output 2 actions
    
    def call(self, x):
        x = self.d1(x)
        x = self.d2(x)
        return self.d3(x)

    # Configs for loading the saved model file later on
    def get_config(self):
        config = super(DQN, self).get_config()
        config.update({"action_size": self.action_size})
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [78]:

memory = deque(maxlen=2000)
#rolling list

In [79]:
class Agent:
    def __init__(self, state_size, action_size, gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995, learning_rate=0.001):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma #Future reward discount rate
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.learning_rate = learning_rate

        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()

        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)

    def _build_model(self):
        return DQN(self.action_size) # build a model using DQN Class

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        memory.append((state, action, reward, next_state, done)) #stacking

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size) # if epsilon is high choose a random value from range , in this case 1 or 2 else use DQN to find Q value
        q_values = self.model(np.array([state])) 
        return np.argmax(q_values[0].numpy()) 

    def save_model(self, filepath):
        self.model.save(filepath)

    def load_model(self, filepath):
        # Load the saved model from the specified filepath
        self.model = tf.keras.models.load_model(filepath, custom_objects={"DQN": DQN})
        self.target_model = tf.keras.models.load_model(filepath, custom_objects={"DQN": DQN})
        
    def replay(self, batch_size):
        minibatch = random.sample(memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            with tf.GradientTape() as tape:
                q_values = self.model(np.array([state]), training=True)
                q_value = q_values[0][action]

                if done:
                    target = reward
                else:
                    next_action = np.argmax(self.model(np.array([next_state]))[0].numpy())
                    t = self.target_model(np.array([next_state]))[0][next_action]
                    target = reward + self.gamma * t 

                loss = tf.reduce_mean(tf.square(target - q_value))

            grads = tape.gradient(loss, self.model.trainable_variables)
            self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

In [81]:
batch_size = 32           # Number of samples for training
n_episodes = 500          # Total number of episodes to train on
gamma = 0.95              # Discount factor for future rewards (0 to 1)
epsilon = 1.0             # Initial exploration rate 
epsilon_min = 0.01        # Minimum exploration rate
epsilon_decay = 0.995     # Decay factor for epsilon after each episode
learning_rate = 0.001     # Step size for neural network weight updates
update_target_every = 5   # Number of episodes between target network updates

In [69]:
output_dir = './cartpole_model/'

# Initialize the Agent
agent = Agent(state_size, action_size, gamma=gamma, epsilon=epsilon, epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, learning_rate=learning_rate)
done = False

# Main Script
for e in range(n_episodes):
    state = env.reset()[0]
    state = np.reshape(state, [1, state_size])
    total_reward = 0

    for time_t in range(500):
        action = agent.act(state[0])
        next_state, reward, done, truncated, _ = env.step(action)
        done = done or truncated
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state[0], action, reward, next_state[0], done)
        state = next_state
        total_reward += reward

        if done:
            print(f"Episode: {e}/{n_episodes}, Score: {time_t}, Epsilon: {agent.epsilon:.2f}")
            break

    if len(memory) > batch_size:
        loss = agent.replay(batch_size)

    # Update epsilon
    if agent.epsilon > agent.epsilon_min:
        agent.epsilon *= agent.epsilon_decay

    # Update target network
    if e % update_target_every == 0:
        agent.update_target_model()

    
    if e % 100 == 0:
        os.makedirs(output_dir, exist_ok=True)
        agent.save_model(os.path.join(output_dir, f'model_{e}.keras'))

agent.save_model(os.path.join(output_dir, f'model_500.keras'))

Episode: 0/500, Score: 18, Epsilon: 1.00
Episode: 1/500, Score: 9, Epsilon: 0.99
Episode: 2/500, Score: 28, Epsilon: 0.99
Episode: 3/500, Score: 17, Epsilon: 0.99
Episode: 4/500, Score: 15, Epsilon: 0.98
Episode: 5/500, Score: 11, Epsilon: 0.98
Episode: 6/500, Score: 20, Epsilon: 0.97
Episode: 7/500, Score: 21, Epsilon: 0.97
Episode: 8/500, Score: 11, Epsilon: 0.96
Episode: 9/500, Score: 19, Epsilon: 0.96
Episode: 10/500, Score: 9, Epsilon: 0.95
Episode: 11/500, Score: 9, Epsilon: 0.95
Episode: 12/500, Score: 8, Epsilon: 0.94
Episode: 13/500, Score: 18, Epsilon: 0.94
Episode: 14/500, Score: 31, Epsilon: 0.93
Episode: 15/500, Score: 12, Epsilon: 0.93
Episode: 16/500, Score: 14, Epsilon: 0.92
Episode: 17/500, Score: 21, Epsilon: 0.92
Episode: 18/500, Score: 11, Epsilon: 0.91
Episode: 19/500, Score: 36, Epsilon: 0.91
Episode: 20/500, Score: 33, Epsilon: 0.90
Episode: 21/500, Score: 26, Epsilon: 0.90
Episode: 22/500, Score: 14, Epsilon: 0.90
Episode: 23/500, Score: 11, Epsilon: 0.89
Episod

In [82]:
pip install "gymnasium[classic-control]"


Note: you may need to restart the kernel to use updated packages.


In [89]:
def render_episode(agent, model_path, num_episodes=1):
    
    agent.load_model(model_path)
    
    env = gym.make('CartPole-v1', render_mode='human')
    for episode in range(num_episodes):
        state,_ = env.reset()
        state = state.reshape(1, -1)
        done = False
        total_reward = 0
        while not done:
            env.render()
            action = agent.act(state)
            next_state, reward, done, truncated, _ = env.step(action)
            next_state = next_state.reshape(1, -1)
            total_reward += reward
            state = next_state
            if truncated:
                done = True
        print(f"Episode {episode + 1} reward: {total_reward}")
    env.close()


state_size = 4
action_size = 2
agent = Agent(state_size, action_size)
agent.epsilon = 0.0  


model_path = "./cartpole_model/model_300.keras"
render_episode(agent, model_path, num_episodes=5)

Episode 1 reward: 386.0
Episode 2 reward: 386.0
Episode 3 reward: 300.0
Episode 4 reward: 318.0
Episode 5 reward: 500.0
