In [1]:
!pip install -r requirements.txt

[0m

In [2]:
# If the assignment takes too long, you may want to download the code 
# and run it locally to take advantage of your GPU. 

# Package Imports
import numpy as np
import tensorflow as tf
import gymnasium as gym
from collections import deque
import random
import sys

# Refactored into Function for reusability
def train_cartpole_dqn(
    GAMMA=0.99,
    EXPLORATION_MAX=1.0,
    EXPLORATION_MIN=0.01,
    EXPLORATION_DECAY=0.990,
    LEARNING_RATE=0.001,
    BATCH_SIZE=64,
    TRAIN_START=1000,
    MEMORY_SIZE=2000,
    EPISODES=300
):
    # Counters during training
    train_freq = 4
    step_count = 0
    target_update_freq = 10
    # If an error occurs below, it is because the environment is looking for a GPU.

    # Environment setup and Variables
    try:
        env = gym.make("CartPole-v1", render_mode=None)
    except Exception:
        print('Failed to initialize environment! Make sure that gymnasium was installed correctly!')
        sys.exit(1)

    state_shape = env.observation_space.shape[0]
    action_size = env.action_space.n

    # Initialize memory and rewards
    memory = deque(maxlen=MEMORY_SIZE)
    episode_rewards = []

    # DQN Builder
    def build_dqn_model():
        model = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(state_shape,)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss='mse')
        return model

    model = build_dqn_model()
    target_model = build_dqn_model()
    target_model.set_weights(model.get_weights())

    # Random Action selection for exploration
    def get_action(state, exploration_rate):
        if np.random.rand() <= exploration_rate:
            return random.randrange(action_size)
        q_values = model.predict(np.array([state]), verbose=0) #Remove verbose=0 to see every prediction display
        return np.argmax(q_values[0])

    # Experience Replay
    def experience_replay():
        if len(memory) < TRAIN_START:
            return
        batch = random.sample(memory, BATCH_SIZE)
        states = np.zeros((BATCH_SIZE, state_shape))
        next_states = np.zeros((BATCH_SIZE, state_shape))
        actions, rewards, dones = [], [], []

        for i, (state, action, reward, next_state, done) in enumerate(batch):
            states[i] = state
            next_states[i] = next_state
            actions.append(action)
            rewards.append(reward)
            dones.append(done)

        target_q = model.predict(states, verbose=0) #Remove verbose=0 to see every prediction display
        next_q = target_model.predict(next_states, verbose=0) #Remove verbose=0 to see every prediction display

        for i in range(BATCH_SIZE):
            if dones[i]:
                target_q[i][actions[i]] = rewards[i]
            else:
                target_q[i][actions[i]] = rewards[i] + GAMMA * np.max(next_q[i])

        model.fit(states, target_q, batch_size=BATCH_SIZE, verbose=0)

    # Main loop
    # Will iterate through based on the number of EPISODES originally listed up top
    for e in range(EPISODES):
        state, _ = env.reset()
        done = False
        total_reward = 0

        while not done:
            action = get_action(state, EXPLORATION_MAX)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            reward = reward if not done else -10

            memory.append((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward

            step_count += 1
            if step_count % train_freq == 0:
                experience_replay()

        if EXPLORATION_MAX > EXPLORATION_MIN:
            EXPLORATION_MAX *= EXPLORATION_DECAY
            EXPLORATION_MAX = max(EXPLORATION_MIN, EXPLORATION_MAX)

        if e % target_update_freq == 0:
            target_model.set_weights(model.get_weights())

        episode_rewards.append(total_reward)
        # This will display the average reward of every 10 episodes occurring.
        # Comment this section and add the below print method if you want to display every episode
        if e % 10 == 0:
            avg_reward = np.mean(episode_rewards[-10:])
            print(f"Episode {e+1}: Average Reward = {avg_reward:.2f}, Exploration = {EXPLORATION_MAX:.3f}")

        # Logic to complete execution if average is greater than 195
        if len(episode_rewards) >= 100:
            avg_last_hun = np.mean(episode_rewards[-100:])
            if avg_last_hun >= 195:
                print(f"Solved at episode {e}: Average reward over last 100: {avg_last_hun:.2f}")
                break

    # Saving model as needed    
    model.save("Cartpole_model.h5")
    print("Training complete! Model saved as Cartpole_model")

    # Note, if an error displays, it is attempting to connect to the GPU.
    # It will not connect and run on CPU. 
    # Code will take some time to run, which is commonplace for real life models.
    # Make sure your computer does not go to sleep, and take a well-deserved break!
    
    # Return statement for reusability
    return model, episode_rewards

baseline, rewards1 = train_cartpole_dqn()

Episode 1: Average Reward = 45.00, Exploration = 0.990
Episode 11: Average Reward = 12.10, Exploration = 0.895
Episode 21: Average Reward = 11.90, Exploration = 0.810
Episode 31: Average Reward = 8.30, Exploration = 0.732
Episode 41: Average Reward = 4.60, Exploration = 0.662
Episode 51: Average Reward = 4.50, Exploration = 0.599
Episode 61: Average Reward = 22.20, Exploration = 0.542
Episode 71: Average Reward = 19.80, Exploration = 0.490
Episode 81: Average Reward = 24.00, Exploration = 0.443
Episode 91: Average Reward = 54.50, Exploration = 0.401
Episode 101: Average Reward = 101.40, Exploration = 0.362
Episode 111: Average Reward = 152.80, Exploration = 0.328
Episode 121: Average Reward = 145.20, Exploration = 0.296
Episode 131: Average Reward = 140.30, Exploration = 0.268
Episode 141: Average Reward = 141.40, Exploration = 0.242
Episode 151: Average Reward = 138.40, Exploration = 0.219
Episode 161: Average Reward = 125.70, Exploration = 0.198
Episode 171: Average Reward = 138.60, 



Training complete! Model saved as Cartpole_model


# Explain how reinforcement learning concepts apply to the cartpole problem
Reinforcement learning is learning through interactions, analyzing the results, and adjusting its moves to maximize the rewards (Simonini, 2018). In the CartPole problem, the goal of the agent is to balance the pole upright on a moving cart for as long as possible. The action the agent makes is based on the current state variables, which describe how the environment looks at the moment. In this problem, these are the cart’s position, the cart’s current velocity, the angle of the pole, and the speed at which the pole is falling. The possible actions for the agent are to move left or right, and it receives rewards if the pole is balanced. To learn the best way to balance the pole, a Deep Q-Network (DQN) algorithm is used, which is a neural network that learns which actions are best in each state by predicting future rewards (Simonini, 2018).

# Analyze how experience replay is applied to the cartpole problem
Experience replay is a technique used in reinforcement learning algorithms that remembers past experiences and reuses them for training (Beysolow, 2019). In the CartPole problem, each time the agent moves the cart left or right, it saves the experience (state, action, reward, next_state, done) in memory. During training, the agent randomly picks experiences from memory to train its neural network. This helps the agent learn from many different situations, avoid depending only on recent moves, and improve its ability to keep the pole balanced longer.

The discount factor determines how much the agent values future rewards versus immediate rewards. Introducing this variable into the algorithm allows the agent to balance short-term gains with long-term goals, helping it plan actions that maximize total reward over time (Beysolow, 2019).

# Analyze how neural networks are used in deep Q-learning.
In this problem, the neural network has an input layer that takes the current state as input, processes this through two hidden layers with 64 neurons each, and outputs Q-values for each possible action. The agent then chooses the action that corresponds to the higher Q-value because it predicts a greater reward. Using a neural network also makes the Q-learning algorithm more efficient, because it can generalize from past experiences and predict Q-values for states it hasn’t seen before, instead of needing a table of every possible state and action (Beysolow, 2019). The learning rate controls how quickly the network updates its predictions. A high learning rate can make the agent learn faster but may be unstable, while a low learning rate makes learning slower but more stable. As seen below

# References
Beysolow, I. T. (2019). Applied reinforcement learning with python : With openai gym, tensorflow, and keras. Apress L. P..

Simonini, T. (2018, September 3). An introduction to Q-Learning: reinforcement learning. freeCodeCamp. https://www.freecodecamp.org/news/an-introduction-to-q-learning-reinforcement-learning-14ac0b4493cc/



In [3]:
# update exploration factor
explorationTest1, rewards1 = train_cartpole_dqn(EXPLORATION_DECAY = 0.100) # test low

explorationTest2, rewards2 = train_cartpole_dqn(EXPLORATION_DECAY = 0.999) # test high

Episode 1: Average Reward = 8.00, Exploration = 0.100
Episode 11: Average Reward = -1.90, Exploration = 0.010
Episode 21: Average Reward = -1.30, Exploration = 0.010
Episode 31: Average Reward = 0.70, Exploration = 0.010
Episode 41: Average Reward = -1.70, Exploration = 0.010
Episode 51: Average Reward = -0.80, Exploration = 0.010
Episode 61: Average Reward = -1.30, Exploration = 0.010
Episode 71: Average Reward = -0.50, Exploration = 0.010
Episode 81: Average Reward = -1.20, Exploration = 0.010
Episode 91: Average Reward = 0.20, Exploration = 0.010
Episode 101: Average Reward = 10.10, Exploration = 0.010
Episode 111: Average Reward = 8.20, Exploration = 0.010
Episode 121: Average Reward = 3.70, Exploration = 0.010
Episode 131: Average Reward = 2.80, Exploration = 0.010
Episode 141: Average Reward = 4.00, Exploration = 0.010
Episode 151: Average Reward = 3.80, Exploration = 0.010
Episode 161: Average Reward = 7.20, Exploration = 0.010
Episode 171: Average Reward = 28.70, Exploration = 



Training complete! Model saved as Cartpole_model
Episode 1: Average Reward = 12.00, Exploration = 0.999
Episode 11: Average Reward = 7.20, Exploration = 0.989
Episode 21: Average Reward = 19.70, Exploration = 0.979
Episode 31: Average Reward = 7.70, Exploration = 0.969
Episode 41: Average Reward = 18.10, Exploration = 0.960
Episode 51: Average Reward = 14.70, Exploration = 0.950
Episode 61: Average Reward = 17.40, Exploration = 0.941
Episode 71: Average Reward = 12.10, Exploration = 0.931
Episode 81: Average Reward = 11.70, Exploration = 0.922
Episode 91: Average Reward = 8.50, Exploration = 0.913
Episode 101: Average Reward = 9.80, Exploration = 0.904
Episode 111: Average Reward = 18.50, Exploration = 0.895
Episode 121: Average Reward = 22.10, Exploration = 0.886
Episode 131: Average Reward = 24.10, Exploration = 0.877
Episode 141: Average Reward = 22.10, Exploration = 0.868
Episode 151: Average Reward = 19.20, Exploration = 0.860
Episode 161: Average Reward = 14.40, Exploration = 0.8



Training complete! Model saved as Cartpole_model


# Exploration observation
With low exploration (0.1), the agent eventually learns a good strategy and gets very high rewards. With high exploration (0.999), the agent keeps trying many random actions, so rewards stay consistent and never reach the same high values. This shows that too much exploration can prevent the agent from maximizing its performance. Neither test get an average reward of 195 over 100 games.

In [4]:
#update gamma
gammaTest, rewards1 = train_cartpole_dqn(GAMMA=0.0) # test low

gammaTest2, rewards2 = train_cartpole_dqn(GAMMA=0.9)

Episode 1: Average Reward = 26.00, Exploration = 0.990
Episode 11: Average Reward = 6.90, Exploration = 0.895
Episode 21: Average Reward = 18.00, Exploration = 0.810
Episode 31: Average Reward = 5.00, Exploration = 0.732
Episode 41: Average Reward = 4.60, Exploration = 0.662
Episode 51: Average Reward = 8.20, Exploration = 0.599
Episode 61: Average Reward = 5.80, Exploration = 0.542
Episode 71: Average Reward = 2.60, Exploration = 0.490
Episode 81: Average Reward = 1.70, Exploration = 0.443
Episode 91: Average Reward = -0.10, Exploration = 0.401
Episode 101: Average Reward = 0.30, Exploration = 0.362
Episode 111: Average Reward = 1.00, Exploration = 0.328
Episode 121: Average Reward = -0.70, Exploration = 0.296
Episode 131: Average Reward = 8.20, Exploration = 0.268
Episode 141: Average Reward = 9.00, Exploration = 0.242
Episode 151: Average Reward = 15.00, Exploration = 0.219
Episode 161: Average Reward = 6.10, Exploration = 0.198
Episode 171: Average Reward = 15.60, Exploration = 0.1



Training complete! Model saved as Cartpole_model
Episode 1: Average Reward = 18.00, Exploration = 0.990
Episode 11: Average Reward = 10.60, Exploration = 0.895
Episode 21: Average Reward = 5.00, Exploration = 0.810
Episode 31: Average Reward = 8.30, Exploration = 0.732
Episode 41: Average Reward = 3.20, Exploration = 0.662
Episode 51: Average Reward = 5.30, Exploration = 0.599
Episode 61: Average Reward = 1.70, Exploration = 0.542
Episode 71: Average Reward = 0.60, Exploration = 0.490
Episode 81: Average Reward = 3.00, Exploration = 0.443
Episode 91: Average Reward = 12.60, Exploration = 0.401
Episode 101: Average Reward = 12.70, Exploration = 0.362
Episode 111: Average Reward = 28.70, Exploration = 0.328
Episode 121: Average Reward = 54.50, Exploration = 0.296
Episode 131: Average Reward = 102.90, Exploration = 0.268
Episode 141: Average Reward = 349.80, Exploration = 0.242
Episode 151: Average Reward = 295.50, Exploration = 0.219
Episode 161: Average Reward = 430.50, Exploration = 0.



Solved at episode 178: Average reward over last 100: 196.08
Training complete! Model saved as Cartpole_model


# Gamma observation
With low gamma (0), the agent only considers immediate rewards, so it fails to plan for keeping the pole balanced over time. As a result, average rewards remain low and inconsistent. With high gamma (0.9), the agent accounts for future rewards and learned strategies that kept the pole upright longer. This allowed the agent to solve CartPole by episode 178, achieving an average reward of 196 over the last 100 episodes, something it could not do with the lower gamma value.

In [5]:
#update learning rate
learningRate, rewards1 = train_cartpole_dqn(LEARNING_RATE=0.0001) # test low

learningRate2, rewards1 = train_cartpole_dqn(LEARNING_RATE=0.1) # test high

Episode 1: Average Reward = 14.00, Exploration = 0.990
Episode 11: Average Reward = 15.30, Exploration = 0.895
Episode 21: Average Reward = 14.00, Exploration = 0.810
Episode 31: Average Reward = 7.50, Exploration = 0.732
Episode 41: Average Reward = 2.40, Exploration = 0.662
Episode 51: Average Reward = 4.00, Exploration = 0.599
Episode 61: Average Reward = 2.30, Exploration = 0.542
Episode 71: Average Reward = 8.00, Exploration = 0.490
Episode 81: Average Reward = 9.40, Exploration = 0.443
Episode 91: Average Reward = 5.60, Exploration = 0.401
Episode 101: Average Reward = 8.60, Exploration = 0.362
Episode 111: Average Reward = 15.40, Exploration = 0.328
Episode 121: Average Reward = 12.20, Exploration = 0.296
Episode 131: Average Reward = 5.20, Exploration = 0.268
Episode 141: Average Reward = 6.00, Exploration = 0.242
Episode 151: Average Reward = 6.30, Exploration = 0.219
Episode 161: Average Reward = 5.10, Exploration = 0.198
Episode 171: Average Reward = 5.00, Exploration = 0.17



Training complete! Model saved as Cartpole_model
Episode 1: Average Reward = 1.00, Exploration = 0.990
Episode 11: Average Reward = 12.90, Exploration = 0.895
Episode 21: Average Reward = 16.10, Exploration = 0.810
Episode 31: Average Reward = 14.30, Exploration = 0.732
Episode 41: Average Reward = 13.90, Exploration = 0.662
Episode 51: Average Reward = 9.20, Exploration = 0.599
Episode 61: Average Reward = 8.50, Exploration = 0.542
Episode 71: Average Reward = 14.90, Exploration = 0.490
Episode 81: Average Reward = 8.50, Exploration = 0.443
Episode 91: Average Reward = 6.00, Exploration = 0.401
Episode 101: Average Reward = 18.60, Exploration = 0.362
Episode 111: Average Reward = 33.00, Exploration = 0.328
Episode 121: Average Reward = 59.20, Exploration = 0.296
Episode 131: Average Reward = 33.30, Exploration = 0.268
Episode 141: Average Reward = 35.40, Exploration = 0.242
Episode 151: Average Reward = 42.30, Exploration = 0.219
Episode 161: Average Reward = 85.10, Exploration = 0.19



Training complete! Model saved as Cartpole_model


# Learning rate observation
With a very low learning rate (0.0001), the agent learns very slowly, and average rewards remain low or even decline over time, showing it struggles to solve the problem. With a high learning rate (0.1), the agent learns much faster and achieves higher rewards, but its performance is inconsistent. In neither case was the agent able to achieve an average score of 195 over 100 episodes.