In [34]:
import numpy as np
import tensorflow as tf
import gym
import random
from collections import namedtuple,deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input
from tensorflow.keras.optimizers import Adam


In [35]:
def get_experiences(training_data,mini_batch_size):
    experiences = random.sample(training_data, mini_batch_size)
    states = tf.convert_to_tensor(
        np.array([e.state for e in experiences if e is not None]), dtype=tf.float32
    )
    actions = tf.convert_to_tensor(
        np.array([e.action for e in experiences if e is not None]), dtype=tf.float32
    )
    next_states = tf.convert_to_tensor(
        np.array([e.next_state for e in experiences if e is not None]), dtype=tf.float32
    )
    rewards = tf.convert_to_tensor(
        np.array([e.reward for e in experiences if e is not None]), dtype=tf.float32
    )
    dones = tf.convert_to_tensor(
        np.array([e.done for e in experiences if e is not None]).astype(np.uint8),
        dtype=tf.float32,
    )
    return (states, actions,  next_states,rewards, dones)

In [36]:
def play_move(q_values,epsilon):
        if random.random()>epsilon:
            return np.argmax(q_values.numpy()[0])
        else:
            return random.choice(np.arange(2))

In [37]:
def q_loss(q_network, tq_network, mini_batch, gamma):
    states, actions, next_states, rewards, dones = mini_batch

    # Compute target Q-values
    next_q_values = tq_network(next_states)  # shape: (batch_size, num_actions)
    max_next_q = tf.reduce_max(next_q_values, axis=1)  # shape: (batch_size,)
    targets = rewards + (1.0 - dones) * gamma * max_next_q  # shape: (batch_size,)

    # Current Q-values
    q_values = q_network(states)  # shape: (batch_size, num_actions)
    batch_indices = tf.range(tf.shape(q_values)[0])  # shape: (batch_size,)
    action_indices = tf.cast(actions, tf.int32)
    q_taken = tf.gather_nd(q_values, tf.stack([batch_indices, action_indices], axis=1))  # shape: (batch_size,)

    return tf.reduce_mean(tf.square(q_taken - targets))


In [38]:
#defining function to our update target network
def update_networks(q_network,tq_network,tau):
    q_weights = q_network.get_weights()
    tq_weights = tq_network.get_weights()
    updated_weights = [
        (1-tau )* tq_w + (tau) * q_w
        for tq_w, q_w in zip(tq_weights, q_weights)
    ]
    tq_network.set_weights(updated_weights)
    return tq_network

In [39]:

def train(q_network,tq_network,gamma,update_interval, mini_batch_size,training_data,iterations = 10000,e=1,e_min=0.01,e_decay=0.995,tau = 1e-3):
    scores = []
    for iteration in range(1,iterations+1):
        
        # Reset the environment to the initial state and get the initial state
        state,_ = env.reset()
        score = 0
        for i in range(300):
            state_qn = np.expand_dims(state,axis = 0) 
            q_values = q_network(state_qn)
            action = play_move(q_values,e)
            next_state,reward,terminated,truncated,_ = env.step(action)
            done = terminated or truncated
            training_data.append(experiences(state,action,next_state,reward,done))
            score+= reward;
            if i%update_interval ==0:
                mini_batch = get_experiences(training_data,mini_batch_size)
                states, actions, next_states,rewards,dones = mini_batch
              
                with tf.GradientTape() as tape:
                    loss = q_loss(q_network,tq_network, mini_batch,gamma)
                grads = tape.gradient(loss,q_network.trainable_variables)
                optimizer.apply_gradients(zip(grads, q_network.trainable_variables))
                update_networks(q_network,tq_network,tau)
                
            if done:
                break;
            state = next_state
        scores.append(score)
        if iteration%100 == 0 :
            
            avg_score =  np.mean(scores[-100:])
            print(f"average score after {iteration} iterations is",avg_score )
            if avg_score > 195:
                "cartpole problem solved :)" 
                break
        e = max(e*e_decay,e_min)
        
    return q_network

In [40]:
#initializing our gym environment
env = gym.make("CartPole-v1")

In [41]:
#getting some random data
experiences = namedtuple("experiences",['state','action','next_state','reward','done'])
training_data = deque(maxlen=10000)

episodes = 256
for episode in range(1, episodes +1):
    state,_ = env.reset()
    done  = False
    while not done:
        action = random.choice([0,1])
        next_state,reward,terminated,truncated,_ = env.step(action)
        done = terminated or truncated
        
    training_data.append(experiences(state,action,next_state,reward,done))
    state = next_state


In [42]:
input_shape_q=((training_data[0][0]).shape)

In [43]:

# intializing our neural network layers
q_network = Sequential([
    Input(shape = input_shape_q),
    tf.keras.layers.Dense(64,activation = 'relu'),
    tf.keras.layers.Dense(64,activation = 'relu'),
    tf.keras.layers.Dense(2,activation = 'linear')
])
tq_network = Sequential([
    Input(shape = input_shape_q),
    tf.keras.layers.Dense(64,activation = 'relu'),
    tf.keras.layers.Dense(64,activation = 'relu'),
    tf.keras.layers.Dense(2,activation = 'linear')
])
final_q_network = Sequential([
    Input(shape = input_shape_q),
    tf.keras.layers.Dense(64,activation = 'relu'),
    tf.keras.layers.Dense(64,activation = 'relu'),
    tf.keras.layers.Dense(2,activation = 'linear')
])
optimizer = Adam(learning_rate = 1e-3)


In [44]:
#setting same weights in both
tq_network.set_weights(q_network.get_weights())

In [None]:
#hyperparameters
gamma = 0.98
mini_batch_size = 64
update_interval = 5
num_p_av = 100  
tau = 1e-3
e = 1 #epsilon
e_min = 0.1 
e_decay = 0.995

In [None]:
final_q_network.set_weights(train(q_network,tq_network,gamma,update_interval, mini_batch_size,training_data,iterations = 10000,e=1,e_min=0.01,e_decay=0.995,tau = 1e-3).get_weights())


average score after 100 iterations is 18.68
average score after 200 iterations is 13.62
average score after 300 iterations is 11.23
average score after 400 iterations is 10.38
average score after 500 iterations is 10.2
average score after 600 iterations is 10.51
average score after 700 iterations is 11.3
average score after 800 iterations is 14.24
average score after 900 iterations is 58.71
average score after 1000 iterations is 128.03
average score after 1100 iterations is 189.23
average score after 1200 iterations is 288.29


In [49]:
final_q_network.save("Q_network_cartpole.keras")

# Final Words

Finally, we have our **final_q_network**, trains for 1200 iterations. It achieves an average score of **288**, which is well above the threshold of 195 required to consider the CartPole problem solved. Although this is a simple and foundational reinforcement learning problem, it still effectively demonstrates the application of core concepts.

## Summary

We defined the following essential functions:

1. **get_experiences** – Returns a minibatch of experience samples to train the network.  
2. **play_move** – Returns the optimal action based on the Q-network’s predicted Q-values.  
3. **q_loss** – Calculates the loss between the predicted Q-values from the Q-network and the target Q-network.  
4. **update_networks** – Updates the target network parameters periodically, every `update_interval` steps.  
5. **train** – Runs the training loop for a specified number of iterations (2000 in our case). At each iteration, it chooses an action based on the current Q-network, stores the resulting experience in the training data, and updates the Q-network periodically to move closer to the target Q-network.

## Working and Observations

We first set up the environment and collected 256 initial training examples to bootstrap the training process. Then we started training the model.

After experimenting with hyperparameters to speed up convergence, I noticed that the network took nearly 1000 iterations before meaningful learning began due to the initial 256 examples collected via random actions. Since these random examples generally yield low rewards and short episodes, they don’t dramatically impact total training time but do affect early learning speed.

The hyperparameters used are detailed separately. Feel free to modify them to see how they affect the model’s convergence and overall performance.
