In [91]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import random
from tqdm import tqdm 

In [92]:
# Environment setup
env = gym.make('CartPole-v1')
num_actions = env.action_space.n
state_shape = env.observation_space.shape



In [93]:
# Neural network model
model = tf.keras.Sequential([
    layers.Dense(24, activation='relu', input_shape=state_shape),
    layers.Dense(24, activation='relu'),
    layers.Dense(num_actions, activation='linear')
])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='mse')


In [94]:
# Parameters
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.995
replay_buffer = []


In [95]:
# Training loop with tqdm progress bar
for episode in tqdm(range(1000), desc="Training Episodes"):
    state = env.reset()

    print(f"Initial state type: {type(state)}, state: {state}")

    if isinstance(state, tuple):
        state = np.array(state[0])
    else:
        state = np.array(state)
    
    print(f"State after conversion to array: {state}, shape: {state.shape}")  # Debugging line

    done = False
    while not done:
        if np.random.rand() < epsilon:
            action = np.random.choice(num_actions)
        else:
            action = np.argmax(model.predict(np.expand_dims(state, axis=0), verbose=0))

        next_state, reward, done, info, _ = env.step(action)
        next_state = np.array(next_state)  
        replay_buffer.append((state, action, reward, next_state, done))

        # Experience replay
        if len(replay_buffer) > 32:
            minibatch = random.sample(replay_buffer, 32)  
            for state_mb, action_mb, reward_mb, next_state_mb, done_mb in minibatch:
                target = reward_mb
                if not done_mb:
                    target += gamma * np.max(model.predict(np.expand_dims(next_state_mb, axis=0), verbose=0))
                target_f = model.predict(np.expand_dims(state_mb, axis=0), verbose=0)
                target_f[0][action_mb] = target
                model.fit(np.expand_dims(state_mb, axis=0), target_f, epochs=1, verbose=0)

        state = next_state

    # Epsilon decay
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

  if not isinstance(terminated, (bool, np.bool8)):


Initial state type: <class 'tuple'>, state: (array([ 0.04459556, -0.0010378 , -0.01434693, -0.03819619], dtype=float32), {})
State after conversion to array: [ 0.04459556 -0.0010378  -0.01434693 -0.03819619], shape: (4,)
Initial state type: <class 'tuple'>, state: (array([ 0.04766874,  0.00661476, -0.03978913,  0.03722138], dtype=float32), {})
State after conversion to array: [ 0.04766874  0.00661476 -0.03978913  0.03722138], shape: (4,)


Training Episodes:   0%|          | 1/1000 [00:38<10:43:20, 38.64s/it]


KeyboardInterrupt: 