In [67]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers


In [68]:
# Environment setup
env = gym.make('CartPole-v1')
state_shape = env.observation_space.shape
num_actions = env.action_space.n


In [69]:
# Actor model
actor = tf.keras.Sequential([
    layers.Dense(24, activation='relu', input_shape=state_shape),
    layers.Dense(24, activation='relu'),
    layers.Dense(num_actions, activation='softmax')
])


In [70]:
# Critic model
critic = tf.keras.Sequential([
    layers.Dense(24, activation='relu', input_shape=state_shape),
    layers.Dense(24, activation='relu'),
    layers.Dense(1)
])


In [71]:
# Optimizers
actor_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)


In [None]:
# Training loop
for episode in range(1000):
    state = env.reset()
    state = np.array(state, dtype=object)  # Use dtype=object to handle lists of varying lengths

    # Optionally flatten or pad the sequences
    if state.ndim > 1 and len(set(len(x) for x in state)) > 1:
        # Flatten or pad sequences here
        state = np.array([item for sublist in state for item in sublist])

    state = state.reshape([1, -1])  # Reshape based on the actual flattened structure

    done = False
    
    while not done:
        # Predict action probabilities and take an action
        action_probs = actor(state)
        action = np.random.choice(num_actions, p=action_probs.numpy()[0])
        
        # Take action in the environment
        next_state, reward, done, _ = env.step(action)
        next_state = np.array(next_state).reshape([1, state_shape[0]])  # Ensure next_state is a numpy array

        # Critic update
        with tf.GradientTape() as tape:
            value = critic(state)
            next_value = critic(next_state)
            target = reward + 0.99 * next_value * (1 - int(done))
            td_error = target - value
            critic_loss = tf.reduce_mean(td_error**2)
        
        grads = tape.gradient(critic_loss, critic.trainable_variables)
        critic_optimizer.apply_gradients(zip(grads, critic.trainable_variables))
        
        # Actor update
        with tf.GradientTape() as tape:
            action_probs = actor(state)
            log_prob = tf.math.log(action_probs[0, action])
            actor_loss = -log_prob * td_error
        
        grads = tape.gradient(actor_loss, actor.trainable_variables)
        actor_optimizer.apply_gradients(zip(grads, actor.trainable_variables))
        
        state = next_state  # Move to the next state
