In [1]:
!pip install gym tensorflow




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import gym
import numpy as np
import random
import math # Import the math library
# Initialize environment
env = gym.make("CartPole-v1")
env.reset()
# Hyperparameters
learning_rate = 0.1
discount_factor = 0.99
exploration_rate = 1.0
exploration_decay = 0.995
exploration_min = 0.01
episodes = 1000
max_steps = 100
# Discretize the state space
bins = [20, 20, 50, 50] # Discretization bins for each dimension of the state space
lower_bounds = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -math.radians(50)]
upper_bounds = [env.observation_space.high[0], 0.5, env.observation_space.high[2], math.radians(50)]
# Create a Q-table with zeros
q_table = np.zeros(bins + [env.action_space.n])
# Discretization function for continuous state space
def discretize_state(state):
  discretized_state = []
  for i in range(len(state)):
    scaling = float((state[i] - lower_bounds[i]) / (upper_bounds[i] -lower_bounds[i]))
    new_state = int(round((bins[i] - 1) * scaling))
    new_state = min(bins[i] - 1, max(0, new_state))
    discretized_state.append(new_state)
  return tuple(discretized_state)
# Q-learning algorithm
for episode in range(episodes):
# Reset environment and check for tuple return
  state = env.reset()
  if isinstance(state, tuple):
    state = state[0] # Extract state from tuple if needed
  current_state = discretize_state(state)
  total_reward = 0
  for step in range(max_steps):
  # Exploration vs Exploitation
    if random.uniform(0, 1) < exploration_rate:
      action = env.action_space.sample() # Explore
    else:
      action = np.argmax(q_table[current_state]) # Exploit best action
    # Perform action and observe the result
    next_state_raw, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated # Combine both end conditions
    if isinstance(next_state_raw, tuple):
      next_state_raw = next_state_raw[0] # Extract next state from tuple if needed
    next_state = discretize_state(next_state_raw)
    total_reward += reward
    # Update Q-table
    best_next_action = np.argmax(q_table[next_state])
    q_table[current_state + (action,)] += learning_rate * (
    reward + discount_factor * q_table[next_state + (best_next_action,)] - q_table[current_state + (action,)])
    current_state = next_state
    if done:
        break
  # Decay exploration rate
  exploration_rate = max(exploration_min, exploration_rate *exploration_decay)
  if episode % 100 == 0:
    print(f"Episode: {episode}, Total Reward: {total_reward}")
env.close()

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Total Reward: 36.0
Episode: 100, Total Reward: 24.0
Episode: 200, Total Reward: 9.0
Episode: 300, Total Reward: 26.0
Episode: 400, Total Reward: 11.0
Episode: 500, Total Reward: 12.0
Episode: 600, Total Reward: 27.0
Episode: 700, Total Reward: 14.0
Episode: 800, Total Reward: 12.0
Episode: 900, Total Reward: 33.0
