# Cart Pole Balancing

In [25]:
import gym
import numpy as np
import random
from collections import defaultdict
import time

class QLearningAgent:
    def __init__(self, action_space, state_space, alpha, epsilon, gamma, decay_rate, min_epsilon):
        self.action_space = action_space
        self.state_space = state_space
        self.alpha = alpha
        self.epsilon = epsilon
        self.gamma = gamma
        self.decay_rate = decay_rate
        self.min_epsilon = min_epsilon
        self.q_table = defaultdict(lambda: np.zeros(self.action_space))
        self.all_losses = []
        self.win_rates = []
        self.win_counts = []
        self.all_rewards = []
        self.optimal_policy = {}
  
    def select_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(range(self.action_space)) # Exploration
        else:
            return np.argmax(self.q_table[state]) # Exploitation
    
    def update_q_value(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.q_table[next_state])
        TD_target = reward + self.gamma * self.q_table[next_state][best_next_action]
        TD_error = TD_target - self.q_table[state][action]
        self.q_table[state][action] += self.alpha * TD_error
        loss = TD_error**2
        self.all_losses.append(loss)

    def decay_epsilon(self):
        if self.epsilon > self.min_epsilon:
            self.epsilon *= self.decay_rate
        else:
            self.epsilon = self.min_epsilon
            
    def reset(self):
        self.q_table = defaultdict(lambda: np.zeros(self.action_space))
        
    def get_q_table(self):
        return dict(self.q_table)


def get_discrete_state(state, bins):
    discrete_state = [np.digitize(s, bins[i]) for i, s in enumerate(state)]
    return tuple(discrete_state)


env = gym.make("CartPole-v1")

alpha = 0.1
gamma = 0.95
episodes = 60000

epsilon = 1.0
epsilon_decay_value = 0.99995
min_epsilon = 0.01
decay_rate = 0.99

# Define bins for discretizing the state space
n_bins = (6, 12, 6, 12)
bins = [
    np.linspace(-4.8, 4.8, n_bins[0] - 1),
    np.linspace(-4, 4, n_bins[1] - 1),
    np.linspace(-0.418, 0.418, n_bins[2] - 1),
    np.linspace(-4, 4, n_bins[3] - 1)
]

agent = QLearningAgent(
    action_space=env.action_space.n,
    state_space=env.observation_space.shape[0],
    alpha=alpha,
    epsilon=epsilon,
    gamma=gamma,
    decay_rate=epsilon_decay_value,
    min_epsilon=min_epsilon
)

for episode in range(episodes):
    state = env.reset()
    state = get_discrete_state(state, bins)
    done = False
    episode_reward = 0
    
    while not done:
        action = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        next_state = get_discrete_state(next_state, bins)
        episode_reward += reward
        
        if not done:
            agent.update_q_value(state, action, reward, next_state)
            state = next_state
        else:
            agent.update_q_value(state, action, reward, next_state)
    
    agent.decay_epsilon()

    if episode % 1000 == 0:
        print(f'Episode {episode}: Epsilon = {agent.epsilon:.2f}, Reward = {episode_reward}')

env.close()

# Testing the trained agent
for episode in range(5):
    state = env.reset()
    state = get_discrete_state(state, bins)
    total_reward = 0
    
    for step in range(200):
        env.render()
        action = agent.select_action(state)
        state, reward, done, _ = env.step(action)
        state = get_discrete_state(state, bins)
        total_reward += reward
        if done:
            break

    print(f'Episode {episode + 1}: Total Reward = {total_reward}')
    time.sleep(1)

env.close()


Original state: [ 0.04136096 -0.01405612  0.00830409  0.045968  ]
Discrete state: [3, 5, 3, 6]


ValueError: too many values to unpack (expected 4)