# Continuous Q Learning

In [1]:
import numpy as np 
import time 
import gymnasium as gym

In [3]:
#Creating the environment
env = gym.make('CartPole-v1', render_mode = 'human')

In [8]:
#The game has continuous observation values so we need to overcome 
#this problem in order to manufacture the Q Table
#we are gonna build bins (classes) for each observation

def create_bins(number_of_bins_per_observation= 10):
    bins_cart_position = np.linspace(-4.8,4.8,number_of_bins_per_observation)
    bins_cart_velocity = np.linspace(-5,5,number_of_bins_per_observation)
    bins_pole_angle = np.linspace(-0.418,0.418,number_of_bins_per_observation)
    bins_pole_angular_velocity = np.linspace(-5,5,number_of_bins_per_observation)
    
    bins = np.array([bins_cart_position,bins_cart_velocity, bins_pole_angle, bins_pole_angular_velocity])
    return bins

In [9]:
BIN_NUM = 10
BINS = create_bins(BIN_NUM)

In [10]:
BINS

array([[-4.8       , -3.73333333, -2.66666667, -1.6       , -0.53333333,
         0.53333333,  1.6       ,  2.66666667,  3.73333333,  4.8       ],
       [-5.        , -3.88888889, -2.77777778, -1.66666667, -0.55555556,
         0.55555556,  1.66666667,  2.77777778,  3.88888889,  5.        ],
       [-0.418     , -0.32511111, -0.23222222, -0.13933333, -0.04644444,
         0.04644444,  0.13933333,  0.23222222,  0.32511111,  0.418     ],
       [-5.        , -3.88888889, -2.77777778, -1.66666667, -0.55555556,
         0.55555556,  1.66666667,  2.77777778,  3.88888889,  5.        ]])

In [14]:
#Initializing the q_table
q_table_shape = (BIN_NUM, BIN_NUM, BIN_NUM, BIN_NUM, env.action_space.n)
q_table = np.zeros(q_table_shape)
q_table.shape

(10, 10, 10, 10, 2)

In [11]:
#Now we need to classify each observation into each bin 
#in order to discretize the observasion space
def discetize_observation(observations, bins):
    bined_observations = []
    
    for i, observation in enumerate(observations):
        discretized_observation = np.digitize(observation, bin[i])#we classify each observation to the apropreate bin
        bined_observations.append(discretized_observation)
    return tuple(bined_observations)

In [12]:
#Initializing the hyperparamiters of the algorithm
ALPHA = .9 #The Learning Rate
GAMMA = .95 #Dicount Factor
EPOCHS = 10000 #How many games the agent will try

#Exploration vs Exploitation parameters
epsilon = 1 #Exploration Rate
MAX_EPSILON = 1 #Maximum probabilty of exploration
MIN_EPSILON = .01 #Minimum exploration probability
DECAY_RATE = .001 #Exponential reduction Rate for the probability of exploration

In [15]:
def greedy_action_selection(state, q_table, epsilon):
    #set a random cutof for the exploration 
    random = np.random.random()
    
    if epsilon < random:
        #Exploit! Use the best Q(s,a) from the Q Table
        possible_actions = q_table[state]
        action = np.argmax(possible_actions) #return the best action
    else:
        #Explore! Perform a random action from the action space
        action = env.action_space.saple()
    return action

In [16]:
def compute_q_value(old_q_value, next_opt_q_value, reward):
    new_q_value = old_q_value + ALPHA*(reward + GAMMA*next_opt_q_value - old_q_value)
    return new_q_value

In [17]:
def epslon_reduction(epsilon, epoch):
    new_epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON)*np.exp(-DECAY_RATE*epoch)
    return new_epsilon

In [None]:
#Creating the Algorithm
rewards = []


for episode in range(EPOCHS):
    initial_state = env.reset()[0]
    discretized_initial_state = discetize_observation(initial_state, BINS)
    
    terminate = False
    total_rewards = 0
    
    while not terminate:
        #choose an action 
        action = greedy_action_selection(discretized_initial_state, q_table, epsilon)
        #Perform the action
        next_state, reward, terminate, truncate, info = env.step(action)
        #discretize the next state
        discretized_next_state = discetize_observation(next_state, BINS)
        
        #retrieving the old q value
        old_q_value = q_table[discretized_initial_state +(action,)]
        
        #get the next optimal q_value
        next_opt_q_value = np.max(q_table[discretized_next_state])
        
        #Calculating the new q value
        new_q_value = compute_q_value(old_q_value, next_opt_q_value, reward)
        
        #Updating the Q Table
        q_table[discretized_initial_state + (action,)] = new_q_value
        #the current(next) state is now the initial state
        discretized_initial_state = discretized_next_state
        
        total_rewards += reward
        
    #Reduce the epsilon 
    epoch = episode +1
    epsilon = epslon_reduction(epsilon, epoch)
    rewards.append(total_rewards)
    
env.close()

In [None]:
#Getting the agent to play the game acting on the Q Table

state = env.reset()[0]

for i in range(100): #Maximum steps the agent allow to do and not winning the game 
    env.render()
    initial_disc_state = discetize_observation(state, BINS)
    action = np.argmax(q_table[initial_disc_state])
    state, reward, terminate, truncate, info = env.step(action)
    
    if terminate:
        break
env.close()