In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Create class for the Q-table
class QLearningTable:
    def __init__(self, actions, learning_rate: float=0.9, reward_decay: float=0.9, e_greedy=0.9):
        # list of actions
        self.actions = actions
        # Learning rate
        self.lr = learning_rate
        # value of gamma
        self.gamma = reward_decay
        # value of epsilon
        self.epsilon = e_greedy
        
        # Create full Q-table
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
        
    # Function for choosing the action for the agent
    def choose_action(self, observation):
        # Checking if the state exists in the table
        self.check_state_exist(observation)
        # Selection of the action - 90% accroding to the epsilon == 0.9
        # Choosing the best action
        if np.random.uniform() < self.epsilon:
            state_action = self.q_table.loc[observation, :]
            # there might be multiple best actions, so permutate the index for exploration
            state_action = state_action.reindex(np.random.permutation(state_action.index))
            action = state_action.idmax() # return the best action appearing first 
            
        else:
            # Choose random action - left 10% for exploration
            action = np.random.choice(self.actions)
        return action
    
    # Add new states to the Q-table
    def check_state_exist(self, state):
        if state not in self.q_table.index:
            self.q_table = self.q_table.append(
                pd.Series(
                    [0]*len(self.actions),
                    index=self.q_table.columns,
                    name=state,
                )
            )
            
    # Function for updating the Q-function value 
    def update_Q_value(self, current_state, action, reward, next_state):
        '''
        current_state:
            Represent the current state
        action:
            The action will be performed at current state
        reward:
            Receive a reward when performed an action given current state
        next_state:
            Transfer into next state when performed an action given current state
        '''
        # Check if the next step exists in the Q-table
        self.check_state_exist(next_state)
        
        # Q value of current state
        q_current = self.q_table.loc[state, action]
        
        # Cheke is the next state is ending
        if next_state != "terminal":
            # Calculate Q target, i.e., TD target
            q_target = reward + self.gamma * self.q_table.loc[next_state, :].max() 
            # note that right here use greedy policy to select Q value of next state
        else:
            q_target = reward
        
        # Update Q-value of current state
        self.q_table.loc[state, action] += self.lr * (q_target - q_current)
        
        return self.q_table.loc[state, action]