In [None]:
# Install gym
!pip install gym

In [None]:
# use the library 'gym' for this environment
import gym
import numpy as np

env = gym.make('CartPole-v1')
env.action_space
env.reset()

In [None]:
# Q table implementation 
state_space = 4  # number of state variables
action_space = 2  # number of actions

def Qtable(state_space, action_space, bin_size=30):
    bins = [np.linspace(-4.8, 4.8, bin_size),  # cart position --> limits are taken from env page
            np.linspace(-4, 4, bin_size),  # cart velocity
            np.linspace(-0.418, 0.418, bin_size),  # pole angle (radians)
            np.linspace(-4, 4, bin_size)]  # pole angular velocity

    q_table = np.random.uniform(low=-1, high=1, size=([bin_size] * state_space + [action_space]))
    return q_table, bins

def Discrete(state, bins, is_env_reset=False):
    """Discretizes the given continuous input state observation."""
    index = []
    main = state
    if is_env_reset: main = state[0]
    for i in range(len(main)): 
        index.append(np.digitize(main[i], bins[i]) - 1)
    return tuple(index)

In [None]:
# Q learning function takes the Q table as input
def Q_learning(q_table, bins, episodes=5000, gamma=0.95, lr=0.1, timestep=100, epsilon=0.2):
    steps = 0
    curr_score_history = []  # Tracks episode level scores
    for episode_i in range(1, episodes + 1):
        steps += 1
        current_state = Discrete(env.reset(), bins, True)
        current_score = 0
        done = False
        while not done:
            if np.random.uniform(0, 1) < epsilon:  # epsilon-greedy approach
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[current_state])
            observation, reward, done, info, empty = env.step(action)
            next_state = Discrete(observation, bins)
            current_score += reward  # accumulate current_score until finish

            if not done:
                max_future_q = np.max(q_table[next_state])
                current_q = q_table[current_state + (action,)]
                new_q = (1 - lr) * current_q + lr * (reward + gamma * max_future_q)
                q_table[current_state + (action,)] = new_q
            else:
                curr_score_history.append(current_score)
            
            current_state = next_state
        
        avg_score = np.mean(curr_score_history[-25:])  # average score of last 5 episodes
        if episode_i % timestep == 0: print('Average score after {} episodes:- {}'.format(episode_i, avg_score))
        if avg_score >= 150:  # success score is subjective
            print('Problem solved in episode {} with steps {}'.format(episode_i, steps))
            return curr_score_history
    return curr_score_history

In [None]:
# Q-learning
q_table, bins = Qtable(4, 2)
score_history = Q_learning(q_table, bins, episodes=2000, gamma=0.995, lr=0.15)
env.close()

In [None]:
import matplotlib.pyplot as plt

# plot score vs no.of episodes
plt.plot(score_history)
plt.xlabel('No of episodes')
plt.ylabel('Reward')
plt.show()