# CartPole Policy

In [1]:
import gym
import numpy as np
import math
from collections import deque

In [12]:
class QLearning():
    def __init__(self, buckets,numberOfEpisodes, desiredScore ,learningRate, epsilon, gamma, div, envSteps=None, quiet=False, monitor=False):
        self.buckets = buckets
        self.numberOfEpisodes = numberOfEpisodes
        self.desiredScore = desiredScore
        self.learningRate = learningRate
        self.epsilon = epsilon
        self.gamma = gamma
        self.div = div
        self.quiet = quiet
        
        self.env = gym.make('CartPole-v1')
        if envSteps is not None: 
            self.env._max_episode_steps = envSteps  
        self.q = np.zeros(self.buckets + (self.env.action_space.n,))

    def discretize(self, obs):
        upper_bounds = [self.env.observation_space.high[0], 0.5, self.env.observation_space.high[2], math.radians(50)]
        lower_bounds = [self.env.observation_space.low[0], -0.5, self.env.observation_space.low[2], -math.radians(50)]
        ratios = [(obs[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i]) for i in range(len(obs))]
        new_obs = [int(round((self.buckets[i] - 1) * ratios[i])) for i in range(len(obs))]
        new_obs = [min(self.buckets[i] - 1, max(0, new_obs[i])) for i in range(len(obs))]
        return tuple(new_obs)
        
    def action(self, state, epsilon):
        return self.env.action_space.sample() if (np.random.random() <= epsilon) else np.argmax(self.q[state])
    
    def updateQTable(self, state_old, action, reward, state_new, alpha):
        self.q[state_old][action] += alpha * (reward + self.gamma * np.max(self.q[state_new]) - self.q[state_old][action])

    def calcEpsilon(self, t, div):
        return max(self.epsilon, min(1, 1.0 - ((t + 1) / div)))

    def calcLearningRate(self, t, div):
        return max(self.learningRate, min(1.0, 1.0 - ((t + 1) / div)))
    
    def runEpisode(self):
        scores = deque(maxlen=100)
        for i in range(numberOfEpisodes):
            s_raw = self.env.reset()
            s = self.discretize(s_raw)
            lr = self.calcLearningRate(i, div)
            ep = self.calcEpsilon(i, div)
            done = False
            j = 1
            while not done:
                self.env.render()
                a = self.action(s,ep)
                obs, r, done, info = self.env.step(a)
                
                s1 = self.discretize(obs)
                self.updateQTable(s, a, r, s1, lr)
                s = s1
                j+=1
            scores.append(j)
            meanScore = np.mean(scores)
            if meanScore >= self.desiredScore and i >= 100:
                if not self.quiet:
                    print('Completed {} episodes. Solved after {}'.format(i, i - 100), " score :", meanScore)
                    return i-100

            if i % 100 == 0 and not self.quiet:
                print('[Episode {}] - Average survival time over last 100 episodes was {}.'.format(i, meanScore))

        if not self.quiet: 
            print('Did not solve after {} episodes :'.format(i))
            return i
        self.env.close()

In [13]:
buckets=(1, 1, 6, 12,)
numberOfEpisodes = 1000
desiredScore = 200
learningRate = 0.1 
gamma = 1.0
epsilon = 0.1
div = 25

solver = QLearning(buckets, numberOfEpisodes, desiredScore, learningRate, epsilon, gamma, div)
solver.runEpisode()

[Episode 0] - Average survival time over last 100 episodes was 14.0.
[Episode 100] - Average survival time over last 100 episodes was 72.5.
[Episode 200] - Average survival time over last 100 episodes was 173.18.
[Episode 300] - Average survival time over last 100 episodes was 175.12.
('Completed 356 episodes. Solved after 256', ' score :', 201.0)


256