# CartPole Reinforcement Learning


In [None]:
import gymnasium as gym
from gym import logger as gymlogger
from gym.wrappers import RecordVideo, RecordEpisodeStatistics

import numpy as np
from tqdm import tqdm

import moviepy.editor

In [5]:
from collections import defaultdict
class CartPoleBot:

  env:gym.Env
  learningRate:float
  discountFactor:float

  def __init__(self, env: gym.Env, learningRate: float,
               initalEpsilon: float, epsilonDecay: float, finalEpsilon: float,
               discountFactor: float):

    self.env = env # cartpole environment.

    self.learningRate = learningRate # rate at which update values in Q-table

    self.epsilon = initalEpsilon
    self.epsilonDecay = epsilonDecay
    self.finalEpsilon = finalEpsilon


    self.qTable = defaultdict(lambda: np.zeros(self.env.action_space.n)) #creates Q-table
  
    self.discountFactor = discountFactor

  def discConv(self, obs):
    '''
    takes a numpy array representing the simulation state and returns a
    hashable tuple with  values "rounded" to the closest chunk
    '''
    #DO NOT CHANGE.
    posSpace = np.linspace(-2.4, 2.4, 10)
    velSpace = np.linspace(-4, 4, 10)
    angSpace = np.linspace(-.2095, .2095, 10)
    angVSpace = np.linspace(-4, 4, 10)
    lTodArray = [posSpace, velSpace, angSpace, angVSpace]
    tR = []
    for i in range(len(obs)):
      tR += [np.digitize(obs[i], lTodArray[i])]

    return(tuple(tR))

  def getAction(self, observation):
    #TO DO
    '''
    function generates random number, checks if it's higher than epsilon and
    then based on that chooses a random action or look up the Q-table's reccomended action.
    '''

    x = np.random.rand()
    state = self.discConv(observation)

    if x > self.epsilon:
      action = self.env.action_space.sample()
    else:
      action = np.argmax(self.qTable[state])
    return action


  def update(self, pastObv, action, reward, terminated, currObv):
    '''
    adjusting our q values based on how good/bad the action was

    pastObv: State of the simulation before we took an action.
    action: action we took 
    reward: the reward given to us by the environment
    terminated: whether the simulation ended or not because we failed 
    currObv: state of the simulation after the action from getAction was taken.
    '''
    pastObv = self.discConv(pastObv)
    currObv = self.discConv(currObv)

    if terminated:
      q_val = 0
    else:
      q_val = max(self.qTable[currObv])

    temporalDiff = (reward) + (q_val * self.discountFactor) - self.qTable[pastObv][action]

    self.qTable[pastObv][action] += self.learningRate * temporalDiff



  def decayEpsilon(self):
    #TO DO
    '''
    used to decay epsilon overtime
    '''

    #Your code here:
    if self.epsilon > self.epsilonDecay:
      self.epsilon = self.epsilonDecay
    else:
      self.epsilon = 0



In [None]:
env = RecordVideo(gym.make("CartPole-v1", render_mode = "rgb_array"), "/content", episode_trigger= lambda x: (x%5000 == 0), new_step_api= True)

#"example" parameter used - not perfect and could use finetuning in future 
learningRate = 0.05
nEps = 60_000
startEpsilon = 1.0
epsilonDecay = (1.0/30_000.0)
finalEpsilon = 0.1
discountFactor = 0.95

balanceAgent = CartPoleBot(env, learningRate, startEpsilon, epsilonDecay, finalEpsilon, discountFactor)

for i in tqdm(range(nEps)):
  observation, info = env.reset() #reset the environment at the start of every episode

  done = False
  while not done:
    action = balanceAgent.getAction(observation)
    newObv, reward, terminated, truncated, info = env.step(action)

    if i % 5000 == 0:
      env.render()

    balanceAgent.update(observation, action, reward, terminated, newObv)

    done = terminated or truncated
    observation = newObv

  balanceAgent.decayEpsilon() #always decaying epsilon!

env.close()


In [None]:
moviepy.editor.ipython_display("/content/rl-video-episode-60000.mp4")