In [1]:
#https://youtube.com/watch?v=qhRNvCVVJaA
#https://www.youtube.com/watch?v=mo96Nqlo1L8
#https://www.youtube.com/watch?v=HGeI30uATws
#https://www.datamachinist.com/reinforcement-learning/part-6-q-learning-for-continuous-state-problems/


In [3]:

import gym 
import math
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer


class QLearnCartPoleSolver():
    def __init__(self, env, buckets=(6, 12), episodes=100, epsilon_decay_rate = 0.1, 
        decay=24, max_steps=100, batch_size = 64, min_lr=0.1, discount=1.0,min_epsilon=0.1):

        self.env = env
        self.action_size = self.env.action_space.n
        self.discount=discount
        self.buckets = buckets
        self.min_lr = min_lr
        self.min_epsilon = min_epsilon 
        self.episodes = episodes
        self.decay = decay
        self.epsilon_decay_rate = epsilon_decay_rate
        self.max_steps = max_steps
        self.batch_size = batch_size
        self.Q_Values = np.zeros(self.buckets +( self.action_size,))
        self.upper_bounds = [
            self.env.observation_space.high[2], math.radians(50)]
        self.lower_bounds = [
            self.env.observation_space.low[2], -math.radians(50)]


    def get_epsilon(self, t):
        return max(self.min_epsilon, min(1., 1. - math.log10((t + 1) / self.decay)))

    def get_learning_rate(self, t):
        return max(self.min_lr, min(1., 1. - math.log10((t + 1) / self.decay)))

    def action(self, state):
        return self.env.action_space.sample() if np.random.random() <= self.epsilon else np.argmax(self.Q_Values[state])

    def updated_q_value(self, state, action, reward, new_state):
        return (self.learning_rate * (reward + self.discount * np.max(self.Q_Values[new_state]) - self.Q_Values[state][action]))

    def discretize_state(self, state):
        _, _, angle, angle_velocity = state
        est = KBinsDiscretizer(n_bins=self.buckets,
                               encode='ordinal', strategy='uniform')
        est.fit([self.lower_bounds, self.upper_bounds])
        return tuple(map(int, est.transform([[angle, angle_velocity]])[0]))
    
    def train(self):
        scores = []
        for episode in range(self.episodes):
            self.learning_rate = self.get_learning_rate(episode)
            self.epsilon = self.get_epsilon(episode)
            state = self.discretize_state(self.env.reset())
            done = False
            reward_current_ep = 0
            step = 1
            while not done:
                # self.env.render()
                action = self.action(state)
                next_state, reward, done, _ = self.env.step(action) 
                next_state = self.discretize_state(next_state)
                self.Q_Values[state][action] += self.updated_q_value(state, action, reward, next_state)
                state = next_state
                reward_current_ep += reward
                # print(f"Trainingsession {episode+1}:", step, "steps")
                step +=1
            scores.append(reward_current_ep)
            print(f"{scores[episode]}  score for ep {episode+1}")
        print('Finished training!')
        #self.env.close()
            
    def run(self):
        done = False
        current_state = self.discretize_state(self.env.reset())
        score = 0
        while not done:
            self.env.render()
            action = self.action(current_state)
            observation, reward, done, _ = self.env.step(action)
            new_state = self.discretize_state(observation)
            current_state = new_state
            score += reward
        print(f"score {score}")
        self.env.close()

env = gym.make('CartPole-v0')

model = QLearnCartPoleSolver(env, episodes=200)
model.train()

[-0.01020568 -0.01720916 -0.02911566 -0.00277115]
(2, 6)
14.0  score for ep 1
[ 0.01630077  0.02926186  0.00213988 -0.03506989]
(3, 5)
25.0  score for ep 2
[-0.01410539  0.03318221 -0.04330508 -0.01566053]
(2, 5)
30.0  score for ep 3
[-0.01424017  0.03050586  0.03095977  0.03401666]
(3, 6)
15.0  score for ep 4
[-0.02286044 -0.0320023   0.01693328 -0.0462225 ]
(2, 5)
57.0  score for ep 5
[ 0.04574633 -0.00859948 -0.01423842 -0.03772189]
(3, 5)
91.0  score for ep 6
[-0.04518612  0.01934939 -0.03375217  0.0233156 ]
(3, 5)
14.0  score for ep 7
[ 0.04924644 -0.00618811  0.00912105  0.00464763]
(2, 6)
23.0  score for ep 8
[ 0.04062912 -0.04972913  0.04796371  0.00341724]
(3, 5)
14.0  score for ep 9
[ 0.00227795 -0.02198212  0.03001086  0.0203116 ]
(2, 6)
16.0  score for ep 10
[-0.01066782 -0.01173394 -0.04169104  0.02327296]
(2, 5)
18.0  score for ep 11
[-0.00212682  0.0404127   0.03814228 -0.02790559]
(3, 6)
25.0  score for ep 12
[-0.03047789 -0.01602721 -0.0036897   0.03104759]
(3, 5)
51.0

In [7]:
model.run()

score 200.0
