In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym
# from JSAnimation.IPyhton_display import display_animation
from matplotlib import animation
from IPython.display import display

In [2]:
frames = []
env = gym.make('CartPole-v0')
observation = env.reset()

for step in range(0,200):
    frames.append(env.render(mode='rgb_array'))
    action = np.random.choice(2)
    observation, reward, done, info = env.step(action)
env.close()



In [3]:
ENV = 'CartPole-v0'
NUM_DIZITIZED=6
env = gym.make(ENV)
observation = env.reset()

In [4]:
def bins(clip_min, clip_max, num):
    return np.linspace(clip_min, clip_max, num+1)[1:-1]

In [5]:
def digitize_state(observation):
    cart_pos, cart_v, pole_angle, pole_v = observation
    digitized = [
        np.digitize(cart_pos, bins=bins(-2.4, 2.4, NUM_DIZITIZED)),
        np.digitize(cart_v,   bins=bins(-3.0, 3.0, NUM_DIZITIZED)),
        np.digitize(pole_angle, bins=bins(-0.5, 0.5, NUM_DIZITIZED)),
        np.digitize(pole_v, bins=bins(-2.0, 2.0, NUM_DIZITIZED)),
    ]
    return sum([x*(NUM_DIZITIZED**i) for i,x in enumerate(digitized)])

In [6]:
#3.4 Q学習

In [7]:
ENV = 'CartPole-v0'
NUM_DIZITIZED = 6
GAMMA=0.99
ETA=0.5
MAX_STEPS = 200
NUM_EPISODES=1000

In [8]:
class Agent:
    def __init__(self, num_states, num_actions):
        self.brain = Brain(num_states, num_actions)
        
    def update_Q_function(self, observation, action, reward, observation_next):
        self.brain.update_Q_table(observation, action, reward, observation_next)
        
    def get_action(self, observation, step):
        action = self.brain.decide_action(observation, step)
        return action
    
    
class Brain:
    def __init__(self, num_states, num_actions):
        self.num_actions = num_actions
        self.q_table = np.random.uniform(low=0, high=1, size=(NUM_DIZITIZED**num_states, num_actions))
        
    def bins(self, clip_min, clip_max, num):
        return np.linspace(clip_min, clip_max, num+1)[1:-1]
    
    def digitize_state(self, observation):
        cart_pos, cart_v, pole_angle, pole_v = observation
        digitized = [
            np.digitize(cart_pos, bins=self.bins(-2.4, 2.4, NUM_DIZITIZED)),
            np.digitize(cart_v, bins=self.bins(-3.0, 3.0, NUM_DIZITIZED)),
            np.digitize(pole_angle, bins=self.bins(-0.5, 0.5, NUM_DIZITIZED)),
            np.digitize(pole_v, bins=self.bins(-2.0, 2.0, NUM_DIZITIZED))]
        return sum([x*(NUM_DIZITIZED**i) for i,x in enumerate(digitized)])
    
    def update_Q_table(self, observation, action, reward, observation_next):
        #QテーブルをQ学習で更新
        state = self.digitize_state(observation)
        state_next = self.digitize_state(observation_next)
        Max_Q_next = max(self.q_table[state_next][:])
        self.q_table[state, action] = self.q_table[state, action] + ETA*(reward + GAMMA*Max_Q_next-self.q_table[state, action])
               
    def decide_action(self, observation, episode):
        state = self.digitize_state(observation)
        epsilon = 0.5*(1/(episode+1))
        
        if epsilon <= np.random.uniform(0,1):
            action = np.argmax(self.q_table[state][:])
        else:
            action = np.random.choice(self.num_actions)
        return action

In [9]:
class Environment:
    def __init__(self):
        self.env = gym.make(ENV)
        num_states = self.env.observation_space.shape[0]
        num_actions= self.env.action_space.n
        self.agent = Agent(num_states, num_actions)
        
    def run(self):
        complete_episodes=0
        is_episode_final = False
        frames = []
        for episode in range(NUM_EPISODES):
            observation = self.env.reset()
            for step in range(MAX_STEPS):
                if is_episode_final:
                    frames.append(self.env.render(mode='rgb_array'))
                action = self.agent.get_action(observation, episode)
                observation_next, _, done, _ = self.env.step(action)
                if done:
                    if step<195:
                        reward = -1
                        complete_episodes=0
                    else:
                        reward = 1
                        complete_episodes +=1
                else:
                    reward = 0
                self.agent.update_Q_function(
                    observation, action, reward, observation_next)
                observation = observation_next

                if done:
                    print(f'{episode} episode: finished after {step+1} time steps')
                    break
                    
            if is_episode_final:
                break

            if complete_episodes>=10:
                print('10回成功')
                is_episode_final=True
        self.env.close()

In [10]:
cartpole_env = Environment()
cartpole_env.run()

0 episode: finished after 18 time steps
1 episode: finished after 13 time steps
2 episode: finished after 17 time steps
3 episode: finished after 14 time steps
4 episode: finished after 8 time steps
5 episode: finished after 13 time steps
6 episode: finished after 14 time steps
7 episode: finished after 24 time steps
8 episode: finished after 14 time steps
9 episode: finished after 35 time steps
10 episode: finished after 28 time steps
11 episode: finished after 16 time steps
12 episode: finished after 27 time steps
13 episode: finished after 36 time steps
14 episode: finished after 47 time steps
15 episode: finished after 81 time steps
16 episode: finished after 70 time steps
17 episode: finished after 80 time steps
18 episode: finished after 26 time steps
19 episode: finished after 39 time steps
20 episode: finished after 32 time steps
21 episode: finished after 30 time steps
22 episode: finished after 106 time steps
23 episode: finished after 60 time steps
24 episode: finished after