In [2]:
# coding:utf-8
# [0]ライブラリのインポート
import gym  #倒立振子(cartpole)の実行環境
from gym import wrappers  #gymの画像保存
import numpy as np
import time


# [1]Q関数を離散化して定義する関数　------------
# 観測した状態を離散値にデジタル変換する
def bins(clip_min, clip_max, num):
    return np.linspace(clip_min, clip_max, num + 1)[1:-1]

# 各値を離散値に変換
def digitize_state(observation):
    cart_pos, cart_v, pole_angle, pole_v = observation
    digitized = [
        np.digitize(cart_pos, bins=bins(-2.4, 2.4, num_dizitized)),
        np.digitize(cart_v, bins=bins(-3.0, 3.0, num_dizitized)),
        np.digitize(pole_angle, bins=bins(-0.5, 0.5, num_dizitized)),
        np.digitize(pole_v, bins=bins(-2.0, 2.0, num_dizitized))
    ]
    return sum([x * (num_dizitized**i) for i, x in enumerate(digitized)])


# [2]行動a(t)を求める関数 -------------------------------------
def get_action(next_state, episode):
           #徐々に最適行動のみをとる、ε-greedy法
    epsilon = 0.5 * (1 / (episode + 1))
    if epsilon <= np.random.uniform(0, 1):
        next_action = np.argmax(q_table[next_state])
    else:
        next_action = np.random.choice([0, 1])
    return next_action


# [3]Qテーブルを更新する関数 -------------------------------------
def update_Qtable(q_table, state, action, reward, next_state):
    gamma = 0.99
    alpha = 0.5
    next_Max_Q=max(q_table[next_state][0],q_table[next_state][1] )
    q_table[state, action] = (1 - alpha) * q_table[state, action] +\
            alpha * (reward + gamma * next_Max_Q)

    return q_table

# [4]. メイン関数開始 パラメータ設定--------------------------------------------------------
env = gym.make('CartPole-v0')
max_number_of_steps = 200  #1試行のstep数
num_consecutive_iterations = 100  #学習完了評価に使用する平均試行回数
num_episodes = 2000  #総試行回数
goal_average_reward = 195  #この報酬を超えると学習終了（中心への制御なし）
# 状態を6分割^（4変数）にデジタル変換してQ関数（表）を作成
num_dizitized = 6  #分割数
q_table = np.random.uniform(
    low=-1, high=1, size=(num_dizitized**4, env.action_space.n))

total_reward_vec = np.zeros(num_consecutive_iterations)  #各試行の報酬を格納
final_x = np.zeros((num_episodes, 1))  #学習後、各試行のt=200でのｘの位置を格納
islearned = 0  #学習が終わったフラグ
isrender = 0  #描画フラグ


# [5] メインルーチン--------------------------------------------------
for episode in range(num_episodes):  #試行数分繰り返す
    # 環境の初期化
    observation = env.reset()
    state = digitize_state(observation)
    action = np.argmax(q_table[state])
    episode_reward = 0

    for t in range(max_number_of_steps):  #1試行のループ
        if islearned == 1:  #学習終了したらcartPoleを描画する
            env.render()
            time.sleep(0.1)
            print (observation[0])  #カートのx位置を出力

        # 行動a_tの実行により、s_{t+1}, r_{t}などを計算する
        observation, reward, done, info = env.step(action)

        # 報酬を設定し与える
        if done:
            if t < 195:
                reward = -200  #こけたら罰則
            else:
                reward = 1  #立ったまま終了時は罰則はなし
        else:
            reward = 1  #各ステップで立ってたら報酬追加

        episode_reward += reward  #報酬を追加

        # 離散状態s_{t+1}を求め、Q関数を更新する
        next_state = digitize_state(observation)  #t+1での観測状態を、離散値に変換
        q_table = update_Qtable(q_table, state, action, reward, next_state)

        #  次の行動a_{t+1}を求める 
        action = get_action(next_state, episode)    # a_{t+1} 

        state = next_state

        #終了時の処理
        if done:
            print('%d Episode finished after %f time steps / mean %f' %
                  (episode, t + 1, total_reward_vec.mean()))
            total_reward_vec = np.hstack((total_reward_vec[1:],
                                          episode_reward))  #報酬を記録
            if islearned == 1:  #学習終わってたら最終のx座標を格納
                final_x[episode, 0] = observation[0]
            break

    if (total_reward_vec.mean() >=
            goal_average_reward):  # 直近の100エピソードが規定報酬以上であれば成功
        print('Episode %d train agent successfuly!' % episode)
        islearned = 1
        #np.savetxt('learned_Q_table.csv',q_table, delimiter=",") #Qtableの保存する場合
        if isrender == 0:
            #env = wrappers.Monitor(env, './movie/cartpole-experiment-1') #動画保存する場合
            isrender = 1
    #10エピソードだけでどんな挙動になるのか見たかったら、以下のコメントを外す
    #if episode>10:
    #    if isrender == 0:
    #        env = wrappers.Monitor(env, './movie/cartpole-experiment-1') #動画保存する場合
    #        isrender = 1
    #    islearned=1;

if islearned:
    np.savetxt('final_x.csv', final_x, delimiter=",")

  and should_run_async(code)


0 Episode finished after 23.000000 time steps / mean 0.000000
1 Episode finished after 10.000000 time steps / mean -1.780000
2 Episode finished after 68.000000 time steps / mean -3.690000
3 Episode finished after 12.000000 time steps / mean -5.020000
4 Episode finished after 167.000000 time steps / mean -6.910000
5 Episode finished after 77.000000 time steps / mean -7.250000
6 Episode finished after 107.000000 time steps / mean -8.490000
7 Episode finished after 15.000000 time steps / mean -9.430000
8 Episode finished after 88.000000 time steps / mean -11.290000
9 Episode finished after 32.000000 time steps / mean -12.420000
10 Episode finished after 11.000000 time steps / mean -14.110000
11 Episode finished after 103.000000 time steps / mean -16.010000
12 Episode finished after 23.000000 time steps / mean -16.990000
13 Episode finished after 32.000000 time steps / mean -18.770000
14 Episode finished after 16.000000 time steps / mean -20.460000
15 Episode finished after 20.000000 time 

If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


0.050421763
0.05821054
0.069895
0.07767369
0.081551194
0.08153071
0.08541842
0.08540944
0.08930991
0.08931514
0.093231216
0.0932537
0.08938445
0.08942763
0.08557752
0.08563862
0.08180516
0.081881896
0.07806304
0.07815354
0.07434763
0.07445041
0.07065613
0.07077002
0.06698635
0.06711048
0.0633367
0.06347046
0.059706096
0.059849124
0.056093946
0.05624613
0.052500144
0.0526616
0.04892503
0.049096104
0.04536942
0.045550697
0.04183461
0.04202692
0.038322397
0.038526833
0.034835126
0.035053074
0.031375732
0.031608902
0.027947817
0.028198268
0.02455572
0.02482592
0.021204619
0.01369365
0.010096817
0.0026086713
-0.0009665799
-0.008434458
-0.019794647
-0.03504863
-0.04639489
-0.06164249
-0.0729893
-0.0804399
-0.08399733
-0.09146703
-0.09504433
-0.10253469
-0.10613371
-0.11364681
-0.1172699
-0.12480835
-0.12845844
-0.12822218
-0.124099635
-0.11608905
-0.111991
-0.10399693
-0.09990866
-0.09972188
-0.10343376
-0.11104315
-0.12255063
-0.13015361
-0.13385586
-0.13365956
-0.1295652
-0.12157152
-0.117

KeyboardInterrupt: 