<a href="https://colab.research.google.com/github/Mustafa-Dara-Ozevin/Reinforcement_Learning_with_gym/blob/master/CartPoleREINFORCE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install box2d-py
!pip install Box2D
!pip install tensorboardx
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!pip install ptan
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

Collecting box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/06/bd/6cdc3fd994b0649dcf5d9bad85bd9e26172308bbe9a421bfc6fdbf5081a6/box2d_py-2.3.8-cp36-cp36m-manylinux1_x86_64.whl (448kB)
[K     |▊                               | 10kB 13.8MB/s eta 0:00:01[K     |█▌                              | 20kB 4.8MB/s eta 0:00:01[K     |██▏                             | 30kB 6.2MB/s eta 0:00:01[K     |███                             | 40kB 6.3MB/s eta 0:00:01[K     |███▋                            | 51kB 4.9MB/s eta 0:00:01[K     |████▍                           | 61kB 5.7MB/s eta 0:00:01[K     |█████▏                          | 71kB 6.2MB/s eta 0:00:01[K     |█████▉                          | 81kB 6.0MB/s eta 0:00:01[K     |██████▋                         | 92kB 6.5MB/s eta 0:00:01[K     |███████▎                        | 102kB 7.0MB/s eta 0:00:01[K     |████████                        | 112kB 7.0MB/s eta 0:00:01[K     |████████▊                       | 12

In [10]:
import torch
import ptan
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
 
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML
 
from IPython import display as ipythondisplay
import collections
import time
import numpy as np
from tensorboardX import SummaryWriter
 
%load_ext tensorboard
 
ENV_NAME = 'CartPole-v0'
GAMMA = 0.99
LEARNING_RATE = 0.01
EPISODES_TO_TRAIN = 4
SHOW_EVERY = 10_000

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [11]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env



In [13]:
class PGN(nn.Module):
  def __init__(self, input_size, n_actions):
    super(PGN, self).__init__()
    self.net = nn.Sequential(
        nn.Linear(input_size,256),
        nn.ReLU(),
        nn.Linear(256, n_actions)
    )

  def forward(self, x):
    return self.net(x)


def calc_qvals(rewards):
  res = []
  sum_r = 0.0
  for r in reversed(rewards):
    sum_r *= GAMMA
    sum_r += r 
    res.append(sum_r)
  return list(reversed(res))

if __name__ == '__main__':
    env = wrap_env(gym.make(ENV_NAME))
    
    writer = SummaryWriter(comment='-CartPole-reinforce')
    net = PGN(env.observation_space.shape[0], env.action_space.n)

    agent = ptan.agent.PolicyAgent(net, preprocessor=ptan.agent.float32_preprocessor,
                                    apply_softmax=True)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA)

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

    total_rewards = []
    step_idx = 0
    done_episodes = 0

    batch_episodes = 0
    batch_states, batch_actions, batch_qvals = [], [], []
    cur_rewards = []

    for step_idx, exp in enumerate(exp_source):
        batch_states.append(exp.state)
        batch_actions.append(int(exp.action))
        cur_rewards.append(exp.reward)

        if step_idx % SHOW_EVERY == 0:
          env.render()
          show_video()

        if exp.last_state is None:
            batch_qvals.extend(calc_qvals(cur_rewards))
            cur_rewards.clear()
            batch_episodes += 1

        # handle new rewards
        new_rewards = exp_source.pop_total_rewards()
        if new_rewards:
            done_episodes += 1
            reward = new_rewards[0]
            total_rewards.append(reward)
            mean_rewards = float(np.mean(total_rewards[-100:]))
            print("%d: reward: %6.2f, mean_100: %6.2f, episodes: %d" % (
                step_idx, reward, mean_rewards, done_episodes))
            writer.add_scalar("reward", reward, step_idx)
            writer.add_scalar("reward_100", mean_rewards, step_idx)
            writer.add_scalar("episodes", done_episodes, step_idx)
            if mean_rewards > 195:
                print("Solved in %d steps and %d episodes!" % (step_idx, done_episodes))
                break

        if batch_episodes < EPISODES_TO_TRAIN:
            continue

        optimizer.zero_grad()
        states_v = torch.FloatTensor(batch_states)
        batch_actions_t = torch.LongTensor(batch_actions)
        batch_qvals_v = torch.FloatTensor(batch_qvals)

        logits_v = net(states_v)
        log_prob_v = F.log_softmax(logits_v, dim=1)
        log_prob_actions_v = batch_qvals_v * log_prob_v[range(len(batch_states)), batch_actions_t]
        loss_v = -log_prob_actions_v.mean()

        loss_v.backward()
        optimizer.step()

        batch_episodes = 0
        batch_states.clear()
        batch_actions.clear()
        batch_qvals.clear()

    writer.close()
    env.close()




10: reward:  10.00, mean_100:  10.00, episodes: 1
25: reward:  15.00, mean_100:  12.50, episodes: 2
55: reward:  30.00, mean_100:  18.33, episodes: 3
68: reward:  13.00, mean_100:  17.00, episodes: 4
86: reward:  18.00, mean_100:  17.20, episodes: 5
99: reward:  13.00, mean_100:  16.50, episodes: 6
112: reward:  13.00, mean_100:  16.00, episodes: 7
138: reward:  26.00, mean_100:  17.25, episodes: 8
156: reward:  18.00, mean_100:  17.33, episodes: 9
179: reward:  23.00, mean_100:  17.90, episodes: 10
193: reward:  14.00, mean_100:  17.55, episodes: 11
205: reward:  12.00, mean_100:  17.08, episodes: 12
231: reward:  26.00, mean_100:  17.77, episodes: 13
252: reward:  21.00, mean_100:  18.00, episodes: 14
281: reward:  29.00, mean_100:  18.73, episodes: 15
295: reward:  14.00, mean_100:  18.44, episodes: 16
364: reward:  69.00, mean_100:  21.41, episodes: 17
382: reward:  18.00, mean_100:  21.22, episodes: 18
416: reward:  34.00, mean_100:  21.89, episodes: 19
430: reward:  14.00, mean_1

10111: reward: 200.00, mean_100:  96.81, episodes: 120
10264: reward: 153.00, mean_100:  97.84, episodes: 121
10434: reward: 170.00, mean_100:  99.43, episodes: 122
10596: reward: 162.00, mean_100: 100.57, episodes: 123
10767: reward: 171.00, mean_100: 101.99, episodes: 124
10876: reward: 109.00, mean_100: 102.80, episodes: 125
10999: reward: 123.00, mean_100: 103.60, episodes: 126
11109: reward: 110.00, mean_100: 103.75, episodes: 127
11238: reward: 129.00, mean_100: 104.55, episodes: 128
11344: reward: 106.00, mean_100: 105.16, episodes: 129
11458: reward: 114.00, mean_100: 106.15, episodes: 130
11518: reward:  60.00, mean_100: 105.60, episodes: 131
11628: reward: 110.00, mean_100: 105.33, episodes: 132
11706: reward:  78.00, mean_100: 105.61, episodes: 133
11792: reward:  86.00, mean_100: 105.72, episodes: 134
11900: reward: 108.00, mean_100: 106.37, episodes: 135
11967: reward:  67.00, mean_100: 106.33, episodes: 136
12013: reward:  46.00, mean_100: 106.21, episodes: 137
12100: rew

20051: reward: 200.00, mean_100: 131.14, episodes: 202
20251: reward: 200.00, mean_100: 132.30, episodes: 203
20451: reward: 200.00, mean_100: 133.33, episodes: 204
20651: reward: 200.00, mean_100: 134.02, episodes: 205
20851: reward: 200.00, mean_100: 134.59, episodes: 206
21051: reward: 200.00, mean_100: 134.67, episodes: 207
21251: reward: 200.00, mean_100: 135.29, episodes: 208
21439: reward: 188.00, mean_100: 135.17, episodes: 209
21639: reward: 200.00, mean_100: 135.17, episodes: 210
21839: reward: 200.00, mean_100: 135.17, episodes: 211
22039: reward: 200.00, mean_100: 135.17, episodes: 212
22239: reward: 200.00, mean_100: 135.17, episodes: 213
22439: reward: 200.00, mean_100: 135.17, episodes: 214
22639: reward: 200.00, mean_100: 135.17, episodes: 215
22839: reward: 200.00, mean_100: 135.17, episodes: 216
23039: reward: 200.00, mean_100: 135.28, episodes: 217
23239: reward: 200.00, mean_100: 135.28, episodes: 218
23439: reward: 200.00, mean_100: 135.28, episodes: 219
23639: rew

30039: reward: 200.00, mean_100: 171.19, episodes: 252
30239: reward: 200.00, mean_100: 172.66, episodes: 253
30439: reward: 200.00, mean_100: 173.88, episodes: 254
30639: reward: 200.00, mean_100: 175.19, episodes: 255
30839: reward: 200.00, mean_100: 176.19, episodes: 256
31039: reward: 200.00, mean_100: 177.24, episodes: 257
31239: reward: 200.00, mean_100: 178.49, episodes: 258
31439: reward: 200.00, mean_100: 179.67, episodes: 259
31639: reward: 200.00, mean_100: 181.08, episodes: 260
31779: reward: 140.00, mean_100: 181.72, episodes: 261
31908: reward: 129.00, mean_100: 182.15, episodes: 262
32061: reward: 153.00, mean_100: 182.91, episodes: 263
32261: reward: 200.00, mean_100: 184.07, episodes: 264
32373: reward: 112.00, mean_100: 184.13, episodes: 265
32497: reward: 124.00, mean_100: 184.64, episodes: 266
32616: reward: 119.00, mean_100: 184.65, episodes: 267
32747: reward: 131.00, mean_100: 184.87, episodes: 268
32864: reward: 117.00, mean_100: 184.90, episodes: 269
32964: rew

40025: reward: 120.00, mean_100:  52.33, episodes: 402
40131: reward: 106.00, mean_100:  52.98, episodes: 403
40243: reward: 112.00, mean_100:  53.67, episodes: 404
40366: reward: 123.00, mean_100:  54.54, episodes: 405
40492: reward: 126.00, mean_100:  55.34, episodes: 406
40611: reward: 119.00, mean_100:  56.21, episodes: 407
40741: reward: 130.00, mean_100:  57.30, episodes: 408
40864: reward: 123.00, mean_100:  58.07, episodes: 409
40997: reward: 133.00, mean_100:  59.11, episodes: 410
41115: reward: 118.00, mean_100:  60.05, episodes: 411
41263: reward: 148.00, mean_100:  61.25, episodes: 412
41390: reward: 127.00, mean_100:  62.15, episodes: 413
41517: reward: 127.00, mean_100:  63.13, episodes: 414
41670: reward: 153.00, mean_100:  64.39, episodes: 415
41817: reward: 147.00, mean_100:  65.51, episodes: 416
41957: reward: 140.00, mean_100:  66.65, episodes: 417
42089: reward: 132.00, mean_100:  67.74, episodes: 418
42208: reward: 119.00, mean_100:  68.49, episodes: 419
42331: rew

50158: reward: 180.00, mean_100: 126.37, episodes: 472
50358: reward: 200.00, mean_100: 127.94, episodes: 473
50524: reward: 166.00, mean_100: 129.04, episodes: 474
50693: reward: 169.00, mean_100: 130.38, episodes: 475
50876: reward: 183.00, mean_100: 131.79, episodes: 476
51076: reward: 200.00, mean_100: 133.27, episodes: 477
51276: reward: 200.00, mean_100: 134.66, episodes: 478
51476: reward: 200.00, mean_100: 136.17, episodes: 479
51676: reward: 200.00, mean_100: 137.80, episodes: 480
51874: reward: 198.00, mean_100: 139.00, episodes: 481
52074: reward: 200.00, mean_100: 140.26, episodes: 482
52274: reward: 200.00, mean_100: 141.70, episodes: 483
52471: reward: 197.00, mean_100: 143.21, episodes: 484
52671: reward: 200.00, mean_100: 144.26, episodes: 485
52871: reward: 200.00, mean_100: 145.53, episodes: 486
53071: reward: 200.00, mean_100: 147.12, episodes: 487
53271: reward: 200.00, mean_100: 148.37, episodes: 488
53471: reward: 200.00, mean_100: 149.25, episodes: 489
53671: rew

60071: reward: 200.00, mean_100: 174.83, episodes: 522
60271: reward: 200.00, mean_100: 175.42, episodes: 523
60471: reward: 200.00, mean_100: 176.03, episodes: 524
60671: reward: 200.00, mean_100: 176.75, episodes: 525
60871: reward: 200.00, mean_100: 177.47, episodes: 526
61071: reward: 200.00, mean_100: 178.06, episodes: 527
61271: reward: 200.00, mean_100: 178.76, episodes: 528
61471: reward: 200.00, mean_100: 179.43, episodes: 529
61671: reward: 200.00, mean_100: 180.02, episodes: 530
61871: reward: 200.00, mean_100: 180.76, episodes: 531
62071: reward: 200.00, mean_100: 181.34, episodes: 532
62271: reward: 200.00, mean_100: 181.99, episodes: 533
62471: reward: 200.00, mean_100: 182.71, episodes: 534
62671: reward: 200.00, mean_100: 183.36, episodes: 535
62871: reward: 200.00, mean_100: 183.87, episodes: 536
63071: reward: 200.00, mean_100: 184.38, episodes: 537
63271: reward: 200.00, mean_100: 184.74, episodes: 538
63471: reward: 200.00, mean_100: 185.34, episodes: 539
63671: rew

In [None]:
logs_base_dir = "./runs"
%tensorboard --logdir {logs_base_dir}

In [None]:
print(env.observation_space)

In [9]:
show_video()