# Install dependencies

In [None]:
!apt-get install -y xvfb python3-opengl ffmpeg > /dev/null 2>&1
!pip install gym[classic_control]
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup gym pyvirtualdisplay tensorflow > /dev/null 2>&1

The action is an ndarray with shape (1,) which can take values {0, 1} indicating pushing the cart to the left or right, respectively. Note that the velocity that is reduced or increased by the applied force is not fixed and it depends on the angle the pole is pointing. The center of gravity of the pole varies the amount of energy needed to move the cart underneath it.

In [None]:
import gym
from gym import logger as gymlogger
from gym.wrappers import RecordVideo
gymlogger.set_level(40) #error only
import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

# Load cartpole env

In [None]:
env = gym.make("CartPole-v1")

Check if there are 2 valid discrete actions that can be performed

In [None]:
print(env.action_space)

Display observable space. Note the format of the space being
{
    position
    velocity
    pole angle
    pole angular velocity
}

In [None]:
print(env.observation_space)

In [None]:
observation = env.reset()
print("Initial observations:", observation)

In [None]:
observation, reward, truncated, done, info = env.step(0)
print(f"""Position: {observation[0]:2f}, 
Velocity: {observation[0]:2f}, 
Angle: {observation[0]:2f}, 
Angular velocity: {observation[0]:2f}""")
print("Reward for this step:", reward)
print("Is this round done?", done)

Simple game where the policy is to always choose 0.

In [None]:
observation = env.reset()
cumulative_reward = 0
done = False
count = 0
while not done:
    count += 1
    observation, reward, truncated, done, info  = env.step(0)
    cumulative_reward += reward

print(count)
print(f"""Position: {observation[0]:2f}, 
Velocity: {observation[0]:2f}, 
Angle: {observation[0]:2f}, 
Angular velocity: {observation[0]:2f}""")
print("Cumulative reward for this round:", cumulative_reward)

# Task 1

Development of an RL agent. Demonstrate the correctness of the implementation by sampling a random state from the cart pole environment, inputting to the agent, and outputting a chosen action. Print the values of the state and chosen action in Jupyter notebook.

In [None]:
import gym
import numpy as np
class CartpoleWorld():
    def __init__(self) -> None:
        self.__env = gym.make("CartPole-v1")
        self.__env.reset()
    def get_observation(self) -> np.ndarray:
        return self.__observation
    def update_world(self,action):
        self.__observation, self.__reward, self.__truncated, self.__done, _ = self.__env.step(action)
        return self.__reward
    def isEnd(self) -> bool:
        return self.__done or self.__truncated
    def get_reward(self):
        return self.__reward

In [None]:
class RLAgent():
    def __init__(self, env:CartpoleWorld) -> None:
        self.__env = env
        self.__total_reward = 0
    def get_optimal_action(self):
        pass
    def move(self):
        if (not self.__env.isEnd()):
            raise Exception("Episode already terminated")
        action = self.get_optimal_action()
        reward = self.__env.update_world(action)
        # update reward
        self.__total_reward += reward
    

agent = RLAgent(CartpoleWorld())

# Task 2:

Demonstrate the effectiveness of the RL agent. Run for 100 episodes (reset the environment at the beginning of each episode) and plot the cumulative reward against all episodes in Jupyter. Print the average reward over the 100 episodes. The average reward should be larger than 195.

# Task 3:
Render one episode played by the developed RL agent on Jupyter. Please refer to the sample code link for rendering code

# Task 4:

Format the Jupyter notebook by including step-by-step instruction and explanation, such that the notebook is easy to follow and run (refer to the tutorial section in the sample notebook). Include text explanation to demonstrate the originality of your implementation and your understanding of the code. For example, for each task, explain your approach and analyze the output; if you improve an existing approach, explain your improvements.