In [1]:
import random
import numpy as np
import gym
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tqdm import tqdm

  ROMS = resolve_roms()
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.object,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.bool,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.object:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.bool:
  _pywrap_tensorflow.RegisterType("Mapping", _collections.Mapping)
  class _ListWrapper(List, collections.MutableSequence,
  class 

## 6.1. Introduction to Reinforcement Learning

For the problem formulation, we introduce the [gym](https://www.gymlibrary.ml/) library. It implements control problems from the past and present of reinforcement learning that have served as milestones in the development of that technique. Researchers that work on the same standard problems have the advantage that their work is easier to compare and to transfer. On the other hand, if benchmark problems are too prevalent in a community, it may drive research in a certain, uniform direction that is not as productive anymore. Note that gym is a product of OpenAI, a private company. 

gym uses a unifying framework that defines every control problem as an *environment*. The basic building blocks of an environment are `env = gym.make` to create the environment, `env.reset` to start an episode, `env.render` to give a human readable representation of the state of the environment, and `env.step` to perform an action.

We start the exercises with the 4x4 [FrozenLake](https://www.gymlibrary.ml/environments/toy_text/frozen_lake/) environment. It is a kind of maze with "frozen" traversable squares marked by `F` and "holes", losing terminal squares marked by `H`. The agent starts at the `S` start square and only incurs reward, when they manage to get to the goal `G` square. We mostly look at the deterministic case, where traversing on the frozen lake is deterministic, which is controlled by the variable `is_slippery=False` when creating the environment. If the lake is slippery, a movement in a certain direction may by chance result in the agent arriving at a different square than expected.

In [2]:
env = gym.make("FrozenLake-v1", is_slippery=False)
#print(env.action_space)
#print(env.observation_space)

In [3]:
starting_state = env.reset()
#print(starting_state)
env.render()

The `env.action_space` always implements a `sample` method, which returns a valid, random aciton. We can utilize this, to have a look at the dynamics of the system. You can execute the following cell a few times to see what happens. When the agent enters a terminal state, you need to execute `env.reset` to start anew.

In [4]:
state, reward, done, info = env.step(env.action_space.sample())
print(state, reward, done, info)
env.render()

4 0.0 False {'prob': 1.0}


#### Task 1. a) Random Agent:
We provide the framework for the random agent, a method to rollout a policy

In [2]:
def rollout(env, agent, render=False):
    state = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = agent.action(state)
        state, reward, done, info = env.step(action)
        total_reward += reward
        if render:
            env.render()
    return total_reward

class RandomAgent:
    def __init__(self, action_space, observation_space):
        self.action_space = action_space
        self.observation_space = observation_space
        
    # We pass the state only for compatability
    def action(self, state):
        # TODO implement here
        return np.random.choice(self.action_space.n)

def compute_avg_return(env, agent, f_rollout = rollout,num_episodes=5000):
    # TODO implement here
    rewards = []
    for episode in tqdm(range(num_episodes)):
        episode_reward = f_rollout(env,agent)
        rewards.append(episode_reward)
        
    avg_reward = np.mean(rewards)
    return avg_reward

Add your code to estimate the `avg_return_random_agent` for the deterministic case and `avg_return_random_agent_slippery` for the stochastic case!

In [6]:
# TODO implement here
env = gym.make("FrozenLake-v1", is_slippery=False)
avg_return_random_agent = compute_avg_return(env,RandomAgent(env.action_space,env.observation_space))

env_slippery = gym.make("FrozenLake-v1", is_slippery=True)
avg_return_random_agent_slippery = compute_avg_return(env,RandomAgent(env.action_space,env.observation_space))

100%|█████████████████████████████████████| 5000/5000 [00:01<00:00, 3831.76it/s]
100%|█████████████████████████████████████| 5000/5000 [00:01<00:00, 3448.05it/s]


In [7]:
print("Estimation for the deterministic case:", avg_return_random_agent)
print("Estimation for the stochastic case:", avg_return_random_agent_slippery)

Estimation for the deterministic case: 0.0134
Estimation for the stochastic case: 0.0156


The expected value goes smaller when we consider is_slippery = True because it introduces some type of noise on the possible actions.

### 1. b) Iterative Policy Evaluation
We provide a `set_state` method that changes the state of the environment. This is a pretty unusual way to interact with this framework. Note, that the random policy is stochastic, while the environment is not. In the value update we sum the value of each possible action that is weighted by its probability to be picked by the action. The architecture of the agent does provide access to these inner dynamics, so instead of passing the agent or its dynamics as a variable, we implement iterative policy evaluation just for the random agent, with the probability of `0.25` for each action hard coded.

We also provide `all_states` and `all_actions`, lists of all admissable states and actions for the environment. 

In [25]:
all_states = list(range(env.observation_space.n))
all_actions = list(range(env.action_space.n))

def set_state(env, state):
    env.reset()
    env.env.env.env.s = state
    return env

def visualize_value_fct(v):
    print(np.round(np.array(list(v.values())).reshape((4,4)),3))

In [32]:
def iterative_policy_iteration_random_agent(env, all_states, all_actions, discount_rate, 
                                            threshold=0.001, max_iter=10000):
    v = {s: 0 for s in all_states}  # value function, initialized to 0
    it = 0
    while it < max_iter:
        v1 = {s: v[s] for s in all_states}
        for s in all_states:
            reward = 0
            for a in all_actions:
                env = set_state(env,s)
                state, reward1, done, info = env.step(a)
                reward += 0.25*(reward1 + discount_rate*v1[state])
            v[s] = reward 
        if np.amax(np.abs(np.array(list(v.values())) - np.array(list(v1.values())))) <= threshold:
            break
        it += 1
    
    return v

In [33]:
env = gym.make("FrozenLake-v1", is_slippery=False)
v_random = iterative_policy_iteration_random_agent(env, all_states, all_actions, discount_rate=0.9)
visualize_value_fct(v_random)

[[0.003 0.003 0.009 0.003]
 [0.006 0.    0.026 0.   ]
 [0.018 0.056 0.106 0.   ]
 [0.    0.129 0.39  0.   ]]


In [36]:
env = gym.make("FrozenLake-v1", is_slippery=True)
v_random = iterative_policy_iteration_random_agent(env, all_states, all_actions, discount_rate=0.9)
visualize_value_fct(v_random)

[[0.012 0.008 0.03  0.002]
 [0.022 0.    0.021 0.   ]
 [0.022 0.052 0.072 0.   ]
 [0.    0.061 0.382 0.   ]]


### 1. c) Value Iteration
Use value iteration to find the optimal policy!

In [37]:
threshold = 0.001
discount_rate = 0.9

v1 = {s: 0 for s in all_states}
v = {s: np.random.uniform(0,10) for s in all_states}
count = 0
while np.amax(np.abs(np.array(list(v.values())) - np.array(list(v1.values())))) > threshold:
    v = {s: v1[s] for s in all_states}
    for s in all_states:
        rewards = []
        for a in all_actions:
            env = set_state(env,s)
            state, reward1, done, info = env.step(a)
            rewards.append(reward1 + discount_rate*v[state])
            
        v1[s] = np.amax(rewards)
        #print(np.amax(rewards))
        
    count += 1
    if count > 10000:
        break

In [38]:
def value_iteration(env, all_states, all_actions, discount_rate, threshold=0.001, max_iter=10000):
    # TODO implement here
    v1 = {s: 0 for s in all_states}
    v = {s: np.random.uniform(0,10) for s in all_states}
    steps = {s: -1 for s in all_states}
    for it in range(max_iter):
        v = {s: v1[s] for s in all_states}
        for s in all_states:
            rewards = []
            for a in all_actions:
                env = set_state(env,s)
                state, reward1, done, info = env.step(a)
                rewards.append(reward1 + discount_rate*v[state])

            v1[s] = np.amax(rewards)
            steps[s] = np.argmax(rewards)

        if np.amax(np.abs(np.array(list(v.values())) - np.array(list(v1.values())))) <= threshold:
            break
    return v,steps

In [39]:
env = gym.make("FrozenLake-v1", is_slippery=False)

v_optimal,policy_optimal = value_iteration(env, all_states, all_actions, discount_rate=0.9)
visualize_value_fct(v_optimal)

[[0.59  0.656 0.729 0.656]
 [0.656 0.    0.81  0.   ]
 [0.729 0.81  0.9   0.   ]
 [0.    0.9   1.    0.   ]]


In [40]:
def optimal_reward(env,policy_optimal):
    env.reset()
    done = False
    reward = 0
    state = 0

    while not done:
        state, reward1, done, info = env.step(policy_optimal[state])
        reward += reward1
        
    return reward
    


In [41]:
print('The optimal reward is ',optimal_reward(env,policy_optimal))

The optimal reward is  1.0


### 2. a) Sarsa & Q-Learning
With the language of a Q-table, we can define a more general agent by a Q-function

In [42]:
def visualize_q_fct(q):
    acts = {0 : "L", 1 : "D", 2 : "R", 3 : "U"} 
    for j in range(4):
        print("Value for action", acts[j], ":")
        print(np.round(np.array([q[i][j] for i in range(16)]).reshape((4,4)), 3))
    for i in range(4):
        print([acts[np.argmax(q[4*i + j])] for j in range(4)])
        
def argmax_tiebreak(array):
    return np.random.choice(np.where(array == array.max())[0])

In [43]:
class Discrete_Q_Agent:
    def __init__(self,env, action_space, observation_space, epsilon=0.9):
        self.action_space = action_space
        self.observation_space = observation_space
        self.epsilon = epsilon
        self.reset_Q()
        self.env = env
    def reset_Q(self):
        all_states = list(range(self.observation_space.n))
        self.actions = list(range(self.action_space.n))
        self.Q = {s: np.zeros(self.action_space.n) for s in all_states}

    def action(self, state): #made to solve SARSA
        # TODO implement here
        env = self.env
        if np.random.uniform(0,1) < self.epsilon:
            action = np.random.choice(list(range(self.action_space.n)))
        else:
            #rewards = []
            #for a in list(range(self.action_space.n)):
            #    env = set_state(env,state)
            #    state, reward1, done, info = env.step(a)
            #    rewards.append(reward1)
            #action = list(range(self.action_space.n))[np.argmax(rewards)]
            
            action = np.argmax(self.Q[state])
        return action

In [44]:
def Sarsa(env, q_agent, alpha=0.1, gamma=0.99, rollouts=10000):
    # TODO implement here
    q_agent.reset_Q()
    for rollout in range(rollouts):
        #state = np.random.choice(list(range(q_agent.observation_space.n)))
        state = env.reset()
        action = q_agent.action(state)
        done = False
        while not done:
            state1, r, done, info = env.step(action)
            action1 = q_agent.action(state1)
            q_agent.Q[state][action] = q_agent.Q[state][action] + alpha*(r + gamma*q_agent.Q[state1][action1] - q_agent.Q[state][action]) 
            
            state,action = state1,action1
         
    return q_agent, q_agent.Q

In [45]:
def Q_Learning(env, q_agent, alpha=0.1, gamma=0.99, rollouts=10000):
    # TODO implement here
    q_agent.reset_Q()
    for rollout in range(rollouts):
        state = np.random.choice(list(range(q_agent.observation_space.n)))
        action = q_agent.action(state)
        done = False
        while not done:
            
            if np.random.uniform(0,1) < q_agent.epsilon:
                action1 = np.random.choice(q_agent.action_space.n)
                env = set_state(env,state)
                state1, r, done, info = env.step(action1)
                q_agent.Q[state][action] += alpha*(r + gamma*q_agent.Q[state1][action1] - q_agent.Q[state][action]) 
            
                
            else:
                temp_diff,rew = [],[]
                for a in range(q_agent.action_space.n):
                    state1, r, done, info = env.step(a)
                    temp_diff.append(q_agent.Q[state1][a] - q_agent.Q[state][action])
                    rew.append(r)

                r,action1 = rew[np.argmax(temp_diff)],np.argmax(temp_diff)
                temporal_difference = np.amax(temp_diff)
                q_agent.Q[state][action] += alpha*(r + gamma*temporal_difference) 

                env = set_state(env,state)
                state1, r, done, info = env.step(action1)

            state,action = state1,action1
                
    return q_agent, q_agent.Q

In [46]:
env_slippery = gym.make("FrozenLake-v1", is_slippery=True)
q_agent = Discrete_Q_Agent(env_slippery,env_slippery.action_space, env_slippery.observation_space, epsilon=0.1)
q_agent, q = Sarsa(env_slippery, q_agent,gamma = 0.9)
visualize_q_fct(q)

Value for action L :
[[0.045 0.024 0.056 0.015]
 [0.078 0.    0.04  0.   ]
 [0.057 0.117 0.311 0.   ]
 [0.    0.162 0.349 0.   ]]
Value for action D :
[[0.042 0.027 0.045 0.023]
 [0.04  0.    0.05  0.   ]
 [0.077 0.218 0.185 0.   ]
 [0.    0.21  0.521 0.   ]]
Value for action R :
[[0.051 0.02  0.04  0.019]
 [0.039 0.    0.098 0.   ]
 [0.084 0.173 0.164 0.   ]
 [0.    0.299 0.44  0.   ]]
Value for action U :
[[0.041 0.04  0.03  0.04 ]
 [0.037 0.    0.011 0.   ]
 [0.112 0.104 0.052 0.   ]
 [0.    0.248 0.341 0.   ]]
['R', 'U', 'L', 'U']
['L', 'L', 'R', 'L']
['U', 'D', 'L', 'L']
['L', 'R', 'D', 'L']


In [47]:
env_slippery = gym.make("FrozenLake-v1", is_slippery=True)
q_agent = Discrete_Q_Agent(env_slippery,env_slippery.action_space, env_slippery.observation_space, epsilon=0.1)
q_agent, q = Q_Learning(env_slippery, q_agent,gamma = 0.9)
visualize_q_fct(q)

ResetNeeded: Cannot call env.step() before calling env.reset()

### 2. b) Cartpole
Next, try the [Cartpole](https://www.gymlibrary.ml/environments/classic_control/cart_pole/) environment. It has a continuous state space, so we need to adjust our methods to accomodate that.

In [48]:
from gym import wrappers
import io
import base64
from IPython.display import HTML

#!pip install piglet
#!sudo apt-get install ffmpeg

In [49]:
def rollout_cartpole(env, agent, render=False):
    env = wrappers.Monitor(env, "./gym-results", force=True)
    state = env.reset()
    done = False
    while not done:
        action = agent.action(state)
        state, reward, done, info = env.step(action)

    env.close()
    
    return env.file_infix
    
def rollout_cart_avg(env, agent, render=False):
    state = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = agent.action(state)
        state, reward, done, info = env.step(action)
        total_reward += reward
    return total_reward

In [50]:
# TODO implement here
cartpole = gym.make("CartPole-v1")
random_agent = RandomAgent(cartpole.action_space,cartpole.observation_space)
avg_return_random_cartpole = compute_avg_return(cartpole,RandomAgent(cartpole.action_space,cartpole.observation_space))
print("Average return for a random strategy in cartpole:", avg_return_random_cartpole)
file_infix = rollout_cartpole(cartpole,random_agent)

video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))


  "We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) "
100%|█████████████████████████████████████| 5000/5000 [00:02<00:00, 1757.97it/s]


Average return for a random strategy in cartpole: 22.0164


AttributeError: module 'gym.wrappers' has no attribute 'Monitor'

### 2. c) Cartpole learning
The observation space of the Cartpole environment can be accessed with `env.observation_space`. It is a [`Box`](https://www.gymlibrary.ml/content/spaces/#box) space, which contains lower bounds, upper bounds, number of dimensions, and datatype. The second and forth dimension are unbounded. We can make them bounded by clipping every value over a certain threshold. Also, the first and third dimension have higher admissbable bounds, than is useful during training!

Hint: Binned Q-Learning is not the most efficient or useful algorithm for this problem. With the provided hyperparameters I achieved only a mean reward of ~100 after 50000 rollouts of training without any further tuning. Can you achieve a better result by changing the hyperparameters or employing some additional technique?

In [51]:
learning_rate = 0.1
discounting_rate = 0.95
number_episodes = 50000
total_reward = 0

window_size = np.array([0.25, 0.25, 0.01, 0.1])
low_clip = np.array([-2.4, -3.75, -0.2095, -2.5])
high_clip = np.array([2.4, 3.75, 0.2095, 2.5])

class Binned_Q_Agent_Cartpole:
    def __init__(self, env,window_size,low_clip,high_clip):
        # TODO implement here
        self.env = env
        self.window_size = window_size
        self.low_clip = low_clip
        self.high_clip = high_clip
        self.q_table = np.zeros(np.append(np.asarray((high_clip - low_clip)/window_size,dtype = 'int') +1,2))


    def get_discrete_state(self, state):
        # TODO implement here
        state = np.maximum(state,self.low_clip)
        state = np.minimum(state,self.high_clip)

        return np.asarray((state - self.low_clip)/window_size,dtype = 'int')
    

    
    def action(self, state,epsilon = 0.05):
        # TODO implement here
        env = self.env
        if np.random.uniform(0,1) < epsilon:
            action = np.random.choice(list(range(env.action_space.n)))
            
        else:
            
            disc_state = self.get_discrete_state(state)
            action = np.argmax(self.q_table[tuple(disc_state)])
            
        return action

def binned_q_learning(env, agent, alpha=0.2, gamma=0.95, epsilon=0.05, num_episodes=50000):
    # TODO implement here
    for rollout in tqdm(range(num_episodes)):
        state = env.reset()
        action = agent.action(state,epsilon)
        done = False
        #c = 0
        while not done:
            state1, r, done, info = env.step(action)
            action1 = agent.action(state1,epsilon)
            disc_state = agent.get_discrete_state(state)
            disc_state1 = agent.get_discrete_state(state1)

            r = -1/(high_clip - np.minimum(high_clip,np.abs(state1)) + 1e-16)
            ch = np.multiply(state,state1)
            ch = np.nan_to_num(ch/np.abs(ch))
            r = min(np.exp(max(ch[1],ch[3])*(r[0]+ r[2])),1e16)


            change = alpha*(r + gamma*(agent.q_table[tuple(disc_state1)][action1] - agent.q_table[tuple(disc_state)][action])) 
            change += -done*1e16
            agent.q_table[tuple(disc_state)][action] = agent.q_table[tuple(disc_state)][action] + change

            state,action = state1,action1
            
    return agent

In [52]:
window_size = np.array([0.25, 0.25, 0.01, 0.1])
low_clip = np.array([-2.4, -3.75, -0.2095, -2.5])
high_clip = np.array([2.4, 3.75, 0.2095, 2.5])


env = gym.make("CartPole-v1")
bagent = Binned_Q_Agent_Cartpole(env,window_size,low_clip,high_clip)
bagent = binned_q_learning(env, bagent, num_episodes=50000,epsilon = 0.2)

100%|█████████████████████████████████████| 50000/50000 [12:47<00:00, 65.15it/s]


In [53]:
cartpole = gym.make("CartPole-v1")
avg_return_bagent_cartpole = compute_avg_return(cartpole,bagent,rollout_cart_avg)
print("Average return for a random strategy in cartpole:", avg_return_bagent_cartpole)
file_infix = rollout_cartpole(cartpole,bagent)

video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))


100%|███████████████████████████████████████| 5000/5000 [00:50<00:00, 99.24it/s]

Average return for a random strategy in cartpole: 320.7844





AttributeError: module 'gym.wrappers' has no attribute 'Monitor'

### 3.a) Linear function control
Implement the linear gradient Sarsa here. Most of the time after a few thousend episodes the linear policy is able to solve the problem (500 reward), but sometimes it just does not converge. The algorithm is a bit shakey as is! I also needed to add one little tweak: Normalize the state by clipping it, just as in the task before, and then dividing by the clip-value. This normalizes the state-vectors to [-1,1] and stablizes the algorithm.

Note that for a linear formulation of Q_theta, Grad(Q_theta) is just the state vector.

#### I know it is not what you asked for

I know what I did was sort of cheating, but I couldnt find the bug in my code and it never solved the problem, so I used the DQN_agent implementation with linear activation functions instead.

I put my code anyway.
Please dont hate me

In [80]:
memory_size = 2000
epsilon = 0.05
learning_rate = 0.001
gamma=0.99
low_clip = np.array([-2.4, -3.75, -0.2095, -2.5])
high_clip = np.array([2.4, 3.75, 0.2095, 2.5])

class Linear_DQN_Agent:
    def __init__(self, env, state_dim, action_dim,learning_rate=0.001, epsilon=epsilon, gamma = gamma):
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.epsilon = epsilon
        self.env = env
        self.gamma = gamma
        self.model = self._init_model(state_dim,action_dim,learning_rate)
        self.model_target = self._init_model(state_dim,action_dim,learning_rate)
        self.replay_memory_x = []
        self.replay_memory_y = []
    
    def _init_model(self, state_dim, action_dim, learning_rate=0.001):
        model = Sequential()
        model.add(Dense(action_dim, input_dim=state_dim, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=learning_rate))
        return model
    
    def norm_state(self, state):
        norm_state = state
        norm_state = np.clip(norm_state,low_clip,high_clip)
        norm_state /= high_clip
        return norm_state
        
    def action(self, state):
        # TODO implement here 
        if np.random.uniform(0,1) < self.epsilon:
            action = np.random.choice(self.action_dim)            
        else:
            state = self.norm_state(state)
            action = np.argmax(self.model.predict(np.array([state])))          
        return action


    def remember(self, state, action, reward, next_state, done):
        # TODO implement here
        state = self.norm_state(state)
        self.replay_memory_x.append(state)
        if done:
            self.replay_memory_y.append(reward-1)
        else:
            self.replay_memory_y.append(reward + self.gamma*np.amax(self.model.predict(np.array([next_state]))))

            
    def learn_from_replay(self, batch_size,epochs = 6):
        # TODO implement here
        index = np.random.choice(len(self.replay_memory_x),batch_size)
        X,Y = np.array(self.replay_memory_x)[index],np.array(self.replay_memory_y)[index]

        self.model.fit(np.array(X),np.array(Y),epochs = epochs,verbose=0 )


def Linear_DQN(env, agent, batch_size = 500, replay_batch_size=128, rollouts=2000,max_memory = 2000):
    # TODO implement here
    history = [0]
    for rollout in tqdm(range(1,rollouts)):
        state = env.reset()
        action = agent.action(state)
        done = False
        episode_reward = 0
        while not done:
            state1, reward, done, info = env.step(action)
            action1 = agent.action(state1) 
            agent.remember(state,action,reward,state1,done)
            if rollout%replay_batch_size == 0 and len(agent.replay_memory_x) > batch_size:                
                agent.learn_from_replay(replay_batch_size)
                        
            state,action = state1,action1
            episode_reward += reward
            
        history.append(episode_reward)
        
        #if max(history) == episode_reward:
        #    agent.model.save('./chekpoint/linear_checkpoint.h5')
        
            
        if len(agent.replay_memory_y) > max_memory:
            agent.replay_memory_x = agent.replay_memory_x[-max_memory:]
            agent.replay_memory_y = agent.replay_memory_y[-max_memory:]
            
        
        
    return agent,history

In [81]:
env = gym.make("CartPole-v1")
agent = Linear_DQN_Agent(env,4, 2) # initialise agent
lin_agent,history = Linear_DQN(env, agent,batch_size=750,replay_batch_size=128)

  6%|██▎                                     | 117/1999 [00:01<00:24, 77.81it/s]

Instructions for updating:
Use tf.cast instead.


  append_fn(tensor_proto, proto_values)
100%|███████████████████████████████████████| 1999/1999 [00:35<00:00, 55.68it/s]


In [83]:
#cartpole = gym.make("CartPole-v1")
#lin_agent.epsilon = 0
#lin_agent.model = load_model('./checkpoint/linear_checkpoint.h5')
avg_return_dqn_cartpole = compute_avg_return(cartpole,lin_agent,rollout_cart_avg,num_episodes=500)
print("Average return for a linear strategy in cartpole:", avg_return_dqn_cartpole)
#file_infix = rollout_cartpole(cartpole,lin_agent)

#video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % file_infix, 'r+b').read()
#encoded = base64.b64encode(video)
#HTML(data='''
#    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
#.format(encoded.decode('ascii')))


100%|████████████████████████████████████████| 500/500 [00:04<00:00, 102.22it/s]

Average return for a linear strategy in cartpole: 21.548





In [None]:
"""
class Linear_Q_Agent:
    def __init__(self, env,action_space, observation_space, epsilon=0.9):
        self.action_space = action_space
        self.observation_space = observation_space
        self.epsilon = epsilon
        self.theta = np.random.uniform(-1,1,(action_space.n, observation_space.shape[0]))
        #self.theta = np.zeros((action_space.n, observation_space.shape[0]))
        self.env = env
        
    def norm_state(self, state):
        norm_state = state
        norm_state = np.clip(norm_state,low_clip,high_clip)
        norm_state /= high_clip
        return norm_state

    def get_Q_values(self, state):
        # TODO implement here
        q_vals = np.zeros(self.action_space.n)

        for ac in range(len(q_vals)):

            a = np.zeros_like(self.theta)
            a[ac] = self.norm_state(state)
            
            q_vals[ac] = self.theta[ac]@a[ac]

        return q_vals
    
    def action(self, state):
        # TODO implement here
        if np.random.uniform(0,1) < self.epsilon:
            action = np.random.choice(list(range(self.action_space.n)))            
        else:
            action = np.argmax(self.get_Q_values(state))           
        return action
    
    def grad(self, state, action):
        return self._x(state, action)
    
    def _x(self, state, action):
        a = np.zeros_like(self.theta)
        a[action] = self.theta[action]
        return a


def Grad_Sarsa(env, agent, alpha=0.01, gamma=0.99, rollouts=2):
    # TODO implement here
    env = agent.env
    for rollout in tqdm(range(rollouts)):
        state = env.reset()
        action = agent.action(state)
        done = False
        while not done:
            state1, r, done, info = env.step(action)
            if done:
                grad = np.zeros_like(agent.theta)
                grad[action] = agent.norm_state(state)
                agent.theta = agent.theta + alpha*(r - agent.get_Q_values(state)[action])*agent.grad(state,action)
                
            else:
                action1 = agent.action(state1)
                grad = np.zeros_like(agent.theta)
                grad[action] = agent.norm_state(state)
                v = r + gamma*agent.get_Q_values(state1)[action1]-agent.get_Q_values(state)[action]
                agent.theta = agent.theta + (alpha*v*agent.grad(state,action))
                
            state,action = state1,action1

                
    return agent
     
env = gym.make("CartPole-v1")
lin_agent = Linear_Q_Agent(env,env.action_space, env.observation_space)
lin_agent = Grad_Sarsa(env, lin_agent,alpha=0.001, rollouts=10000)

cartpole = gym.make("CartPole-v1")
avg_return_lin_agent_cartpole = compute_avg_return(cartpole,lin_agent,rollout_cart_avg)
print("Average return for a random strategy in cartpole:", avg_return_lin_agent_cartpole)
file_infix = rollout_cartpole(cartpole,lin_agent)

video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))

"""

### 3.b) DQN
As a suggestion, I provided the interfaces for functions, some hyperparameters, and the architecture of the neural net that approximates Q. For this algorithm to somewhat work, I needed at least experience replay. But other techniques may also be interesting and work even better. Please feel free to experiment!

In [9]:
memory_size = 2000
epsilon = 0.05
learning_rate = 0.01
gamma=0.99
low_clip = np.array([-2.4, -3.75, -0.2095, -2.5])
high_clip = np.array([2.4, 3.75, 0.2095, 2.5])

class DQN_Agent:
    def __init__(self, env, state_dim, action_dim,learning_rate=learning_rate, epsilon=epsilon, gamma = gamma):
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.epsilon = epsilon
        self.env = env
        self.gamma = gamma
        self.model = self._init_model(state_dim,action_dim,learning_rate)
        self.replay_memory_x = []
        self.replay_memory_y = []
    
    def _init_model(self, state_dim, action_dim, learning_rate):
        model = Sequential()
        model.add(Dense(32, input_dim=state_dim, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(action_dim, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=learning_rate))
        return model
    
    def norm_state(self, state):
        norm_state = state
        norm_state = np.clip(norm_state,low_clip,high_clip)
        norm_state /= high_clip
        return norm_state
        
    def action(self, state):
        # TODO implement here 
        if np.random.uniform(0,1) < self.epsilon:
            action = np.random.choice(self.action_dim)            
        else:
            action = np.argmax(self.model.predict(np.array([state])))          
        return action


    def remember(self, state, action, reward, next_state, done):
        # TODO implement here
        self.replay_memory_x.append(state)
        label = self.model.predict(np.array([state]))[0]
        if done:
            label[action] = reward
        else:
            label[action] = (reward + self.gamma*np.amax(self.model.predict(np.array([next_state]))))
        self.replay_memory_y.append(label)
            
    def learn_from_replay(self, batch_size,epochs = 6):
        # TODO implement here
        index = np.random.choice(len(self.replay_memory_x),batch_size)
        X,Y = np.array(self.replay_memory_x)[index],np.array(self.replay_memory_y)[index]
        self.model.fit(np.array(X),np.array(Y),epochs = epochs,verbose=0 )


def DQN(env, agent, batch_size = 500, replay_batch_size=128, rollouts=5000,max_memory = 1000):
    # TODO implement here
    history = [0]
    ct = 0
    for rollout in range(1,rollouts):
        state = env.reset()
        state = agent.norm_state(state)
        action = agent.action(state)
        done = False
        episode_reward = 0
        while not done:
            state1, reward, done, info = env.step(action)
            state1 = agent.norm_state(state1)
            action1 = agent.action(state1) 
            agent.remember(state,action,reward,state1,done)
            if ct%20 == 0 and len(agent.replay_memory_x) > batch_size:                
                agent.learn_from_replay(replay_batch_size)
            ct += 1
                        
            state,action = state1,action1
            episode_reward += reward
            
            if len(agent.replay_memory_y) > max_memory:
                agent.replay_memory_x = agent.replay_memory_x[-max_memory:]
                agent.replay_memory_y = agent.replay_memory_y[-max_memory:]
            
        history.append(episode_reward)
        if rollout % 10 == 0:
            print(rollout, episode_reward)
        
        #if max(history) == episode_reward:
        #    agent.model.save('./checkpoint/dqn.h5')
        

            
        
        
    return agent,history


In [10]:
env = gym.make("CartPole-v1")
agent = DQN_Agent(env,4, 2) # initialise agent
dqn_agent_trained,history = DQN(env, agent,batch_size=750,replay_batch_size=128)

10 10.0
20 10.0
30 10.0
40 11.0
50 9.0
60 9.0
70 9.0
80 10.0
90 10.0
100 10.0
110 9.0
120 10.0
130 11.0
140 10.0
150 11.0
160 10.0
170 9.0
180 10.0
190 10.0
200 9.0
210 9.0
220 10.0
230 10.0
240 8.0
250 10.0
260 10.0
270 10.0
280 14.0
290 10.0
300 11.0
310 10.0
320 10.0
330 11.0
340 9.0
350 8.0
360 9.0
370 11.0
380 9.0
390 10.0
400 9.0
410 9.0
420 10.0
430 9.0
440 10.0
450 10.0
460 10.0
470 10.0
480 9.0
490 12.0
500 10.0
510 8.0
520 10.0
530 9.0
540 9.0
550 10.0
560 10.0
570 9.0
580 9.0
590 10.0
600 10.0
610 10.0
620 10.0
630 11.0
640 9.0
650 9.0
660 10.0
670 10.0
680 9.0
690 9.0
700 9.0
710 8.0
720 9.0
730 9.0
740 10.0
750 11.0
760 9.0
770 10.0
780 11.0
790 10.0
800 10.0
810 9.0
820 9.0
830 9.0
840 9.0
850 11.0
860 9.0
870 9.0
880 9.0
890 8.0
900 9.0
910 9.0
920 10.0
930 9.0
940 9.0
950 10.0
960 8.0
970 9.0
980 10.0
990 8.0
1000 10.0
1010 10.0
1020 9.0
1030 10.0
1040 11.0
1050 11.0
1060 10.0
1070 9.0
1080 9.0
1090 9.0
1100 9.0
1110 10.0
1120 9.0
1130 9.0
1140 12.0
1150 10.0
1160 8.0
1

In [12]:
cartpole = gym.make("CartPole-v1")
dqn_agent_trained.epsilon = 0
#dqn_agent_trained.model = load_model('./checkpoint/dqn.h5')
avg_return_dqn_cartpole = compute_avg_return(cartpole,dqn_agent_trained,rollout_cart_avg,num_episodes=500)
print("Average return for a dqn strategy in cartpole:", avg_return_dqn_cartpole)

NameError: name 'rollout_cart_avg' is not defined

### 3.c) Another one
Browse the [environments](https://www.gymlibrary.ml/) to pick another challenge! Maybe even record a video with the [RecordVideo wrapper](https://github.com/openai/gym/blob/master/gym/wrappers/record_video.py)!

In [None]:
import gym
import matplotlib.pyplot as plt
import tensorflow as tf
import cv2

In [None]:
#np.save('eigeninviders.npy',eigeninviders)

In [65]:
memory_size = 2000
epsilon = 0.3
learning_rate = 0.001
gamma=0.99
frames = 4

#pca.fit(np.reshape(X,(len(X),210*160))/255)
#eigeninviders = pca.components_
eigeninviders = np.load('eigeninviders.npy') #Using PCA befor the NN

class DQN_Agent2:
    def __init__(self, env, action_dim,learning_rate=learning_rate, epsilon=epsilon, gamma = gamma,frames = 3):
        self.action_dim = action_dim
        self.epsilon = epsilon
        self.max_epsilon = epsilon
        self.env = env
        self.gamma = gamma
        self.frames = frames
        self.model = self._init_model(action_dim,learning_rate)
        self.replay_memory_x = []
        self.replay_memory_y = []
        
    
    def _init_model(self, action_dim, learning_rate):
        model = Sequential()
        #model.add(tf.keras.layers.Conv2D(32, input_shape=(210,160,self.frames), kernel_size = (5,5), activation='relu'))
        model.add(tf.keras.layers.Conv1D(32, input_shape=(200,self.frames), kernel_size = (5), activation='relu'))
        model.add(tf.keras.layers.MaxPool1D((3)))
        model.add(tf.keras.layers.Conv1D(32,kernel_size = (5), activation='relu'))
        model.add(tf.keras.layers.MaxPool1D((3)))
        model.add(tf.keras.layers.Flatten())
        model.add(Dense(self.action_dim**4, activation='relu'))
        model.add(Dense(self.action_dim**3, activation='relu'))
        model.add(Dense(self.action_dim, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=learning_rate))
        return model
        
        
    def action(self, state):
        # TODO implement here 
        if np.random.uniform(0,1) < self.epsilon:
            action = np.random.choice(self.action_dim)            
        else:
            #action = np.argmax(self.model(np.array([np.transpose(state,axes = [1,2,0])]), training=False).numpy())          
            flat = np.reshape(state,(self.frames,210*160))
            projected = (eigeninviders@flat.T).T
            action = np.argmax(self.model.predict(np.array([np.transpose(projected,axes = [1,0])])))          
            
        return action


    def remember(self, state, action, reward, next_state, done):
        # TODO implement here
        X = np.transpose(state,axes = [1,2,0])/255
        #nX = [np.transpose(next_state,axes = [1,2,0])]
        flat = np.reshape(next_state,(self.frames,210*160))
        projected = (eigeninviders@flat.T)
        nX = [np.copy(projected)]
        
        flat = np.reshape(state,(self.frames,210*160))
        projected = (eigeninviders@flat.T)
            
            
        self.replay_memory_x.append(np.copy(projected))
        if done:
            self.replay_memory_y.append(reward)
        else:
            self.replay_memory_y.append(reward + self.gamma*np.amax(self.model.predict(np.array(nX))))

            
    def learn_from_replay(self, batch_size,epochs = 6):
        # TODO implement here
        index = np.random.choice(len(self.replay_memory_x),batch_size)
        X,Y = np.array(self.replay_memory_x)[index],np.array(self.replay_memory_y)[index]

        self.model.fit(np.array(X),np.array(Y),epochs = epochs,verbose=0 )


def DQN(env, agent, batch_size = 500, replay_batch_size=128, rollouts=300,max_memory = 2000):
    # TODO implement here
    history = [0]
    for rollout in tqdm(range(1,rollouts)):
        state = env.reset()
        state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY)
        state = [state]
        for it in range(agent.frames-1):
            s, reward, done, info = env.step(0)
            s = cv2.cvtColor(s, cv2.COLOR_BGR2GRAY)
            state.append(s)
        action = agent.action(state)
        done = False
        episode_reward = 0
        while not done:
            state2, reward, done, info = env.step(action)
            state2 = cv2.cvtColor(state2, cv2.COLOR_BGR2GRAY)
            state1 = state[1:]
            state1.append(state2)
            action1 = agent.action(state1) 
            agent.remember(state,action,reward,state1,done)
            if rollout%replay_batch_size == 0 and len(agent.replay_memory_x) > batch_size:                
                agent.learn_from_replay(replay_batch_size)
                agent.epsilon = agent.max_epsilon/(np.log(history[-1] + 1)+1)

            state,action = state1,action1
            episode_reward += reward

        history.append(episode_reward)

        if max(history) == episode_reward:
            agent.model.save('.checkpoint/space_invaders.h5')


        if len(agent.replay_memory_y) > max_memory:
            agent.replay_memory_x = agent.replay_memory_x[-max_memory:]
            agent.replay_memory_y = agent.replay_memory_y[-max_memory:]
            
        
        
    return agent,history


In [66]:
env = gym.make("SpaceInvaders-v4")
agent = DQN_Agent2(env, env.action_space.n) # initialise agent
dqn_agent_trained,history = DQN(env, agent,batch_size=750,replay_batch_size=128)

A.L.E: Arcade Learning Environment (version 0.7.5+db37282)
[Powered by Stella]


Error: We're Unable to find the game "SpaceInvaders". Note: Gym no longer distributes ROMs. If you own a license to use the necessary ROMs for research purposes you can download them via `pip install gym[accept-rom-license]`. Otherwise, you should try importing "SpaceInvaders" via the command `ale-import-roms`. If you believe this is a mistake perhaps your copy of "SpaceInvaders" is unsupported. To check if this is the case try providing the environment variable `PYTHONWARNINGS=default::ImportWarning:ale_py.roms`. For more information see: https://github.com/mgbellemare/Arcade-Learning-Environment#rom-management

In [None]:
env = gym.make("SpaceInvaders-v4")
dqn_agent_trained.epsilon = 0
dqn_agent_trained.model = load_model('.checkpoint/space_invaders.h5')
rewards = []
for it in tqdm(range(10)):
    r = 0
    state = env.reset()
    state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY)
    state = [state]
    for it in range(agent.frames-1):
        s, reward, done, info = env.step(0)
        s = cv2.cvtColor(s, cv2.COLOR_BGR2GRAY)
        state.append(s)
    action = agent.action(state)
    done = False
    episode_reward = 0
    while not done:
        state2, reward, done, info = env.step(action)
        state2 = cv2.cvtColor(state2, cv2.COLOR_BGR2GRAY)
        state1 = state[1:]
        state1.append(state2)
        action1 = agent.action(state1) 
        r += reward
        state,action = state1,action1
        
    rewards.append(r)
    
print("Average return for a dqn strategy in space inviders:", np.mean(rewards))

In [None]:
video_output = 'space_inviders_pca.avi'
out = cv2.VideoWriter(video_output,cv2.VideoWriter_fourcc('M','J','P','G'),20, (160,210))
env = gym.make("SpaceInvaders-v4")
agent = dqn_agent_trained
agent.model = load_model('.checkpoint/space_invaders.h5')


state = env.reset()
state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY)
state = [state]
for it in range(agent.frames-1):
    s, reward, done, info = env.step(0)
    s = cv2.cvtColor(s, cv2.COLOR_BGR2GRAY)
    state.append(s)
    
action = agent.action(state)
done = False

while not done:
    state2, reward, done, info = env.step(action)
    out.write(state2)

    state2 = cv2.cvtColor(state2, cv2.COLOR_BGR2GRAY)
    state1 = state[1:]
    state1.append(state2)
    action1 = agent.action(state1) 
    
    state,action = state1,action1
    
out.release()