In [1]:
import gymnasium as gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
env = gym.make('CartPole-v1', render_mode="human")

rewards = 0
done = False
trunc = False
state, info = env.reset()

while not done and not trunc:
    action = env.action_space.sample()
    new_state, reward, done, trunc, info = env.step(action)
    if type(reward)==np.ndarray: reward = reward[0]
    rewards += reward
    env.render()

env.close()
print(rewards)

13.0


In [3]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Use GPU 0

In [5]:
# Neural network 
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

env = gym.make('CartPole-v1')
input_dim = env.observation_space.shape[0]
n_actions = env.action_space.n
print(input_dim, n_actions)
model = Sequential([
    Dense(64, input_dim = input_dim , activation = 'relu'),
    Dense(32, activation = 'relu'),
    Dense(n_actions, activation = 'linear')
])
model.compile(optimizer=Adam(), loss = 'mse')
model.summary()

4 2
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                320       
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 2)                 66        
                                                                 
Total params: 2,466
Trainable params: 2,466
Non-trainable params: 0
_________________________________________________________________


In [6]:
def replay(replay_memory, model, gamma, minibatch_size=32):
    # choose <s,a,r,s',done> experiences randomly from the memory
    minibatch = np.random.choice(replay_memory, minibatch_size, replace=True)
    # create one list containing s, one list containing a, etc
    s_l =      np.array(list(map(lambda x: x['s'], minibatch)))
    a_l =      np.array(list(map(lambda x: x['a'], minibatch)))
    r_l =      np.array(list(map(lambda x: x['r'], minibatch)))
    sprime_l = np.array(list(map(lambda x: x['sprime'], minibatch)))
    done_l   = np.array(list(map(lambda x: x['done'], minibatch)))
    # Find q(s', a') for all possible actions a'. Store in list
    # We'll use the maximum of these values for q-update  
    qvals_sprime_l = model.predict(sprime_l, verbose=0)
    # Find q(s,a) for all possible actions a. Store in list
    target_f = model.predict(s_l, verbose=0)
    # q-update target
    # For the action we took, use the q-update value  
    # For other actions, use the current nnet predicted value
    for i,(s,a,r,qvals_sprime, done) in enumerate(zip(s_l,a_l,r_l,qvals_sprime_l, done_l)): 
        if not done:  target = r + gamma * np.max(qvals_sprime)
        else:         target = r
        target_f[i][a] = target
    # Update weights of neural network with fit() 
    # Loss function is 0 for actions we didn't take
    model.fit(s_l, target_f, epochs=1, verbose=0)
    return model

In [7]:
EPISODES = 150
gamma = 0.99
epsilon = 1
decay_rate = 0.01
minibatch_size = 32
reward_list = []  # stores rewards of each epsiode 
replay_memory = [] # replay memory holds s, a, r, s'
mem_max_size = 100000

reward_list = []
for episode in range(EPISODES):
    rewards = 0
    done = False
    trunc = False
    state, info = env.reset()

    while not done and not trunc:
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            qvals_s = model.predict(state.reshape(1,-1), verbose=0)
            action = np.argmax(qvals_s)
        
        new_state, reward, done, trunc, info = env.step(action)
        if type(reward)==np.ndarray: reward = reward[0]
        rewards += reward
        
        if len(replay_memory) > mem_max_size: replay_memory.pop(0)
        replay_memory.append({"s":state,
                              "a":action,
                              "r":reward,
                              "sprime":new_state,
                              "done":done})
        model = replay(replay_memory, model = model, gamma = gamma, minibatch_size = minibatch_size)
        state = new_state
    
    print("Reward episode",episode,":",rewards)
    reward_list.append(rewards)
    rewards = 0
    if epsilon > 0.01: epsilon = np.exp(-decay_rate*episode)

print("Complete Training!")

Reward episode 0 : 12.0
Reward episode 1 : 15.0
Reward episode 2 : 36.0
Reward episode 3 : 42.0


In [None]:
env = gym.make('CartPole-v1', render_mode="human")
rewards = 0
done = False
trunc = False
state, info = env.reset()

while not done and not trunc:
    rewards = 0
    done = False
    trunc = False
    state, info = env.reset(seed=1)

    while not done and not trunc:
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            qvals_s = model.predict(state.reshape(1,-1), verbose=0)
            action = np.argmax(qvals_s)
        
        new_state, reward, done, trunc, info = env.step(action)
        if type(reward)==np.ndarray: reward = reward[0]
        rewards += reward
        
        model = replay(replay_memory, model = model, gamma = gamma, minibatch_size = minibatch_size)
        state = new_state

env.close()
print(rewards)

500.0
