In [2]:
import gym
from collections import deque
import random
import numpy as np
import copy

In [97]:
# create environment and preview observation and action space
env = gym.make('CartPole-v1')
obs_space = env.observation_space
act_space = env.action_space
print(env)
print(f"Action Space: {act_space}\nObservation Space: {obs_space}\n[cart position, cart velocity, pole angle, pole angular velocity]")

<TimeLimit<OrderEnforcing<PassiveEnvChecker<CartPoleEnv<CartPole-v1>>>>>
Action Space: Discrete(2)
Observation Space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
[cart position, cart velocity, pole angle, pole angular velocity]


In [99]:
# test random step
obs = env.reset()
print(f"Initial observation: {obs}")
action = env.action_space.sample()
# print(env.step(action))
new_obs, reward, done, _, _ = env.step(action)
print(f"New observation: {new_obs}\nReward: {reward}\nTerminated: {done}")

Initial observation: (array([ 0.00796497,  0.0228937 , -0.03278474, -0.00565631], dtype=float32), {})
New observation: [ 0.00842284  0.21847013 -0.03289786 -0.3085003 ]
Reward: 1.0
Terminated: False


In [100]:
# function to build replay buffer
def experience_buffer(action):
    currentState = env.reset()
    obs,reward,done,_,_ = env.step(action)
    experience = currentState,action,reward,obs.tolist(),done
    return experience

In [101]:
# test replay buffer
buff_max = 100000
replay = deque(maxlen=buff_max)

for i in range(buff_max):
    a = env.action_space.sample()
    e = experience_buffer(a)
    replay.append(e)

In [102]:
# verify replay buffer by previewing 9999th items
print(f"state: {replay[9999][0]}")
print(f"\naction: {replay[9999][1]}")
print(f"\nreward: {replay[9999][2]}")
print(f"\nnew state: {replay[9999][3]}")
print(f"\nisterminal: {replay[9999][4]}")

state: (array([-0.01835129, -0.04710228,  0.03518173, -0.02295367], dtype=float32), {})

action: 0

reward: 1.0

new state: [-0.01929333247244358, -0.2427106499671936, 0.03472265601158142, 0.2806186079978943]

isterminal: False


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
import torchvision.transforms as transforms
from torchsummary import summary
from torch.optim import Adam,SGD
import warnings
warnings.filterwarnings('ignore')
torch.manual_seed(42)

<torch._C.Generator at 0x7fe2c68200b0>

In [6]:
# create DQN
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(4,16)
        self.fc2 = nn.Linear(16,16)
        self.fc3 = nn.Linear(16,2)
    
    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [148]:
# DQNet and target net
DQNet = Net()
TNet = copy.deepcopy(DQNet)
TNet.eval()
summary(DQNet,(1,1,4))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1             [-1, 1, 1, 16]              80
            Linear-2             [-1, 1, 1, 16]             272
            Linear-3              [-1, 1, 1, 2]              34
Total params: 386
Trainable params: 386
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [149]:
# test tensor to pass through networks
test = torch.tensor([0.5, 0.5, 0.5, 0.5])
test

tensor([0.5000, 0.5000, 0.5000, 0.5000])

In [150]:
# verify output from networks
print(DQNet(test))
print(TNet(test))

tensor([-0.0659,  0.0028], grad_fn=<AddBackward0>)
tensor([-0.0659,  0.0028], grad_fn=<AddBackward0>)


In [151]:
# Initialize parameters for network

lr = 0.001
optimizer = Adam(params=DQNet.parameters(), lr=lr)
eps_start = 1
eps_end = 0.0001
eps_decay = 0.9999
loss_fn = nn.MSELoss()
episodes = 1500
mini_batch_size = 128

loss_value = 0
loss_history = []
update_freq = 25 # Updating target network
gamma = 0.93
scores = []
rewards = 0

In [152]:
# Training function

def training(Qnet,t_net,replay_memory,optimizer,loss_fn,mini_batch_size=32):
    
    # Sample random observations
    observations = random.choices(replay_memory,k=mini_batch_size)
        
    Qnet.train()
    
    for epochs in range(1): 
        
        # Loop over sampled observations
        for observation in observations:
            state = torch.from_numpy(np.array(observation[0])).float()

            # Predict Q-value at time t and next action
            q_values = Qnet(state)
            #expected_value = q_values.detach().numpy()[int(observation[2])]
            expected_value = q_values[int(observation[1])]
            
            
            #expected_value = torch.tensor(expected_value,requires_grad=True)

            done = observation[4]
            
            # Determine Q-value at time t+1
            next_state = torch.from_numpy(np.array(observation[3])).float() 
            next_q_values = t_net(next_state)          
            next_action = torch.argmax(next_q_values)
            next_q_value = next_q_values[next_action]       

            # Bellman's eqution for current state
            if done:
                target_value = torch.tensor(observation[2])   # If episode is done the target value is the reward
                
            else:
                target_value = observation[2] + (gamma * next_q_value)   # Add direct reward to obtain target value
                
            # Compute loss value
            loss = loss_fn(expected_value, target_value)
            loss_value = loss.item()

            # Back prop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            

    return Qnet,loss_value

In [153]:
# Q explore / exploit loop

eps = eps_start
replay = []

for i in range(episodes):   
    s = env.reset()[0]
    # print(s)
    s = torch.from_numpy(s)
    done=False
    rewards = 0
    t_step = 0
    
    while not done: 
            DQNet.eval()
            
            # Predict Q-value at time t
            q_values = DQNet(s)
            
            # Take action based on DQNet prediction or random
            if np.random.random() < eps:  
                a = env.action_space.sample()
                new_state,reward,done,_,_ = env.step(a)
                
            else:
                a = torch.argmax(q_values)
                new_state,reward,done,_,_ = env.step(a.item())
                
            # Gather new experience and append to repaly buffer   
            new_experience = s.tolist(),a,reward,new_state.tolist(),done 
            replay.append(new_experience)
            
            # Limit repaly buffer to 100000
            if len(replay) >100000:
                replay.pop(0)
                
            # Accumulate rewards
            rewards += reward
             
            s = torch.from_numpy(new_state) # Next state becomes current state             
            
            # Swap weights from DQNet
            if i % update_freq == 0:
                targetNet = copy.deepcopy(DQNet)
                targetNet.eval()
                
            # Don't let the episode run more than 500 time steps
            t_step += 1
            if t_step > 500:
                #print('here')
                break
            
            # Train the DQNet that approximates q(s,a), using the replay memory
            if len(replay) > 10000:
                DQNet,loss = training(DQNet,targetNet,replay,optimizer,loss_fn,mini_batch_size)
                loss_history.append(loss)
                # decrease the epsilon
                eps = max(eps*eps_decay,eps_end)
                
    scores.append(rewards)
    if rewards >= 475 and i > 1000:
        torch.save(DQNet,'cartpoleDQN.pth')
        torch.save(DQNet.state_dict(), 'cartpoleDQN_parameters.pth')
        print(i)
        
    print(f"Episode{i}/{episodes} Rewards:{rewards} Buffer Length:{len(replay)} and eps:{eps}")
    #print(rewards,end = " ")

Episode0/1500 Rewards:27.0 Buffer Length:27 and eps:1
Episode1/1500 Rewards:46.0 Buffer Length:73 and eps:1
Episode2/1500 Rewards:31.0 Buffer Length:104 and eps:1
Episode3/1500 Rewards:35.0 Buffer Length:139 and eps:1
Episode4/1500 Rewards:16.0 Buffer Length:155 and eps:1
Episode5/1500 Rewards:18.0 Buffer Length:173 and eps:1
Episode6/1500 Rewards:13.0 Buffer Length:186 and eps:1
Episode7/1500 Rewards:21.0 Buffer Length:207 and eps:1
Episode8/1500 Rewards:15.0 Buffer Length:222 and eps:1
Episode9/1500 Rewards:11.0 Buffer Length:233 and eps:1
Episode10/1500 Rewards:15.0 Buffer Length:248 and eps:1
Episode11/1500 Rewards:21.0 Buffer Length:269 and eps:1
Episode12/1500 Rewards:15.0 Buffer Length:284 and eps:1
Episode13/1500 Rewards:25.0 Buffer Length:309 and eps:1
Episode14/1500 Rewards:16.0 Buffer Length:325 and eps:1
Episode15/1500 Rewards:16.0 Buffer Length:341 and eps:1
Episode16/1500 Rewards:14.0 Buffer Length:355 and eps:1
Episode17/1500 Rewards:18.0 Buffer Length:373 and eps:1
Epis

KeyboardInterrupt: 