In [1]:
import gymnasium as gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import imageio
import tqdm
from collections import deque

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
from torch.distributions import Categorical

import matplotlib.pyplot as plt
import numpy as np

In [2]:
NUM_EPISODES = 10000
MAX_STEPS = 200
RECENT_EPS = 20
SOLVED_SCORE = 0

GAMMA = 0.99
HIDDEN_UNITS = 256
LR_ACTOR = 0.0005
LR_CRITIC = 0.0005

#device to run model on 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cpu


In [3]:
class Actor(nn.Module):    
    def __init__(self, observation_space, hidden_units, action_space):
        super(Actor, self).__init__()
        self.input_layer = nn.Linear(observation_space, hidden_units)
        self.output_layer = nn.Linear(hidden_units, action_space)
    
    def forward(self, x):
        x = self.input_layer(x)
        x = F.relu(x)
        actions = self.output_layer(x)
        action_probs = F.softmax(actions, dim=1)
        return action_probs

In [4]:
class Critic(nn.Module):
    def __init__(self, observation_space, hidden_units):
        super(Critic, self).__init__()
        self.input_layer = nn.Linear(observation_space, hidden_units)
        self.output_layer = nn.Linear(hidden_units, 1)
        
    def forward(self, x):
        x = self.input_layer(x)
        x = F.relu(x)
        state_value = self.output_layer(x)
        return state_value

In [5]:
env = gym.make("Pong-ramDeterministic-v4", obs_type="ram")

observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
hidden_units = HIDDEN_UNITS

actor = Actor(observation_space, hidden_units, action_space).to(DEVICE)
critic = Critic(observation_space, hidden_units).to(DEVICE)

actor_optimizer = torch.optim.Adam(actor.parameters(), lr=LR_ACTOR)
critic_optimizer = torch.optim.Adam(critic.parameters(), lr=LR_CRITIC)

In [6]:
def select_action(actor, state):
    ''' Selects an action given current state using the actor network.'''
    
    #convert state to float tensor, add 1 dimension, allocate tensor on device
    state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)
    
    #use network to predict action probabilities
    action_probs = actor(state)
    state = state.detach()
    
    #sample an action using the probability distribution
    m = Categorical(action_probs)
    action = m.sample()
    
    #return action
    return action.item(), m.log_prob(action)

In [7]:
#track scores
scores = []

#track recent scores
recent_scores = deque(maxlen = RECENT_EPS)

#run episodes
for episode in tqdm.trange(NUM_EPISODES):
    
    #init variables
    state, _ = env.reset()
    done = False
    score = 0
    I = 1
    
    #run episode, update online
    for step in range(MAX_STEPS):
        
        #get action and log probability
        action, lp = select_action(actor, state)
        
        #step with action
        new_state, reward, done, _, _ = env.step(action)
        
        #update episode score
        score += reward
        
        #get state value of current state
        state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)
        state_val = critic(state_tensor)
        
        #get state value of next state
        new_state_tensor = torch.from_numpy(new_state).float().unsqueeze(0).to(DEVICE)        
        new_state_val = critic(new_state_tensor)
        
        #if terminal state, next state val is 0
        if done:
            new_state_val = torch.tensor([0]).float().unsqueeze(0).to(DEVICE)
        
        #calculate value function loss with MSE
        val_loss = F.smooth_l1_loss(reward + GAMMA * new_state_val, state_val)
        val_loss *= I
        
        #calculate policy loss
        advantage = reward + GAMMA * new_state_val.item() - state_val.item()
        actor_loss = -lp * advantage
        actor_loss *= I
        
        #Backpropagate actor
        actor_optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        actor_optimizer.step()
        
        #Backpropagate critic
        critic_optimizer.zero_grad()
        val_loss.backward()
        critic_optimizer.step()
        
        if done:
            break
            
        #move into new state, discount I
        state = new_state
        I *= GAMMA
    
    #append episode score 
    scores.append(score)
    recent_scores.append(score)

    #print episode stats
    if episode % RECENT_EPS == 0:
        print('Episode {}\tAverage Score: {:.2f}'.format(episode, np.array(recent_scores).mean()))
    
    #early stopping if we meet solved score goal
    if np.array(recent_scores).mean() >= SOLVED_SCORE:
        break

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 1/10000 [00:01<3:14:11,  1.17s/it]

Episode 0	Average Score: -4.00


  0%|          | 21/10000 [00:37<5:14:55,  1.89s/it]

Episode 20	Average Score: -4.00


  0%|          | 41/10000 [01:18<5:24:38,  1.96s/it]

Episode 40	Average Score: -4.00


  1%|          | 61/10000 [01:58<5:47:50,  2.10s/it]

Episode 60	Average Score: -4.00


  1%|          | 81/10000 [02:34<5:33:55,  2.02s/it]

Episode 80	Average Score: -4.00


  1%|          | 101/10000 [03:16<5:55:58,  2.16s/it]

Episode 100	Average Score: -4.00


  1%|          | 121/10000 [03:58<5:46:02,  2.10s/it]

Episode 120	Average Score: -4.00


  1%|▏         | 141/10000 [04:26<3:46:10,  1.38s/it]

Episode 140	Average Score: -4.00


  2%|▏         | 161/10000 [04:53<3:25:25,  1.25s/it]

Episode 160	Average Score: -4.00


  2%|▏         | 181/10000 [05:35<5:36:29,  2.06s/it]

Episode 180	Average Score: -4.00


  2%|▏         | 201/10000 [06:03<3:09:53,  1.16s/it]

Episode 200	Average Score: -4.00


  2%|▏         | 221/10000 [06:31<4:19:02,  1.59s/it]

Episode 220	Average Score: -4.00


  2%|▏         | 241/10000 [06:55<3:21:08,  1.24s/it]

Episode 240	Average Score: -4.00


  3%|▎         | 261/10000 [07:20<3:33:40,  1.32s/it]

Episode 260	Average Score: -4.00


  3%|▎         | 281/10000 [07:50<5:12:02,  1.93s/it]

Episode 280	Average Score: -4.00


  3%|▎         | 301/10000 [08:19<3:14:28,  1.20s/it]

Episode 300	Average Score: -4.00


  3%|▎         | 321/10000 [09:03<5:36:42,  2.09s/it]

Episode 320	Average Score: -4.00


  3%|▎         | 341/10000 [09:29<3:14:51,  1.21s/it]

Episode 340	Average Score: -4.00


  4%|▎         | 361/10000 [09:56<3:22:10,  1.26s/it]

Episode 360	Average Score: -4.00


  4%|▍         | 381/10000 [10:28<3:20:17,  1.25s/it]

Episode 380	Average Score: -4.00


  4%|▍         | 401/10000 [10:53<3:29:56,  1.31s/it]

Episode 400	Average Score: -4.00


  4%|▍         | 421/10000 [11:26<5:36:28,  2.11s/it]

Episode 420	Average Score: -4.00


  4%|▍         | 441/10000 [12:12<7:44:56,  2.92s/it]

Episode 440	Average Score: -4.00


  5%|▍         | 461/10000 [13:03<4:10:42,  1.58s/it] 

Episode 460	Average Score: -4.00


  5%|▍         | 481/10000 [13:32<3:46:19,  1.43s/it]

Episode 480	Average Score: -4.00


  5%|▌         | 501/10000 [13:59<3:23:09,  1.28s/it]

Episode 500	Average Score: -4.00


  5%|▌         | 521/10000 [14:28<4:18:19,  1.64s/it]

Episode 520	Average Score: -4.00


  5%|▌         | 541/10000 [14:56<3:46:44,  1.44s/it]

Episode 540	Average Score: -4.00


  6%|▌         | 561/10000 [15:25<3:54:49,  1.49s/it]

Episode 560	Average Score: -4.00


  6%|▌         | 581/10000 [15:55<3:51:31,  1.47s/it]

Episode 580	Average Score: -4.00


  6%|▌         | 601/10000 [16:24<3:40:14,  1.41s/it]

Episode 600	Average Score: -4.00


  6%|▌         | 621/10000 [16:57<4:17:34,  1.65s/it]

Episode 620	Average Score: -4.00


  6%|▋         | 641/10000 [17:26<3:28:30,  1.34s/it]

Episode 640	Average Score: -4.00


  7%|▋         | 661/10000 [17:54<3:51:32,  1.49s/it]

Episode 660	Average Score: -4.00


  7%|▋         | 679/10000 [18:32<4:14:33,  1.64s/it]


KeyboardInterrupt: 