## Imports

In [1]:
import torch
from torch import nn
from torchvision import transforms as T
from PIL import Image
import numpy as np
from pathlib import Path
from collections import deque
import random, datetime, os, copy

import matplotlib.pyplot as plt
import matplotlib.animation as animation

import gym
from gym.spaces import Box
from gym.wrappers import FrameStack

from nes_py.wrappers import JoypadSpace

import gym_super_mario_bros

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0")

env = JoypadSpace(env,[["right"],["right","A"]])
env.reset()

next_state,reward,done,info = env.step(action=0)
print(f"{next_state.shape}, \n {reward},\n {done},\n {info}")

(240, 256, 3), 
 0,
 False,
 {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40, 'y_pos': 79}


In [4]:
plt.show()

# Wrappers
## To customize env as per agent required input

In [5]:
#SkipFrame is a custom wrapper that inherits from gym.Wrapper and implements 
#the step() function. Because consecutive frames don’t vary much, we can skip 
#n-intermediate frames without losing much information. The n-th frame aggregates 
#rewards accumulated over each skipped frame.

class SkipFrame(gym.Wrapper):
    def __init__(self,env,skip):
        super().__init__(env)
        self._skip = skip # number of frames to be skipped
        
    def step(self,action):
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            obs ,reward, done , info = self.env.step(action)
            total_reward += reward
            if done:
                break
                
        return obs, total_reward,done,info

In [6]:
#useful wrapperclass to conver rgb to grayscale image
class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self,env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        
    def permute_orientation(self,observation):
        observation = np.transpose(observation,(2,0,1))
        observation = torch.tensor(observation.copy(),dtype=torch.float)
        return observation
    
    def observation(self,observation):
        observation = self.permute_orientation(observation)
        transform = T.Grayscale()
        observation = transform(observation)
        return observation        

In [7]:
#ResizeObservation downsamples each observation into a square image. New size: [1, 84, 84]

class ResizeObservation(gym.ObservationWrapper):
    
    def __init__(self,env,shape):
        super().__init__(env)
        if isinstance(shape,int): # if single value provided as shape --> obviously square frame
            self.shape = (shape,shape)
        else:
            self.shape = tuple(shape)
            
        obs_shape = self.shape + self.observation_space.shape[2:]#last channel of observation frame img
        self.observaion_space = Box(low=0,high=255,shape=obs_shape,dtype = np.uint8)
        #...............................................^   
        
    def observation(self,observation):
        transforms = T.Compose([T.Resize(self.shape),T.Normalize(0,255)])
        observation = transforms(observation).squeeze(0) # (1,x,x) -> (x,x)
        return observation

In [8]:
'''
FrameStack is a wrapper that allows us to squash consecutive frames of the 
environment into a single observation point to feed to our learning model. 
This way, we can identify if Mario was landing or jumping based on the 
direction of his movement in the previous several frames.
'''

'\nFrameStack is a wrapper that allows us to squash consecutive frames of the \nenvironment into a single observation point to feed to our learning model. \nThis way, we can identify if Mario was landing or jumping based on the \ndirection of his movement in the previous several frames.\n'

In [9]:
# apply wrapper to the env
env = SkipFrame(env,skip = 4)
env = GrayScaleObservation(env)
env = ResizeObservation(env,shape=84)
env = FrameStack(env,num_stack=4)

In [10]:
'''
After applying the above wrappers to the environment, the final wrapped state 
consists of 4 gray-scaled consecutive frames stacked together, as shown above 
in the image on the left. Each time Mario makes an action, the environment 
responds with a state of this structure. The structure is represented by a 
3-D array of size [4, 84, 84]
'''

'\nAfter applying the above wrappers to the environment, the final wrapped state \nconsists of 4 gray-scaled consecutive frames stacked together, as shown above \nin the image on the left. Each time Mario makes an action, the environment \nresponds with a state of this structure. The structure is represented by a \n3-D array of size [4, 84, 84]\n'

# Agent

### Our Mario should be able to
<li><B><I>Act </I></B></li>
<li><B><I>Remember </I></B></li>
<li><B><I>Learn </I></B></li>

    class Mario:
        def __init__():
            pass
        
        def act(self,state):
            # act on the basis of given state
            pass
        
        def cache(self,experience):
            # record a new experience in memory
            pass
        
        def recall(self):
            # samples experience from memory
            pass
        
        def learn(self):
            # update Q function with a batch of new expeirences
            pass

In [11]:
class Mario:
    
    def __init__(self,state_dim,action_dim,save_dir):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.save_dir = save_dir
        self.use_cuda = torch.cuda.is_available()
        #Mario net -> for learning
        self.net = MarioNet(self.state_dim,self.action_dim).float()
        
        if self.use_cuda:
            self.net = self.net.to(device="cuda")
            
        self.exploration_rate = 1
        self.exploration_rate_decay = 0.99999975
        self.exploration_rate_min = 0.1
        self.curr_step = 0
        
        self.save_every = 10
        
        self.memory = deque(maxlen=100000)
        self.batch_size = 32
        
    def act(self,state):
        # Given a state , choose an epsilon greedy action and update value of step
        
        #inputs:
        #state(LazyFrame):A single observation of the current state, dimensio = (state_dim)
        
        #outputs:
        #action_idx(int):an integer representing which action mario will perform
        
        #EXPLORE
        if np.random.rand()<self.exploration_rate:
            action_idx = np.random.randint(self.action_dim) # parameter passed is lower bound
        
        #EXPLOIT
        else:
            state = state.__array__() #we can get a copy of an array that doesn’t change the 
                                    #data element of original array if we change any element in the new one
            
            if self.use_cuda:
                state = torch.tensor(state).cuda()
            else:
                state = torch.tensor(state)
            
            state = state.unsqueeze(0)
            action_values = self.net(state,model="online")
            action_idx = torch.argmax(action_values,axis=1).item() # .item() method to
                        #get a python int from a torch tensor containing single value
                
        #decrease exploration rate
        self.exploration_rate += self.exploration_rate_decay
        self.exploration_rate = max(self.exploration_rate_min,self.exploration_rate)
        
        #increment step
        self.curr_step+=1
        
        return action_idx
    
    
    #following two functions are for Mario's memory
    
    def cache(self, state, next_state, action, reward, done):
        '''
        Each time Mario performs an action, he stores the experience to his memory. His 
        experience includes the current state, action performed, reward from the action, 
        the next state, and whether the game is done
        
        saves experience to self.memory
        
        Inputs:
        state (LazyFrame),
        next_state (LazyFrame),
        action (int),
        reward (float),
        done(boolean)
        '''
        print(type(state))
        state = state.__array__()
        next_state = next_state.__array__()
        
        
        state = torch.tensor([state]).to(device)
        next_state = torch.tensor([next_state]).to(device)
        action = torch.tensor([action]).to(device)
        done = torch.tensor([done]).to(device)

        self.memory.append((state,next_state,action,reward,done,))
        
    def recall(self):
        # retrieve a batch of experience from memory
        
        batch = random.sample(self.memory, self.batch_size)
        state, next_state, action, reward, done = map(torch.tensor, zip(*batch))
        state = torch.stack(state)
        next_state = torch.stack(next_state)
        action = torch.stack(action)
        reward = torch.stack(reward)
        done = torch.stack(done)
        return state, next_stack, action.squeeze() ,reward.squeeze(), done.squeeze()

Mario uses the DDQN algorithm under the hood. DDQN uses two ConvNets - Qonline and Qtarget- that independently approximate the optimal action-value function.

In our implementation, we share feature generator features across Qonline and Qtarget, but maintain separate FC classifiers for each. θtarget (the parameters of Qtarget) is frozen to prevent updation by backprop. Instead, it is periodically synced with θonline (more on this later).

In [12]:
class MarioNet(nn.Module):
    '''
    input -> 3 x (conv2d + relu) -> flatten -> 2 x (dense + relu) -> output
    '''
    
    def __init__(self,input_dim,output_dim):
        super().__init__()
        c,h,w = input_dim
        
        if h != 84:
            raise ValueError(f"Expecting input height : 84, got : {h}")
            
        if w != 84:
            raise ValueError(f"Expecting input height : 84, got : {h}")
            
        self.online = nn.Sequential(
            nn.Conv2d(in_channels=c, out_channels=32, kernel_size = 8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64 , kernel_size = 4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64 , kernel_size = 4, stride=2),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136,512),
            nn.ReLU(),
            nn.Linear(512, output_dim),
        )
        
        self.target = copy.deepcopy(self.online)
        
        for p in self.target.parameters():
            p.requires_grad = False
            
            
    def forward(self,input,model):
        if model == 'online':
            return self.online(input)
        elif model== 'target':
            return self.target(input)
        else:
            raise TypeError("Invalid model argument")

Two values are involved in learning:

### TD Estimate - 
###### Predicted optimal Q∗ for a given state s

    TDe=Q∗online(s,a)

### TD Target - 
###### Aggregation of current reward and the estimated Q∗ in the next state s′

    a′=argmaxaQonline(s′,a)

    TDt=r+γQ∗target(s′,a′)

Because we don’t know what next action a′ will be, we use the action a′ maximizes Qonline in the next state s′.

Notice we use the <B>@torch.no_grad()</B> decorator on td_target() to disable gradient calculations here (because we don’t need to backpropagate on θtarget).

In [13]:
class Mario(Mario):
    
    def __init__(self, state_dim, action_dim, save_dir):
        super().__init__(state_dim,action_dim,save_dir)
        self.gamma = 0.9
        
    def td_estimate(self, state, action):
        current_Q = self.net(state,model="online")[
            np.arange(0,self.batch_size), action
        ]
        
        # Q_online(s,a)
        return current_Q
    
    @torch.no_grad()
    def td_target(self, reward, next_state, done):
        
        next_state_Q = self.net(next_state,model="online")
        best_action = self.argmax(next_state_Q, axis=1)
        
        next_Q = self.net(next_state,model = 'target')[
            np.arange(0, self.batch_size), best_action
        ]
        
        return (reward + (1 - done.float()) * self.gamma * next_Q).float()

## Updating the model
As Mario samples inputs from his replay buffer, we compute TDt and TDe and backpropagate this loss down Qonline to update its parameters θonline (α is the learning rate lr passed to the optimizer)

    θonline←θonline+α∇(TDe−TDt)
    
θtarget does not update through backpropagation. Instead, we periodically copy θonline to θtarget

    θtarget←θonline

In [14]:
class Mario(Mario):
    
    def __init__(self, state_dim, action_dim, save_dir):
        super().__init__(state_dim, action_dim, save_dir)
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025)
        self.loss_fn = torch.nn.SmoothL1Loss()
        
    def update_Q_online(self, td_estimate, td_target):
        loss = self.loss_fn(td_estimate, td_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        return loss.item()
    
    def sync_Q_target(self):
        # copy online model into target model
        self.net.target.load_state_dict(self.net.online.state_dict())
    
    # save checkpoint
    def save(self):
        save_path = (
            self.save_dir / f"mario_net_{int(self.curr_step//self.save_every)}.chkpt"
        )
        torch.save(
            dict(model=self.net.state_dict(),exploration_rate=self.exploration_rate),
            save_path,
        )
        print(f"MarioNet saved to {save_path} at step {self.curr_step}")
        

In [15]:
class Mario(Mario):
    def __init__(self, state_dim, action_dim, save_dir):
        super().__init__(state_dim, action_dim, save_dir)
        self.burnin = 1e4       # min. experiences before training
        self.learn_every = 3    # no. of experiences between updates to Q_online
        self.sync_every = 1e4   # no. of experiences between Q_target & Q_online sync
        
    def learn(self):
        
        if self.curr_step % self.sync_every == 0:
            self.sync_Q_target()            
        if self.curr_step % self.save_every == 0:
            self.save()
        if self.curr_step <self.burnin:
            return None, None
        if self.curr_step % self.learn_every != 0:
            return None, None
        
        # Sample from memory
        state, next_state, action, reward, done = self.recall()
        
        #get td estimate and target
        td_est = self.td_estimate(state, action)
        td_tgt = self.td_target(reward, next_state, done)
        
        loss = self.update_Q_online(td_est, td_tgt)
        
        return (td_est.mean().item(), loss)

## Logging

In [16]:
import numpy as np
import time, datetime
import matplotlib.pyplot as plt

class MetricLogger:
    
    def __init__(self,save_dir):
        self.save_log = save_dir/ "log"
        
        with open(self.save_log,"w") as f:
            f.write(
                f"{'Episode':>0}{'Step':>0}{'Epsilon':>10}{'MeanReward':>15}"
                f"{'MeaningLength':>15}{'MeanLoss':>15}{'MeanQValue':>15}"
                f"{'TimeDelta':>15}{'Time':>20}\n"
            )
        self.ep_rewards_plot = save_dir / "reward_plot.jpg"
        self.ep_length_plot = save_dir / "length_plot.jpg"
        self.ep_avg_losses_plot = save_dir / "loss_plot.jpg"
        self.ep_avg_qs_plot = save_dir / "q_plot.jpg"
        
        # History metrics
        self.ep_rewards = []
        self.ep_lengths = []
        self.ep_avg_losses = []
        self.ep_avg_qs = []
        
        # Moving averages, added for every call to record()
        self.moving_avg_ep_rewards = []
        self.moving_avg_ep_length = []
        self.moving_avg_ep_avg_losses = []
        self.moving_avg_ep_avg_qs = []
        
        # current episode metric
        self.init_episode()
        
        #Timing
        self.record_time = time.time()
     
    def init_episode(self):
        self.curr_ep_reward = 0.0
        self.curr_ep_length = 0
        self.curr_ep_loss = 0.0
        self.curr_ep_q = 0.0
        self.curr_ep_loss_length = 0
    
    def log_step(self, reward, loss, q):
        self.curr_ep_reward += reward
        self.curr_ep_length += 1
        
        if loss:
            self.curr_ep_loss += loss
            self.curr_ep_q += q
            self.curr_ep_loss_length += 1
            
    def log_episode(self):
        "Mark end of episode"
        self.ep_rewards.append(self.curr_ep_reward)
        self.ep_lengths.append(self.curr_ep_length)
        
        if self.curr_ep_loss_length == 0:
            ep_avg_loss = 0
            ep_avg_q = 0
            
        else:
            ep_avg_loss = np.round(self.curr_ep_loss / self.curr_ep_loss_length, 5)
            ep_avg_q = np.round(self.curr_ep_q / self.curr_ep_loss_length, 5)
        
        self.ep_avg_losses.append(ep_avg_loss)
        self.ep_avg_qs.append(ep_avg_q)
        
        self.init_episode()
        
    def record(self, episode, epsilon, step):
        mean_ep_rewards = np.round(np.mean(self.ep_rewards[-100:]),3)
        mean_ep_length = np.round(np.mean(self.ep_lengths[-100:]),3)
        mean_ep_loss = np.round(np.mean(self.ep_avg_losses[-100:]),3)        
        mean_ep_q = np.round(np.mean(self.ep_avg_qs[-100:]),3)
        
        self.moving_avg_ep_rewards.append(mean_ep_rewards)
        self.moving_avg_ep_length.append(mean_ep_length)
        self.moving_avg_ep_avg_losses.append(mean_ep_loss)
        self.moving_avg_ep_avg_qs.append(mean_ep_q)
        
        last_record_time = self.record_time
        self.record_time = time.time()
        time_since_last_record = np.round(self.record_time - last_record_time, 3)
        
        print(
            f"Episode : {episode}\n",
            f"Step : {step}\n",
            f"Epsilon : {epsilon}\n",
            f"Mean Reward : {mean_ep_rewards}\n",
            f"Mean Length : {mean_ep_length}\n",
            f"Mean Loss : {mean_ep_loss}\n",
            f"Mean Q Value : {mean_ep_q}\n",
            f"Time Delta {time_since_last_record}\n",
            f"Time {datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}"
        )
          
        #'''
        with open(self.save_log,"a") as f:
            f.write(
                f"{episode:8d}{step:8d}{epsilon:10.3f}\n"
                f"{mean_ep_rewards:15.3f} {mean_ep_length:15.3f} {mean_ep_loss:15.3f} {mean_ep_q:15.3f}\n"
                f"{time_since_last_record:15.3f}\n"
                f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'):>20}\n"
            )
        #''' 
        for metric in ["ep_rewards","ep_length","ep_avg_losses","ep_avg_qs"]:
            plt.plot(getattr(self,f"moving_avg_{metric}"))
            plt.savefig(getattr(self, f"{metric}_plot"))
            plt.clf()

In [17]:
use_cuda = torch.cuda.is_available()
print(f"Using CUDA : {use_cuda}")
print()

fig = plt.figure()
f = r"./mario.gif"
save_dir = Path("checkpoints") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
save_dir.mkdir(parents=True)

mario = Mario(state_dim = (4,84, 84), action_dim=env.action_space.n, save_dir=save_dir)

logger = MetricLogger(save_dir)

episodes = 10
vid = []
for e in range(episodes):
    
    state = env.reset()
    
    # Play the Game
    while 1:
        action = mario.act(state)
        next_state, reward, done, info = env.step(action)
        img = plt.imshow(env.render(mode='rgb_array'),animated=True)
        
        if e == 8:
            vid.append([img])
        img
        
        mario.cache(state, next_state, action, reward, done)
        
        #learn
        q, loss = mario.learn()
        
        #logging
        logger.log_step(reward, loss, q)
        
        #update state
        state = next_state
        
        #check if end of game
        if done or info["flag_get"]:
            break
            
            
    logger.log_episode()
    
    if e%9 == 0:
        logger.record(episode=e, epsilon=mario.exploration_rate, step=mario.curr_step)
        
ani = animation.ArtistAnimation(fig, vid, interval=1000, blit=True,repeat_delay=1000)

 
writergif = animation.PillowWriter(fps=30) 
ani.save(f, writer=writergif)

Using CUDA : True

<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
MarioNet saved to checkpoints\2021-01-10T20-00-47\mario_net_1.chkpt at step 10
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_s

  done = torch.tensor([done]).to(device)


<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
MarioNet saved to checkpoints\2021-01-10T20-00-47\mario_net_17.chkpt at step 170
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
MarioNet saved to checkpoints\2021-01-10T20-00-47\mario_net_18.chkpt at step 180
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gym.wrappers.frame_stack.LazyFrames'>
<class 'gy

<Figure size 432x288 with 0 Axes>

In [18]:
#torch.cuda.empty_cache()

In [19]:
#torch.cuda.memory_allocated(device)

In [30]:
#type(vid[0][0])

matplotlib.image.AxesImage

In [31]:
len(vid)

282

In [41]:
fi = plt.figure()
#anim = animation.ArtistAnimation(fig, vid, interval=1000, blit=True,repeat_delay=1000)
x = vid[0][0]
x
plt.show()

<Figure size 432x288 with 0 Axes>

In [38]:
plt.show()

In [42]:
v = [vid[i][0] for i in range(len(vid))]

In [47]:
v = np.array(v[0])

In [12]:
import matplotlib
anim = matplotlib.animation.ArtistAnimation(fig, a, interval=1000, blit=True,repeat_delay=1000)

NameError: name 'animation' is not defined

In [46]:
anim

<matplotlib.animation.ArtistAnimation at 0x19358833040>

In [49]:
v = np.array(vid)

In [None]:
np.save('./arr.np',vid)

In [11]:
import numpy as np
a = np.load('arr.npy',allow_pickle=True)

In [9]:
a = list(a[0])

In [10]:
a

[<matplotlib.image.AxesImage at 0x195033fda00>]