In [1]:
#!apt-get install swig
#!pip install gymnasium[box2d]
import gymnasium as gym
#--#
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import IPython
#--#
import collections
import random
#--#
import torch

In [2]:
def show(ims,jump=10):
    ims = ims[::jump]
    fig = plt.Figure()
    ax = fig.subplots()
    def update(i):
        ax.imshow(ims[i])
    ani = FuncAnimation(fig,update,frames=len(ims))
    display(IPython.display.HTML(ani.to_jshtml()))

In [3]:
class AgentRandom: 
    def __init__(self):
        #--# define spaces 
        self.action_space = gym.spaces.Discrete(4)
        #
        self.state =  None    ## 길이가 8인 np.array        
        self.action = None            ## int, 0,1,2,3 중 하나
        self.reward = None            ## float  
        self.next_state =  None       ## np.array
        self.terminated = None        ## bool 
        #
        self.buffer_size = 5000
        self.states = collections.deque(maxlen=self.buffer_size) # 원소는 텐서         
        self.actions = collections.deque(maxlen=self.buffer_size) # 원소는 텐서 
        self.rewards = collections.deque(maxlen=self.buffer_size) # 원소는 텐서
        self.next_states = collections.deque(maxlen=self.buffer_size) # 원소는 텐서 
        self.terminations = collections.deque(maxlen=self.buffer_size) # 원소는 텐서 
        #
        self.n_experiences = 0 
    def act(self):
        self.action = self.action_space.sample()
    def learn(self):
        pass 
    def save_experience(self):
        self.states.append(torch.tensor(self.state))
        self.actions.append(torch.tensor(self.action))
        self.rewards.append(torch.tensor(self.reward))
        self.next_states.append(torch.tensor(self.next_state))
        self.terminations.append(torch.tensor(self.terminated))           
        #--#
        self.n_experiences = self.n_experiences + 1 

In [4]:
class Agent(AgentRandom):
    def __init__(self):
        super().__init__()
        self.eps = 0 
        self.q_net = torch.nn.Sequential(
            torch.nn.Linear(8,256),
            torch.nn.ReLU(),
            torch.nn.Linear(256,128),
            torch.nn.ReLU(),
            torch.nn.Linear(128,64),
            torch.nn.ReLU(),    
            torch.nn.Linear(64,4)
        )
        self.optimizr = torch.optim.Adam(self.q_net.parameters(),lr=0.0001)
        self.batch_size = 64
    def act(self):
        if random.random() < self.eps: 
            self.action = self.action_space.sample()
        else: 
            s = torch.tensor(self.state)
            self.action = self.q_net(s).argmax().item() 
    def learn(self):
        if self.n_experiences < self.batch_size:
            pass 
        else: 
            for epoc in range(1):
                memory = list(zip(
                    self.states,
                    self.actions,
                    self.rewards,
                    self.next_states,
                    self.terminations
                ))
                minibatch = random.sample(memory,self.batch_size)
                ## step 1~2 
                loss = 0 
                for s,a,r,ss,tmd in minibatch:
                    # step1: q_hat 
                    q_hat = self.q_net(s)[a]        
                    # step2: loss를 계산한다. 
                    if self.terminated:
                        q = r
                    else:
                        future = self.q_net(ss).max().data
                        q = r + 0.99 * future
                    loss = loss + (q_hat-q)**2 
                loss = loss / self.batch_size 
                # step3 
                loss.backward()
                # step4 
                self.optimizr.step()
                self.optimizr.zero_grad() 

In [15]:
env = gym.make("LunarLander-v3",render_mode = 'rgb_array')
player_dummy = Agent() 
#player_dummy.q_net = player.q_net # 비법전수 
player_dummy.q_net.load_state_dict(torch.load("2025q_net_600.pth"))
player_dummy.state, _ = env.reset()
score = 0 
ims = [] 
ims.append(env.render())
for t in range(1001):
    player_dummy.act() 
    player_dummy.next_state, player_dummy.reward, player_dummy.terminated, player_dummy.truncated, _  = env.step(player_dummy.action)
    score = score + player_dummy.reward
    ims.append(env.render())
    player_dummy.state = player_dummy.next_state
    if player_dummy.terminated or player_dummy.truncated: 
        break 

In [None]:
show(ims)

In [None]:
score