<a href="https://colab.research.google.com/github/ImaginationX4/Path_to_MARL/blob/master/Human_level_control_through_deep_reinforcement_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gymnasium[classic-control]

Collecting gymnasium[classic-control]
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium[classic-control])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [23]:
import gymnasium as gym
import numpy as np
from collections import deque
import random
import torch
from torch import nn
import torch.nn.functional as F

class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super().__init__()
        self.layer_int = nn.Linear(state_size, 64)
        self.layer_out = nn.Linear(64, action_size)
    def forward(self, inputs):
        q_values = self.layer_int(inputs)
        q_values = self.layer_out(q_values)
        #action = torch.argmax(q_values).item()
        return q_values#.detach().numpy()
def state_to_dqn_input( state:int, num_states:int)->torch.Tensor:
        input_tensor = torch.zeros(num_states)
        input_tensor[state] = 1
        return input_tensor

In [19]:
# Setup device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
model_1=DQN(16,4)
# Check model device
next(model_1.parameters()).device

Using device: cuda


device(type='cpu')

In [50]:
#parameter
EPOCH=10000
learning_rate_a = 0.001
discount_factor_g = 0.9
network_sync_rate = 10
mini_batch_size = 32
epsilon = 1
#enviorment
env= gym.make("FrozenLake-v1",is_slippery=False)
#experience buffer (s_i,a,r,s_i+1,done)
memory = deque([], maxlen=100)
terminated = False
truncated  = False
reward_count = 0
observation, info = env.reset(seed=42)
while(reward_count==0):
    action = env.action_space.sample()
    # add to memory
    new_observation, reward, terminated, truncated, info = env.step(action)
    observation_t = state_to_dqn_input(observation, 16)
    new_observation_t = state_to_dqn_input(new_observation, 16)
    memory.append((observation_t, action , reward, new_observation_t,terminated))
    observation = new_observation
    reward_count+=reward
    if terminated or truncated:
        observation, info = env.reset()
#double model,loss,optimizer
loss_fn = nn.MSELoss()
train_model = DQN(16,4).to(device)
target_model = DQN(16,4).to(device)
target_model.load_state_dict(train_model.state_dict())
optimizer  = torch.optim.Adam(train_model.parameters(), lr=learning_rate_a)
#epsilon is combined with epochs
epochs = np.linspace(0,1,EPOCH)
for i in range(EPOCH):
  observation, info = env.reset(seed=42)
  terminated = False
  truncated  = False
  while(not terminated and not truncated):
    if random.random() < epsilon:
        # select random action
        action = env.action_space.sample() # actions: 0=left,1=down,2=right,3=up
    else:
        with torch.no_grad():
            action = train_model(state_to_dqn_input(observation, 16).to(device)).argmax().item()
    new_observation, reward, terminated, truncated, info = env.step(action)
    observation_t = state_to_dqn_input(observation, 16)
    new_observation_t = state_to_dqn_input(new_observation, 16)
    observation = new_observation

    memory.append((observation_t, action , reward, new_observation_t,terminated))
  if i%1000==0:
      print('got the gift!'+f'{i} times')

  mini_barch = random.sample(memory, mini_batch_size)

  obs_a = np.asarray([t[0] for t in mini_barch])
  action_a = np.asarray([t[1] for t in mini_barch])
  reward_a = np.asarray([t[2] for t in mini_barch])
  new_obs_a = np.asarray([t[3] for t in mini_barch])
  terminated_a = np.asarray([t[4] for t in mini_barch])

  obs_t = torch.as_tensor(obs_a,dtype=torch.float32).to(device)
  action_t = torch.as_tensor(action_a,dtype=torch.int64).view(-1,1).to(device)
  reward_t = torch.as_tensor(reward_a,dtype=torch.float32).to(device).view(-1,1)
  new_obs_t = torch.as_tensor(new_obs_a,dtype=torch.float32).to(device)
  terminated_t = torch.as_tensor(terminated_a,dtype=torch.float32).to(device).view(-1,1)
  #compute the target
  max_q_action = target_model(new_obs_t).max(dim=1,keepdim=True)[0]
  target = reward_t+ (1-terminated_t)*discount_factor_g*max_q_action
  #compute the loss
  #print(action_t)
  q_values = train_model(obs_t).gather(dim=-1,index=action_t)
  loss = F.smooth_l1_loss(q_values,target)
  #optimizer
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  #update
  epsilon = max(epsilon - 1/EPOCH, 0)
  if i % network_sync_rate==0:
      target_model.load_state_dict(train_model.state_dict())
env.close()
torch.save(train_model.state_dict(), "frozen_lake_dql.pt")

got the gift!0 times
got the gift!1000 times
got the gift!2000 times
got the gift!3000 times
got the gift!4000 times
got the gift!5000 times
got the gift!6000 times
got the gift!7000 times
got the gift!8000 times
got the gift!9000 times


In [52]:
a = state_to_dqn_input(2, 16).to(device)
train_model(a)

tensor([0.5110, 0.7287, 0.6055, 0.6838], device='cuda:0',
       grad_fn=<ViewBackward0>)