In [1]:
import random
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable
from torchmetrics.classification import MulticlassAccuracy

In [2]:
episodes = 100

capacity = 10000
sample_size = 10

learning_rate = .00001
discount_factor = 0.9
interpolation_parameter = 1e-3

epsilon_start = 1.0
epsilon_end = 0.1
epsilon_decay_rate = 1.2 / episodes

In [3]:
env = gym.make("MountainCar-v0", render_mode=None)

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

In [4]:
class Network(nn.Module):
    def __init__(self, state_size, action_size, seed=42):
        super(Network, self).__init__()
        self.seed = seed
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128,636)
        self.fc3 = nn.Linear(636,64)
        self.fc4 = nn.Linear(64, action_size)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)

In [5]:
class Memory(object):
    def __init__(self, capacity):
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")
        self.capacity = capacity
        self.memory = []

    def push(self, event):
        self.memory.append(event)
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, sample_size):
        experiences = random.sample(self.memory, sample_size)
        states = torch.from_numpy(np.vstack([e[0] for e in experiences if e is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e[1] for e in experiences if e is not None])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e[2] for e in experiences if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([e[3] for e in experiences if e is not None])).float().to(self.device)
        terminations = torch.from_numpy(np.vstack([e[4] for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
        return states, next_states, actions, rewards, terminations

In [6]:
class Agent():
    def __init__(self, state_size, action_size):
        self.device = torch.device("cuda:0" if torch.cuda.is_available else "cpu")
        self.state_size = state_size
        self.action_size = action_size
        self.local_network = Network(state_size, action_size).to(self.device)
        self.target_network = Network(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.local_network.parameters(), lr = learning_rate)
        self.metric = MulticlassAccuracy(num_classes=3).to(self.device)
        self.memory = Memory(capacity)
        self.t_step = 0
        self.l_step = 0

    def step(self, state, action, reward, next_state, terminated):
        # print(f'Step / action: {action}')
        self.memory.push((state,action,reward,next_state,terminated))
        self.t_step = (self.t_step + 1) % 4
        if self.t_step == 0:
            if len(self.memory.memory) > sample_size:
                experiences = self.memory.sample(sample_size)
                self.learn(experiences, discount_factor)

    def act(self, state, epsilon):
        st = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.local_network.eval()
        with torch.no_grad():
            action = self.local_network(st)
        self.local_network.train()
        if random.random() > epsilon:
            return np.argmax(action.cpu().data.numpy())
        else:
            if state[1] > 0:
                action = random.choices(np.arange(self.action_size),weights=(20,20,60))
            else:
                action = random.choices(np.arange(self.action_size),weights=(60,20,20))
            return action[0]

    def learn(self, experiences, discount_factor):
        states, next_states, actions, rewards, terminations = experiences
        # print(self.local_network(states))
        acc = self.metric(self.local_network(states), actions.squeeze(1))
        next_q_targets = self.target_network(next_states).detach().max(1)[0].unsqueeze(1)
        q_targets = rewards + discount_factor * next_q_targets * (1 - terminations)
        q_expected = self.local_network(states).gather(1, actions)
        loss = F.mse_loss(q_expected, q_targets)
        self.l_step = (self.l_step + 1) % 200
        # if self.l_step == 0:
        #     print(f'loss: {loss}\tacc: {acc}')
        loss.backward()
        self.optimizer.step()
        self.soft_update(self.local_network, self.target_network, interpolation_parameter)

    def soft_update(self, local_model, target_model, interpolation_parameter):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(interpolation_parameter * local_param.data + (1.0 - interpolation_parameter) * target_param.data)

In [7]:
agent = Agent(state_size,action_size)

In [8]:
epsilon = epsilon_start
best = float('-inf')

for e in range(1, episodes+1):
    terminated = False
    state,_ = env.reset()
    score = 1000

    while(not terminated):
        action = agent.act(state, epsilon)
        next_state, reward, terminated,truncated,_ = env.step(action)

        if terminated:
            reward = 10
        elif next_state[1] > 0:
            if next_state[0] < 0:
                reward = (next_state[0]*-1)*next_state[1]
        elif next_state[1] < 0 and next_state[0] < 0:
                reward = ((next_state[0]*1)*next_state[1])*0.1
        else:
            reward = reward - next_state[0]*next_state[1]
        # print(f'Loop / action: {action}')
        agent.step(state, action, reward, next_state, terminated)
        state = next_state
        score += reward
    epsilon = max(0.05, epsilon - epsilon_decay_rate)
    best = max(best, score)
    if e % 2 == 0:
        print(f'Episode: {e}\tScore: {score}')
env.close()

Episode: 2	Score: 953.8598583008422
Episode: 4	Score: 985.9849558475122
Episode: 6	Score: 953.2682145058009
Episode: 8	Score: 994.3378058873279
Episode: 10	Score: 966.7072192783382
Episode: 12	Score: 988.4017707415697
Episode: 14	Score: 981.736214700563
Episode: 16	Score: 953.0991145397824
Episode: 18	Score: 853.695515802464
Episode: 20	Score: 957.056935982786
Episode: 22	Score: 976.7143522051604
Episode: 24	Score: 792.2814127446566
Episode: 26	Score: 967.7142790482221
Episode: 28	Score: 963.6294477028154
Episode: 30	Score: 957.8737676487752
Episode: 32	Score: 971.1937443595363
Episode: 34	Score: 952.965719769904
Episode: 36	Score: 955.1630993163004
Episode: 38	Score: 953.2228266322848
Episode: 40	Score: 869.112974784525
Episode: 42	Score: 428.71844374840816
Episode: 44	Score: 898.591174559242
Episode: 46	Score: 920.4568498815001
Episode: 48	Score: 955.900820320147
Episode: 50	Score: 991.4098621727777
Episode: 52	Score: 839.2752478824414
Episode: 54	Score: 965.9971650014763
Episode: 56

In [9]:
env = gym.make("MountainCar-v0", render_mode='human')

for i in range(1,5):
    terminated = False
    state,_ = env.reset()
    score = 1000

    while(not terminated):
        action = agent.act(state, .1)
        next_state, reward, terminated,_,_ = env.step(action)
        # agent.step(state, action, reward, next_state, terminated)

        state = next_state
        score += reward
    print(f'Episode:{i}\tScore: {score}')
env.close()

Episode:1	Score: 835.0
Episode:2	Score: 691.0
Episode:3	Score: 833.0
Episode:4	Score: 529.0
