In [1]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

In [2]:
class MountainCarTrain:
    def __init__(self,env):
        self.env = env
        self.gamma = 0.99

        self.input_dim = self.env.observation_space.shape[0]
        self.output_dim = self.env.action_space.n

        self.epsilon = 1
        self.epsilon_decay = 0.05
        self.epsilon_min = 0.01

        self.learingRate = 0.001

        self.replayBuffer = deque(maxlen=10000)

        self.n_episodes = 1000

        self.n_steps = 200

        self.batch_size = 32

        self.trainNetwork = self.createNetwork()
        self.targetNetwork = self.createNetwork()

        self.targetNetwork.load_state_dict(self.trainNetwork.state_dict())

        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.trainNetwork.parameters(), lr=self.learingRate)

    def createNetwork(self):
        network = nn.Sequential(
            nn.Linear(self.input_dim, 24),
            nn.ReLU(),
            nn.Linear(24, 48),
            nn.ReLU(),
            nn.Linear(48, self.output_dim)
        )
        return network
    
    def getAction(self, states):
        self.epsilon = max(self.epsilon_min, self.epsilon)

        if states.ndim == 1:
            if np.random.rand() < self.epsilon:
                return self.env.action_space.sample()
            else:
                return torch.argmax(self.trainNetwork(torch.tensor(states).float())).item()
            
        else:
            randoms = np.random.rand(states.shape[0])
            actions = np.zeros(states.shape[0])

            actions[randoms < self.epsilon] = self.env.action_space.sample()
            actions[randoms >= self.epsilon] = torch.argmax(self.trainNetwork(states[torch.tensor(randoms >= self.epsilon)]).float()).item()

            return actions
        
    def train(self):
        if len(self.replayBuffer) < self.batch_size:
            return

        minibatch = random.sample(self.replayBuffer, self.batch_size)

        states = []
        next_states = []
        rewards = []
        dones = []
        actions = []

        for state, action, reward, next_state, done in minibatch:
            states.append(state)
            next_states.append(next_state)
            rewards.append(reward)
            dones.append(done)
            actions.append(action)

        states = np.array(states, dtype=np.float32).reshape(self.batch_size, self.input_dim)
        next_states = np.array(next_states, dtype=np.float32).reshape(self.batch_size, self.input_dim)
        actions = np.array(actions, dtype=np.float32).reshape(self.batch_size, 1)
        rewards = np.array(rewards, dtype=np.float32).reshape(self.batch_size)
        dones = np.array(dones, dtype=np.float32).reshape(self.batch_size)


        states = torch.tensor(states)
        next_states = torch.tensor(next_states)
        actions = torch.tensor(actions)
        rewards = torch.tensor(rewards)
        dones = torch.tensor(dones)

        # Q(s, a) with epsilon greedy usw getAction and make action into an int64 tensor
        q_values = self.trainNetwork(states).gather(1, actions.long()).reshape(self.batch_size)

        # Q(s', a') with max a' of Q(s', a')
        next_q_values = torch.max(self.targetNetwork(next_states), 1)[0].detach()

        # Q(s, a) = r + gamma * Q(s', a')
        target_q_values = rewards + self.gamma * next_q_values * (1 - dones)

        loss = self.criterion(q_values, target_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def originalTry(self, state, episode):
        cumulative_reward = 0

        max_position = -np.inf

        for step in range(self.n_steps):
            # if state is a tuple, take it's first element
            if isinstance(state, tuple):
                state = state[0]

            action = self.getAction(state)
            
            if episode % 50 == 0:
                self.env.render()

            next_state, reward, terminated, truncated, _ = self.env.step(action)

            done = terminated or truncated

            # keep track of max position
            if next_state[0] > max_position:
                max_position = next_state[0]

            # # give reward for reaching a new maximum position
            # reward = next_state[0] + 0.5

            self.replayBuffer.append((state, action, reward, next_state, done))

            self.train()

            cumulative_reward += reward

            if done:
                print(f"Episode {episode} finished after {step} steps with cumulative reward {cumulative_reward}")
                break

            state = next_state

        self.targetNetwork.load_state_dict(self.trainNetwork.state_dict())

        self.epsilon -= self.epsilon_decay

    def trainModel(self):
        for episode in range(self.n_episodes):
            state = self.env.reset()
            self.originalTry(state, episode)

In [3]:
env = gym.make('MountainCar-v0', render_mode=None)
agent = MountainCarTrain(env)

In [4]:
agent.trainModel()

  gym.logger.warn(


Episode 0 finished after 199 steps with cumulative reward -200.0
Episode 1 finished after 199 steps with cumulative reward -200.0
Episode 2 finished after 199 steps with cumulative reward -200.0
Episode 3 finished after 199 steps with cumulative reward -200.0
Episode 4 finished after 199 steps with cumulative reward -200.0
Episode 5 finished after 199 steps with cumulative reward -200.0
Episode 6 finished after 199 steps with cumulative reward -200.0
Episode 7 finished after 199 steps with cumulative reward -200.0
Episode 8 finished after 199 steps with cumulative reward -200.0
Episode 9 finished after 199 steps with cumulative reward -200.0
Episode 10 finished after 199 steps with cumulative reward -200.0
Episode 11 finished after 199 steps with cumulative reward -200.0
Episode 12 finished after 199 steps with cumulative reward -200.0
Episode 13 finished after 199 steps with cumulative reward -200.0
Episode 14 finished after 199 steps with cumulative reward -200.0
Episode 15 finished 

In [6]:
env_test = gym.make('MountainCar-v0', render_mode=None) # render_mode='human' for visualization

seeds = [42, 43, 44, 45, 46, 47, 48, 49, 50, 51]

for seed in seeds:
    observation, _ = env_test.reset(seed=42)
    total_reward = 0
    
    for i in range(200):
        env_test.render()
        q_values = agent.targetNetwork(torch.tensor(observation).float())
        # print(q_values)
        action = torch.argmax(q_values).item()
        observation, reward, terminated, truncated, info = env_test.step(action)
        total_reward += reward
        # print(i, observation, reward, terminated, truncated, info, action)
        if terminated or truncated:
            break

    print(f"Episode with seed={seed} finished after {i} steps and {total_reward} reward")

env_test.close()

Episode with seed=42 finished after 92 steps and -93.0 reward
Episode with seed=43 finished after 92 steps and -93.0 reward
Episode with seed=44 finished after 92 steps and -93.0 reward
Episode with seed=45 finished after 92 steps and -93.0 reward
Episode with seed=46 finished after 92 steps and -93.0 reward
Episode with seed=47 finished after 92 steps and -93.0 reward
Episode with seed=48 finished after 92 steps and -93.0 reward
Episode with seed=49 finished after 92 steps and -93.0 reward
Episode with seed=50 finished after 92 steps and -93.0 reward
Episode with seed=51 finished after 92 steps and -93.0 reward
