In [7]:
import os
import numpy as np
import gym
from gym import spaces
from math import sqrt
import torch
import torch.nn as nn
import torch.optim as optim

# Constants
SPACE_WIDTH = 100  # Half-width of the space [-SPACE_WIDTH, SPACE_WIDTH]
RESET_SPACE_WIDTH = SPACE_WIDTH - 2

# Define the Cube Environment
class CubeEnv(gym.Env):
    def __init__(self):
        super(CubeEnv, self).__init__()
        
        self.times_in_origin = 0
        
        # Define the state and action space
        self.state_space = spaces.Box(low=-SPACE_WIDTH, high=SPACE_WIDTH, shape=(3,), dtype=np.int32)
        self.action_space = spaces.Discrete(5)  # Four possible moves plus 'stay'
        
        # Initialize state
        self.state = np.array([np.random.randint(-SPACE_WIDTH, SPACE_WIDTH + 1), 
                               np.random.randint(-SPACE_WIDTH, SPACE_WIDTH + 1), 
                               0])
        
    def step(self, action):
        # Apply action to the state
        if action == 0:  
            self.state[0] += 1
        elif action == 1:  
            self.state[0] -= 1
        elif action == 2:  
            self.state[1] += 1
        elif action == 3:  
            self.state[1] -= 1
        # Action 4 is 'stay', do nothing
        
        # Clip the state to be within the state space
        self.state = np.clip(self.state, -SPACE_WIDTH, SPACE_WIDTH-1)
        
        # Calculate reward
        distance = 100 * self.state[0]**2 + self.state[1]**2  # + self.state[2]**2
        reward = -distance
        
        # Check if done
        done = self.is_done(self.state)
        
        return self.state, reward, done, {}

    def reset(self):
        # Reset the state to a random location on the Z=0 surface
        self.state = np.array([np.random.randint(-RESET_SPACE_WIDTH, RESET_SPACE_WIDTH + 1), 
                               np.random.randint(-RESET_SPACE_WIDTH, RESET_SPACE_WIDTH + 1), 
                               0])
        self.times_in_origin = 0
        return self.state

    def render(self, mode='human'):
        pass
    
    def is_done(self, state):
        in_origin = np.array_equal(state, np.array([0, 0, 0]))
        if in_origin: 
            self.times_in_origin += 1
            return self.times_in_origin > 20
        else:
            self.times_in_origin = 0
            return False


class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 10)
        self.fc2 = nn.Linear(10, 10)
        self.fc3 = nn.Linear(10, output_dim)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return x

# Define the Deep Q-Learning Agent
class DeepQLearningAgent:
    def __init__(self, env, initial_alpha=0.15, gamma=0.7, initial_epsilon=0.6, 
                 min_alpha=0.1, min_epsilon=0.15, alpha_decay=0.999, epsilon_decay=0.999):
        self.env = env
        self.alpha = initial_alpha
        self.gamma = gamma
        self.epsilon = initial_epsilon
        self.min_alpha = min_alpha
        self.min_epsilon = min_epsilon
        self.alpha_decay = alpha_decay
        self.epsilon_decay = epsilon_decay
        
        # Initialize Q-Network
        self.q_network = QNetwork(3, 5)  # State space: 3 dimensions, Action space: 5 actions
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=self.alpha)
        self.criterion = nn.MSELoss()

    def choose_action(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return self.env.action_space.sample()
        else:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            with torch.no_grad():
                q_values = self.q_network(state_tensor)
            q_values = q_values.numpy().flatten()  # Convert to numpy array and flatten it
            max_q_value = np.max(q_values)  # Find the maximum Q-value
            max_actions = np.where(q_values == max_q_value)[0]  # Get indices of actions with max Q-value
            return np.random.choice(max_actions)  # Randomly select one of these actions
        
        
    def learn(self, state, action, reward, next_state):
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
        action_tensor = torch.tensor([action])
        reward_tensor = torch.tensor([reward])
        
        current_q = self.q_network(state_tensor).gather(1, action_tensor.unsqueeze(0)).squeeze(0)
        next_q = self.q_network(next_state_tensor).max(1)[0]
        target_q = reward_tensor + self.gamma * next_q
        
        loss = self.criterion(current_q, target_q.detach())
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def decay_parameters(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)
        self.alpha = max(self.min_alpha, self.alpha * self.alpha_decay)
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = self.alpha


def save_model(agent, filename):
    torch.save(agent.q_network.state_dict(), filename)
    print(f"Model saved to {filename}")


def load_model(agent, filename):
    agent.q_network.load_state_dict(torch.load(filename))
    agent.q_network.eval()  # Set the network to evaluation mode
    print(f"Model loaded from {filename}")


def print_weights(agent):
    state_dict = agent.q_network.state_dict()
    for key, value in state_dict.items():
        print(f"{key}: {value}")


# Training the agent
MAX_ITEREATIONS_PER_EPISODE = 2500
MODEL_FILE_NAME = "origins_models/q_network"

should_save_model = True
should_load_model = False

env = CubeEnv()
agent = DeepQLearningAgent(env, min_epsilon=0.01, initial_epsilon=0.8)

if should_load_model:
    load_model(agent, MODEL_FILE_NAME)

for episode in range(30001):
    state = env.reset()
    for iteration in range(MAX_ITEREATIONS_PER_EPISODE):
        action = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.learn(state, action, reward, next_state)
        state = next_state
        if done:
            break
    
    agent.decay_parameters()
    
    if should_save_model and episode % 100 == 0:
        save_model(agent, MODEL_FILE_NAME + f"-{episode}.pth")
    
    print(f"Episode {episode + 1} finished")

print("Training completed")

Model saved to origins_models/q_network-0.pth
Episode 1 finished
Episode 2 finished
Episode 3 finished
Episode 4 finished
Episode 5 finished
Episode 6 finished
Episode 7 finished
Episode 8 finished
Episode 9 finished
Episode 10 finished
Episode 11 finished
Episode 12 finished
Episode 13 finished
Episode 14 finished
Episode 15 finished
Episode 16 finished
Episode 17 finished
Episode 18 finished
Episode 19 finished
Episode 20 finished
Episode 21 finished
Episode 22 finished
Episode 23 finished
Episode 24 finished
Episode 25 finished
Episode 26 finished
Episode 27 finished
Episode 28 finished
Episode 29 finished
Episode 30 finished
Episode 31 finished
Episode 32 finished
Episode 33 finished
Episode 34 finished
Episode 35 finished
Episode 36 finished
Episode 37 finished
Episode 38 finished
Episode 39 finished
Episode 40 finished
Episode 41 finished
Episode 42 finished
Episode 43 finished
Episode 44 finished
Episode 45 finished
Episode 46 finished
Episode 47 finished
Episode 48 finished
Epi

KeyboardInterrupt: 