In [1]:
%pip install tqdm gymnasium numpy tensorflow stable-baselines3

Collecting tqdm
  Downloading tqdm-4.67.0-py3-none-any.whl (78 kB)
     ---------------------------------------- 78.6/78.6 kB 4.3 MB/s eta 0:00:00
Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
     -------------------------------------- 958.1/958.1 kB 7.6 MB/s eta 0:00:00
Collecting numpy
  Downloading numpy-2.1.3-cp310-cp310-win_amd64.whl (12.9 MB)
     ---------------------------------------- 12.9/12.9 MB 7.2 MB/s eta 0:00:00
Collecting tensorflow
  Downloading tensorflow-2.18.0-cp310-cp310-win_amd64.whl (7.5 kB)
Collecting stable-baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl (182 kB)
     -------------------------------------- 182.3/182.3 kB 5.5 MB/s eta 0:00:00
Collecting farama-notifications>=0.0.1
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting cloudpickle>=1.2.0
  Downloading cloudpickle-3.1.0-py3-none-any.whl (22 kB)
Collecting tensorflow-intel==2.18.0
  Downloading tensorflow_intel-2.18.0-cp310-cp


[notice] A new release of pip available: 22.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from tqdm import tqdm
import gymnasium as gym
import gym_rubiks
from gym_rubiks.rubiks_gym_agent import RubiksAgent
from rubiks import RubiksCube

import numpy as np

# stable-baselines3 Reinformencement Learning agent
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

import tensorboard

In [7]:
def make_rubik_env(n_shuffle, max_episode_steps=10):
    env = gym.make("RubiksCube-v0", cube=RubiksCube(), n_shuffle=n_shuffle)
    env = gym.wrappers.TimeLimit(env, max_episode_steps=max_episode_steps)
    return env

In [12]:
env = gym.make("RubiksCube-v0", cube=RubiksCube(), n_shuffle=1)
env = gym.wrappers.TimeLimit(env, max_episode_steps=10)

vec_env = make_vec_env(lambda: make_rubik_env(1, 10), n_envs=16)
model = PPO("MlpPolicy", vec_env, verbose=0, tensorboard_log="./RubiksLog")
# Train the agent and display a progress bar, tensorboard

model.learn(total_timesteps=int(6e4), progress_bar=False, tb_log_name="./RubiksLog", log_interval=1)
# Save the agent
model.save("RubiksCube")
del model

In [11]:
env = gym.make("RubiksCube-v0", cube=RubiksCube(), n_shuffle=2)
env = gym.wrappers.TimeLimit(env, max_episode_steps=10)

vec_env = make_vec_env(lambda: make_rubik_env(2, 10), n_envs=16)
model = PPO.load("RubiksCube", vec_env, tensorboard_log="./RubiksLog")

model.learn(total_timesteps=int(6e5), progress_bar=False, tb_log_name="./RubiksLog_2", log_interval=1)
# Save the agent
model.save("RubiksCube_2")
del model

In [26]:
env = gym.make("RubiksCube-v0", cube=RubiksCube(), n_shuffle=3)
env = gym.wrappers.TimeLimit(env, max_episode_steps=10)

vec_env = make_vec_env(lambda: make_rubik_env(3, 10), n_envs=16)
model = PPO.load("RubiksCube_2", vec_env, tensorboard_log="./RubiksLog")

model.learn(total_timesteps=int(2e6), progress_bar=False, tb_log_name="./RubiksLog_3", log_interval=1)
# Save the agent
model.save("RubiksCube_3")
del model

In [50]:
env = gym.make("RubiksCube-v0", cube=RubiksCube(), n_shuffle=4)
env = gym.wrappers.TimeLimit(env, max_episode_steps=10)

vec_env = make_vec_env(lambda: make_rubik_env(4, 10), n_envs=16)
model = PPO.load("RubiksCube_4_1", vec_env, tensorboard_log="./RubiksLog")

model.learn(total_timesteps=int(3e6), progress_bar=False, tb_log_name="./RubiksLog_4_2", log_interval=1)
# Save the agent
model.save("RubiksCube_4_2")
del model

In [None]:
vec_env = make_vec_env(lambda: make_rubik_env(5, 10), n_envs=16)
model = PPO.load("RubiksCube_4_2", vec_env, tensorboard_log="./RubiksLog")

model.learn(total_timesteps=int(5e7), progress_bar=False, tb_log_name="./RubiksLog_5", log_interval=1)
# Save the agent
model.save("RubiksCube_5")
del model

In [56]:
env = gym.make("RubiksCube-v0", cube=RubiksCube(), n_shuffle=4)
model = PPO.load("RubiksCube_4_2", env=env)

vec_env = model.get_env()
solved_ratio = 0
repeats = 1000
for evaluation in range(repeats):
    obs = vec_env.reset()
    #print("-------------------")
    #vec_env.env_method(method_name='print_state', indices=0)
    #print(vec_env.get_attr("state"))
    #vec_env.env_method(method_name='print_step')
    for i in range(10):
        #vec_env.env_method(method_name='pprint_state', indices=0)
        action, _states = model.predict(obs, deterministic=True)
        #print(action)
        obs, rewards, dones, info = vec_env.step(action)
        if rewards[0] == 100:
            solved_ratio += 1
            #print(rewards)
            break
print(f"Solved ratio: {solved_ratio}/{repeats}")

Solved ratio: 949/1000
