In [1]:
from deepsudoku import REPO_PATH, DIFFICULTIES
import os
os.chdir(REPO_PATH)

import tensorflow as tf
import numpy as np
from deepsudoku.reinforcement_learning.ppo import *

import gymnasium as gym
from deepsudoku.verify_sudoku import *

2023-08-25 17:41:13.235447: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Sudoku Environment avaible at gymnasium as 'Sudoku-v0'.


In [2]:
import random

class PPO_MultiDiscrete_Cartpole_Wrapper:
    def __init__(self, NUM_ENVS, **env_kwargs):
        self.num_envs = NUM_ENVS
        # We use vectorized environments (Implementation Detail 1)
        self.envs = envs = gym.vector.make('CartPole-v1', num_envs=NUM_ENVS)
        self.current_state, _ = self.envs.reset()

    def sample(self, model):
        old_observation = self.current_state
        q_values = model(self.current_state) #get q values for current state
        #Q_values are logprobs, we use a categorical distribution to sample
        probs = np.exp(q_values)
        # Implementing Multi-Discrete Action spaces:
        # probs = [environments][action-spaces][logprobs]
        # We need to sample for each environment from all action-spaces
        # NOTE: This assumes that all action-spaces look like [0,1,2,3,...]
        # In particular, for the sudoku case, the third action space represents the number we want to fill in
        # This number will be one less than the actual digit that gets filled in
        # This is accounted for and implemented like this in our custom environment
        action = np.array([
                    [np.random.choice(np.arange(len(logprob)), p=logprob) for logprob in action_space]
                    for action_space in probs
                 ])

        log_prob = tf.gather(q_values, action, batch_dims=2)
        log_prob = tf.reduce_sum(log_prob, axis=-1).numpy()

        new_observation, reward, terminated, _, _ = self.envs.step(action[:,0])

        self.current_state = new_observation #update current state after environment did step
        return (old_observation, action, reward, new_observation, terminated, log_prob)
    
    def collect_trajectories(self, model, length):
        old_obs, act, rew, new_obs, term, log_probs = self.sample(model)
        data = {"observations": np.expand_dims(old_obs, axis=1), 
                "actions": np.expand_dims(act, axis=1), 
                "rewards": rew, 
                "terminateds": term,
                "log_prob": log_probs}
        for i in range(length-1):
            old_obs, act, rew, new_obs, term, log_probs = self.sample(model)
            data["observations"] = np.column_stack((data["observations"], np.expand_dims(old_obs, axis=1)))
            data["actions"] = np.column_stack((data["actions"], np.expand_dims(act, axis=1)))
            data["rewards"] = np.column_stack((data["rewards"], rew))
            data["terminateds"] = np.column_stack((data["terminateds"], term))
            data["log_prob"] = np.column_stack((data["log_prob"], log_probs))
        return data, new_obs



In [3]:
envs = PPO_MultiDiscrete_Cartpole_Wrapper(5)

In [4]:
# Create models

# Implementation Detail 2: Orthogonal Initialization of hidden weights and constant initialization of biases and output weights
# Biases seem to be 0 by default in Keras
hidden_ortho_init = lambda: tf.keras.initializers.Orthogonal(gain=np.sqrt(2))
pol_out_ortho_init = lambda: tf.keras.initializers.Orthogonal(gain=0.01)
val_out_ortho_init = lambda: tf.keras.initializers.Orthogonal(gain=1)

def create_policy_model():
    inputs = tf.keras.Input(shape=(4))
    x = tf.keras.layers.Flatten()(inputs)
    x = tf.keras.layers.Dense(128, activation="tanh", kernel_initializer=hidden_ortho_init())(x)
    x = tf.keras.layers.Dense(128, activation="tanh", kernel_initializer=hidden_ortho_init())(x)
    x = tf.keras.layers.Dense(2, activation="log_softmax", kernel_initializer=pol_out_ortho_init())(x)
    outputs = tf.keras.layers.Reshape((1,2))(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs, name="pi")
    return model

def create_value_function_model():
    inputs = tf.keras.Input(shape=(4))
    x = tf.keras.layers.Flatten()(inputs)
    x = tf.keras.layers.Dense(32, activation="relu", kernel_initializer=hidden_ortho_init())(x)
    #x = tf.keras.layers.Dense(32, activation="relu", kernel_initializer=hidden_ortho_init())(x)
    outputs = tf.keras.layers.Dense(1, activation="linear", kernel_initializer=val_out_ortho_init())(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs, name="V")
    return model


In [5]:
pi, V = PPO(envs, create_policy_model(), create_value_function_model(), TRAIN_EPOCHS = 1000, LEARNING_RATE_DECAY_PER_EPOCH = 0, LEARNING_RATE_START=0.001)

2023-08-25 17:41:18.637277: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


epoch:  0  ; KL:  0  ; LR:  0.001  ; MR:  0
Collection
Tapework
epoch:  1  ; KL:  0.0018327674  ; LR:  0.001  ; MR:  1.0
Collection
Tapework
epoch:  2  ; KL:  0.004429827  ; LR:  0.001  ; MR:  1.0
Collection
Tapework
epoch:  3  ; KL:  0.0020111117  ; LR:  0.001  ; MR:  1.0
Collection
Tapework
epoch:  4  ; KL:  0.0027630597  ; LR:  0.001  ; MR:  1.0
Collection
Tapework
epoch:  5  ; KL:  0.00218419  ; LR:  0.001  ; MR:  1.0
Collection
Tapework
epoch:  6  ; KL:  0.0030257185  ; LR:  0.001  ; MR:  1.0
Collection
Tapework
epoch:  7  ; KL:  0.0028139409  ; LR:  0.001  ; MR:  1.0
Collection
Tapework
epoch:  8  ; KL:  0.0015623911  ; LR:  0.001  ; MR:  1.0
Collection
Tapework
epoch:  9  ; KL:  0.002095902  ; LR:  0.001  ; MR:  1.0
Collection
Tapework
epoch:  10  ; KL:  0.0007709952  ; LR:  0.001  ; MR:  1.0
Collection
Tapework
epoch:  11  ; KL:  0.00084699807  ; LR:  0.001  ; MR:  1.0
Collection
Tapework
epoch:  12  ; KL:  0.0022357416  ; LR:  0.001  ; MR:  1.0
Collection
Tapework
epoch:  13  

In [6]:

test_env = gym.make('CartPole-v1', render_mode='human')
#test_env = gym.make('CartPole-v1', render_mode='human')
obs, inf = test_env.reset()

In [14]:
obs, inf = test_env.reset()

In [7]:
for i in range(1000):
    qs = pi(tf.expand_dims(obs, 0))
    act = [np.argmax(subspace) for subspace in qs[0]]
    obs, reward, terminated, _, _ = test_env.step(act[0])
    if(terminated):
        obs, _ = test_env.reset()

: 