In [5]:
!pip install stable_baselines3[extra] -q
!pip install pyglet==1.5.27 -q
!pip install -U bposd -q

In [6]:
# Append the common library for CPC codes
import os
import sys
# TODO: lets do something better here like refactor the common parts and different learning mech parts
sys.path.append(os.getcwd() + "/src")

## Setup the RL Env

In [9]:
import os
from stable_baselines3.common.env_checker import check_env
import gym
from gym import spaces
import numpy as np
import torch
import utils
from global_params import params
from scoring import score_dataset
from CPC import cpc_code, generate_random as gen_random_cpc


def flatten(l):
    return [item for sublist in l for item in sublist]


"""
Some quick thoughts:
-- Should we start with a specific code each time or always a new random code?
"""


class SwapLDPCEnv(gym.Env):
    """Custom Environment that follows gym interface"""
    metadata = {'render.modes': ['human']}

    def __init__(self, target_succ_rate=0.99):
        super(SwapLDPCEnv, self).__init__()

        self.target_succ_rate = target_succ_rate
        _, m_b, m_p, m_c = gen_random_cpc.random_cpc()
        self.m_b = m_b
        self.m_p = m_p
        self.m_c = m_c
        # self.target_succ_rate = target_succ_rate
        # Each action corresponds to choosing to parity checks and the corresponding edges to swap
        self.action_space = spaces.MultiDiscrete([
            3,  # select which matrix to operate on, m_b, m_p, or m_c
            # select which parity check to operate on
            params['n_data_qubits'],
            # higher than the check qubit index return a low reward
            params['n_check_qubits'],
            # select which data qubit to operate on. If m_c is selected, have choosing a data qubit
        ])
        self.last_fer = 0
        self.n_steps = 0

        self.n_qubits = n_qubits = params['n_data_qubits'] + \
            params['n_check_qubits']
        flattened_pc_size = 2 * \
            (n_qubits) * \
            params['n_check_qubits']

        # The first n qubits represent the noise distribution
        # TODO: THIS ALLOWS US TO TRAIN FOR "ADAPTIVE NOISE!!" (i.e. lets decrease connections...)
        # The quantum parity check matrix
        self.observation_space = spaces.Box(low=0.0, high=1.0,
                                            shape=(n_qubits + flattened_pc_size,), dtype=np.float32)

    def step(self, action):
        p_fails = np.ones(self.n_qubits) * np.random.uniform(
            low=params['constant_error_rate_lower'], high=params['constant_error_rate_upper'])
        if action[0] == 0:
            self.m_b[action[1], action[2]] = 1 - self.m_b[action[1], action[2]]
        elif action[0] == 1:
            self.m_p[action[1], action[2]] = 1 - self.m_p[action[1], action[2]]
        elif action[0] == 2:
            if action[1] >= params['n_check_qubits']:
                old_code_pc = cpc_code.get_classical_code_cpc(
                    self.m_b, self.m_p, self.m_c)
                flattened = np.array(old_code_pc).astype(np.float32).flatten()
                obs = np.concatenate((p_fails, flattened)).astype(np.float32)
                return obs, -1, False, {}  # Return a very low reward
            self.m_c[action[1], action[2]] = 1 - self.m_c[action[1], action[2]]
        else:
            raise "Undefined selector action"

        code_pc = cpc_code.get_classical_code_cpc(self.m_b, self.m_p, self.m_c)
        # TODO: p_fail??
        succ_rate = score_dataset.run_decoder(code_pc, p_fails)

        self.last_fer = 1 - succ_rate

        # TODO: scaling?
        reward = succ_rate
        flattened = np.array(code_pc).astype(np.float32).flatten()
        obs = np.concatenate((p_fails, flattened)).astype(np.float32)

		# Update global parameters
        self.n_steps += 1

        return obs, reward, succ_rate >= self.target_succ_rate, {}

    def reset(self):
        p_fails = np.ones(self.n_qubits) * np.random.uniform(
            low=params['constant_error_rate_lower'], high=params['constant_error_rate_upper'])
        _, m_b, m_p, m_c = gen_random_cpc.random_cpc()
        self.m_b = m_b
        self.m_p = m_p
        self.m_c = m_c
        code_pc = cpc_code.get_classical_code_cpc(self.m_b, self.m_p, self.m_c)
        # reward, done, info can't be included
        npd = np.array(code_pc).astype(np.float32)
        return np.concatenate((p_fails, npd.flatten())).astype(np.float32)

    def render(self, mode='console'):
        pass

    def close(self):
        pass


env = SwapLDPCEnv()
check_env(env, warn=True)


## Begin Training

In [12]:
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.env_util import make_vec_env

env = SwapLDPCEnv() 
model_type = "PPO"
check_env(env, warn=True)
tf_logs = "./logs/{model_type}-tensorboard"

# wrap it
env = make_vec_env(lambda: env,   n_envs=1)
loading_saved = False
model = None
if not loading_saved:
	model = PPO("MlpPolicy", env=env, tensorboard_log=tf_logs)
else:
	# TODO!
	model = PPO.load(utils.get_best_scoring_model_path_rl(), env=env, print_system_info=True)

In [18]:
## Setup the callbacks
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback, CallbackList, BaseCallback
import numpy as np
import json
import tensorflow as tf

class TensorboardCallback(BaseCallback):
    """
    Custom callback for plotting additional values in tensorboard.
    """
    def __init__(self, verbose=0):
        self.is_tb_set = False
        super(TensorboardCallback, self).__init__(verbose)
        writer = tf.summary.create_file_writer(tf_logs) ## TODO?
        self.writer = writer

    def _on_step(self) -> bool:
        # Log additional tensor
        # if not self.is_tb_set:
        #     with self.model.graph.as_default():
        #         tf.summary.scalar('value_target', tf.reduce_mean(self.model.value_target))
        #         self.model.summary = tf.summary.merge_all()
        #     self.is_tb_set = True

        # Log scalar value (here a random variable)
        env = self.model.get_env().envs[0]
        fer = env.last_fer
        with self.writer.as_default():
            tf.summary.scalar('Frame Error Rate', fer, step=env.n_steps)
            self.writer.flush()
        return True


# From https://stable-baselines.readthedocs.io/en/master/guide/examples.html
class SaveModelOnTraining(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, verbose=0):
        super(SaveModelOnTraining, self).__init__(verbose)
        self.check_freq = params['rl_save_model_freq']
        self.save_path = utils.get_most_recent_model_path_rl()
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
          
          self.model.save(self.save_path)
          print("Saving new model to {} for step {}".format(self.save_path), self.n_calls)
          with open(utils.get_most_recent_model_path_rl_info(), 'w') as f:
            data = {
              "n_steps": self.n_calls,
              # "last_fer": 
            }
            json.dump(data, f)
        return True


callback_list = CallbackList([TensorboardCallback()])

In [19]:
for i in range(25):
	model.learn(total_timesteps=10_000, callback=callback_list)
	model.save(utils.get_most_recent_model_path_rl())