In [9]:
# !pip install stable_baselines3[extra] -q
# !pip install pyglet==1.5.27 -q
# !pip install -U bposd -q

In [10]:
# Append the common library for CPC codes
import os
import sys
# TODO: lets do something better here like refactor the common parts and different learning mech parts
!export PATH=$PATH:~/.local/bin
sys.path.append(os.getcwd() + "/src")

In [11]:
# !aff3ct
!aff3ct

Usage: /home/lev/.local/share/aff3ct/build/bin/aff3ct-3.0.2 -C <text> [optional args...]

[1m[3m[35mSimulation parameter(s):[0m
[1m[31m{R} [0m[1m--sim-cde-type, -C[0m[37m <text:including set={BCH|LDPC|POLAR|POLAR_MK|RA|REP|RS|RSC|RSC_DB|TPC|TURBO|TURBO_DB|UNCODED}>[0m
      Select the channel code family to simulate. 
    [1m--sim-prec, -p    [0m[37m <integer:including set={8|16|32|64}>[0m
      Specify the representation of the real numbers in the receiver part of the 
      chain. 
    [1m--sim-type        [0m[37m <text:including set={BFER|BFERI}>[0m
      Select the type of simulation (or communication chain skeleton). 

[1m[3m[35mOther parameter(s):[0m
    [1m--Help, -H        [0m
      Print the help like with the '--help, -h' parameter plus advanced 
      arguments (denoted as '{A}'). 
    [1m--help, -h        [0m
      Print the help with all the required (denoted as '{R}') and optional 
      arguments. The latter change depending on the selected sim

## Setup the RL Env

In [12]:
import os
import copy
from stable_baselines3.common.env_checker import check_env
import gym
from gym import spaces
import numpy as np
import torch
import utils
from global_params import params
from scoring import score_dataset
from CPC import cpc_code, generate_random as gen_random_cpc


def flatten(l):
    return [item for sublist in l for item in sublist]


"""
Some quick thoughts:
-- Should we start with a specific code each time or always a new random code?
"""


class SwapLDPCEnv(gym.Env):
    """Custom Environment that follows gym interface"""

    def __init__(self, target_succ_rate=0.99):
        super(SwapLDPCEnv, self).__init__()

        self.target_succ_rate = target_succ_rate
        _, m_b, m_p, m_c = gen_random_cpc.random_cpc()
        self.m_b = m_b
        self.m_p = m_p
        self.m_c = m_c
        self.original_m_b = copy.deepcopy(m_b)
        self.original_m_p = copy.deepcopy(m_p)
        self.original_m_c = copy.deepcopy(m_c)

        # self.target_succ_rate = target_succ_rate
        # Each action corresponds to choosing to parity checks and the corresponding edges to swap
        self.action_space = spaces.MultiDiscrete([
            3,  # select which matrix to operate on, m_b, m_p, or m_c
            # select which parity check to operate on
            params['n_data_qubits'],
            # higher than the check qubit index return a low reward
            params['n_check_qubits'],
            # select which data qubit to operate on. If m_c is selected, have choosing a data qubit
        ])
        self.last_fer = 0
        self.n_steps = 0
        self.current_run_len = 0

        self.n_qubits = n_qubits = (params['n_data_qubits'] +
                                    params['n_check_qubits']) * 2

        # The first n qubits represent the noise distribution
        # TODO: THIS ALLOWS US TO TRAIN FOR "ADAPTIVE NOISE!!" (i.e. lets decrease connections...)
        # The quantum parity check matrix
        self.observation_space = spaces.Box(low=0, high=1,
                                            shape=(params['n_check_qubits'], n_qubits), dtype=np.uint8)
        code_pc = cpc_code.get_classical_code_cpc(self.m_b, self.m_p, self.m_c)
        p_fails = np.ones(self.n_qubits) * np.random.uniform(
            low=params['constant_error_rate_lower'], high=params['constant_error_rate_upper'])
        self.last_wsr = score_dataset.run_decoder(code_pc, p_fails)

        # TODO: WHAT NUMBER???
        self.max_run_len = 150
        self.reward_step = 0.008

    def step(self, action):
        p_fails = np.ones(self.n_qubits) * np.random.uniform(
            low=params['constant_error_rate_lower'], high=params['constant_error_rate_upper'])
        if action[0] == 0:
            self.m_b[action[1], action[2]] = 1 - self.m_b[action[1], action[2]]
        elif action[0] == 1:
            self.m_p[action[1], action[2]] = 1 - self.m_p[action[1], action[2]]
        elif action[0] == 2:
            if action[1] >= params['n_check_qubits']:
                old_code_pc = cpc_code.get_classical_code_cpc(
                    self.m_b, self.m_p, self.m_c)
                obs = old_code_pc
                return obs, -1, False, {}  # Return a very low reward
            self.m_c[action[1], action[2]] = 1 - self.m_c[action[1], action[2]]
        else:
            raise "Undefined selector action"

        code_pc = cpc_code.get_classical_code_cpc(self.m_b, self.m_p, self.m_c)
        # TODO: p_fail??
        succ_rate = score_dataset.run_decoder(code_pc, p_fails)

        self.last_fer = 1 - succ_rate

        # TODO: scaling?
        reward = 1 if succ_rate > self.last_wsr + self.reward_step else 0
        self.last_wsr = succ_rate
        obs = code_pc

        # Update global parameters
        self.n_steps += 1

        done = succ_rate >= self.target_succ_rate
        if self.current_run_len > self.max_run_len:
            done = True
            self.current_run_len = 0

        self.current_run_len += 1
        return obs, reward, done, {}

    def reset(self):
        self.m_b = copy.deepcopy(self.original_m_b)
        self.m_p = copy.deepcopy(self.original_m_p)
        self.m_c = copy.deepcopy(self.original_m_c)
        code_pc = cpc_code.get_classical_code_cpc(self.m_b, self.m_p, self.m_c)
        return code_pc

    def render(self, mode='console'):
        pass

    def close(self):
        pass


env = SwapLDPCEnv()
check_env(env, warn=True)


## Begin Training

In [13]:
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
# from stable_baselines3.common import set_random_seed, make_vec_env

model_type = "PPO"
check_env(env, warn=True)
tf_logs = "./logs/{model_type}-tensorboard"

def make_env(rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = SwapLDPCEnv() 
        env.seed(seed + rank)
        return env
    # set_global_seeds(seed)
    return _init

num_cpu = 4

# wrap it
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
# make_vec_env(lambda: env,   n_envs=1)

loading_saved = False

model = None
if not loading_saved:
	model = PPO("MlpPolicy", env=env, tensorboard_log=tf_logs)
else:
	# TODO!
	model = PPO.load(utils.get_most_recent_model_path_rl(), env=env, print_system_info=True)

In [14]:
## Setup the callbacks
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback, CallbackList, BaseCallback
import numpy as np
import json
import tensorflow as tf

class TensorboardCallback(BaseCallback):
    """
    Custom callback for plotting additional values in tensorboard.
    """
    def __init__(self, verbose=0):
        self.is_tb_set = False
        super(TensorboardCallback, self).__init__(verbose)
        writer = tf.summary.create_file_writer(tf_logs) ## TODO?
        self.writer = writer

    def _on_step(self) -> bool:
        # Log additional tensor
        # if not self.is_tb_set:
        #     with self.model.graph.as_default():
        #         tf.summary.scalar('value_target', tf.reduce_mean(self.model.value_target))
        #         self.model.summary = tf.summary.merge_all()
        #     self.is_tb_set = True

        # Log scalar value (here a random variable)
        fers = self.model.get_env().get_attr("last_fer", list(range(num_cpu)))
        fer = sum(fers) / len(fers)
        with self.writer.as_default():
            n_steps = self.model.get_env().get_attr("n_steps", list(range(num_cpu)))
            n_steps_avg = int(sum(n_steps) / len(n_steps))
            tf.summary.scalar('Frame Error Rate', fer, step=n_steps_avg)
            self.writer.flush()
        return True


# From https://stable-baselines.readthedocs.io/en/master/guide/examples.html
class SaveModelOnTraining(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, verbose=0):
        super(SaveModelOnTraining, self).__init__(verbose)
        self.check_freq = params['rl_save_model_freq']
        self.save_path = utils.get_most_recent_model_path_rl()
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
          
          self.model.save(self.save_path)
          print("Saving new model to {} for step {}".format(self.save_path), self.n_calls)
          with open(utils.get_most_recent_model_path_rl_info(), 'w') as f:
            data = {
              "n_steps": self.n_calls,
              # "last_fer": 
            }
            json.dump(data, f)
        return True


# TODO: save everything else (like approx kl etc. for tensor board logs)
callback_list = CallbackList([TensorboardCallback()])

In [15]:
model.learn(total_timesteps=10_000, callback=callback_list)
# for i in range(25):
# 	model.learn(total_timesteps=100_000, callback=callback_list)
# 	model.save(utils.get_most_recent_model_path_rl())

(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!
(WW) Found in Alist file connections of degree 0!


In [None]:
model.save(utils.get_most_recent_model_path_rl())