In [36]:
%load_ext autoreload
%autoreload 2

import numpy as np
import os
import qiskit
from gymnasium import spaces
from stable_baselines3 import PPO, A2C, DQN, TD3
from stable_baselines3.common.env_util import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_checker import check_env

from qiskit.quantum_info import random_density_matrix, random_statevector, DensityMatrix
from adaptive_qst.plotting import PlotOneQubit
from adaptive_qst.max_info import Posterior, HiddenState
from adaptive_qst.rl_qst import AQSTEnv
import matplotlib.pyplot as plt
from numpy import pi
from qiskit.quantum_info import state_fidelity

from numpy import sqrt
from numpy.linalg import cholesky

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Environment Test

In [10]:
from adaptive_qst.rl_qst import AQSTEnv
env = AQSTEnv()

In [14]:
check_env(env, warn=True)

In [20]:
env = AQSTEnv()
obs, _ = env.reset()
n_steps = 20
print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())
print(env.hidden_state.hidden_state)

for step in range(n_steps):
    print(f"Step {step + 1}")
    
    obs, reward, terminated, truncated, info = env.step(action)
    print("obs=", obs[:10], "reward=", reward)

Box(-1.0, 1.0, (120,), float32)
Box(0.0, 3.1415927, (2,), float32)
[1.7335824 1.6154512]
[[ 0.82832257-3.58733815e-18j -0.19614041+2.58214613e-01j]
 [-0.19614041-2.58214613e-01j  0.17167743+3.58733815e-18j]]
Step 1
obs= [0.6144465  0.7037841  0.34106225 0.43639508 0.549855   0.35525948
 0.25307056 0.4547204  0.8907877  0.05525317] reward= 0.7590576576174881
Step 2
obs= [0.6144465  0.7037841  0.34106225 0.43639508 0.549855   0.35525948
 0.25307056 0.4547204  0.8907877  0.05525317] reward= 0.7814775300678963
Step 3
obs= [0.6144465  0.7037841  0.34106225 0.43639508 0.549855   0.35525948
 0.25307056 0.4547204  0.8907877  0.05525317] reward= 0.7935629417016612
Step 4
obs= [0.6144465  0.7037841  0.34106225 0.43639508 0.549855   0.35525948
 0.25307056 0.4547204  0.8907877  0.05525317] reward= 0.7996067658445755
Step 5
obs= [0.6144465  0.7037841  0.34106225 0.43639508 0.549855   0.35525948
 0.25307056 0.4547204  0.8907877  0.05525317] reward= 0.8024857775532155
Step 6
obs= [0.6144465  0.703784

## Training the Model: Run 1
- Reward as -ln(1 - fidelity)
- Runs of length 100
- Using the default MLP architecture

In [40]:
env = AQSTEnv(n_measurements = 100)
hidden_state = env.hidden_state
env = DummyVecEnv([lambda: Monitor(env)])

tb_log_dir = "tb_logs/1_qubit_aqst"

model = A2C("MlpPolicy", env, tensorboard_log = tb_log_dir)
model.learn(50_000, tb_log_name="A2C")

<stable_baselines3.a2c.a2c.A2C at 0x19afc9ca880>

In [42]:
save_dir = "models/rl_qst"
os.makedirs(save_dir, exist_ok=True)

model.save(f"{save_dir}/default_mlp_50_000")

In [48]:
#Do some more training
loaded_model = A2C.load(f"{save_dir}/default_mlp_50_000_1")

env = AQSTEnv(n_measurements = 100)
env = DummyVecEnv([lambda: Monitor(env)])

loaded_model.set_env(env)
loaded_model.learn(50_000, tb_log_name="A2C")

<stable_baselines3.a2c.a2c.A2C at 0x19afcc55c70>

In [47]:
loaded_model.save(f"{save_dir}/default_mlp_50_000_1")