In [19]:
import matplotlib.pyplot as plt
import numpy as np

from freq_stacking_LFM_ACF_utils import *

from scipy.fft import fft, ifft, fftfreq, fftshift
import scipy.signal

from tensorforce.environments import Environment
from tensorforce.agents import Agent

# Define Constant parameters

In [20]:
# min and max RF center freqs in Hz
fc_min = 500e6
fc_max = 1.48e9

delta_coeff = 0.01

# min and max Bws in Hz
Bmin = 1e6
Bmax = 20e6
delta_B = 1e6


max_delay_time = 5e-6
delay_time = 0.0

# chirp rate
chirp_rate = 50e6/10e-6


num_subpulses = 50

# maximum episode length
max_episode_timesteps = 3000

training_num = 200 # how many trajectories 



# Derived Parameters

In [21]:
Fs_baseband = Bmax  # baseband samp freq in Hz
Fs_SWW_max = fc_max + Bmax / 2 - (fc_min - Bmax / 2)

# time window size
max_subpulse_duration = Bmax / chirp_rate
time_window_size = 10 * max_subpulse_duration + max_delay_time


# compute the state vector size
LFM_rx_subpulses, LFM_ref_subpulses = generate_LFM_rx_ref_subpulses_for_ACF(
    BW_RF_array=np.array([20e6]),
    chirp_rate=chirp_rate,
    time_window_size=time_window_size,
    Fs_baseband=Fs_baseband
)

N_max = compute_Nup_f(LFM_rx_subpulses[0], Fs_baseband, Fs_SWW_max) # the state vector size

# num_subpulses = int( Fs_SWW_max / Bmax * 1.2) # number of subpulses: may be a little bit larger than the none-overlapping case


# Radar Environment Setup

In [22]:
class ACF_Env(Environment):

    ####################################################################
    # Required methods defs
    ####################################################################
    def __init__(
        self,
        num_subpulses,
        fc_min,
        fc_max,
        delta_coeff,
        Bmin,
        Bmax,
        delta_B,
        chirp_rate,
        time_window_size,
        Fs_baseband,
    ):
        super().__init__()
        
        self.num_subpulses = num_subpulses
        self.fc_min = fc_min
        self.fc_max = fc_max
   
        self.Bmin = Bmin
        self.Bmax = Bmax
        self.delta_B = delta_B

        self.delta_coeff = delta_coeff
    
        self.chirp_rate = chirp_rate

        self.time_window_size = time_window_size
        self.Fs_baseband = Fs_baseband

        self._current_best_sww_performance = 0 # the ISLR or PSL, i.e. the criteria of the waveform

    def states(self):
        return dict(
            type="float", shape=(2, self.num_subpulses)
        )  # the first row is the RF center freqs, and the second row is the BWs;
        # the actions are normalized to [0, 1]

    def actions(self):
        return dict(
            type="float", shape=(2, self.num_subpulses), min_value=-1, max_value=1
        )  


    def reset(self):
        """Reset state."""
        # state = np.random.random(size=(1,))
        self.timestep = 0
        self.current_state = np.zeros((2, self.num_subpulses))
        self.current_state[0] = 1
        self.current_state[1] =self.Bmax

        return self.current_state

    def execute(self, actions):
        """[summary] Executes the given action(s) and advances the environment by one step.

        The execute method implements the "simulator": how the environment reacts to an action
            1. Increment timestamp
            2. Update the current state: next_state <-- f(current_state, actions) (implement state transition)
            3. Compute the reward accociated with the new state

        returns state, terminal, reward
        """

        # increment timestep
        self.timestep += 1

        # compute the next ACF and next reward
        next_state, next_reward = self.compute_next_state_and_reward(actions)

        # update the current ACF and reward
        self.current_state = next_state
        reward = next_reward

        terminal = False  # maybe implement like this: if converge, terminal = True

        return self.current_state, terminal, reward

    ####################################################################
    # Helper functions
    ####################################################################

    # helper function that implements the env model/simulator
    def compute_next_state_and_reward(self, actions):
        """[summary] this method implements how env reacts to action:
            action --> state

            and also computes the reward associated with the state

        Args:
            actions ([type]): [description] the normalized actions

        Returns:
            [type]: [description] the the next_state and next_reward
        """
        
        next_state = np.zeros_like(actions)
        next_state[0] = self.current_state[0] + self.delta_coeff * actions[0] # compute center freqs
        next_state[1] = self.current_state[1] + self.delta_B * actions[1] # compute BWs

        # make sure all center freqs and BWs are within range
        next_state[0, next_state[0] > 1] = 1
        next_state[0, next_state[0] < 0] = 0
        next_state[1, next_state[1] > self.Bmax] = self.Bmax
        next_state[1, next_state[1] < self.Bmin] = self.Bmin


        # the following code computes the waveform performance associated with the next_state
        fc_RF_freqs = self.fc_min + np.arange(self.num_subpulses) * self.Bmin  # the first row is the RF center freqs
        BW_RF_array = next_state[1]  # the second row is the BWs
        

        # compute the time domain subpulses
        LFM_rx_subpulses, LFM_ref_subpulses = generate_LFM_rx_ref_subpulses_for_ACF(
            BW_RF_array, self.chirp_rate, self.time_window_size, self.Fs_baseband
        )

        # multiply coeff
        LFM_ref_subpulses =  np.reshape(next_state[0],(-1, 1)) * LFM_ref_subpulses

        # filter BWs
        Bs_array = BW_RF_array

        # apply freq. stacking and get ACF
        _, d_t = freq_stacking(
            LFM_rx_subpulses,
            LFM_ref_subpulses,
            fc_RF_freqs,
            BW_RF_array,
            Bs_array,
            self.Fs_baseband,
        )

        # compute ACF
        ACF = np.abs(d_t) / np.max(np.abs(d_t))

        # compute the sww_performance associated with this ACF
        sww_performance = -int_sidelobe_ratio(ACF)
        reward =  sww_performance - self._current_best_sww_performance

        # update the max score
        if sww_performance > self._current_best_sww_performance:
            self._current_best_sww_performance = sww_performance
            self._current_best_state = next_state
        
        if sww_performance > 22.9:
            print(f"-ISLR = {sww_performance}")

        return next_state, reward

    ####################################################################
    # Optional methods defs
    ####################################################################

    # Optional, should only be defined if environment has a natural maximum
    # episode length
    def max_episode_timesteps(self):
        return super().max_episode_timesteps()

    # Optional
    def close(self):
        super().close()


In [23]:
# create an ACF_env


my_ACF_Env = Environment.create(
    environment=ACF_Env,
    max_episode_timesteps=max_episode_timesteps,
    num_subpulses = num_subpulses,
    fc_min=fc_min,
    fc_max=fc_max,
    delta_coeff = delta_coeff,
    Bmin=Bmin,
    Bmax=Bmax,
    delta_B = delta_B,
    chirp_rate=chirp_rate,
    time_window_size=time_window_size,
    Fs_baseband=Fs_baseband,
)


# Agent Setup

 Here we configure a type of agent to learn against this environment. There are many agent configurations to choose from, which we will not cover here. We will not discuss what type of agent to choose here -- we will just take a basic agent to train.

In [24]:
agent = Agent.create(
    agent='tensorforce', environment=my_ACF_Env, update=64,
    optimizer=dict(optimizer='adam', learning_rate=1e-3),
    objective='policy_gradient', reward_estimation=dict(horizon=1)
)



# Check: Untrained Agent Performance
The agent just initializes a policy and use that policy

# Train the agent

In [25]:
# Train for 200 episodes

for _ in range(training_num):
    states = my_ACF_Env.reset()
    terminal = False
    print(f"{_} iteration")
    
    while not terminal:
        actions = agent.act(states=states)
        states, terminal, reward = my_ACF_Env.execute(actions=actions)
        agent.observe(terminal=terminal, reward=reward)
      
        #print(f"time = {time}")

0 iteration
-ISLR = 54.53568054170657
-ISLR = 51.657462982210376
-ISLR = 49.55371269864188
-ISLR = 46.6386797123332
-ISLR = 44.86439533153104
-ISLR = 44.188668059695445
-ISLR = 43.62064754520127
-ISLR = 43.60980839591982
-ISLR = 43.993212686727915
-ISLR = 43.604953780742505
-ISLR = 42.249796667335985
-ISLR = 40.10739637914084
-ISLR = 38.338123754339684
-ISLR = 37.79869629670011
-ISLR = 37.83713961603546
-ISLR = 38.23538937890892
-ISLR = 38.79824180849489
-ISLR = 41.418511970280704
-ISLR = 42.14389121961097
-ISLR = 42.53546505170135
-ISLR = 41.938861874105314
-ISLR = 41.59437032167163
-ISLR = 40.81761643148015
-ISLR = 40.505847238489
-ISLR = 40.419828319173504
-ISLR = 40.844809191781685
-ISLR = 41.393729050039084
-ISLR = 42.57147274732192
-ISLR = 44.334683344823524
-ISLR = 44.13504471708152
-ISLR = 43.45194111909096
-ISLR = 43.42546125094442
-ISLR = 43.24162714317788
-ISLR = 42.74556788494969
-ISLR = 41.67047509630286
-ISLR = 41.387139106420854
-ISLR = 41.29833505172832
-ISLR = 41.81515

InvalidArgumentError:   : Tensor had NaN values
	 [[node agent/VerifyFinite_1/CheckNumerics (defined at /home/cs229/anaconda3/lib/python3.8/site-packages/tensorforce/core/utils/tensor_spec.py:319) ]] [Op:__inference_act_21669]

Errors may have originated from an input operation.
Input Source operations connected to node agent/VerifyFinite_1/CheckNumerics:
 agent/assert_equal_5/Assert/AssertGuard (defined at /home/cs229/anaconda3/lib/python3.8/site-packages/tensorforce/core/utils/tensor_spec.py:312)	
 agent/StatefulPartitionedCall (defined at /home/cs229/anaconda3/lib/python3.8/site-packages/tensorforce/core/module.py:136)

Function call stack:
act


In [26]:
my_ACF_Env._current_best_sww_performance

54.53568054170657

In [27]:
my_ACF_Env._current_best_state

array([[9.9e-01, 9.9e-01, 9.9e-01, 9.9e-01, 9.9e-01, 9.9e-01, 1.0e+00,
        1.0e+00, 9.9e-01, 9.9e-01, 1.0e+00, 1.0e+00, 9.9e-01, 1.0e+00,
        1.0e+00, 1.0e+00, 9.9e-01, 1.0e+00, 9.9e-01, 1.0e+00, 9.9e-01,
        9.9e-01, 1.0e+00, 9.9e-01, 1.0e+00, 9.9e-01, 1.0e+00, 9.9e-01,
        9.9e-01, 9.9e-01, 9.9e-01, 9.9e-01, 1.0e+00, 1.0e+00, 9.9e-01,
        1.0e+00, 9.9e-01, 9.9e-01, 9.9e-01, 1.0e+00, 1.0e+00, 1.0e+00,
        9.9e-01, 1.0e+00, 9.9e-01, 1.0e+00, 1.0e+00, 9.9e-01, 1.0e+00,
        1.0e+00],
       [2.0e+07, 1.9e+07, 2.0e+07, 1.9e+07, 2.0e+07, 2.0e+07, 2.0e+07,
        1.9e+07, 2.0e+07, 2.0e+07, 2.0e+07, 2.0e+07, 1.9e+07, 1.9e+07,
        2.0e+07, 1.9e+07, 2.0e+07, 1.9e+07, 1.9e+07, 1.9e+07, 2.0e+07,
        2.0e+07, 1.9e+07, 2.0e+07, 1.9e+07, 2.0e+07, 2.0e+07, 1.9e+07,
        2.0e+07, 1.9e+07, 2.0e+07, 1.9e+07, 2.0e+07, 2.0e+07, 1.9e+07,
        1.9e+07, 1.9e+07, 1.9e+07, 1.9e+07, 1.9e+07, 1.9e+07, 2.0e+07,
        2.0e+07, 2.0e+07, 2.0e+07, 1.9e+07, 1.9e+07, 1.9e+0

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline


plt.plot(reward_list)

plt.show()

NameError: name 'reward_list' is not defined

In [None]:
num_subpulses, np.max(reward_list)

# Trained Agent Performance