In [1]:
"""
    The goal of this experiment will be to instanciate a reinforcement learning approach RNN on a quantum-setup. We will use continuous generated real data
    as interactions with the agent, and train the agent accordingly to the env. 
    Think of this as like having a car learning to traverse a road, first, we give it a set environment, and the car runs on it will access to 
    3 controls, if it is able to run, we can give it rewards, and if not we punish it and let it screw around with its controls.
    Inspiration: https://www.youtube.com/watch?v=cUojVsCJ51I
"""

'\n    The goal of this experiment will be to instanciate a reinforcement learning approach RNN on a quantum-setup. We will use continuous generated real data\n    as interactions with the agent, and train the agent accordingly to the env. \n    Think of this as like having a car learning to traverse a road, first, we give it a set environment, and the car runs on it will access to \n    3 controls, if it is able to run, we can give it rewards, and if not we punish it and let it screw around with its controls.\n    Inspiration: https://www.youtube.com/watch?v=cUojVsCJ51I\n'

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_
import random
import math
from torch.utils.tensorboard import SummaryWriter
from collections import deque, namedtuple
import time

In [2]:
import time 
def start_time():
    return time.time()

def elapsed(a):
    return time.time()-a

In [4]:
input_feature_dim = 3  # Each input element is a 1x3 vector
embed_size = 128
target_dim = 3
block_size = 100
num_heads = 32
max_iters = 1200
batch_size = 32
eval_iters = 200
eval_interval = 10
num_layers=12

def get_batch3(split):
    # Select the correct data split
    if split == 'train':
        a, b, max_index = x_train, y_train, int(length_read * 0.9) - block_size - 1
    else:  # split == 'test'
        a, b, max_index = x_test, y_test, length_read - (int(length_read * 0.9) + block_size + 1)

    # Generate random indices for batch selection, ensuring they're within bounds
    ix = torch.randint(0, max_index, (batch_size,))
    # Initialize lists to hold the batches
    x_batch = []
    y_batch = []

    for i in ix:
        try:
            # Extract sequences from 'a' and 'b' and the corresponding target from 'b'
            seq_A = torch.tensor(a.iloc[i.item():i.item() + block_size+1].astype(np.float32).values, dtype=torch.float32)
            seq_B = torch.tensor(b.iloc[i.item():i.item() + block_size].astype(np.float32).values, dtype=torch.float32)
            target = torch.tensor(b.iloc[i.item() + block_size].astype(np.float32).values, dtype=torch.float32)

            seq = torch.cat((seq_A, seq_B), dim=0)
            x_batch.append(seq)
            y_batch.append(target)
        except IndexError as e:
            print(f"IndexError for index {i.item()}: {str(e)}")
            print(f"Attempting to access index [{i.item()}:{i.item() + block_size}] in 'a' with shape {a.shape}")
            print(f"Attempting to access index {i.item() + block_size} in 'b' with shape {b.shape}")
            # Optionally, break or continue depending on desired behavior on error
            break  # or continue

    if not x_batch or not y_batch:
        print("Error: Batch could not be created due to index issues.")
        return None, None

    # Stack the collected sequences and targets into tensors
    xstack = torch.stack(x_batch)
    ystack = torch.stack(y_batch)

    return xstack, ystack


class SelfAttention(nn.Module):
    def __init__(self, embed_size):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size

        self.keys = nn.Linear(embed_size, embed_size, bias=False)
        self.queries = nn.Linear(embed_size, embed_size, bias=False)
        self.values = nn.Linear(embed_size, embed_size, bias=False)

    def forward(self, x):
        K = self.keys(x)
        Q = self.queries(x)
        V = self.values(x)

        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / self.embed_size ** 0.5
        attention = torch.softmax(attention_scores, dim=-1)

        attended = torch.matmul(attention, V)
        return attended

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.embed_size = embed_size
        self.num_heads = num_heads

        assert embed_size % num_heads == 0

        self.head_dim = embed_size // num_heads

        self.keys = nn.Linear(embed_size, embed_size, bias=False)
        self.queries = nn.Linear(embed_size, embed_size, bias=False)
        self.values = nn.Linear(embed_size, embed_size, bias=False)

        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, x):
        batch_size, seq_length, _ = x.shape
        keys = self.keys(x).view(batch_size, seq_length, self.num_heads, self.head_dim)
        queries = self.queries(x).view(batch_size, seq_length, self.num_heads, self.head_dim)
        values = self.values(x).view(batch_size, seq_length, self.num_heads, self.head_dim)

        attention_scores = torch.einsum("bnqh,bnkh->bnqk", [queries, keys]) / (self.head_dim ** 0.5)
        attention = torch.softmax(attention_scores, dim=-1)

        attended = torch.einsum("bnqk,bnkv->bnqv", [attention, values]).reshape(batch_size, seq_length, self.embed_size)

        output = self.fc_out(attended)
        return output

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch3(split)
            X, Y = X.to(device), Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train
    del X, Y
    return out

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, num_heads):
        super(TransformerBlock, self).__init__()
        self.norm1 = nn.LayerNorm(embed_size)
        self.attention = MultiHeadAttention(embed_size, num_heads)
        self.dropout1 = nn.Dropout(0.1)

        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, 2 * embed_size),
            nn.ReLU(),
            nn.Linear(2 * embed_size, embed_size),
        )
        self.dropout2 = nn.Dropout(0.1)

    def forward(self, value):
        x = self.norm1(value)
        attention_output = self.attention(x)
        x = value + self.dropout1(attention_output)  # Residual connection and dropout after attention
        x = self.norm2(x)
        feed_forward_output = self.feed_forward(x)
        out = value + self.dropout2(feed_forward_output)  # Residual connection and dropout after FFN
        return out

# Positional Encoding in Encoder class should be moved to the device
class Encoder(nn.Module):
    def __init__(self, input_feature_dim, embed_size, num_heads, num_layers, seq_length):
        super(Encoder, self).__init__()
        self.input_fc = nn.Linear(input_feature_dim, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, seq_length, embed_size)).to(device)
        self.layers = nn.ModuleList([
            TransformerBlock(embed_size, num_heads) for _ in range(num_layers)])
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.input_fc(x)) + self.positional_encoding
        for layer in self.layers:
            x = layer(x)
        return x
    
    def to_cpu(self):
        # Move the entire model to CPU
        self.input_fc.to('cpu')
        self.positional_encoding.data = self.positional_encoding.data.cpu()
        for layer in self.layers:
            layer.to('cpu')
        self.relu.to('cpu')
        torch.cuda.empty_cache()

class EncoderDecoderModelWithMultiHeadAttention(nn.Module):
    def __init__(self, input_feature_dim, embed_size, target_dim, seq_length, num_heads, num_layers):
        super(EncoderDecoderModelWithMultiHeadAttention, self).__init__()
        self.encoder = Encoder(input_feature_dim, embed_size, num_heads, num_layers, seq_length)
        self.decoder = nn.Sequential(
            nn.Linear(embed_size, target_dim),
        )

    def forward(self, x, targets):
        encoded = self.encoder(x)
        encoded_pooled = torch.mean(encoded, dim=1)
        decoded = self.decoder(encoded_pooled)
        
        if targets is not None:
            loss = criterion(decoded, targets)  
            return decoded, loss


        return decoded, None

    def to_cpu(self):
        self.encoder.to_cpu()
        for layer in self.decoder:
            layer.to('cpu')
        torch.cuda.empty_cache()

In [5]:
model_path = "C:/Users/yueze/Desktop/trained_model.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EncoderDecoderModelWithMultiHeadAttention(input_feature_dim, embed_size, target_dim, block_size+1, num_heads, num_layers)
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
print("Model loaded from", model_path)

Model loaded from C:/Users/yueze/Desktop/trained_model.pth


In [6]:
import copy
class DummyHardware:
    """Simulates hardware behavior for the environment."""
    @staticmethod
    def implement(actions,t, measurement_1350, measurement_1550):
        # Apply action effects with cosine function
        measurement_1350 += actions * 0.3 * np.cos(actions)+ np.sin(t)
        # Wrap around if the value exceeds 2.2 or goes below -2.2
        
        while np.any(measurement_1350 > 2.2) or np.any(measurement_1350 < -2.2):
            measurement_1350 = np.where(measurement_1350 > 2.2, measurement_1350 - 4.4, measurement_1350)
            measurement_1350 = np.where(measurement_1350 < -2.2, measurement_1350 + 4.4, measurement_1350)
        
        # Apply action to 1550 nm state with a multiplier of 1
        measurement_1550 += actions * np.cos(3 * actions) + np.sin(3 * t)
        
        while np.any(measurement_1550 > 2.2) or np.any(measurement_1550 < -2.2):
            measurement_1550 = np.where(measurement_1550 > 2.2, measurement_1550 - 4.4, measurement_1550)
            measurement_1550 = np.where(measurement_1550 < -2.2, measurement_1550 + 4.4, measurement_1550)    
        return measurement_1350, measurement_1550    


    
    @staticmethod
    def drift(t, measurement_1350, measurement_1550):
        # Apply action effects with cosine function
        measurement_1350 += np.sin(t)
        while np.any(measurement_1350 > 2.2) or np.any(measurement_1350 < -2.2):
            measurement_1350 = np.where(measurement_1350 > 2.2, measurement_1350 - 4.4, measurement_1350)
            measurement_1350 = np.where(measurement_1350 < -2.2, measurement_1350 + 4.4, measurement_1350)
        measurement_1550 += np.sin(3 * t)
        while np.any(measurement_1550 > 2.2) or np.any(measurement_1550 < -2.2):
            measurement_1550 = np.where(measurement_1550 > 2.2, measurement_1550 - 4.4, measurement_1550)
            measurement_1550 = np.where(measurement_1550 < -2.2, measurement_1550 + 4.4, measurement_1550)
   
        return measurement_1350, measurement_1550

    @staticmethod
    def measure(wavelength, t, initial_state_1550=None):
        # Simulate a hardware measurement with random values within a range
        if wavelength == 1350:
            return env.optimal_state + np.sin(t)
        elif wavelength == 1550 and initial_state_1550 is not None:
            return initial_state_1550 + np.sin(3 * t)



class Env:
    """Wrapper for interacting with the environment."""
    def __init__(self):
        self.optimal_state = np.array([0, 1, 2.2])
        self.df_1550 = []
        self.t = 0
        #these two variables are for after the action executes
        self.measured_1550 = None
        self.measured_1350= None
        #these two variables are for after the action executes, and we actually measure. (the dummy environment needs manually updating, in real case,
        #these two variables should be the same.
        self.drifted_1550= None
        self.drifted_1350=None
        
    def reset(self):
        self.t = 0  # Reset time
        action = np.array([0,0,0])
        # Reset the measurement at 1350 nm perfectly to the optimal state
        self.measured_1350 = np.array(DummyHardware.measure(1350,self.t))
        self.data_to_correct_for = self.optimal_state - self.measured_1350
        action = self.data_to_correct_for  # Directly use the correction values as actions
        
        # Reset the measurement at 1550 nm to some random position
        self.measured_1550 = np.random.uniform(-2, 2, 3)
        
        while len(self.df_1550) < 100:
            measured_1350=copy.deepcopy(self.measured_1350)
            measured_1550=copy.deepcopy(self.measured_1550)
            self.t += 1  # Increment time
            # Implement the action in the hardware with effects on states
            self.measured_1350, self.measured_1550 = DummyHardware.implement(action,self.t ,measured_1350, measured_1550)  
            # Insert new measurement into the list
            self.df_1550.append(copy.deepcopy(self.measured_1550))
        if len(self.df_1550) > 100:
            self.df_1550.pop(0)  # Remove the oldest element
        stacked_df = np.stack(self.df_1550)
        
        # Ensure optimal_state is reshaped correctly
        optimal_state_reshaped = self.optimal_state.reshape(1, -1)
        
        # Concatenate optimal_state to the end
        stacked_df_with_optimal = np.concatenate((stacked_df, optimal_state_reshaped), axis=0)
        
        # Convert the final array to a torch tensor
        tensor_df = torch.tensor(stacked_df_with_optimal, dtype=torch.float32).to(device)
        
        # Get the output from the model
        output, _ = model(tensor_df, None)
        
        # Next state is the latest 100 measurements of 1550 concatenated with the output
        next_state_measurements = np.array(self.df_1550)
        output_numpy = output.cpu().detach().numpy().reshape(1, -1)
        
        # Concatenate output and optimal state with the measurements
        next_state = np.concatenate((next_state_measurements, output_numpy, optimal_state_reshaped), axis=0)
        return next_state

    def step(self, action):
        measured_1350=copy.deepcopy(self.measured_1350)
        measured_1550=copy.deepcopy(self.measured_1550)
        self.t += 1  # Increment time
            # Implement the action in the hardware with effects on states
        self.measured_1350, self.measured_1550 = DummyHardware.implement(action,self.t ,measured_1350, measured_1550)
            # Insert new measurement into the list
        self.df_1550.append(copy.deepcopy(self.measured_1550))
        if len(self.df_1550) > 100:
            self.df_1550.pop(0)  # Remove the oldest element
        stacked_df = np.stack(self.df_1550)
        
        # Ensure optimal_state is reshaped correctly
        optimal_state_reshaped = self.optimal_state.reshape(1, -1)
        
        # Concatenate optimal_state to the end
        stacked_df_with_optimal = np.concatenate((stacked_df, optimal_state_reshaped), axis=0)
        
        # Convert the final array to a torch tensor
        tensor_df = torch.tensor(stacked_df_with_optimal, dtype=torch.float32).to(device)
        
        # Get the output from the model
        output, _ = model(tensor_df, None)
        
        # Next state is the latest 100 measurements of 1550 concatenated with the output
        next_state_measurements = np.array(self.df_1550)
        output_numpy = output.cpu().detach().numpy().reshape(1, -1)
        
        # Concatenate output and optimal state with the measurements
        next_state = np.concatenate((next_state_measurements, output_numpy, optimal_state_reshaped), axis=0)
        
        # Reward is the negative mean squared error between new_measurement_1350 and optimal_state
        reward = -np.mean((self.measured_1350 - self.optimal_state) ** 2)
        
        return next_state, reward

# Example usage
env = Env()
state = env.reset()
print(f"Initial state: {state}")

action = np.array([1, -0.5, 2])  # Example action
# Simulate a step
next_state, reward = env.step(action)
print(f"Next state: {next_state}")
print(f"Next state size: {next_state.size}")
print(f"Reward: {reward}")

Initial state: [[-1.09914520e-01 -1.59248028e+00  1.01984249e+00]
 [-3.89330018e-01 -1.87189578e+00  7.40426997e-01]
 [ 2.27884673e-02 -1.45977730e+00  1.15254548e+00]
 [-5.13784451e-01 -1.99635022e+00  6.15972564e-01]
 [ 1.36503389e-01 -1.34606237e+00  1.26626040e+00]
 [-6.14483857e-01 -2.09704962e+00  5.15273157e-01]
 [ 2.22171781e-01 -1.26039398e+00  1.35192880e+00]
 [-6.83406581e-01 -2.16597235e+00  4.46350434e-01]
 [ 2.72969348e-01 -1.20959642e+00  1.40272636e+00]
 [-7.15062277e-01 -2.19762804e+00  4.14694738e-01]
 [ 2.84849584e-01 -1.19771618e+00  1.41460660e+00]
 [-7.06929270e-01 -2.18949503e+00  4.22827745e-01]
 [ 2.56866116e-01 -1.22569965e+00  1.38662313e+00]
 [-6.59655431e-01 -2.14222120e+00  4.70101583e-01]
 [ 1.91248093e-01 -1.29131767e+00  1.32100511e+00]
 [-5.77006568e-01 -2.05957233e+00  5.52750446e-01]
 [ 9.32226076e-02 -1.38934316e+00  1.22297962e+00]
 [-4.65566441e-01 -1.94813221e+00  6.64190573e-01]
 [-2.94016860e-02 -1.51196745e+00  1.10035533e+00]
 [-3.34212307e-0

In [9]:
class env2():
    """"This will be the wrapper for interacting with the environment"""
    def reset():
        """
        TODO:
        pseudocode: 
            optimal_state = [a,b,c]
            Take measurement result of 1350 (at the output of the fibre) [a',b',c']
            Data to correct for = [a-a',b-b',c-c']
            action = (sign(a-a'),abs(a-a'))x3
            hardware.implement(Data to correct for )
            Take measurement result of 1550 [d',e',f']
            return ([d',e',f'])
        """
        example_output_state = np.array([0,0,0])
        return (example_output_state)
        
    def step(action):
        """
        TODO:
        *action is a 6 valued vector containing 3 pairs of ((0,1), int) tuples
        Since this step is also responsible for spawning the state, we wil also add in the 
        GPT- suggestion rates here.




        
        harware.implement (action)
        sleep(1000)
        new_1350 = hardware.measure (1350)
        new_1550 - hardware.measure (1550)
        df.insert(new_1350, new_1550)
        next_state = df[-100:]
        reward = = MSE(new_1350,optimal_state)
        
        """
        return (next_state,reward)
  

In [16]:
class IQN(nn.Module):
    def __init__(self, state_size, action_size, layer_size, n_step, seed, layer_type="ff"):
        super(IQN, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_shape = state_size
        self.action_size = action_size
        self.K = 1  # Set K to 1 for single step
        self.N = 1  # Set N to 1 for single step
        self.n_cos = 64
        self.layer_size = layer_size
        self.pis = torch.FloatTensor([np.pi * i for i in range(self.n_cos)]).view(1, 1, self.n_cos).to(device)

        self.head = nn.Linear(self.input_shape[0], layer_size)  # Could be a CNN
        self.cos_embedding = nn.Linear(self.n_cos, layer_size)
        self.ff_1 = nn.Linear(layer_size, layer_size)
        self.ff_2 = nn.Linear(layer_size, action_size)

    def calc_cos(self, batch_size, n_tau=1):
        """Calculate the cosine values for the number of tau samples."""
        taus = torch.rand(batch_size, n_tau).to(device).unsqueeze(-1)  # (batch_size, n_tau, 1)
        cos = torch.cos(taus * self.pis)
        assert cos.shape == (batch_size, n_tau, self.n_cos), "cos shape is incorrect"
        return cos, taus

    def forward(self, input, num_tau=1):
        """Quantile calculation depending on the number of tau."""
        print(f"Input shape: {input.shape}")
        batch_size = input.shape[0]
        print(f"Batch size: {batch_size}")

        x = torch.relu(self.head(input))
        print(f"After head and ReLU, x shape: {x.shape}")

        cos, taus = self.calc_cos(batch_size, num_tau)
        print(f"Cos shape: {cos.shape}, Taus shape: {taus.shape}")

        cos = cos.view(batch_size * num_tau, self.n_cos)
        print(f"Reshaped cos: {cos.shape}")

        cos_x = torch.relu(self.cos_embedding(cos)).view(batch_size, num_tau, self.layer_size)
        print(f"After cos embedding and ReLU, cos_x shape: {cos_x.shape}")

        # Adjust x to match cos_x shape
        x = (x.unsqueeze(1) * cos_x).view(batch_size * num_tau, self.layer_size)
        print(f"After element-wise multiplication and reshaping, x shape: {x.shape}")

        x = torch.relu(self.ff_1(x))
        print(f"After first feed-forward layer and ReLU, x shape: {x.shape}")

        out = self.ff_2(x)
        print(f"After second feed-forward layer, out shape: {out.shape}")

        final_out = out.view(batch_size, num_tau, self.action_size)
        print(f"Final output shape: {final_out.shape}")

        return final_out, taus

    def get_action(self, inputs):
        quantiles, _ = self.forward(inputs, self.K)
        actions = quantiles.mean(dim=1)
        return actions

In [8]:
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, buffer_size, batch_size, device, seed, gamma, n_step=1):
        """Initialize a ReplayBuffer object.
        Params
        ======
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.device = device
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
        self.gamma = gamma
        self.n_step = n_step
        self.n_step_buffer = deque(maxlen=self.n_step)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        #print("before:", state,action,reward,next_state, done)
        self.n_step_buffer.append((state, action, reward, next_state, done))
        if len(self.n_step_buffer) == self.n_step:
            state, action, reward, next_state, done = self.calc_multistep_return()
            #print("after:",state,action,reward,next_state, done)
            e = self.experience(state, action, reward, next_state, done)
            self.memory.append(e)
    
    def calc_multistep_return(self):
        Return = 0
        for idx in range(self.n_step):
            Return += self.gamma**idx * self.n_step_buffer[idx][2]
        
        return self.n_step_buffer[0][0], self.n_step_buffer[0][1], Return, self.n_step_buffer[-1][3], self.n_step_buffer[-1][4]
        
    
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.stack([e.state for e in experiences if e is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.stack([e.next_state for e in experiences if e is not None])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [9]:
"""TODO:
Here, the following code for the agent are directly copied from the paper,
Since no testing has been made, modifications to be seen.
"""      

class DQN_Agent():
    """Interacts with and learns from the environment."""

    def __init__(self,
                 state_size,
                 action_size,
                 layer_size,
                 n_step,
                 BATCH_SIZE,
                 BUFFER_SIZE,
                 LR,
                 TAU,
                 GAMMA,
                 UPDATE_EVERY,
                 device,
                 seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            layer_size (int): size of the hidden layer
            BATCH_SIZE (int): size of the training batch
            BUFFER_SIZE (int): size of the replay memory
            LR (float): learning rate
            TAU (float): tau for soft updating the network weights
            GAMMA (float): discount factor
            UPDATE_EVERY (int): update frequency
            device (str): device that is used for the compute
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.device = device
        self.TAU = TAU
        self.GAMMA = GAMMA
        self.UPDATE_EVERY = UPDATE_EVERY
        self.BATCH_SIZE = BATCH_SIZE
        self.Q_updates = 0
        self.n_step = n_step

        self.action_step = 4
        self.last_action = None

        # IQN-Network
        self.qnetwork_local = IQN(state_size, action_size,layer_size, n_step, seed).to(device)
        self.qnetwork_target = IQN(state_size, action_size,layer_size, n_step, seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        print(self.qnetwork_local)
        
        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device, seed, self.GAMMA, n_step)
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done, writer):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.BATCH_SIZE:
                experiences = self.memory.sample()
                loss = self.learn(experiences)
                self.Q_updates += 1
                writer.add_scalar("Q_loss", loss, self.Q_updates)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy. Acting only every 4 frames!
        
        Params
        ======
            frame: to adjust epsilon
            state (array_like): current state
            
        """

        if self.action_step == 4:
            state = np.array(state)

            state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
            self.qnetwork_local.eval()
            with torch.no_grad():
                action_values = self.qnetwork_local.get_action(state)
            self.qnetwork_local.train()

            # Epsilon-greedy action selection
            if random.random() > eps: # select greedy action if random number is higher than epsilon or noisy network is used!
                action = np.argmax(action_values.cpu().data.numpy())
                self.last_action = action
                return action
            else:
                action = random.choice(np.arange(self.action_size))
                self.last_action = action 
                return action

        else:
            self.action_step += 1
            return self.last_action


    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        self.optimizer.zero_grad()
        states, actions, rewards, next_states, dones = experiences
        # Get max predicted Q values (for next states) from target model
        Q_targets_next, _ = self.qnetwork_target(next_states)
        Q_targets_next = Q_targets_next.detach().max(2)[0].unsqueeze(1) # (batch_size, 1, N)
        
        # Compute Q targets for current states 
        Q_targets = rewards.unsqueeze(-1) + (self.GAMMA**self.n_step * Q_targets_next * (1. - dones.unsqueeze(-1)))
        # Get expected Q values from local model
        Q_expected, taus = self.qnetwork_local(states)
        Q_expected = Q_expected.gather(2, actions.unsqueeze(-1).expand(self.BATCH_SIZE, 102, 1))

        # Quantile Huber loss
        td_error = Q_targets - Q_expected
        assert td_error.shape == (self.BATCH_SIZE, 102, 3), "wrong td error shape"
        huber_l = calculate_huber_loss(td_error, 1.0)
        
        quantil_l = abs(taus -(td_error.detach() < 0).float()) * huber_l / 1.0
        
        loss = quantil_l.sum(dim=1).mean(dim=1) # , keepdim=True if per weights get multipl
        loss = loss.mean()


        # Minimize the loss
        loss.backward()
        #clip_grad_norm_(self.qnetwork_local.parameters(),1)
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target)
        return loss.detach().cpu().numpy()            

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.TAU*local_param.data + (1.0-self.TAU)*target_param.data)


def calculate_huber_loss(td_errors, k=1.0):
    """
    Calculate huber loss element-wisely depending on kappa k.
    """
    loss = torch.where(td_errors.abs() <= k, 0.5 * td_errors.pow(2), k * (td_errors.abs() - 0.5 * k))
    assert loss.shape == (td_errors.shape[0],102, 3), "huber loss has wrong shape"
    return loss
    
            
def eval_runs(eps, frame):
    """
    Makes an evaluation run with the current epsilon
    """
    env = gym.make("CartPole-v0")
    reward_batch = []
    for i in range(5):
        state = env.reset()
        rewards = 0
        while True:
            action = agent.act(state, eps)
            state, reward, done, _ = env.step(action)
            rewards += reward
            if done:
                break
        reward_batch.append(rewards)
        
    writer.add_scalar("Reward", np.mean(reward_batch), frame)

In [10]:
def evaluate(eps):
    """tells you how good is the current policy is performing"""
    reward_batch = []
    state = env_reset()
    rewards=0
    a = start_time()
    while  (elapsed(a)<300):#todo :
        action = agent.act(state,0.001,eval=True)
        next_state, reward = env.step(action,optimal_state)
        rewards+=reward
    reward_batch.append(rewards)

In [11]:
def run(interaction_time = 10000,reset_time = 300,eps_fixed=False, end_exploration_time = 3e5, min_eps=0.01, eval_every=300, eval_runs=1, worker=1,optimal_state=[0,0,0] ): 
    """Params
    interaction_time = each agent in one epoch takes 300 (seconds) to complete
    epsilon: the "fuck around" parameter; how much does the model weights fluctuate with each observation
    end_exploration_time = how long til we stop annealing 
    min_eps = smallest learning rate we will have
    eval_every = The frequency of evaluation runs. The agent's performance will be evaluated every eval_every seconds.
    eval_runs = The number of evaluation runs to perform each time the agent is evaluated. This helps in assessing the agent's performance more robustly by averaging the results over multiple runs.
    worker = The worker multiplier used for distributed training setups. Due to having only one computer setup, this parameter will always be 1.
    optimal_state = the state we want to practice convergence on, we will use this as a reference for reward giving
    """
    
    scores = []   
    if eps_fixed:
        eps = 0
    else:
        eps = 1
    eps_start = 1
    d_eps = eps_start - min_eps
    i_episode = 1
    state = env.reset()
    score = 0      
    done=False
    for time in range (1, interaction_time+1):
        if (time%reset_time==0 ):
            done = True
        action = agent.act(state,eps)
        next_state, reward = env.step(action)
        agent.step(state,action,reward,next_state,done,writer)
        if eps_fixed == False:
            eps = max(eps_start - ((time*d_eps)/end_exploration_time), min_eps)
        
        if done:
            scores_window.append(score)  # save most recent score
            scores.append(score)  # save most recent score
            writer.add_scalar("Average100", np.mean(scores_window), frame)
            output_history.append(np.mean(scores_window))
            print('\rEpisode {}\tFrame {}\tAverage Score: {:.2f}'.format(i_episode, frame, np.mean(scores_window)), end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tFrame {}\tAverage Score: {:.2f}'.format(i_episode, frame, np.mean(scores_window)))
            i_episode += 1
            state = env.reset()
            score = 0
            done = False

    return output_history

In [12]:
#TODO: create a dummy hardware-interaction layer that just returns gibberish and see how 
#we can added/interface with the system.

In [17]:
if __name__ == "__main__":
    writer = SummaryWriter("runs/"+"IQN_CP_5")
    action_size = 3
    state_size = (3,102)
    layer_size=256
    seed = 5
    batch=8
    BUFFER_SIZE = 100000
    BATCH_SIZE=1
    GAMMA = 0.99
    TAU = 1e-2
    LR = 1e-3
    UPDATE_EVERY = 1
    n_step = 1
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Using ", device)

    agent = DQN_Agent(state_size=state_size,    
                        action_size=action_size,
                        layer_size=256,
                        n_step=n_step,
                        BATCH_SIZE=BATCH_SIZE, 
                        BUFFER_SIZE=BUFFER_SIZE, 
                        LR=LR, 
                        TAU=TAU, 
                        GAMMA=GAMMA, 
                        UPDATE_EVERY=UPDATE_EVERY, 
                        device=device, 
                        seed=seed)

    eps_fixed = False

    t0 = time.time()
    final_average100 = run(interaction_time = 10000,reset_time = 300,eps_fixed=False, end_exploration_time = 3e5, min_eps=0.01, eval_every=300, eval_runs=1, worker=1,optimal_state=[0,0,0])
    t1 = time.time()

Using  cuda:0
IQN(
  (head): Linear(in_features=3, out_features=256, bias=True)
  (cos_embedding): Linear(in_features=64, out_features=256, bias=True)
  (ff_1): Linear(in_features=256, out_features=256, bias=True)
  (ff_2): Linear(in_features=256, out_features=3, bias=True)
)
Input shape: torch.Size([1, 102, 3])
Batch size: 1
After head and ReLU, x shape: torch.Size([1, 102, 256])
Cos shape: torch.Size([1, 1, 64]), Taus shape: torch.Size([1, 1, 1])
Reshaped cos: torch.Size([1, 64])
After cos embedding and ReLU, cos_x shape: torch.Size([1, 1, 256])


RuntimeError: shape '[1, 256]' is invalid for input of size 26112

In [None]:
size = (3, 4)
tensor = torch.empty(size)  # Creates a tensor of shape (3, 4) with default dtype and device

In [21]:
# Example usage
env = Env()
state = env.reset()
print(f"Initial state: {state}")

action = [(1, 1), (-1, 0.5), (1, 2)]  # Example action

# Calculate and print action size
action_size = len(action)
print(f"Action size: {action_size}")

# Simulate a few steps until we have 100 points
for _ in range(105):  # We loop more times to ensure we pass the 100 points threshold
    next_state, reward = env.step(action)
    if isinstance(next_state, np.ndarray):  # Ensure next_state is ready
        state_size = next_state.shape
        print(f"State size: {state_size}")
    print(f"Reward: {reward}")

Initial state: [0.40467017 0.76137428 1.02188813]
Action size: 3
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Reward: 0
Rewar