# Library Imports

In [16]:
import numpy as np                  # Numpy numerical library
import pandas as pd                 # Pandas for dataframes manipulation
import tensorflow as tf             # TensorFlow for neural networks and deep learning APIs
import matplotlib.pyplot as plt     # Plotting
%matplotlib inline                  # Magic formula for using plots with jupyter notebooks

# Dataset Preparation

In [17]:
BATCH_SIZE = 4              # Default batch size

class DataLoader:
    # Take in a csv file and extracts features and labels
    def __init__(self, csv_filepath, batch_size = BATCH_SIZE):                          
        
        self.df_samples = pd.read_csv(csv_filepath)                 # Create a pandas dataframe
        self.numpy_samples = self.df_samples.to_numpy()
        
        self.states_features = self.numpy_samples[:, 1:self.numpy_samples.shape[1]-1]               # Take the feature values for states, also ignore first column for IDs  
        self.feature_dim = self.states_features.shape[1]                                        
        self.actions_labels = self.numpy_samples[:, -1].reshape(-1, 1)                              # The action labels separated from the labels
        self.actions_classes = np.amax(self.actions_labels) + 1                                     # Number of different action classes to set the output layer dimensions (+1 bec starts at zero)
        self.batches = self.prepare_batches(batch_size)
        
    def prepare_batches(self, batch_size):
        states_t = self.states_features[:-1, :]           # Considered as S(t)
        states_t_plus_one = self.states_features[1:, :]   # Considered as S(t+1)
        actions_star_t = self.actions_labels[:-1, :]      # Considered as a*(t)
        whole_generic_samples = np.hstack((states_t, actions_star_t, states_t_plus_one))            # Stack the whole dataset as described in the paper
        
        self.n_samples = whole_generic_samples.shape[0]
        self.batch_size = batch_size
        self.n_batches = ceil(self.n_samples / self.batch_size)

        batches = []

        for i in range(self.n_batches):
            start = i * batch_size
            end = (i + 1) * batch_size + 1 
            curr_batch = whole_generic_samples[start:end, :]

        print()
        return batches
    
    # Function that takes in the 2D arrays of data and converts lo lists of tuples to be compatible with looping while training
    def tupelize(self, array):
        list_of_tuples = list(zip(array.T[0], array.T))
        return list_of_tuples 

    # Function to get the unique rows representing unique states, returns a numpy array of rows
    def get_unique_rows(self):
        self.unique_rows = np.unique(self.states_features, axis = 0)
        return self.unique_rows

    # Get the pandas dataframe for the data, returns a pandas dataframe
    def get_dataframe(self):    
        return self.df_samples

data = DataLoader("new_data.csv")


# Model and Classes Definition

In [6]:
Q_OUT_DIM = 1
LEARNING_RATE = 0.001       # Gradient-descent learning rate
REPLAY_MEMORY_SIZE = 50     # Size for RL replay memory
UPDATE_TARGET_EVERY = 5     

# Creating our main class for our DQN
class DeepQNet:
    
    def __init__(self, dataset):
        self.data = dataset                                         # Storing the data in our QNet            
        self.input_dim = dataset.feature_dim + 1                    # State feature dim + 1 (for ground truth actions)
        self.output_dim = Q_OUT_DIM                               
        
        self.model = self.create_model()                            # Main model that gets trained every step 
        self.model.summary()                                        # Printing model details
        self.target_model = self.create_model()                     # Target model we predict against each step
        self.target_model.set_weights(self.model.get_weights())     # To make all the initial weights the same

        # Defining the action-set
        self.action_set = np.arange(self.output_dim).astype(np.uint16).tolist() # List of all possible actions [0, 1, ... output_dim]

        # Used to count when to update target network with main network's weights
        self.target_update_counter = 0
    
    def create_model(self):
        # Definition of the neural network architecture mentioned in the paper (3 relu feedforward layers)
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Input(self.input_dim))                        # Input dimension of the state-vector
        model.add(tf.keras.layers.Dense(128, activation= "relu"))
        model.add(tf.keras.layers.Dense(128, activation= "relu"))
        model.add(tf.keras.layers.Dense(self.output_dim, activation= "relu"))       # Output is value function
        model.compile(loss="mse", optimizer=tf.optimizers.Adam(lr= LEARNING_RATE), metrics=['accuracy'])
        return model

    def get_reward(self, a_predicted, a_label):
        if a_predicted == a_label:
            return 1
        else: return 0

    # Function to implement the epsilon-greedy policy selection, returns the index of the selected action
    def greedy(self, epsilon, action_values):
        p = np.random.uniform(low=0.0, high=1.0)
        
        if p < epsilon:             # Take the greedy action
            return np.argmax(action_values)            
        
        else:                       # Take an exploration action
            return np.random.randint(low=0, high=len(action_values))

    def train(self):
        
        batches = self.data.batches             # Get the batches
        self.target_update_counter = 0          # The update counter 

        for current_states, optimal_actions, next_states in batches:
            estimated_qs_list = []
            for action in self.action_set:                  # Iterating and calculating the value for each action
                action_vector = np.full((current_states.shape[0], 1), fill_value= action)
                input_vector =  np.hstack((current_states, action_vector))
                estimated_qs_list.append(self.target_model.predict(input_vector))

            # If counter reaches set value, update target network with weights of main network
            if self.target_update_counter > UPDATE_TARGET_EVERY:
                self.target_model.set_weights(self.model.get_weights())
                self.target_update_counter = 0
                

              

dq_net = DeepQNet(dataset= data)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               3584      
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 1032      
Total params: 21,128
Trainable params: 21,128
Non-trainable params: 0
_________________________________________________________________
[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]


# Model Training

# Model Testing and Evaluation