# Library Imports

In [1]:
import numpy as np                  # Numpy numerical library
import pandas as pd                 # Pandas for dataframes manipulation
import tensorflow as tf             # TensorFlow for neural networks and deep learning APIs
import matplotlib.pyplot as plt     # Plotting
%matplotlib inline                  

# Dataset Preparation

In [2]:
BATCH_SIZE = 256                # Default batch size
TRAIN_SPLIT_PERCENT = 0.9       # 90% of the data for training, 10% for testing    

class DataLoader:
    # Take in a csv file and extracts features and labels
    def __init__(self, csv_filepath, batch_size):                          
        
        self.df_samples = pd.read_csv(csv_filepath)                 # Create a pandas dataframe
        self.numpy_samples = self.df_samples.to_numpy()
        
        self.states_features = self.numpy_samples[:, 1:self.numpy_samples.shape[1]-1]               # Take the feature values for states, also ignore first column for IDs  
        self.feature_dim = self.states_features.shape[1]                                        
        self.actions_labels = self.numpy_samples[:, -1].reshape(-1, 1)                              # The action labels separated from the labels
        self.actions_classes = np.amax(self.actions_labels) + 1                                     # Number of different action classes to set the output layer dimensions (+1 bec starts at zero)
        self.train_batches, self.test_batches = self.prepare_batches(batch_size, train_split_percent= TRAIN_SPLIT_PERCENT)
        print("Dataset successfully loaded with {} training batches, and {} testing batches with {} batch size.".format(
            len(self.train_batches), len(self.test_batches), self.batch_size
        ))
        
    def prepare_batches(self, batch_size, train_split_percent):
        states_t = self.states_features[:-1, :].copy()           # Considered as S(t)
        states_t_plus_one = self.states_features[1:, :].copy()   # Considered as S(t+1)
        actions_star_t = self.actions_labels[:-1, :].copy()      # Considered as a*(t)

        whole_generic_samples = np.hstack((states_t, actions_star_t, states_t_plus_one))            # Stack the whole dataset as described in the paper

        # np.random.shuffle(whole_generic_samples)          # Shuffle the dataset
        
        self.n_samples = whole_generic_samples.shape[0]
        self.batch_size = batch_size
        self.n_batches = np.ceil(self.n_samples / self.batch_size).astype(np.uint32)

        train_batches = []        # Empty list to hold the batches of whole data
        test_batches = []

        # Prepare the data into batches
        for i in range(self.n_batches):
            start = i * batch_size
            end = (i + 1) * batch_size
            curr_batch = whole_generic_samples[start:end, :]
            if (i / self.n_batches) < train_split_percent:    
                train_batches.append(curr_batch)
            else:
                test_batches.append(curr_batch)
        
        self.n_train_batches = len(train_batches)
        self.n_test_batches = len(test_batches)
        return train_batches, test_batches
    
    # Function that takes in the 2D arrays of data and converts lo lists of tuples to be compatible with looping while training
    # TODO: Enhancing this function
    def tupelize(self, array):
        list_of_tuples = list(zip(array.T[0], array.T))
        return list_of_tuples 

    # Function to get the unique rows representing unique states, returns a numpy array of rows
    def get_unique_rows(self):
        self.unique_rows = np.unique(self.states_features, axis = 0)
        return self.unique_rows

    # Get the pandas dataframe for the data, returns a pandas dataframe
    def get_dataframe(self):    
        return self.df_samples
        

# Model and Classes Definition

In [3]:
Q_OUT_DIM = 1               # Output dimension
LEARNING_RATE = 0.001       # Gradient-descent learning rate
UPDATE_TARGET_EVERY = 5     # Iterations before updating the secondary model     
EPSILON = 0.8               # Epsilon value for the epsilon greedy policy selection
LAMBDA = 0.001              # Discount factor for loss calculation
EPOCHS = 2                  # Number of training epochs

# Creating our main class for our DQN
class DeepQNet:
    
    def __init__(self, dataset):
        self.data = dataset                                         # Storing the data in our QNet            
        self.input_dim = dataset.feature_dim + 1                    # State feature dim + 1 (for actions)
        self.output_dim = Q_OUT_DIM                               
        
        self.model = self.create_model()                            # Main model that gets trained every step 
        self.target_model = self.create_model()                     # Target model we predict against each step
        self.target_model.set_weights(self.model.get_weights())     # To make all the initial weights the same

        # Defining the action-set
        self.action_set = np.arange(self.data.actions_classes).astype(np.uint16).tolist() # List of all possible actions [0, 1, ... actions_classes]

        # Used to count when to update target network with main network's weights
        self.target_update_counter = 0
    
    def create_model(self):
        # Definition of the neural network architecture mentioned in the paper (3 relu feedforward layers)
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Input(self.input_dim))                        # Input dimension of the state-vector
        model.add(tf.keras.layers.Dense(128, activation= "relu"))
        model.add(tf.keras.layers.Dense(128, activation= "relu"))
        model.add(tf.keras.layers.Dense(self.output_dim, activation= "relu"))       # Output is value function
        model.compile(loss="mse", optimizer=tf.optimizers.Adam(lr= LEARNING_RATE), metrics=['accuracy'])
        return model

    # Prints the model details
    def summary(self):
        self.model.summary()

    def get_reward(self, predicted_actions, optimal_actions):
        predicted_actions = np.asarray(predicted_actions).reshape(-1)
        optimal_actions = np.asarray(optimal_actions).reshape(-1)
        reward_vector = np.equal(predicted_actions, optimal_actions).astype(np.uint32)
        return reward_vector

    # Function to implement the epsilon-greedy policy selection, returns the index of the selected action
    def greedy(self, actions_values_vec, epsilon):
        num_in_curr_batch = actions_values_vec.shape[1]
        selections = []
        for i in range(num_in_curr_batch):
            p = np.random.uniform(low=0.0, high=1.0)
            if p < epsilon:
                curr_actions_values = actions_values_vec[:, i].reshape(-1)
                selections.append(np.argmax(curr_actions_values))
            else:
                random_selection = np.random.randint(low=0, high=self.data.actions_classes)
                selections.append(random_selection)
        
        selections = np.asarray(selections).reshape(-1, 1)

        return selections

    # Function to process the batch and split the S(t), a*(t), and S(t+1)
    def process_batch(self, batch):
        current_states = batch[:, :self.data.feature_dim]
        optimal_actions = batch[:, self.data.feature_dim]
        next_states = batch[:, self.data.feature_dim+1 :]
        return current_states, optimal_actions, next_states

    def get_input_vector(self, current_states, action):
        action_vector = np.full((current_states.shape[0], 1), fill_value= action)
        input_vector =  np.hstack((current_states, action_vector))
        return input_vector

    # Function to get the vectorized output of the batched-samples
    def get_batch_vector_out(self, lst):
        nparr = np.asarray(lst).squeeze() 
        if nparr.ndim < 2:
            nparr = np.reshape(nparr, (nparr.shape[0], 1))
        return nparr

    def train(self):
        
        batches = self.data.train_batches               # Get the batches
        self.target_update_counter = 0                  # The update counter 

        for epoch in range(EPOCHS):

            for batch_idx, batch in enumerate(batches):                   # Looping over the batches
                current_states, optimal_actions, next_states = self.process_batch(batch)

                # Prediction on S(t) and all actions
                estimated_qs_t_list = []
                for action in self.action_set:                  # Iterating and calculating the value for each action
                    input_vector = self.get_input_vector(current_states, action)
                    estimated_qs_t_list.append(self.target_model.predict(input_vector))
                    print(np.unique(self.target_model.predict(input_vector)))

                estimated_qs_vec_t = self.get_batch_vector_out(estimated_qs_t_list)
                predicted_actions_t = self.greedy(estimated_qs_vec_t, epsilon= EPSILON)
                rewards_t = self.get_reward(predicted_actions_t, optimal_actions)
            

                # Prediction on S(t+1) and all actions
                estimated_qs_t_plus_one_list = []
                for action in self.action_set:                  # Iterating and calculating the value for each action
                    input_vector = self.get_input_vector(next_states, action)
                    estimated_qs_t_plus_one_list.append(self.target_model.predict(input_vector))
                
                estimated_qs_vec_t_plus_one = self.get_batch_vector_out(estimated_qs_t_plus_one_list)
                predicted_actions_tplus_one = self.greedy(estimated_qs_vec_t_plus_one, epsilon= 1.0)     # Taking the always argmax (epsilon = 1.0)
                
                # Prediction with S(t+1) and a_cap(t+1)
                input_vector = np.hstack((next_states, predicted_actions_tplus_one))
                q_cap_t_plus_one = self.target_model.predict(input_vector).reshape(-1)
                #print(q_cap_t_plus_one)
                
                # Calculation of qref
                qref = rewards_t + LAMBDA * q_cap_t_plus_one
                
                input_train_vector = np.hstack((current_states, predicted_actions_t))
                loss, accuracy = self.model.train_on_batch(input_train_vector, qref, reset_metrics= False)
                
                # print(" -------------------------------------------------- ")
                # print("In epoch {}/{} epochs, batch {}/{} batches:".format(epoch, EPOCHS, batch_idx, self.data.n_train_batches))
                # print("Accuracy: {}".format(accuracy))
                # print("Loss: {}".format(loss))
                # print(" -------------------------------------------------- ")
                
                # If counter reaches set value, update target network with weights of main network
                if self.target_update_counter > UPDATE_TARGET_EVERY:
                    self.target_model.set_weights(self.model.get_weights())
                    self.target_update_counter = 0
                
                self.target_update_counter += 1
    
    def test(self):
        batches = self.data.test_batches
        
        accuracy = 0

        for batch_idx, batch in enumerate(batches):                             # Looping over the batches
            current_states, optimal_actions, _ = self.process_batch(batch)      # Get the data from the batch
            estimated_qs_list = []

            for action in self.action_set:                                      # Iterating and predicting the value for each action
                input_vector = self.get_input_vector(current_states, action)        
                estimated_qs_list.append(self.model.predict(input_vector))

            estimated_qs_vec = self.get_batch_vector_out(estimated_qs_list).squeeze()     # Get the vectorized output
            predicted_actions = self.greedy(estimated_qs_vec, epsilon= 1.0).squeeze()     # Since we are testing so we need no exploration, we are only greedy now (eps=1.0)

            break

            curr_batch_accuracy = np.mean(np.equal(predicted_actions, optimal_actions).astype(np.uint32))
            accuracy += curr_batch_accuracy / len(batches)
        
        
        print("Finished testing on the testing dataset with accuracy {}".format(accuracy))
        return accuracy

    #TODO: single predict function
    #def predict(self, state):
    

# Model Training

In [4]:
# Training our model
data = DataLoader("new_data.csv", batch_size= BATCH_SIZE)           # Importing the dataset using our dataloader
dq_net = DeepQNet(dataset= data)                                    # Creating our DQNet
dq_net.summary()                                                    # Printing the model contents
dq_net.train()                                                      # Calling the train function

Dataset successfully loaded with 342 training batches, and 37 testing batches with 256 batch size.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               3584      
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 20,225
Trainable params: 20,225
Non-trainable params: 0
_________________________________________________________________
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.

KeyboardInterrupt: 

# Model Testing and Evaluation

In [None]:
dq_net.test()

# Saving The Model (Checkpoint)