# Sequential Split tutorial

### Vivado container bug fix

In [None]:
import os

os.environ['LD_PRELOAD'] = '/usr/lib/x86_64-linux-gnu/libudev.so.1'
os.environ['PATH'] = os.environ['XILINX_VIVADO'] + '/bin:' + os.environ['PATH']
#os.environ['LM_LICENSE_FILE'] = 'XXXX@your.xilinx.licence.server' or filepath

### Requirements

In [None]:
!pip install tensorflow==2.15
!pip install rule4ml

### Controls Variables

In [None]:
# Toggle if you want to load a saved model or create the example model
LOAD = False

## MNIST Example Model

### Load the MNIST dataset

In [None]:
import numpy as np
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from PIL import Image

# Load the MNIST dataset
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
# One-hot encode the labels
train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

# Flatten the images and/or resize
train_images = np.array([np.array(Image.fromarray(img).resize((28, 28))).flatten() for img in train_images])
test_images = np.array([np.array(Image.fromarray(img).resize((28, 28))).flatten() for img in test_images])

# Normalize the images to the range [0, 1]
train_images = train_images.astype('float32') / 255
test_images = test_images.astype('float32') / 255

np.save('X_train_val.npy', train_images)
np.save('X_test.npy', test_images)
np.save('y_train_val.npy', train_labels)
np.save('y_test.npy', test_labels)

### Create and train the model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.regularizers import l1
from keras.optimizers import Adam

# Create & train model
if not LOAD:
    model = Sequential()
    model.add(
        Dense(
            16,
            input_shape=(784,),
            name='fc1',
            kernel_initializer='lecun_uniform',
            kernel_regularizer=l1(0.0001),
        )
    )
    model.add(
        Dense(
            32,
            name='dense',
            kernel_initializer='lecun_uniform',
            kernel_regularizer=l1(0.0001),
        )
    )
    model.add(
        Dense(
            10,
            name='output',
            kernel_initializer='lecun_uniform',
            kernel_regularizer=l1(0.0001),
        )
    )
    model.add(Activation(activation='softmax', name='softmax'))

    # Compile the model
    model.compile(
        optimizer=Adam(),
        loss='categorical_crossentropy',  # Use 'sparse_categorical_crossentropy' if labels are integers
        metrics=['accuracy']
    )
    
    # Train the model
    model.fit(train_images, train_labels, epochs=10, batch_size=32, validation_split=0.2)

### Load the model

In [None]:
from tensorflow.keras.models import load_model
from qkeras.utils import _add_supported_quantized_objects

co = {}
_add_supported_quantized_objects(co)

if LOAD:
    model = load_model('your/model/path.h5', custom_objects=co)


## Helper Functions

### rule4ml function

In [None]:
import itertools
from rule4ml.models.estimators import MultiModelEstimator
def predict_nb_nodes(model_to_predict, hls_config=None, sort=['FPGAs', 'nb_bits'], order=[True, False]):
        if hls_config == None:
            hls_config = [
                {   
                    "model": {
                    "precision": precisions,
                    "reuse_factor": reuse_factors,
                    "strategy": strategies,
                    },
                    "board": "pynq-z2",
                }
                for reuse_factors, strategies, precisions in itertools.product([1,2,4,8,16,32,64,128,256], 
                                                         ["Latency", "Resource"],
                                                         [f'ap_fixed<{bit},{bit//2}>' for bit in range(2, 33)])
            ]
            
        # Load default estimator
        estimator = MultiModelEstimator()
        estimator.load_default_models()
        # MultiModelEstimator predictions are formatted as a DataFrame
        prediction_df = estimator.predict(model_to_predict, hls_config)
    
        if not prediction_df.empty:
            # Calculate the number of splits needed
            prediction_df['FPGAs'] = prediction_df[['DSP (%)', 'FF (%)', 'LUT (%)', 'BRAM (%)']].max(axis=1).apply(lambda x: (x // 100) + 1)
            
            # Extract nb_bits from the Precision column
            prediction_df['nb_bits'] = prediction_df['Precision'].str.extract(r'ap_fixed<(\d+),')[0].astype(int)
            
            # Sort by least number of splits, then highest nb_bits
            sorted_df = prediction_df.sort_values(by=sort, ascending=order)

            strategy, nb_bits, reuse_factor, nb_fpga = sorted_df.iloc[0][['Strategy','nb_bits', 'Reuse Factor', 'FPGAs']]
            
            return strategy, nb_bits.astype(int), reuse_factor, nb_fpga.astype(int)
        return "No Predictions Found.", None, None, None

### Possible Combinations of Split Function 

In [None]:
from itertools import combinations

def generate_possible_splits(layers, num_splits):
    n = len(layers)
    if num_splits >= n or num_splits < 1:
        return []

    all_splits = []

    # Helper function to check if a layer is an activation
    def is_activation_layer(layer_name):
        activation_layers = ['Activation', 'ReLU', 'Softmax']  # Add more as needed
        return any(act_layer.lower() in layer_name.lower() for act_layer in activation_layers)

    # Generate splits for the specified number of splits
    for indices in combinations(range(1, n), num_splits):
        valid_split = True
        split = []
        prev_index = 0
        for index in indices:
            if index < n and is_activation_layer(layers[index]) and not is_activation_layer(layers[index - 1]):
                valid_split = False
                break
            split.append(layers[prev_index:index])
            prev_index = index
        split.append(layers[prev_index:])

        if valid_split:
            all_splits.append(split)

    return all_splits

# Example usage
# layers = [layer.name for layer in model.layers]
# possible_splits = generate_possible_splits(layers, 1)

# for split in possible_splits:
#     print(split)

### Sequential Split Function

In [None]:
from tensorflow import keras

def sequential_splits(model, layer_names):
    sub_models = []
    for i,names in enumerate(layer_names):
        layers = []
        for j,name in enumerate(names):
            layer = model.get_layer(name)
            if i == 0:
                layers.append(layer)
            elif j == 0:
                layers.append(keras.layers.InputLayer(layer.input_shape[1:]))
                layers.append(layer)
            else:
                layers.append(layer)
        sub_models.append(keras.Sequential(layers))
    return sub_models

# Example usage:
# possible_sub_models = []
# for split in possible_splits:
#     possible_sub_models.append(sequential_splits(model, split))

# print('number of sub_models ', len(possible_sub_models))

## End to End for Sequential Split

### rule4ml Estimations

In [None]:
sequential = False
parallel = False

# GET SEQUENTIAL SPLITS
# Rule4ML estimation
strategy, nb_bits, reuse_factor, nb_fpga = predict_nb_nodes(model)

if nb_fpga > 1:
    sequential = True
    print(f'Sequential split needed using the following parameters...')
    print(f'Strategy : {strategy}')
    print(f'Reuse Factor : {reuse_factor}')
    print(f'Precision : ap_fixed<{nb_bits},{nb_bits//2}>')
    print(f'Number of nodes: {nb_fpga}')

    
# GET PARALLEL SPLITS
for layer in model.layers:
    sub_model = Sequential(layer)
    sub_model.build(input_shape=layer.input_shape)
    s, b, r, f = predict_nb_nodes(sub_model)
    if f > 1:
        parallel = True
        print(f'Parallel split needed for layerÂ {layer.name} using the following parameters...')
        print(f'Strategy : {s}')
        print(f'Reuse Factor : {r}')
        print(f'Precision : ap_fixed<{b},{b//2}>')
        print(f'Number of nodes: {f}')

# No split needed
if not sequential and not parallel:
    print(f'No need for a split using the following parameters...')
    print(f'Strategy : {strategy}')
    print(f'Reuse Factor : {reuse_factor}')
    print(f'Precision : ap_fixed<{nb_bits},{nb_bits//2}>')
    print(f'Number of nodes: {nb_fpga}')

### Generate the Possible Splitted Models

In [None]:

# Get the possible splits
layers = [layer.name for layer in model.layers]
num_splits = nb_fpga.astype(int)-1


possible_splits = generate_possible_splits(layers, num_splits)

# Split the model according tho the possible splits
possible_models = []
for split in possible_splits:
    possible_models.append(sequential_splits(model, split))

# test accuracy before conversion (should be the same as the model)
print('Performance of original model before hls4ml...')
model.evaluate(np.load('X_test.npy'), np.load('y_test.npy'))
y_test = test_labels
for i,split_model in enumerate(possible_models):
    print(f'Performance of split model #{i} before hls4ml...')
    input = test_images
    for sub_model in split_model:
        output = sub_model.predict(input)
        input = output
    output = input.argmax(axis=1)
    print(f'Accuracy of model #{i} for {len(output)} samples is {(np.sum(output == y_test.argmax(axis=1))/len(output))*100}%')
    

### HLS Conversion fpr a pynq-z2 

In [None]:
# convert to hls4ml
# -REMOVE LAST PROJECTs- #
# !rm -rf model_split*
# --------------------- #
import hls4ml

# For each possible split combinations do the hls conversions with rule4ml outputs.
possible_hls_models = []
for j,split_model in enumerate(possible_models):
    split_hls_model = []
    sub_configs = []
    for i,sub_model in enumerate(split_model):
        sub_configs.append(hls4ml.utils.config_from_keras_model(sub_model, granularity='model'))
        sub_configs[i]['Model']['Strategy'] = strategy
        sub_configs[i]['Model']['Precision'] = f'ap_fixed<{nb_bits},{nb_bits//2}>'
        sub_configs[i]['Model']['ReuseFactor'] = reuse_factor
        split_hls_model.append(hls4ml.converters.convert_from_keras_model(sub_model, 
                                                                         hls_config=sub_configs[i],
                                                                         output_dir=f'model_split_{j}/sub_model_{i}',
                                                                         backend='VivadoAccelerator',
                                                                         board='pynq-z2',
                                                                         part='xc7z020clg400-1'
                                                                         ))
        split_hls_model[i].compile()
    possible_hls_models.append(split_hls_model)


### Test the accuracy of the models with the HLS conversion

In [None]:
# Test accuracy of the hls models  
y_test = test_labels
for i,split_hls_model in enumerate(possible_hls_models):
    print(f'Performance of split model #{i} after hls4ml...')
    input = np.ascontiguousarray(test_images)
    for sub_hls_model in split_hls_model:
        output = sub_hls_model.predict(input)
        input = output
    output = input.argmax(axis=1)
    print(f'Accuracy of model #{i} for {len(output)} samples is {(np.sum(output == y_test.argmax(axis=1))/len(output))*100}%')

### Bonus build the bitstream in parallel for one model

In [None]:
# Select 1 model default is first
sub_hls_models = possible_hls_models[0]

# BUILD
# Function to call multiple vivado builds
def build(i_build):
        sub_hls_models[i_build].build(csim=False, export=True, bitfile=True)

# Code that parallelize the process
import multiprocessing
with multiprocessing.Pool() as pool:
    print(f'PIDs of worker processes: {[p.pid for p in multiprocessing.active_children()]}')
    pool.map(build, range(0, len(sub_hls_models)))