Help me write some HLS source code to create a FPGA hardware accelerator for a streaming-based linear layer acclerator. Please use the Vitis HLS libaraies including `hls_math.h`, `hls_stream.h`, and `ap_fixed.h`. The inputs to the kernel is one stream for the input vector, and two fixed arrays for the weights and bias. Everything uses fixed point types of `ap_fixed<32,10>`.
Sure, here is a sample code for a FPGA hardware accelerator for a streaming-based linear layer accelerator using Vitis HLS libraries
```
#include hls_math.h
#include hls_stream.h
#include ap_fixed.h

// Define the data types
typedef ap_fixed<32,10> fixed_type;
typedef hls::stream<fixed_type> stream_type;

// Define the linear layer function
void linear_layer(stream_type& input_stream, fixed_type weight[], fixed_type bias[], stream_type& output_stream) {
    // Set the number of input and output values
    const int N = 1024;

    // Perform the linear layer computation
    for(int i = 0; i < N; i++) {
        fixed_type input_value = input_stream.read();
        fixed_type weighted_sum = 0;
        for(int j = 0; j < N; j++) {
            weighted_sum += input_value * weight[i * N + j];
        }
        fixed_type output_value = hls::tanh(weighted_sum + bias[i]);
        output_stream.write(output_value);
    }
}

// Top-level function for the accelerator
void linear_accelerator(stream_type& input_stream, fixed_type weight[], fixed_type bias[], stream_type& output_stream) {
    #pragma HLS INTERFACE axis port=input_stream
    #pragma HLS INTERFACE axis port=output_stream
    #pragma HLS INTERFACE m_axi port=weight offset=slave bundle=gmem
    #pragma HLS INTERFACE m_axi port=bias offset=slave bundle=gmem

    // Call the linear layer function
    linear_layer(input_stream, weight, bias, output_stream);```
Rewrite the code to use an 8 bit fixed point type and code a RELU nonlinearity after the linear layer.
Here's the modified code that uses an 8-bit fixed point type and adds a RELU nonlinearity:

```c
#include hls_math.h
#include hls_stream.h
#include ap_fixed.h

// Define the data types
typedef ap_fixed<8,2> fixed_type;
typedef hls::stream<fixed_type> stream_type;

// Define the linear layer function
void linear_layer(stream_type& input_stream, fixed_type weight[], fixed_type bias[], stream_type& output_stream) {
    // Set the number of input and output values
    const int N = 1024;

    // Perform the linear layer computation
    for(int i = 0; i < N; i++) {
        fixed_type input_value = input_stream.read();
        fixed_type weighted_sum = 0;
        for(int j = 0; j < N; j++) {
            weighted_sum += input_value * weight[i * N + j];
        }
        fixed_type output_value = weighted_sum + bias[i];
        output_stream.write(output_value);
    }
}

// Define the RELU function
void relu(stream_type& input_stream, stream_type& output_stream) {
    const int N = 1024;
    for(int i = 0; i < N; i++) {
        fixed_type input_value = input_stream.read();
        fixed_type output_value = input_value > 0 ? input_value : 0;
        output_stream.write(output_value);
    }
}

// Top-level function for the accelerator
void linear_accelerator(stream_type& input_stream, fixed_type weight[], fixed_type bias[], stream_type& output_stream) {
    #pragma HLS INTERFACE axis port=input_stream
    #pragma HLS INTERFACE axis port=output_stream
    #pragma HLS INTERFACE m_axi port=weight offset=slave bundle=gmem
    #pragma HLS INTERFACE m_axi port=bias offset=slave bundle=gmem

    // Create intermediate streams
    stream_type linear_output_stream;
    stream_type relu_output_stream;

    // Call the linear layer function
    linear_layer(input_stream, weight, bias, linear_output_stream);

    // Call the RELU function
    relu(linear_output_stream, relu_output_stream);

    // Write the output to the output stream
    const int N = 1024;
    for(int i = 0; i < N; i++) {
        fixed_type output_value = relu_output_stream.read();
        output_stream.write(output_value);
    }
}
```

In this modified code, we define the fixed point type to be `ap_fixed<8,2>`, which means that it has 8 bits in total, with 2 of those bits reserved for the fractional part. We then define the RELU function, which takes the output of the linear layer and applies the RELU nonlinearity (i.e., if the input value is less than 0, the output is set to 0, otherwise it is passed through unchanged). Finally, in the top-level function, we call the linear layer function, then the RELU function, and then write the final output to the output stream.