Basic Examples With Poorly Drawn Diagrams

Floating Point Adder

Here is the PipelineC code that describes a pipelined floating point adder:

float main(float x, float y)
{
   return x + y;
}

diagram1

Binary Adder Tree

Here is the PipelineC code that describes a pipelined binary tree summation of 8 values by instantiating 7 floating point adders:

float main(float x0, float x1, float x2, float x3,
           float x4, float x5, float x6, float x7)
{
   // First layer of 4 sums in parallel
   float sum0;
   float sum1;
   float sum2;
   float sum3;
   sum0 = x0 + x1;
   sum1 = x2 + x3;
   sum2 = x4 + x5;
   sum3 = x6 + x7;
   
   // Next layer of two sums in parallel
   float sum4;
   float sum5;
   sum4 = sum0 + sum1;
   sum5 = sum2 + sum3;
   
   // Final layer of a single sum
   float sum6;
   sum6 = sum4 + sum5;
   
   return sum6;
}

diagram2

Low latency, high resource usage matrix multiply

This code describes a pipelined floating point matrix multiplier by instantiating::

N^3 floating point multipliers
N^2 binary tree summations of N elements ('float_array_sumN') (of which each is is 2^(log2(N)+1)-1 floating point adders )

// Lowest latency, most resource usage
// Resource usage grows O(N^3)

#include "uintN_t.h"
#define N 2
#define float_array_sumN float_array_sum2 // Built in function
#define iter_t uint1_t

typedef struct an_array_t
{
	float a[N][N];
} an_array_t;


an_array_t main(float mat1[N][N], float mat2[N][N])
{
    an_array_t res;
    
    iter_t i;
    iter_t j;
    iter_t k;
    for (i = 0; i < N; i = i + 1) 
    { 
        for (j = 0; j < N; j = j + 1) 
        { 
            float res_k[N];
            for (k = 0; k < N; k = k + 1)
            {
                res_k[k] = mat1[i][k] * mat2[k][j]; 
            }
	    res.a[i][j] =  float_array_sumN(res_k);
        } 
    }
    
    return res;
}

The diagram I tried to hand draw for this was embarrassing. Use your imagination for now.

High latency, low resource usage matrix multiply

This code describes two floating point units, a multiplier and an adder. The multiply+add computation takes M total clock cycles. Every M clock cycles the i,j,k iterators are incremented to unroll the loop. Latency = N^3 * M clock cycles.

// High latency, low resource usage matrix multiply
#include "uintN_t.h"

// Matrix dimension
#define N 2
#define N_MINUS_1 1 // To not need an extra bit in iter_t
#define iter_t uint1_t

// Type holding result matrix
typedef struct result_matrix_t
{
	float mat[N][N];
	uint1_t done;
} result_matrix_t;

// Indicates if volatile stateful variables are valid
volatile uint1_t volatiles_valid;
// Volatile stateful variables
// i,j,k iterators
volatile iter_t i;
volatile iter_t j;
volatile iter_t k;
// Result matrix
volatile float mat[N][N];

// An unrolled version of the i,j,k matrix multiply nested loops
/*
for (i = 0; i < N; i++) 
{ 
	for (j = 0; j < N; j++) 
	{  
		for (k = 0; k < N; k++) 
			res[i][j] += mat1[i][k] * mat2[k][j]; 
	} 
}
*/
// Signal the start of the operation with 'start'
// return value '.done' signals the completion of the operation
// Both input matrices are assumed to have data from start->done
result_matrix_t main(
	 uint1_t start,
	 float mat1[N][N], float mat2[N][N])
{       
	// Reset everything if this is the start
	if(start)
	{
		// Validate the volatiles as being reset to 0
		volatiles_valid = 1;
		// Clear the matrix and iterators
		i = 0;
		j = 0;
		k = 0;
		for (i = 0; i < N; i = i + 1) 
		{ 
			for (j = 0; j < N; j = j + 1) 
			{ 
				mat[i][j] = 0;
			}
		}
	}
	
	// Default return accumulated result and not done yet
	result_matrix_t rv;
	rv.mat = mat;
	rv.done = 0;

	// Accumulate into volatile state matrix
	// (only if the volatile stateful variables have valid data)
	if(volatiles_valid)
	{		
		// This is like unrolling to a 
		// single iteration of the inner most loop
		
		// Do the math for this iteration
		// Two separate floating point operatations
		// Multiply
		float product;
		product = mat1[i][k] * mat2[k][j];
		// Accumulate
		mat[i][j] = mat[i][j] + product;

		// Done?
		if(
			(i == N_MINUS_1) &
			(j == N_MINUS_1) &
			(k == N_MINUS_1)
		)
		{
			// Multiply is done now
			rv.mat = mat;
			rv.done = 1;
			// Invalidate state, wait for start again
			volatiles_valid = 0;
		}
		
		// Increment iterators unless going back to start
		// Increment k every time
		// k
		if(k == N_MINUS_1)
		{
			// End of k loop, reset
			k = 0;
		}
		else
		{
			// Increment k
			k = k + 1;
		}
		
		// j
		// Increment j when k is done
		if(k == 0) // Was N-1
		{
			if(j == N_MINUS_1) 
			{
				// End of j loop, reset
				j = 0;
			}
			else
			{
				// Increment j
				j = j + 1;
			}
		}
		
		// i
		// Increment i when j and k are done
		if(
			(j == 0) & // Was N-1
			(k == 0) // Was N-1
		)
		{
			if (i == N_MINUS_1)
			{
				// End of i loop, reset
				i = 0;
			}
			else
			{
				// Increment i
				i = i + 1;
			}
		}		
			
	}

    return rv;
}

Diagram: TBD

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Basic Examples With Poorly Drawn Diagrams

Table of Contents

Floating Point Adder

Binary Adder Tree

Low latency, high resource usage matrix multiply

High latency, low resource usage matrix multiply

Clone this wiki locally