Skip to content

Basic Examples With Poorly Drawn Diagrams

JulianKemmerer edited this page May 30, 2022 · 22 revisions
Table of Contents

Floating Point Adder

Here is the PipelineC code that describes a pipelined floating point adder:

float main(float x, float y)
{
   return x + y;
}

diagram1

Binary Adder Tree

Here is the PipelineC code that describes a pipelined binary tree summation of 8 values by instantiating 7 floating point adders:

float main(float x0, float x1, float x2, float x3,
           float x4, float x5, float x6, float x7)
{
   // First layer of 4 sums in parallel
   float sum0;
   float sum1;
   float sum2;
   float sum3;
   sum0 = x0 + x1;
   sum1 = x2 + x3;
   sum2 = x4 + x5;
   sum3 = x6 + x7;
   
   // Next layer of two sums in parallel
   float sum4;
   float sum5;
   sum4 = sum0 + sum1;
   sum5 = sum2 + sum3;
   
   // Final layer of a single sum
   float sum6;
   sum6 = sum4 + sum5;
   
   return sum6;
}

diagram2

Low latency, high resource usage matrix multiply

This code describes a pipelined floating point matrix multiplier by instantiating::

  • N^3 floating point multipliers
  • N^2 binary tree summations of N elements ('float_array_sumN') (of which each is is 2^(log2(N)+1)-1 floating point adders )
// Lowest latency, most resource usage
// Resource usage grows O(N^3)

#include "uintN_t.h"
#define N 2
#define float_array_sumN float_array_sum2 // Built in function
#define iter_t uint1_t

typedef struct an_array_t
{
	float a[N][N];
} an_array_t;


an_array_t main(float mat1[N][N], float mat2[N][N])
{
    an_array_t res;
    
    iter_t i;
    iter_t j;
    iter_t k;
    for (i = 0; i < N; i = i + 1) 
    { 
        for (j = 0; j < N; j = j + 1) 
        { 
            float res_k[N];
            for (k = 0; k < N; k = k + 1)
            {
                res_k[k] = mat1[i][k] * mat2[k][j]; 
            }
	    res.a[i][j] =  float_array_sumN(res_k);
        } 
    }
    
    return res;
}

The diagram I tried to hand draw for this was embarrassing. Use your imagination for now.

High latency, low resource usage matrix multiply

This code describes two floating point units, a multiplier and an adder. The multiply+add computation takes M total clock cycles. Every M clock cycles the i,j,k iterators are incremented to unroll the loop. Latency = N^3 * M clock cycles.

// High latency, low resource usage matrix multiply
#include "uintN_t.h"

// Matrix dimension
#define N 2
#define N_MINUS_1 1 // To not need an extra bit in iter_t
#define iter_t uint1_t

// Type holding result matrix
typedef struct result_matrix_t
{
	float mat[N][N];
	uint1_t done;
} result_matrix_t;

// Indicates if volatile stateful variables are valid
volatile uint1_t volatiles_valid;
// Volatile stateful variables
// i,j,k iterators
volatile iter_t i;
volatile iter_t j;
volatile iter_t k;
// Result matrix
volatile float mat[N][N];

// An unrolled version of the i,j,k matrix multiply nested loops
/*
for (i = 0; i < N; i++) 
{ 
	for (j = 0; j < N; j++) 
	{  
		for (k = 0; k < N; k++) 
			res[i][j] += mat1[i][k] * mat2[k][j]; 
	} 
}
*/
// Signal the start of the operation with 'start'
// return value '.done' signals the completion of the operation
// Both input matrices are assumed to have data from start->done
result_matrix_t main(
	 uint1_t start,
	 float mat1[N][N], float mat2[N][N])
{       
	// Reset everything if this is the start
	if(start)
	{
		// Validate the volatiles as being reset to 0
		volatiles_valid = 1;
		// Clear the matrix and iterators
		i = 0;
		j = 0;
		k = 0;
		for (i = 0; i < N; i = i + 1) 
		{ 
			for (j = 0; j < N; j = j + 1) 
			{ 
				mat[i][j] = 0;
			}
		}
	}
	
	// Default return accumulated result and not done yet
	result_matrix_t rv;
	rv.mat = mat;
	rv.done = 0;

	// Accumulate into volatile state matrix
	// (only if the volatile stateful variables have valid data)
	if(volatiles_valid)
	{		
		// This is like unrolling to a 
		// single iteration of the inner most loop
		
		// Do the math for this iteration
		// Two separate floating point operatations
		// Multiply
		float product;
		product = mat1[i][k] * mat2[k][j];
		// Accumulate
		mat[i][j] = mat[i][j] + product;

		// Done?
		if(
			(i == N_MINUS_1) &
			(j == N_MINUS_1) &
			(k == N_MINUS_1)
		)
		{
			// Multiply is done now
			rv.mat = mat;
			rv.done = 1;
			// Invalidate state, wait for start again
			volatiles_valid = 0;
		}
		
		// Increment iterators unless going back to start
		// Increment k every time
		// k
		if(k == N_MINUS_1)
		{
			// End of k loop, reset
			k = 0;
		}
		else
		{
			// Increment k
			k = k + 1;
		}
		
		// j
		// Increment j when k is done
		if(k == 0) // Was N-1
		{
			if(j == N_MINUS_1) 
			{
				// End of j loop, reset
				j = 0;
			}
			else
			{
				// Increment j
				j = j + 1;
			}
		}
		
		// i
		// Increment i when j and k are done
		if(
			(j == 0) & // Was N-1
			(k == 0) // Was N-1
		)
		{
			if (i == N_MINUS_1)
			{
				// End of i loop, reset
				i = 0;
			}
			else
			{
				// Increment i
				i = i + 1;
			}
		}		
			
	}

    return rv;
}

Diagram: TBD