In [1]:
from numpy import binary_repr
import numpy as np
import scipy
import logging
from inst import inst
import random
import sys
import os
import math
import time
from buffer_size import buffer_size
import shutil
from tqdm import tqdm

# python Dram.py Cora 2 2 512 "ram_test_re.txt"    

np.random.seed(0)

DEBUG = False

class data:
            
    ### Generates data for GCN-Accelerator
    def __init__(self):
        logging.debug("Data generator created")
    def matrixToBinary(self, A):
        string_A = ''
        for i in A:
            for j in i:
                string_A = string_A + binary_repr(j, 32)[::-1]
        return string_A
    
    def arrayToBinary(self, A):
        string_A = ''
        for i in A:
                string_A = string_A + binary_repr(i, 32)[::-1]
        return string_A
    


    # def randSP(self,spDim, denDim):
    #     I = np.random.randint(2, size = spDim)
    #     print(f"\nSparse matrix is = \n{I}")
    #     Sparse = csr_matrix(I)
    #     val = Sparse.data
    #     print(f"\nVal is =\n {val}")
    #     col = Sparse.indices
    #     print(f"\nCol_Idx  is = \n{col}")
    #     row = Sparse.indptr
    #     print(f"\nRow_ptr is = \n{row}")
    #     W = np.random.randint(100, size = denDim)
    #     print(f"\nDense matrix is = \n{W}")
    #     O = np.matmul(I,W)
    #     print(f"\nOutput matrix is = \n{O}")
    #     return ((val,col,row), I, W, O)

In [2]:
# bench_name = 'NELL'
# dense_col = 32
# tile_dense_col = 8
# tile_spRow = 16384
# tile_spCol = 1024
# nPEs = 16
# tile = True

bench_name = 'NELL_x2'
dense_col = 186
tile_dense_col = 8
tile_spRow = 14392
tile_spCol = 32
nPEs = 16
nDenseGroups = 5
tile = True
calculateCycles = False
generateInstructions = True


In [3]:

# sup_bens = {}
# sup_bens_f = open("ben_dim.txt")
# for ben in sup_bens_f:
#     (key, val) = ben.split()
#     sup_bens[key] = val
# if bench_name in sup_bens:
#     dense_row = int(sup_bens[bench_name])
# else:
#     print("Bench is not supported yet.")
#     exit()

# Pad dense columns to a multiple of the tile columns
if dense_col % tile_dense_col != 0:
    dense_col = dense_col + tile_dense_col - dense_col%tile_dense_col

rowptr_path = "sample_graphs"+"/"+bench_name+"_rowptr.txt"
colidx_path = "sample_graphs"+"/"+bench_name+"_colidx.txt"

colF = open(colidx_path,'r')
maxCol = 0
numValues = 0
colFlines = colF.readlines()
for line in colFlines:
    if int(line) > maxCol:
        maxCol = int(line)
    numValues = numValues + 1
colF.close()
numSpCols = maxCol + 1
dense_row = numSpCols

with open(rowptr_path, 'rb') as f:
    try:  # catch OSError in case of a one line file 
        f.seek(-2, os.SEEK_END)
        while f.read(1) != b'\n':
            f.seek(-2, os.SEEK_CUR)
    except OSError:
        f.seek(0)
    nnz = int(f.readline().decode())

f = open(rowptr_path, 'rb')
numSpRows = -1
buf_size = 1024 * 1024
read_f = f.raw.read

buf = read_f(buf_size)
while buf:
    numSpRows += buf.count(b'\n')
    buf = read_f(buf_size)

if tile_spRow > numSpRows:
    tile_spRow = numSpRows
if tile_spCol > dense_row:
    tile_spCol = dense_row

numSpTiles = math.ceil((numSpCols)/tile_spCol)*math.ceil(numSpRows/tile_spRow)
spTilesPerRow = math.ceil(numSpCols/tile_spCol)
spTilesPerCol = math.ceil(numSpRows/tile_spRow)
numDenTiles = math.ceil(dense_row/tile_spCol)*math.ceil(dense_col/tile_dense_col)
denTilesPerRow = math.ceil(dense_col/tile_dense_col)
denTilesPerCol = math.ceil(dense_row/tile_spCol)

# Multiple dense tiles are multiplied at once
adjustedDenTilesPerRow = math.ceil(denTilesPerRow/nDenseGroups)

print('num sparse tiles = ' + str(numSpTiles))
print('num nonzero elements  = ' + str(nnz))
print('num sparse rows  = ' + str(numSpRows))
print('num sparse cols  = ' + str(numSpCols))
print('num dense tiles = ' + str(numDenTiles))
print('denTilePerRow = ' + str(denTilesPerRow))
print('Adjusted denTilePerRow = ' + str(adjustedDenTilesPerRow))
print('denTilePerCol = ' + str(denTilesPerCol))
print('spTilesPerCol = ' + str(spTilesPerCol))
print('spTilesPerRow = ' + str(spTilesPerRow))

num sparse tiles = 10
num nonzero elements  = 3635988
num sparse rows  = 65755
num sparse cols  = 64
num dense tiles = 48
denTilePerRow = 24
Adjusted denTilePerRow = 5
denTilePerCol = 2
spTilesPerCol = 5
spTilesPerRow = 2


In [4]:
rowF = open(rowptr_path,'r')
colF = open(colidx_path,'r')

# add_range = pow(2,16) #16, 20, 27, 29
block_size = 32
block_size_bytes = int(block_size/8)
num_multiplications = numSpTiles * math.ceil(dense_col/tile_dense_col)
max_inst = 7*num_multiplications
max_inst_bits = max_inst * 256
data_start_addr = int(max_inst_bits/8)+1024
if data_start_addr % 64 != 0:
    data_start_addr = data_start_addr + 64-(data_start_addr%64)

# rowAddr_start = data_start_addr
# # stored like: tile 0 row ptr, tile 0 col, tile 0 val, tile 1 row ptr, tile 1 col, tile 1 val
# colAddr_start = rowAddr_start + (numSpRows+2*numSpTiles)*block_size_bytes + 1024
# while colAddr_start % 1024:
#     colAddr_start += block_size_bytes
# valAddr_start = colAddr_start + numSpRows*numSpCols*block_size_bytes + 1024
# denAddr_start = valAddr_start + numSpRows*numSpCols*block_size_bytes + 1024
# outAddr_start = denAddr_start + dense_col*numSpCols*block_size_bytes + 1024
# while outAddr_start % 1024:
#     outAddr_start += block_size_bytes
# pSumAddr_start = outAddr_start + numSpRows*numSpCols*block_size_bytes + 1024
# while pSumAddr_start % 1024:
#     pSumAddr_start += block_size_bytes


# Store start addresses for each tile
rowPtrStart = [0]*numSpTiles
colIdxStart = [0]*numSpTiles
valStart = [0]*numSpTiles
currAddr = data_start_addr
maxSparseTileAddrRange = 0


rowPtrEnd =int(rowF.readline())

tileNum = 0

if not os.path.exists(os.getcwd()+'/'+bench_name+'_tiles/'):
    os.mkdir(os.getcwd()+'/'+bench_name+'_tiles/')

if not os.path.exists(os.getcwd()+'/'+bench_name+'_results/'):
    os.mkdir(os.getcwd()+'/'+bench_name+'_results/')

dataF = open(bench_name+'_tiles/data.txt', 'w')
dataGen = data()

tileNNZf = open(bench_name+'_results/nnz.csv','w')
tileNNZf.write('Tile,Number of Values,Number of Values After Padding\n')

maxValues = 0

if tile:
    # Tile the sparse matrix
    while(rowPtrEnd < numValues):
        # Initialize empty list of lists
        tileCols = [ [] for _ in range(spTilesPerRow) ]
        tileVals = [ [] for _ in range(spTilesPerRow) ]
        tilePtrs = [ [0] for _ in range(spTilesPerRow) ]
        
        for i in range(tile_spRow):
            if(rowPtrEnd >= numValues):
                break
            rowPtrEndPrev = rowPtrEnd
            rowPtrEnd = int(rowF.readline())

            # Loop through all values in the row
            for j in range(rowPtrEnd-rowPtrEndPrev):
                colidx = int(colF.readline())
                tileCols[int(colidx/tile_spCol)].append(colidx%tile_spCol)
#                 print(str(tileNum+int(colidx/tile_spCol)) + ': ' + str(j) + ', ' +str(colidx%tile_spCol))
                tileVals[int(colidx/tile_spCol)].append(random.randint(0,255))

            for j in range(spTilesPerRow):
                tilePtrs[j].append(len(tileCols[j]))

        # Pad with zeros so that the nnz per PE is a multiple of the number of dense columns
        for i in range(spTilesPerRow):
            if len(tileCols[i]) == 0:
                padding = tile_dense_col*nPEs
            else:
                if len(tileCols[i]) % (tile_dense_col*nPEs) == 0:
                    padding = 0
                else:    
                    padding = (tile_dense_col*nPEs) - (len(tileCols[i]) % (tile_dense_col*nPEs))

            for j in range(padding):
                tileCols[i].insert(0,0)
                tileVals[i].insert(0,0)
            for j in range(1,len(tilePtrs[i])):
                tilePtrs[i][j] = tilePtrs[i][j] + padding
        
        
        # Pad tile row ptrs to be the correct size
        for i in range(spTilesPerRow):
            tileNNZf.write(str(tileNum+i)+','+str(len(tilePtrs[i]))+',')
            while len(tilePtrs[i]) < tile_spRow + 1:
                tilePtrs[i].append(tilePtrs[i][-1])
            tileNNZf.write(str(len(tilePtrs[i]))+'\n')

        for i in range(spTilesPerRow):
            np.savetxt((bench_name+'_tiles/'+'colIdx_tile_'+str(tileNum+i)+'.txt'),tileCols[i])
            np.savetxt((bench_name+'_tiles/'+'rowPtr_tile_'+str(tileNum+i)+'.txt'),tilePtrs[i])
            np.savetxt((bench_name+'_tiles/'+'val_tile_'+str(tileNum+i)+'.txt'),tileVals[i])

            for vals in tileVals:
                maxValues = max(maxValues, len(vals))
            
            rowBin = dataGen.arrayToBinary(tilePtrs[i])
            colBin = dataGen.arrayToBinary(tileCols[i])
            valBin = dataGen.arrayToBinary(tileVals[i])

            rowPtrStart[i+tileNum] = currAddr
            dataF.write(rowBin)
            currAddr = currAddr + int(len(rowBin)/8)
            while currAddr % 64 != 0:
                currAddr = currAddr + 1
                dataF.write('0'*8)

            colIdxStart[i+tileNum] = currAddr
            dataF.write(colBin)
            currAddr = currAddr + int(len(colBin)/8)
            while currAddr % 64 != 0:
                currAddr = currAddr + 1
                dataF.write('0'*8)

            valStart[i+tileNum] = currAddr
            dataF.write(valBin)
            currAddr = currAddr + int(len(valBin)/8)
            while currAddr % 64 != 0:
                currAddr = currAddr + 1
                dataF.write('0'*8)


            maxSparseTileAddrRange = max(maxSparseTileAddrRange,valStart[i]+int(len(valBin)/8)-rowPtrStart[i])


        tileNum = tileNum + spTilesPerRow

    # outF.write(str(numValues/numTiles)+','+str(maxCount)+','+str(minCount)+','+str(numTiles)+'\n')
    tileNNZf.close()

for i in colIdxStart:
    print(i)

DEBUG:root:Data generator created


112384
3353600
6596864
9837056
13078272
16322560
19562752
22803968
26044160
27912192


In [5]:
# Generate the tiled dense matrix
denseTileAddr = [0]*numDenTiles
for i in range(numDenTiles):
    denseTileAddr[i] = currAddr

    denseTile = np.random.randint(255, size = (tile_spCol, tile_dense_col))
    denBin = dataGen.matrixToBinary(denseTile)
    dataF.write(denBin)

    np.savetxt((bench_name+'_tiles/'+'den_tile_'+str(i)+'.txt'),denseTile)

    currAddr = currAddr + int(len(denBin)/8)
    while currAddr % 64 != 0:
        currAddr = currAddr + 1
        dataF.write('0'*8)

denSramAddr = maxSparseTileAddrRange
if denSramAddr % 64 != 0:
    denSramAddr = denSramAddr + 64-(denSramAddr%64)

outputAddr = currAddr

In [6]:
instGen = inst()

spTileLoaded = -1
denTileLoaded = -1
outTileLoaded = -1
pSumLoaded = True

instructions = open(bench_name+'_tiles/'+'instructions.txt','w')

instCount = 0
numComputations = 0

totalCompute = 0
totalAgg = 0

# Generate the instructions and the outputs
numOutputTiles = spTilesPerCol*denTilesPerRow
partialSumOutputNum = [-1]*numOutputTiles
partialSumAddr = [currAddr]*numOutputTiles
psumSramAddr = denSramAddr + tile_dense_col*tile_spCol*block_size_bytes
if psumSramAddr % 64 != 0:
    psumSramAddr = psumSramAddr + 64-(psumSramAddr%64)

if calculateCycles:
    cycleF = open(bench_name+'_results/cycles.csv','w')
    cycleF.write("Computation #,Compute Cycles,Combine Cycles\n")
    emptyRowF = open(bench_name+'_results/empty_row_counts.csv','w')
    for i in range(15):
        emptyRowF.write('PE' + str(i)+',')
    emptyRowF.write('PE15\n')
    print("Computation #\tCompute Cycles\tCombine Cycles\n")

for k in range(adjustedDenTilesPerRow): # dense tile col idx
    for j in (range(spTilesPerRow)): # sparse tile col idx, dense tile row idx
        for i in (range(spTilesPerCol)): # sparse tile row idx

            spTile = i*spTilesPerRow + j
            denTile = j*denTilesPerRow + k
            outTile = i*denTilesPerRow + k
            # print('spTile = ' + str(spTile))
            # print('rowPtrStart[spTile] = ' + str(rowPtrStart[spTile]))


#             print('Computation #' + str(numComputations))
#             print('i,j,k = ' + str(i)+','+str(j)+','+str(k))
#             print('Sparse Tile = ' + str(spTile))
#             print('Dense Tile = ' + str(denTile))
#             print('Output Tile = ' + str(outTile) + '\n')

            spRow = np.loadtxt(bench_name+'_tiles/'+'rowPtr_tile_'+str(spTile)+'.txt').astype(np.int64)
            spCol = np.loadtxt(bench_name+'_tiles/'+'colIdx_tile_'+str(spTile)+'.txt').astype(np.int64)
            spVal = np.loadtxt(bench_name+'_tiles/'+'val_tile_'+str(spTile)+'.txt').astype(np.int64)
            
            if generateInstructions:
                sparse = scipy.sparse.csr_array((spVal,spCol,spRow),(tile_spRow, tile_spCol))
                sparseAsDense = sparse.todense()

                for g in range(nDenseGroups):
                    
                    outTile = i*denTilesPerRow + k + g
                    den = np.loadtxt(bench_name+'_tiles/'+'den_tile_'+str(denTile+g)+'.txt').astype(np.int64)
                    outputMatrix = np.matmul(sparseAsDense,den)
                    if partialSumOutputNum[outTile] != -1:
                        partialSumMatrix = np.loadtxt(bench_name+'_results/'+'output_matrix_'+str(partialSumOutputNum[outTile])+'.txt').astype(np.int64)
                        outputMatrix = outputMatrix + partialSumMatrix

                    np.savetxt((bench_name+'_results/'+'output_matrix_'+str(numComputations*nDenseGroups+g)+'.txt'), outputMatrix)

                    partialSumOutputNum[outTile] = numComputations*nDenseGroups+g
                    partialSumAddr[outTile] = currAddr


                    outBin = dataGen.matrixToBinary(outputMatrix)
                    dataF.write(outBin)
                    currAddr = currAddr + int(len(outBin)/8)
                    while currAddr % 64 != 0:
                        currAddr = currAddr + 1
                        dataF.write('0'*8)
                    
                    
                # np.savetxt((bench_name+'_results/'+'sparse_matrix_as_dense_'+str(spTile)+'.txt'), sparseAsDense)

                pSumLoaded = partialSumOutputNum[outTile] == -1
#                 pSumLoaded = (outTile == outTileLoaded) or (partialSumOutputNum[outTile] == -1)

                instr=''

                if spTile != spTileLoaded:
                    instr = instr + instGen.load(xsize = spRow.size, id = 'row', dram_offset = rowPtrStart[spTile], sram_offset = 0)
                    instr = instr + instGen.load(xsize = spCol.size, id = 'col', dram_offset = colIdxStart[spTile], sram_offset = colIdxStart[spTile]-rowPtrStart[spTile])
                    instr = instr + instGen.load(xsize = spVal.size, id = 'val', dram_offset = valStart[spTile], sram_offset = valStart[spTile]-rowPtrStart[spTile], final_load = int((denTile == denTileLoaded) and (pSumLoaded)))
                    instCount = instCount + 3

                if denTile != denTileLoaded:
                    
                    for g in range(nDenseGroups):
                        if (g + denTile) < numDenTiles:
                            lastDenTile = (g == nDenseGroups - 1) or (g + denTile == numDenTiles-1)
                            instr = instr + instGen.load(xsize = den.size, id = 'den', dram_offset = denseTileAddr[denTile+g], sram_offset = 0, final_load = pSumLoaded and lastDenTile, denGroup = g)
                            instCount = instCount + 1

                if not pSumLoaded: 
                    for g in range(nDenseGroups):
                        if (g + denTile) < numDenTiles:
                            lastTile = (g == nDenseGroups - 1) or (g + denTile == numDenTiles-1)
                            instr = instr + instGen.load(xsize = outputMatrix.size, id = 'psum', dram_offset = partialSumAddr[outTile+g], sram_offset = 0, final_load = lastTile, denGroup = g)
                            instCount = instCount + 1

                instr = instr + instGen.spMM(sram_offset_col = colIdxStart[spTile]-rowPtrStart[spTile], sram_offset_ptr = 0, sram_offset_den = denSramAddr, sram_offset_val = valStart[spTile]-rowPtrStart[spTile], den_size = den.size, col_size = spCol.size, row_size = spRow.size, pr_valid = 1, sram_offset_partial_sum = psumSramAddr, add_partial_sum = (partialSumOutputNum[outTile] != -1), scratchpad_n_global_buffer = (outTile == outTileLoaded), nnz_per_group = int(spCol.size/nPEs), dense_loaded = (denTile == denTileLoaded))
                instr = instr + instGen.store(xsize = outputMatrix.size, dram_offset = outputAddr, sram_offset = 0)
                instCount = instCount + 2

                instructions.write(instr)

        


                spTileLoaded = spTile
                denTileLoaded = denTile
                outTileLoaded = outTile
            
            if calculateCycles:
                
                emptyRowCounter = 0
                maxEmptyRow = 0
                nnzPerPE = int(len(spVal)/nPEs)
                breakPoint = nnzPerPE
                prevBreakPoint = 0
                cyclesToFillPipeline = 7
                prevRowWasBreakPoint = False
                lastWasPR = False
                queueDepth = 0
                stallCycles = 0
                maxStallCycles = 0
                lastIterationWasPrBreak = False
                
                aggCycles = 2*nPEs-1
                
                
                
                for m in range(1,spRow.size):
                    
                    if lastIterationWasPrBreak:
                        queueDepth = (spRow[m] - prevBreakPoint)*2
                    else:
                        if queueDepth == 0:
                            queueDepth = (spRow[m] - spRow[m-1])*2
                        else:
                            queueDepth = queueDepth - 1 + (spRow[m] - spRow[m-1])*2
                            
                        if queueDepth == 0:
                            stallCycles = stallCycles + 1
                    
                    if spRow[m] == breakPoint and breakPoint != len(spVal):
                        if lastWasPR:
                            aggCycles = aggCycles - 2
                        else:
                            aggCycles = aggCycles - 1
                        maxEmptyRow = max(maxEmptyRow,emptyRowCounter)
                        
                        emptyRowF.write(str(emptyRowCounter)+',')
                        emptyRowCounter = 0
                        
                        # D1 latency
                        if m+1 < spRow.size and spRow[m+1] < breakPoint:
                            if m+2 < spRow.size and spRow[m+2] < breakPoint:
                                emptyRowCounter = 2
                            else:
                                emptyRowCounter = 1
                        prevBreakPoint = breakPoint
                        breakPoint = breakPoint + nnzPerPE
                        
                        maxStallCycles = max(maxStallCycles, stallCycles)
                        stallCycles = 0
                        queueDepth = 0
                        
                        lastWasPR = False
                        
                    elif spRow[m] > breakPoint:
                        maxEmptyRow = max(maxEmptyRow,emptyRowCounter)
                        emptyRowF.write(str(emptyRowCounter)+',')
                        emptyRowCounter = 0
                        prevBreakPoint = breakPoint
                        breakPoint = breakPoint + nnzPerPE
                            
                        if (spRow[m] > breakPoint):
                            aggCycles = aggCycles - 1
                            
                        lastWasPR = True
                        lastIterationWasPrBreak = True
                        
                        maxStallCycles = max(maxStallCycles, stallCycles)
                        stallCycles = 0
                        queueDepth = 0
                    
                    if spRow[m] == spRow[m-1]:
                        emptyRowCounter = emptyRowCounter + 1
                    
                    
                    lastIterationWasPrBreak = False
                emptyRowF.write(str(emptyRowCounter)+',')
                emptyRowF.write('\n')
                        
    
                    
    
                
                maxEmptyRow = max(maxEmptyRow,emptyRowCounter)
                
                computeCycles = cyclesToFillPipeline+nnzPerPE*2+maxStallCycles
                
                cycleF.write(str(numComputations+1)+','+str(computeCycles)+','+str(aggCycles)+'\n')
                print(str(numComputations+1)+'\t'+str(computeCycles)+',\t'+str(aggCycles))
            
                totalCompute = totalCompute + computeCycles
                totalAgg = totalAgg + aggCycles

            numComputations = numComputations + 1

buffSize = buffer_size()
(globalSize,localSize) = buffSize.set(tile_spRow,tile_spCol,tile_dense_col,maxValues,nPEs)
buffSize.create_csv(bench_name+'_'+str(dense_col)+'_'+str(tile_dense_col)+'_'+str(tile_spRow)+'_'+str(tile_spCol)+'_'+str(nPEs)+'.csv')
buffSize.report_csv()
del buffSize
            
if calculateCycles:
    cycleF.write('Total,'+str(totalCompute)+','+str(totalAgg)+'\n')
    cycleF.write('\nGlobal Buffer Size (MB),Local Buffer Size (MB)\n')
    cycleF.write(str(globalSize/(2**23))+','+str(localSize/(2**23))+'\n')
    cycleF.close()
    print('Total\t'+str(totalCompute)+'\t'+str(totalAgg))
    print('\nGlobal Buffer Size (MB)\t\tLocal Buffer Size (MB)')
    print(str(globalSize/(2**23))+'\t\t'+str(localSize/(2**23)))
            
dataF.close()
# if instCount%2 != 0:
#     instCount = instCount + 1
#     instructions.write('0'*256)
    
# print('\nInstruction Count = ' + str(instCount))

# for i in range(data_start_addr-instCount*32):
#     instructions.write('0'*8)
# instructions.close()

DEBUG:root:Instruction generator created


In [7]:
# Save the output metadata
metaDataF = open(bench_name+'_results/'+'metaData.txt','w')
metaDataF.write(str(outputAddr)+'\n')           # Where to store the result data for comparison
metaDataF.write(str(tile_spRow) + '\n')         # Number of rows in the output matrices
metaDataF.write(str(tile_dense_col)+'\n')       # Number of columns in the output matrices
metaDataF.write(str(nPEs) +'\n')                # Number of PEs in the accelerator
metaDataF.write(str(numComputations)+'\n')      # Number of multiplications
metaDataF.write(str(instCount)+'\n')            # Number of instructions
metaDataF.close()

# # Merge RAM files
# with open(bench_name+'_results/'+'ram.txt','wb') as wfd:
#     for f in [bench_name+'_tiles/instructions.txt',bench_name+'_tiles/data.txt']:
#         with open(f,'rb') as fd:
#             shutil.copyfileobj(fd, wfd)
