From 8b6c8bf6c729964162f1880e1c62ee2781f8e262 Mon Sep 17 00:00:00 2001 From: Steven Dalton Date: Thu, 11 Apr 2013 23:06:27 -0500 Subject: [PATCH 1/2] Ran dos2unix --- test/test_coo_spmv.cu | 2944 ++++++++++++++++----------------- test/test_coo_spmv_double.cu | 2954 +++++++++++++++++----------------- 2 files changed, 2949 insertions(+), 2949 deletions(-) diff --git a/test/test_coo_spmv.cu b/test/test_coo_spmv.cu index ad28957bac..24f0573a8c 100644 --- a/test/test_coo_spmv.cu +++ b/test/test_coo_spmv.cu @@ -1,1477 +1,1477 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/****************************************************************************** - * Experimental reduce-value-by-row COO implementation of SPMV - ******************************************************************************/ - -// Ensure printing of CUDA runtime errors to console -#define CUB_STDERR - -#include -#include -#include -#include -#include - -#include - -#include "test_util.h" -#include "cub.cuh" - -using namespace cub; -using namespace std; - - -//--------------------------------------------------------------------- -// Globals, constants and typedefs -//--------------------------------------------------------------------- - -bool g_verbose = false; -int g_iterations = 1; - - -//--------------------------------------------------------------------- -// Graph building types and utilities -//--------------------------------------------------------------------- - -/** +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Experimental reduce-value-by-row COO implementation of SPMV + ******************************************************************************/ + +// Ensure printing of CUDA runtime errors to console +#define CUB_STDERR + +#include +#include +#include +#include +#include + +#include + +#include "test_util.h" +#include "cub.cuh" + +using namespace cub; +using namespace std; + + +//--------------------------------------------------------------------- +// Globals, constants and typedefs +//--------------------------------------------------------------------- + +bool g_verbose = false; +int g_iterations = 1; + + +//--------------------------------------------------------------------- +// Graph building types and utilities +//--------------------------------------------------------------------- + +/** * COO graph type. A COO graph is just a vector of edge tuples. - */ -template -struct CooGraph -{ - /** + */ +template +struct CooGraph +{ + /** * COO edge tuple. (A COO graph is just a vector of these.) - */ - struct CooTuple - { - VertexId row; - VertexId col; - Value val; - - CooTuple() {} - CooTuple(VertexId row, VertexId col) : row(row), col(col) {} - CooTuple(VertexId row, VertexId col, Value val) : row(row), col(col), val(val) {} - }; - - /** - * Comparator for sorting COO sparse format edges - */ - static bool CooTupleCompare (const CooTuple &elem1, const CooTuple &elem2) - { - if (elem1.row < elem2.row) - { - return true; - } - else if ((elem1.row == elem2.row) && (elem1.col < elem2.col)) - { - return true; - } - - return false; - } - - - /** - * Fields - */ - int row_dim; // Num rows - int col_dim; // Num cols - vector coo_tuples; // Non-zero entries - - - /** - * CooGraph ostream operator - */ - friend std::ostream& operator<<(std::ostream& os, const CooGraph& coo_graph) - { - os << "Sparse COO (" << coo_graph.row_dim << " rows, " << coo_graph.col_dim << " cols, " << coo_graph.coo_tuples.size() << " nonzeros):\n"; - os << "Ordinal, Row, Col, Val\n"; - for (int i = 0; i < coo_graph.coo_tuples.size(); i++) - { - os << i << ',' << coo_graph.coo_tuples[i].row << ',' << coo_graph.coo_tuples[i].col << ',' << coo_graph.coo_tuples[i].val << "\n"; - } - return os; - } - - /** - * Update graph dims based upon COO tuples - */ - void UpdateDims() - { - row_dim = -1; - col_dim = -1; - - for (int i = 0; i < coo_tuples.size(); i++) - { - row_dim = CUB_MAX(row_dim, coo_tuples[i].row); - col_dim = CUB_MAX(col_dim, coo_tuples[i].col); - } - - row_dim++; - col_dim++; - } - - /** - * Builds a METIS COO sparse from the given file. - */ - int InitMetis(const string &metis_filename) - { - coo_tuples.clear(); - - // Read from file - FILE *f_in = fopen(metis_filename.c_str(), "r"); - if (!f_in) return -1; - - int edges_read = -1; - int edges = 0; - - char line[1024]; - - while(true) - { - if (fscanf(f_in, "%[^\n]\n", line) <= 0) - { - break; - } - if (line[0] == '%') - { - // Comment - } - else if (edges_read == -1) - { - // Problem description - long long ll_nodes_x, ll_nodes_y, ll_edges; - if (sscanf(line, "%lld %lld %lld", &ll_nodes_x, &ll_nodes_y, &ll_edges) != 3) - { - fprintf(stderr, "Error parsing MARKET graph: invalid problem description\n"); - return -1; - } - - if (ll_nodes_x != ll_nodes_y) - { - fprintf(stderr, "Error parsing MARKET graph: not square (%lld, %lld)\n", ll_nodes_x, ll_nodes_y); - return -1; - } - - edges = ll_edges; - - printf(" (%lld nodes, %lld directed edges)... ", - (unsigned long long) ll_nodes_x, - (unsigned long long) ll_edges); - fflush(stdout); - - // Allocate coo graph - coo_tuples.reserve(edges); - edges_read++; - } - else - { - if (edges_read >= edges) - { - fprintf(stderr, "Error parsing MARKET graph: encountered more than %d edges\n", edges); - fclose(f_in); - return -1; - } - - long long ll_row, ll_col; - float val; - if (sscanf(line, "%lld %lld %f", &ll_col, &ll_row, &val) != 3) - { - fprintf(stderr, "Error parsing MARKET graph: badly formed edge\n", edges); - fclose(f_in); - return -1; - } - - ll_row -= 1; - ll_col -= 1; - - coo_tuples.push_back(CooTuple(ll_row, ll_col, val)); // zero-based array - edges_read++; - } - } - - if (edges_read != edges) - { - fprintf(stderr, "Error parsing MARKET graph: only %d/%d edges read\n", edges_read, edges); - fclose(f_in); - return -1; - } - - // Sort by rows, then columns, update dims - std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); - UpdateDims(); - - fclose(f_in); - - return 0; - } - - - /** - * Builds a wheel COO sparse graph having spokes spokes. - */ - int InitWheel(VertexId spokes) - { - VertexId edges = spokes + (spokes - 1); - - coo_tuples.clear(); - coo_tuples.reserve(edges); - - // Add spoke edges - for (VertexId i = 0; i < spokes; i++) - { - coo_tuples.push_back(CooTuple(0, i + 1)); - } - - // Add rim - for (VertexId i = 0; i < spokes; i++) - { - VertexId dest = (i + 1) % spokes; - coo_tuples.push_back(CooTuple(i + 1, dest + 1)); - } - - // Sort by rows, then columns, update dims - std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); - UpdateDims(); - - // Assign arbitrary values to graph vertices - for (int i = 0; i < coo_tuples.size(); i++) - { - coo_tuples[i].val = i % 21; - } - - return 0; - } - - - /** - * Builds a square 3D grid COO sparse graph. Interior nodes have degree 7 (including - * a self-loop). Values are unintialized, coo_tuples are sorted. - */ - int InitGrid3d(VertexId width) - { - VertexId interior_nodes = (width - 2) * (width - 2) * (width - 2); - VertexId face_nodes = (width - 2) * (width - 2) * 6; - VertexId edge_nodes = (width - 2) * 12; - VertexId corner_nodes = 8; - VertexId nodes = width * width * width; - VertexId edges = (interior_nodes * 6) + (face_nodes * 5) + (edge_nodes * 4) + (corner_nodes * 3) + nodes; - - coo_tuples.clear(); - coo_tuples.reserve(edges); - - for (VertexId i = 0; i < width; i++) { - for (VertexId j = 0; j < width; j++) { - for (VertexId k = 0; k < width; k++) { - - VertexId me = (i * width * width) + (j * width) + k; - - VertexId neighbor = (i * width * width) + (j * width) + (k - 1); - if (k - 1 >= 0) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = (i * width * width) + (j * width) + (k + 1); - if (k + 1 < width) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = (i * width * width) + ((j - 1) * width) + k; - if (j - 1 >= 0) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = (i * width * width) + ((j + 1) * width) + k; - if (j + 1 < width) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = ((i - 1) * width * width) + (j * width) + k; - if (i - 1 >= 0) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = ((i + 1) * width * width) + (j * width) + k; - if (i + 1 < width) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = me; - coo_tuples.push_back(CooTuple(me, neighbor)); - } - } - } - - // Sort by rows, then columns, update dims - std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); - UpdateDims(); - - // Assign arbitrary values to graph vertices - for (int i = 0; i < coo_tuples.size(); i++) - { - coo_tuples[i].val = i % 21; - } - - return 0; - } - - - /** - * Builds a square 2D grid CSR graph. Interior nodes have degree 5 (including - * a self-loop) - * - * Returns 0 on success, 1 on failure. - */ - int InitGrid2d(VertexId width, bool self) - { - VertexId interior_nodes = (width - 2) * (width - 2); - VertexId edge_nodes = (width - 2) * 4; - VertexId corner_nodes = 4; - VertexId edges = (interior_nodes * 4) + (edge_nodes * 3) + (corner_nodes * 2); - - if (self) edges += edges; - - coo_tuples.clear(); - coo_tuples.reserve(edges); - - for (VertexId j = 0; j < width; j++) { - for (VertexId k = 0; k < width; k++) { - - VertexId me = (j * width) + k; - - VertexId neighbor = (j * width) + (k - 1); - if (k - 1 >= 0) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = (j * width) + (k + 1); - if (k + 1 < width) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = ((j - 1) * width) + k; - if (j - 1 >= 0) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = ((j + 1) * width) + k; - if (j + 1 < width) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - if (self) - { - neighbor = me; - coo_tuples.push_back(CooTuple(me, neighbor)); - } - } - } - - // Sort by rows, then columns, update dims - std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); - UpdateDims(); - - // Assign arbitrary values to graph vertices - for (int i = 0; i < coo_tuples.size(); i++) - { - coo_tuples[i].val = i % 21; - } - - return 0; - } -}; - - - - -//--------------------------------------------------------------------- -// GPU types and device functions -//--------------------------------------------------------------------- - - -/// Pairing of dot product partial sums and corresponding row-id -template -struct PartialSum -{ - Value partial; /// PartialSum sum - VertexId row; /// Row-id - - /// Tags indicating this structure provides overloaded ThreadLoad and ThreadStore operations - typedef void ThreadLoadTag; - typedef void ThreadStoreTag; - - /// ThreadLoad (simply defer to loading individual items) - template - __device__ __forceinline__ void ThreadLoad(PartialSum *ptr) - { - partial = cub::ThreadLoad(&(ptr->partial)); - row = cub::ThreadLoad(&(ptr->row)); - } - - /// ThreadStore (simply defer to storing individual items) - template - __device__ __forceinline__ void ThreadStore(PartialSum *ptr) const - { - // Always write partial first - cub::ThreadStore(&(ptr->partial), partial); - cub::ThreadStore(&(ptr->row), row); - } - -}; - - -/// Templated Texture reference type for multiplicand vector -template -struct TexVector -{ - // Texture reference type - typedef texture TexRef; - - static TexRef ref; - - /** - * Bind textures - */ - static void BindTexture(void *d_in, int elements) - { - cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc(); - if (d_in) - { - size_t offset; - size_t bytes = sizeof(Value) * elements; - CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes)); - } - } - - /** - * Unbind textures - */ - static void UnbindTexture() - { - CubDebugExit(cudaUnbindTexture(ref)); - } -}; - -// Texture reference definitions -template -typename TexVector::TexRef TexVector::ref = 0; - - -/// Reduce-by-row scan operator -struct ReduceByKeyOp -{ - template - __device__ __forceinline__ PartialSum operator()( - const PartialSum &first, - const PartialSum &second) - { - PartialSum retval; - - retval.partial = (second.row != first.row) ? - second.partial : - first.partial + second.partial; - - retval.row = second.row; - return retval; - } -}; - - -/// Min-row reduction operator -struct MinRowOp -{ - template - __device__ __forceinline__ PartialSum operator()( - const PartialSum &first, - const PartialSum &second) - { - return (first.row < second.row) ? - first : - second; - } -}; - - -// Callback functor for waiting on the previous threadblock to compute its partial sum (the prefix for this threadblock) -template -struct BlockPrefixOp -{ - PartialSum prefix; - - /** - * Threadblock-wide prefix callback functor called by thread-0 in BlockScan::ExclusiveScan(). - * Returns the threadblock-wide prefix to apply to all scan inputs. - */ - __device__ __forceinline__ PartialSum operator()( - const PartialSum &block_aggregate) ///< The aggregate sum of the local prefix sum inputs - { - ReduceByKeyOp scan_op; - - PartialSum retval = prefix; - prefix = scan_op(prefix, block_aggregate); - return retval; - } -}; - - -/// Functor for detecting row discontinuities. -struct NewRowOp -{ - /// Returns true if row_b is the start of a new row - template - __device__ __forceinline__ bool operator()( - const VertexId& row_a, - const VertexId& row_b) - { - return (row_a != row_b); - } -}; - - -/** + */ + struct CooTuple + { + VertexId row; + VertexId col; + Value val; + + CooTuple() {} + CooTuple(VertexId row, VertexId col) : row(row), col(col) {} + CooTuple(VertexId row, VertexId col, Value val) : row(row), col(col), val(val) {} + }; + + /** + * Comparator for sorting COO sparse format edges + */ + static bool CooTupleCompare (const CooTuple &elem1, const CooTuple &elem2) + { + if (elem1.row < elem2.row) + { + return true; + } + else if ((elem1.row == elem2.row) && (elem1.col < elem2.col)) + { + return true; + } + + return false; + } + + + /** + * Fields + */ + int row_dim; // Num rows + int col_dim; // Num cols + vector coo_tuples; // Non-zero entries + + + /** + * CooGraph ostream operator + */ + friend std::ostream& operator<<(std::ostream& os, const CooGraph& coo_graph) + { + os << "Sparse COO (" << coo_graph.row_dim << " rows, " << coo_graph.col_dim << " cols, " << coo_graph.coo_tuples.size() << " nonzeros):\n"; + os << "Ordinal, Row, Col, Val\n"; + for (int i = 0; i < coo_graph.coo_tuples.size(); i++) + { + os << i << ',' << coo_graph.coo_tuples[i].row << ',' << coo_graph.coo_tuples[i].col << ',' << coo_graph.coo_tuples[i].val << "\n"; + } + return os; + } + + /** + * Update graph dims based upon COO tuples + */ + void UpdateDims() + { + row_dim = -1; + col_dim = -1; + + for (int i = 0; i < coo_tuples.size(); i++) + { + row_dim = CUB_MAX(row_dim, coo_tuples[i].row); + col_dim = CUB_MAX(col_dim, coo_tuples[i].col); + } + + row_dim++; + col_dim++; + } + + /** + * Builds a METIS COO sparse from the given file. + */ + int InitMetis(const string &metis_filename) + { + coo_tuples.clear(); + + // Read from file + FILE *f_in = fopen(metis_filename.c_str(), "r"); + if (!f_in) return -1; + + int edges_read = -1; + int edges = 0; + + char line[1024]; + + while(true) + { + if (fscanf(f_in, "%[^\n]\n", line) <= 0) + { + break; + } + if (line[0] == '%') + { + // Comment + } + else if (edges_read == -1) + { + // Problem description + long long ll_nodes_x, ll_nodes_y, ll_edges; + if (sscanf(line, "%lld %lld %lld", &ll_nodes_x, &ll_nodes_y, &ll_edges) != 3) + { + fprintf(stderr, "Error parsing MARKET graph: invalid problem description\n"); + return -1; + } + + if (ll_nodes_x != ll_nodes_y) + { + fprintf(stderr, "Error parsing MARKET graph: not square (%lld, %lld)\n", ll_nodes_x, ll_nodes_y); + return -1; + } + + edges = ll_edges; + + printf(" (%lld nodes, %lld directed edges)... ", + (unsigned long long) ll_nodes_x, + (unsigned long long) ll_edges); + fflush(stdout); + + // Allocate coo graph + coo_tuples.reserve(edges); + edges_read++; + } + else + { + if (edges_read >= edges) + { + fprintf(stderr, "Error parsing MARKET graph: encountered more than %d edges\n", edges); + fclose(f_in); + return -1; + } + + long long ll_row, ll_col; + float val; + if (sscanf(line, "%lld %lld %f", &ll_col, &ll_row, &val) != 3) + { + fprintf(stderr, "Error parsing MARKET graph: badly formed edge\n", edges); + fclose(f_in); + return -1; + } + + ll_row -= 1; + ll_col -= 1; + + coo_tuples.push_back(CooTuple(ll_row, ll_col, val)); // zero-based array + edges_read++; + } + } + + if (edges_read != edges) + { + fprintf(stderr, "Error parsing MARKET graph: only %d/%d edges read\n", edges_read, edges); + fclose(f_in); + return -1; + } + + // Sort by rows, then columns, update dims + std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); + UpdateDims(); + + fclose(f_in); + + return 0; + } + + + /** + * Builds a wheel COO sparse graph having spokes spokes. + */ + int InitWheel(VertexId spokes) + { + VertexId edges = spokes + (spokes - 1); + + coo_tuples.clear(); + coo_tuples.reserve(edges); + + // Add spoke edges + for (VertexId i = 0; i < spokes; i++) + { + coo_tuples.push_back(CooTuple(0, i + 1)); + } + + // Add rim + for (VertexId i = 0; i < spokes; i++) + { + VertexId dest = (i + 1) % spokes; + coo_tuples.push_back(CooTuple(i + 1, dest + 1)); + } + + // Sort by rows, then columns, update dims + std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); + UpdateDims(); + + // Assign arbitrary values to graph vertices + for (int i = 0; i < coo_tuples.size(); i++) + { + coo_tuples[i].val = i % 21; + } + + return 0; + } + + + /** + * Builds a square 3D grid COO sparse graph. Interior nodes have degree 7 (including + * a self-loop). Values are unintialized, coo_tuples are sorted. + */ + int InitGrid3d(VertexId width) + { + VertexId interior_nodes = (width - 2) * (width - 2) * (width - 2); + VertexId face_nodes = (width - 2) * (width - 2) * 6; + VertexId edge_nodes = (width - 2) * 12; + VertexId corner_nodes = 8; + VertexId nodes = width * width * width; + VertexId edges = (interior_nodes * 6) + (face_nodes * 5) + (edge_nodes * 4) + (corner_nodes * 3) + nodes; + + coo_tuples.clear(); + coo_tuples.reserve(edges); + + for (VertexId i = 0; i < width; i++) { + for (VertexId j = 0; j < width; j++) { + for (VertexId k = 0; k < width; k++) { + + VertexId me = (i * width * width) + (j * width) + k; + + VertexId neighbor = (i * width * width) + (j * width) + (k - 1); + if (k - 1 >= 0) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = (i * width * width) + (j * width) + (k + 1); + if (k + 1 < width) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = (i * width * width) + ((j - 1) * width) + k; + if (j - 1 >= 0) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = (i * width * width) + ((j + 1) * width) + k; + if (j + 1 < width) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = ((i - 1) * width * width) + (j * width) + k; + if (i - 1 >= 0) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = ((i + 1) * width * width) + (j * width) + k; + if (i + 1 < width) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = me; + coo_tuples.push_back(CooTuple(me, neighbor)); + } + } + } + + // Sort by rows, then columns, update dims + std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); + UpdateDims(); + + // Assign arbitrary values to graph vertices + for (int i = 0; i < coo_tuples.size(); i++) + { + coo_tuples[i].val = i % 21; + } + + return 0; + } + + + /** + * Builds a square 2D grid CSR graph. Interior nodes have degree 5 (including + * a self-loop) + * + * Returns 0 on success, 1 on failure. + */ + int InitGrid2d(VertexId width, bool self) + { + VertexId interior_nodes = (width - 2) * (width - 2); + VertexId edge_nodes = (width - 2) * 4; + VertexId corner_nodes = 4; + VertexId edges = (interior_nodes * 4) + (edge_nodes * 3) + (corner_nodes * 2); + + if (self) edges += edges; + + coo_tuples.clear(); + coo_tuples.reserve(edges); + + for (VertexId j = 0; j < width; j++) { + for (VertexId k = 0; k < width; k++) { + + VertexId me = (j * width) + k; + + VertexId neighbor = (j * width) + (k - 1); + if (k - 1 >= 0) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = (j * width) + (k + 1); + if (k + 1 < width) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = ((j - 1) * width) + k; + if (j - 1 >= 0) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = ((j + 1) * width) + k; + if (j + 1 < width) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + if (self) + { + neighbor = me; + coo_tuples.push_back(CooTuple(me, neighbor)); + } + } + } + + // Sort by rows, then columns, update dims + std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); + UpdateDims(); + + // Assign arbitrary values to graph vertices + for (int i = 0; i < coo_tuples.size(); i++) + { + coo_tuples[i].val = i % 21; + } + + return 0; + } +}; + + + + +//--------------------------------------------------------------------- +// GPU types and device functions +//--------------------------------------------------------------------- + + +/// Pairing of dot product partial sums and corresponding row-id +template +struct PartialSum +{ + Value partial; /// PartialSum sum + VertexId row; /// Row-id + + /// Tags indicating this structure provides overloaded ThreadLoad and ThreadStore operations + typedef void ThreadLoadTag; + typedef void ThreadStoreTag; + + /// ThreadLoad (simply defer to loading individual items) + template + __device__ __forceinline__ void ThreadLoad(PartialSum *ptr) + { + partial = cub::ThreadLoad(&(ptr->partial)); + row = cub::ThreadLoad(&(ptr->row)); + } + + /// ThreadStore (simply defer to storing individual items) + template + __device__ __forceinline__ void ThreadStore(PartialSum *ptr) const + { + // Always write partial first + cub::ThreadStore(&(ptr->partial), partial); + cub::ThreadStore(&(ptr->row), row); + } + +}; + + +/// Templated Texture reference type for multiplicand vector +template +struct TexVector +{ + // Texture reference type + typedef texture TexRef; + + static TexRef ref; + + /** + * Bind textures + */ + static void BindTexture(void *d_in, int elements) + { + cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc(); + if (d_in) + { + size_t offset; + size_t bytes = sizeof(Value) * elements; + CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes)); + } + } + + /** + * Unbind textures + */ + static void UnbindTexture() + { + CubDebugExit(cudaUnbindTexture(ref)); + } +}; + +// Texture reference definitions +template +typename TexVector::TexRef TexVector::ref = 0; + + +/// Reduce-by-row scan operator +struct ReduceByKeyOp +{ + template + __device__ __forceinline__ PartialSum operator()( + const PartialSum &first, + const PartialSum &second) + { + PartialSum retval; + + retval.partial = (second.row != first.row) ? + second.partial : + first.partial + second.partial; + + retval.row = second.row; + return retval; + } +}; + + +/// Min-row reduction operator +struct MinRowOp +{ + template + __device__ __forceinline__ PartialSum operator()( + const PartialSum &first, + const PartialSum &second) + { + return (first.row < second.row) ? + first : + second; + } +}; + + +// Callback functor for waiting on the previous threadblock to compute its partial sum (the prefix for this threadblock) +template +struct BlockPrefixOp +{ + PartialSum prefix; + + /** + * Threadblock-wide prefix callback functor called by thread-0 in BlockScan::ExclusiveScan(). + * Returns the threadblock-wide prefix to apply to all scan inputs. + */ + __device__ __forceinline__ PartialSum operator()( + const PartialSum &block_aggregate) ///< The aggregate sum of the local prefix sum inputs + { + ReduceByKeyOp scan_op; + + PartialSum retval = prefix; + prefix = scan_op(prefix, block_aggregate); + return retval; + } +}; + + +/// Functor for detecting row discontinuities. +struct NewRowOp +{ + /// Returns true if row_b is the start of a new row + template + __device__ __forceinline__ bool operator()( + const VertexId& row_a, + const VertexId& row_b) + { + return (row_a != row_b); + } +}; + + +/** * Threadblock abstraction for processing sparse SPMV tiles - */ -template < - int BLOCK_THREADS, - int ITEMS_PER_THREAD, - typename VertexId, - typename Value> -struct SpmvBlock -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Constants - enum - { - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - // Head flag type - typedef int HeadFlag; - - // Dot product partial sum type - typedef PartialSum PartialSum; - - // Parameterized CUB types for the parallel execution context - typedef BlockScan BlockScan; - typedef BlockExchange BlockExchangeRows; - typedef BlockExchange BlockExchangeValues; - typedef BlockDiscontinuity BlockDiscontinuity; - typedef BlockReduce BlockReduce; - - // Shared memory type for this threadblock - struct SmemStorage - { - union - { - typename BlockExchangeRows::SmemStorage exchange_rows; // Smem needed for striped->blocked transpose - typename BlockExchangeValues::SmemStorage exchange_values; // Smem needed for striped->blocked transpose - struct { - typename BlockScan::SmemStorage scan; // Smem needed for reduce-value-by-row scan - typename BlockDiscontinuity::SmemStorage discontinuity; // Smem needed for head-flagging - }; - typename BlockReduce::SmemStorage reduce; // Smem needed for min-finding reduction - }; - - VertexId last_block_row; - VertexId prev_tile_row; - PartialSum identity; - PartialSum first_scatter; - }; - - //--------------------------------------------------------------------- - // Operations - //--------------------------------------------------------------------- - - /** + */ +template < + int BLOCK_THREADS, + int ITEMS_PER_THREAD, + typename VertexId, + typename Value> +struct SpmvBlock +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Constants + enum + { + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + // Head flag type + typedef int HeadFlag; + + // Dot product partial sum type + typedef PartialSum PartialSum; + + // Parameterized CUB types for the parallel execution context + typedef BlockScan BlockScan; + typedef BlockExchange BlockExchangeRows; + typedef BlockExchange BlockExchangeValues; + typedef BlockDiscontinuity BlockDiscontinuity; + typedef BlockReduce BlockReduce; + + // Shared memory type for this threadblock + struct SmemStorage + { + union + { + typename BlockExchangeRows::SmemStorage exchange_rows; // Smem needed for striped->blocked transpose + typename BlockExchangeValues::SmemStorage exchange_values; // Smem needed for striped->blocked transpose + struct { + typename BlockScan::SmemStorage scan; // Smem needed for reduce-value-by-row scan + typename BlockDiscontinuity::SmemStorage discontinuity; // Smem needed for head-flagging + }; + typename BlockReduce::SmemStorage reduce; // Smem needed for min-finding reduction + }; + + VertexId last_block_row; + VertexId prev_tile_row; + PartialSum identity; + PartialSum first_scatter; + }; + + //--------------------------------------------------------------------- + // Operations + //--------------------------------------------------------------------- + + /** * Processes a COO input tile of edges, outputting dot products for each row - */ - __device__ __forceinline__ - static void ProcessTile( - SmemStorage &smem_storage, - BlockPrefixOp &carry, - VertexId* d_rows, - VertexId* d_columns, - Value* d_values, - Value* d_vector, - Value* d_result, - int block_offset, - int guarded_items = 0) - { - VertexId columns[ITEMS_PER_THREAD]; - VertexId rows[ITEMS_PER_THREAD]; - Value values[ITEMS_PER_THREAD]; - PartialSum partial_sums[ITEMS_PER_THREAD]; - HeadFlag head_flags[ITEMS_PER_THREAD]; - - // Load a threadblock-striped tile of A (sparse row-ids, column-ids, and values) - if (guarded_items) - { - // This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last - // vertex for out-of-bound items, but zero-valued - BlockLoadDirectStriped(d_columns + block_offset, guarded_items, VertexId(0), columns, BLOCK_THREADS); - BlockLoadDirectStriped(d_values + block_offset, guarded_items, Value(0), values, BLOCK_THREADS); - } - else - { - // Unguarded loads - BlockLoadDirectStriped(d_columns + block_offset, columns, BLOCK_THREADS); - BlockLoadDirectStriped(d_values + block_offset, values, BLOCK_THREADS); - } - - // Fence to prevent hoisting any dependent code below into the loads above - __syncthreads(); - - // Load the referenced values from x and compute the dot product partials sums - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - values[ITEM] *= tex1Dfetch(TexVector::ref, columns[ITEM]); - } - - // Load a threadblock-striped tile of A (sparse row-ids, column-ids, and values) - if (guarded_items) - { - // This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last - // vertex for out-of-bound items, but zero-valued - BlockLoadDirectStriped(d_rows + block_offset, guarded_items, smem_storage.last_block_row, rows, BLOCK_THREADS); - } - else - { - // Unguarded loads - BlockLoadDirectStriped(d_rows + block_offset, rows, BLOCK_THREADS); - } - - // Transpose from threadblock-striped to threadblock-blocked arrangement - BlockExchangeValues::StripedToBlocked(smem_storage.exchange_values, values); - - // Barrier for smem reuse and coherence - __syncthreads(); - - // Transpose from threadblock-striped to threadblock-blocked arrangement - BlockExchangeRows::StripedToBlocked(smem_storage.exchange_rows, rows); - - // Barrier for smem reuse and coherence - __syncthreads(); - - // Flag row heads by looking for discontinuities - BlockDiscontinuity::Flag( - smem_storage.discontinuity, - rows, // Original row ids - smem_storage.prev_tile_row, // Last row id from previous threadblock - NewRowOp(), // Functor for detecting start of new rows - head_flags); // (Out) Head flags - - // Assemble - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - partial_sums[ITEM].partial = values[ITEM]; - partial_sums[ITEM].row = rows[ITEM]; - } - - - // Compute the exclusive scan of partial_sums - PartialSum block_aggregate; // Threadblock-wide aggregate in thread0 (unused) - BlockScan::ExclusiveScan( - smem_storage.scan, - partial_sums, - partial_sums, // (Out) - smem_storage.identity, - ReduceByKeyOp(), - block_aggregate, // (Out) - carry); // (In-out) - - // Store the last row in the tile (for computing head flags in the next tile) - if (threadIdx.x == BLOCK_THREADS - 1) - { - smem_storage.prev_tile_row = rows[ITEMS_PER_THREAD - 1]; - } - - // Pull the last row ID out of smem - VertexId last_block_row = smem_storage.last_block_row; - - // Scatter an accumulated dot product if it is the head of a valid row - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if (head_flags[ITEM]) - { - // Scatter - d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial; - } - else - { - // Otherwise reset the row ID to the last row in the threadblock's range - partial_sums[ITEM].row = last_block_row; - } - } - - // Find the first-scattered dot product if not yet set - if (smem_storage.first_scatter.row == last_block_row) - { - // Barrier for smem reuse and coherence - __syncthreads(); - - PartialSum candidate = BlockReduce::Reduce( - smem_storage.reduce, - partial_sums, - MinRowOp()); - - // Stash the first-scattered dot product in smem - if (threadIdx.x == 0) - { - smem_storage.first_scatter = candidate; - } - } - - } -}; - - -/** - * COO SpMV kernel - */ -template < - int BLOCK_THREADS, - int ITEMS_PER_THREAD, - int BLOCK_OCCUPANCY, - typename VertexId, - typename Value> -__launch_bounds__ (BLOCK_THREADS, BLOCK_OCCUPANCY) -__global__ void CooKernel( - GridEvenShare block_progress, - PartialSum *d_block_partials, - VertexId *d_rows, - VertexId *d_columns, - Value *d_values, - Value *d_vector, - Value *d_result) -{ - // Constants - enum - { - TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD - }; - - // SpMV threadblock tile-processing abstraction - typedef SpmvBlock SpmvBlock; - - // Shared memory - __shared__ typename SpmvBlock::SmemStorage smem_storage; - - // Initialize threadblock even-share to tell us where to start and stop our tile-processing - block_progress.Init(); - - // Stateful prefix carryover from one tile to the next - BlockPrefixOp > carry; - - // Initialize scalar shared memory values - if (threadIdx.x == 0) - { - VertexId first_block_row = d_rows[block_progress.block_offset]; - VertexId last_block_row = d_rows[block_progress.block_oob - 1]; - - // Initialize carry to identity - carry.prefix.row = first_block_row; - carry.prefix.partial = Value(0); - smem_storage.identity = carry.prefix; - - smem_storage.first_scatter.row = last_block_row; - smem_storage.first_scatter.partial = Value(0); - smem_storage.last_block_row = last_block_row; - smem_storage.prev_tile_row = (blockIdx.x == 0) ? - first_block_row : - d_rows[block_progress.block_offset - 1]; - } - - // Process full tiles - while (block_progress.block_offset <= block_progress.block_oob - TILE_SIZE) - { - SpmvBlock::ProcessTile( - smem_storage, - carry, - d_rows, - d_columns, - d_values, - d_vector, - d_result, - block_progress.block_offset); - - block_progress.block_offset += TILE_SIZE; - } - - // Process final partial tile (if present) - int guarded_items = block_progress.block_oob - block_progress.block_offset; - if (guarded_items) - { - SpmvBlock::ProcessTile( - smem_storage, - carry, - d_rows, - d_columns, - d_values, - d_vector, - d_result, - block_progress.block_offset, - guarded_items); - } - - if (threadIdx.x == 0) - { - if (gridDim.x == 1) - { - // Scatter the final aggregate (this kernel contains only 1 threadblock) - d_result[carry.prefix.row] = carry.prefix.partial; - } - else - { - // Write out threadblock first-item and carry aggregate - - // Unweight the first-output if it's the same row as the carry aggregate - PartialSum first_scatter = smem_storage.first_scatter; - if (first_scatter.row == carry.prefix.row) first_scatter.partial = Value(0); - - d_block_partials[blockIdx.x * 2] = first_scatter; - d_block_partials[(blockIdx.x * 2) + 1] = carry.prefix; - } - } -} - - -/** - * Threadblock abstraction for processing sparse SPMV tiles - */ -template < - int BLOCK_THREADS, - int ITEMS_PER_THREAD, - typename VertexId, - typename Value> -struct FinalizeSpmvBlock -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Constants - enum - { - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - // Head flag type - typedef int HeadFlag; - - // Dot product partial sum type - typedef PartialSum PartialSum; - - // Parameterized CUB types for the parallel execution context - typedef BlockScan BlockScan; - typedef BlockDiscontinuity BlockDiscontinuity; - - // Shared memory type for this threadblock - struct SmemStorage - { - union - { - typename BlockScan::SmemStorage scan; // Smem needed for reduce-value-by-row scan - typename BlockDiscontinuity::SmemStorage discontinuity; // Smem needed for head-flagging - }; - - VertexId last_block_row; - VertexId prev_tile_row; - PartialSum identity; - }; - - //--------------------------------------------------------------------- - // Operations - //--------------------------------------------------------------------- - - /** - * Processes a COO input tile of edges, outputting dot products for each row - */ - __device__ __forceinline__ - static void ProcessTile( - SmemStorage &smem_storage, - BlockPrefixOp &carry, - PartialSum *d_block_partials, - Value *d_result, - int block_offset, - int guarded_items = 0) - { - VertexId rows[ITEMS_PER_THREAD]; - PartialSum partial_sums[ITEMS_PER_THREAD]; - HeadFlag head_flags[ITEMS_PER_THREAD]; - - // Load a threadblock-striped tile of A (sparse row-ids, column-ids, and values) - if (guarded_items) - { - // This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last - // vertex for out-of-bound items, but zero-valued - PartialSum default_sum; - default_sum.row = smem_storage.last_block_row; - default_sum.partial = Value(0); - - BlockLoadDirect(d_block_partials + block_offset, guarded_items, default_sum, partial_sums); - } - else - { - // Unguarded loads - BlockLoadDirect(d_block_partials + block_offset, partial_sums); - } - - // Fence to prevent hoisting any dependent code below into the loads above - __syncthreads(); - - // Copy out row IDs for row-head flagging - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - rows[ITEM] = partial_sums[ITEM].row; - } - - // Flag row heads by looking for discontinuities - BlockDiscontinuity::Flag( - smem_storage.discontinuity, - rows, // Original row ids - smem_storage.prev_tile_row, // Last row id from previous threadblock - NewRowOp(), // Functor for detecting start of new rows - head_flags); // (Out) Head flags - - // Barrier for smem reuse and coherence - __syncthreads(); - - // Store the last row in the tile (for computing head flags in the next tile) - if (threadIdx.x == BLOCK_THREADS - 1) - { - smem_storage.prev_tile_row = rows[ITEMS_PER_THREAD - 1]; - } - - // Compute the exclusive scan of partial_sums - PartialSum block_aggregate; // Threadblock-wide aggregate in thread0 (unused) - BlockScan::ExclusiveScan( - smem_storage.scan, - partial_sums, - partial_sums, // (Out) - smem_storage.identity, - ReduceByKeyOp(), - block_aggregate, // (Out) - carry); // (In-out) - - // Scatter an accumulated dot product if it is the head of a valid row - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if (head_flags[ITEM]) - { - // Scatter - d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial; - } - } - - } -}; - - -/** - * COO Finalize kernel. - */ -template < - int BLOCK_THREADS, - int ITEMS_PER_THREAD, - typename VertexId, - typename Value> -__launch_bounds__ (BLOCK_THREADS, 1) -__global__ void CooFinalizeKernel( - PartialSum *d_block_partials, - int finalize_partials, - Value *d_result) -{ - // Constants - enum - { - TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD - }; - - // SpMV threadblock tile-processing abstraction - typedef FinalizeSpmvBlock FinalizeSpmvBlock; - - // Shared memory - __shared__ typename FinalizeSpmvBlock::SmemStorage smem_storage; - - // Stateful prefix carryover from one tile to the next - BlockPrefixOp > carry; - - // Initialize scalar shared memory values - if (threadIdx.x == 0) - { - VertexId first_block_row = d_block_partials[0].row; - VertexId last_block_row = d_block_partials[finalize_partials - 1].row; - - // Initialize carry to identity - carry.prefix.row = first_block_row; - carry.prefix.partial = Value(0); - smem_storage.identity = carry.prefix; - - smem_storage.last_block_row = last_block_row; - smem_storage.prev_tile_row = first_block_row; - } - - // Barrier for smem coherence - __syncthreads(); - - // Process full tiles - int block_offset = 0; - while (block_offset <= finalize_partials - TILE_SIZE) - { - FinalizeSpmvBlock::ProcessTile( - smem_storage, - carry, - d_block_partials, - d_result, - block_offset); - - // Barrier for smem reuse and coherence - __syncthreads(); - - block_offset += TILE_SIZE; - } - - // Process final partial tile (if present) - int guarded_items = finalize_partials - block_offset; - if (guarded_items) - { - FinalizeSpmvBlock::ProcessTile( - smem_storage, - carry, - d_block_partials, - d_result, - block_offset, - guarded_items); - } - - // Scatter the final aggregate (this kernel contains only 1 threadblock) - if (threadIdx.x == 0) - { - d_result[carry.prefix.row] = carry.prefix.partial; - } -} - - - - - - -//--------------------------------------------------------------------- -// Host subroutines -//--------------------------------------------------------------------- - -/** - * Compares the equivalence of two arrays - */ -template -int CompareResults(float* computed, float* reference, SizeT len) -{ - int retval = 0; - for (SizeT i = 0; i < len; i++) - { - if (computed[i] > 0.0) - { - if ((computed[i] * 1.001 < reference[i]) || - (computed[i] * 0.999 > reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - else - { - if ((computed[i] * 1.001 > reference[i]) || - (computed[i] * 0.999 < reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - } - - if (!retval) printf("CORRECT"); - return retval; -} - -/** - * Compares the equivalence of two arrays - */ -template -int CompareResults(double* computed, double* reference, SizeT len) -{ - int retval = 0; - for (SizeT i = 0; i < len; i++) - { - if (computed[i] > 0.0) - { - if ((computed[i] * 1.001 < reference[i]) || - (computed[i] * 0.999 > reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - else - { - if ((computed[i] * 1.001 > reference[i]) || - (computed[i] * 0.999 < reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - } - - if (!retval) printf("CORRECT"); - return retval; -} - - -/** + */ + __device__ __forceinline__ + static void ProcessTile( + SmemStorage &smem_storage, + BlockPrefixOp &carry, + VertexId* d_rows, + VertexId* d_columns, + Value* d_values, + Value* d_vector, + Value* d_result, + int block_offset, + int guarded_items = 0) + { + VertexId columns[ITEMS_PER_THREAD]; + VertexId rows[ITEMS_PER_THREAD]; + Value values[ITEMS_PER_THREAD]; + PartialSum partial_sums[ITEMS_PER_THREAD]; + HeadFlag head_flags[ITEMS_PER_THREAD]; + + // Load a threadblock-striped tile of A (sparse row-ids, column-ids, and values) + if (guarded_items) + { + // This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last + // vertex for out-of-bound items, but zero-valued + BlockLoadDirectStriped(d_columns + block_offset, guarded_items, VertexId(0), columns, BLOCK_THREADS); + BlockLoadDirectStriped(d_values + block_offset, guarded_items, Value(0), values, BLOCK_THREADS); + } + else + { + // Unguarded loads + BlockLoadDirectStriped(d_columns + block_offset, columns, BLOCK_THREADS); + BlockLoadDirectStriped(d_values + block_offset, values, BLOCK_THREADS); + } + + // Fence to prevent hoisting any dependent code below into the loads above + __syncthreads(); + + // Load the referenced values from x and compute the dot product partials sums + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + values[ITEM] *= tex1Dfetch(TexVector::ref, columns[ITEM]); + } + + // Load a threadblock-striped tile of A (sparse row-ids, column-ids, and values) + if (guarded_items) + { + // This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last + // vertex for out-of-bound items, but zero-valued + BlockLoadDirectStriped(d_rows + block_offset, guarded_items, smem_storage.last_block_row, rows, BLOCK_THREADS); + } + else + { + // Unguarded loads + BlockLoadDirectStriped(d_rows + block_offset, rows, BLOCK_THREADS); + } + + // Transpose from threadblock-striped to threadblock-blocked arrangement + BlockExchangeValues::StripedToBlocked(smem_storage.exchange_values, values); + + // Barrier for smem reuse and coherence + __syncthreads(); + + // Transpose from threadblock-striped to threadblock-blocked arrangement + BlockExchangeRows::StripedToBlocked(smem_storage.exchange_rows, rows); + + // Barrier for smem reuse and coherence + __syncthreads(); + + // Flag row heads by looking for discontinuities + BlockDiscontinuity::Flag( + smem_storage.discontinuity, + rows, // Original row ids + smem_storage.prev_tile_row, // Last row id from previous threadblock + NewRowOp(), // Functor for detecting start of new rows + head_flags); // (Out) Head flags + + // Assemble + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + partial_sums[ITEM].partial = values[ITEM]; + partial_sums[ITEM].row = rows[ITEM]; + } + + + // Compute the exclusive scan of partial_sums + PartialSum block_aggregate; // Threadblock-wide aggregate in thread0 (unused) + BlockScan::ExclusiveScan( + smem_storage.scan, + partial_sums, + partial_sums, // (Out) + smem_storage.identity, + ReduceByKeyOp(), + block_aggregate, // (Out) + carry); // (In-out) + + // Store the last row in the tile (for computing head flags in the next tile) + if (threadIdx.x == BLOCK_THREADS - 1) + { + smem_storage.prev_tile_row = rows[ITEMS_PER_THREAD - 1]; + } + + // Pull the last row ID out of smem + VertexId last_block_row = smem_storage.last_block_row; + + // Scatter an accumulated dot product if it is the head of a valid row + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (head_flags[ITEM]) + { + // Scatter + d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial; + } + else + { + // Otherwise reset the row ID to the last row in the threadblock's range + partial_sums[ITEM].row = last_block_row; + } + } + + // Find the first-scattered dot product if not yet set + if (smem_storage.first_scatter.row == last_block_row) + { + // Barrier for smem reuse and coherence + __syncthreads(); + + PartialSum candidate = BlockReduce::Reduce( + smem_storage.reduce, + partial_sums, + MinRowOp()); + + // Stash the first-scattered dot product in smem + if (threadIdx.x == 0) + { + smem_storage.first_scatter = candidate; + } + } + + } +}; + + +/** + * COO SpMV kernel + */ +template < + int BLOCK_THREADS, + int ITEMS_PER_THREAD, + int BLOCK_OCCUPANCY, + typename VertexId, + typename Value> +__launch_bounds__ (BLOCK_THREADS, BLOCK_OCCUPANCY) +__global__ void CooKernel( + GridEvenShare block_progress, + PartialSum *d_block_partials, + VertexId *d_rows, + VertexId *d_columns, + Value *d_values, + Value *d_vector, + Value *d_result) +{ + // Constants + enum + { + TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD + }; + + // SpMV threadblock tile-processing abstraction + typedef SpmvBlock SpmvBlock; + + // Shared memory + __shared__ typename SpmvBlock::SmemStorage smem_storage; + + // Initialize threadblock even-share to tell us where to start and stop our tile-processing + block_progress.Init(); + + // Stateful prefix carryover from one tile to the next + BlockPrefixOp > carry; + + // Initialize scalar shared memory values + if (threadIdx.x == 0) + { + VertexId first_block_row = d_rows[block_progress.block_offset]; + VertexId last_block_row = d_rows[block_progress.block_oob - 1]; + + // Initialize carry to identity + carry.prefix.row = first_block_row; + carry.prefix.partial = Value(0); + smem_storage.identity = carry.prefix; + + smem_storage.first_scatter.row = last_block_row; + smem_storage.first_scatter.partial = Value(0); + smem_storage.last_block_row = last_block_row; + smem_storage.prev_tile_row = (blockIdx.x == 0) ? + first_block_row : + d_rows[block_progress.block_offset - 1]; + } + + // Process full tiles + while (block_progress.block_offset <= block_progress.block_oob - TILE_SIZE) + { + SpmvBlock::ProcessTile( + smem_storage, + carry, + d_rows, + d_columns, + d_values, + d_vector, + d_result, + block_progress.block_offset); + + block_progress.block_offset += TILE_SIZE; + } + + // Process final partial tile (if present) + int guarded_items = block_progress.block_oob - block_progress.block_offset; + if (guarded_items) + { + SpmvBlock::ProcessTile( + smem_storage, + carry, + d_rows, + d_columns, + d_values, + d_vector, + d_result, + block_progress.block_offset, + guarded_items); + } + + if (threadIdx.x == 0) + { + if (gridDim.x == 1) + { + // Scatter the final aggregate (this kernel contains only 1 threadblock) + d_result[carry.prefix.row] = carry.prefix.partial; + } + else + { + // Write out threadblock first-item and carry aggregate + + // Unweight the first-output if it's the same row as the carry aggregate + PartialSum first_scatter = smem_storage.first_scatter; + if (first_scatter.row == carry.prefix.row) first_scatter.partial = Value(0); + + d_block_partials[blockIdx.x * 2] = first_scatter; + d_block_partials[(blockIdx.x * 2) + 1] = carry.prefix; + } + } +} + + +/** + * Threadblock abstraction for processing sparse SPMV tiles + */ +template < + int BLOCK_THREADS, + int ITEMS_PER_THREAD, + typename VertexId, + typename Value> +struct FinalizeSpmvBlock +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Constants + enum + { + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + // Head flag type + typedef int HeadFlag; + + // Dot product partial sum type + typedef PartialSum PartialSum; + + // Parameterized CUB types for the parallel execution context + typedef BlockScan BlockScan; + typedef BlockDiscontinuity BlockDiscontinuity; + + // Shared memory type for this threadblock + struct SmemStorage + { + union + { + typename BlockScan::SmemStorage scan; // Smem needed for reduce-value-by-row scan + typename BlockDiscontinuity::SmemStorage discontinuity; // Smem needed for head-flagging + }; + + VertexId last_block_row; + VertexId prev_tile_row; + PartialSum identity; + }; + + //--------------------------------------------------------------------- + // Operations + //--------------------------------------------------------------------- + + /** + * Processes a COO input tile of edges, outputting dot products for each row + */ + __device__ __forceinline__ + static void ProcessTile( + SmemStorage &smem_storage, + BlockPrefixOp &carry, + PartialSum *d_block_partials, + Value *d_result, + int block_offset, + int guarded_items = 0) + { + VertexId rows[ITEMS_PER_THREAD]; + PartialSum partial_sums[ITEMS_PER_THREAD]; + HeadFlag head_flags[ITEMS_PER_THREAD]; + + // Load a threadblock-striped tile of A (sparse row-ids, column-ids, and values) + if (guarded_items) + { + // This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last + // vertex for out-of-bound items, but zero-valued + PartialSum default_sum; + default_sum.row = smem_storage.last_block_row; + default_sum.partial = Value(0); + + BlockLoadDirect(d_block_partials + block_offset, guarded_items, default_sum, partial_sums); + } + else + { + // Unguarded loads + BlockLoadDirect(d_block_partials + block_offset, partial_sums); + } + + // Fence to prevent hoisting any dependent code below into the loads above + __syncthreads(); + + // Copy out row IDs for row-head flagging + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + rows[ITEM] = partial_sums[ITEM].row; + } + + // Flag row heads by looking for discontinuities + BlockDiscontinuity::Flag( + smem_storage.discontinuity, + rows, // Original row ids + smem_storage.prev_tile_row, // Last row id from previous threadblock + NewRowOp(), // Functor for detecting start of new rows + head_flags); // (Out) Head flags + + // Barrier for smem reuse and coherence + __syncthreads(); + + // Store the last row in the tile (for computing head flags in the next tile) + if (threadIdx.x == BLOCK_THREADS - 1) + { + smem_storage.prev_tile_row = rows[ITEMS_PER_THREAD - 1]; + } + + // Compute the exclusive scan of partial_sums + PartialSum block_aggregate; // Threadblock-wide aggregate in thread0 (unused) + BlockScan::ExclusiveScan( + smem_storage.scan, + partial_sums, + partial_sums, // (Out) + smem_storage.identity, + ReduceByKeyOp(), + block_aggregate, // (Out) + carry); // (In-out) + + // Scatter an accumulated dot product if it is the head of a valid row + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (head_flags[ITEM]) + { + // Scatter + d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial; + } + } + + } +}; + + +/** + * COO Finalize kernel. + */ +template < + int BLOCK_THREADS, + int ITEMS_PER_THREAD, + typename VertexId, + typename Value> +__launch_bounds__ (BLOCK_THREADS, 1) +__global__ void CooFinalizeKernel( + PartialSum *d_block_partials, + int finalize_partials, + Value *d_result) +{ + // Constants + enum + { + TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD + }; + + // SpMV threadblock tile-processing abstraction + typedef FinalizeSpmvBlock FinalizeSpmvBlock; + + // Shared memory + __shared__ typename FinalizeSpmvBlock::SmemStorage smem_storage; + + // Stateful prefix carryover from one tile to the next + BlockPrefixOp > carry; + + // Initialize scalar shared memory values + if (threadIdx.x == 0) + { + VertexId first_block_row = d_block_partials[0].row; + VertexId last_block_row = d_block_partials[finalize_partials - 1].row; + + // Initialize carry to identity + carry.prefix.row = first_block_row; + carry.prefix.partial = Value(0); + smem_storage.identity = carry.prefix; + + smem_storage.last_block_row = last_block_row; + smem_storage.prev_tile_row = first_block_row; + } + + // Barrier for smem coherence + __syncthreads(); + + // Process full tiles + int block_offset = 0; + while (block_offset <= finalize_partials - TILE_SIZE) + { + FinalizeSpmvBlock::ProcessTile( + smem_storage, + carry, + d_block_partials, + d_result, + block_offset); + + // Barrier for smem reuse and coherence + __syncthreads(); + + block_offset += TILE_SIZE; + } + + // Process final partial tile (if present) + int guarded_items = finalize_partials - block_offset; + if (guarded_items) + { + FinalizeSpmvBlock::ProcessTile( + smem_storage, + carry, + d_block_partials, + d_result, + block_offset, + guarded_items); + } + + // Scatter the final aggregate (this kernel contains only 1 threadblock) + if (threadIdx.x == 0) + { + d_result[carry.prefix.row] = carry.prefix.partial; + } +} + + + + + + +//--------------------------------------------------------------------- +// Host subroutines +//--------------------------------------------------------------------- + +/** + * Compares the equivalence of two arrays + */ +template +int CompareResults(float* computed, float* reference, SizeT len) +{ + int retval = 0; + for (SizeT i = 0; i < len; i++) + { + if (computed[i] > 0.0) + { + if ((computed[i] * 1.001 < reference[i]) || + (computed[i] * 0.999 > reference[i])) + { + printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); + return 1; + } + } + else + { + if ((computed[i] * 1.001 > reference[i]) || + (computed[i] * 0.999 < reference[i])) + { + printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); + return 1; + } + } + } + + if (!retval) printf("CORRECT"); + return retval; +} + +/** + * Compares the equivalence of two arrays + */ +template +int CompareResults(double* computed, double* reference, SizeT len) +{ + int retval = 0; + for (SizeT i = 0; i < len; i++) + { + if (computed[i] > 0.0) + { + if ((computed[i] * 1.001 < reference[i]) || + (computed[i] * 0.999 > reference[i])) + { + printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); + return 1; + } + } + else + { + if ((computed[i] * 1.001 > reference[i]) || + (computed[i] * 0.999 < reference[i])) + { + printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); + return 1; + } + } + } + + if (!retval) printf("CORRECT"); + return retval; +} + + +/** * Simple test of device - */ -template < - int COO_BLOCK_THREADS, - int COO_ITEMS_PER_THREAD, - int COO_BLOCK_OCCUPANCY, - int FINALIZE_BLOCK_THREADS, - int FINALIZE_ITEMS_PER_THREAD, - typename VertexId, - typename Value> -void TestDevice( - CooGraph& coo_graph, - Value* h_vector, - Value* h_reference) -{ - typedef PartialSum PartialSum; - - const int COO_TILE_SIZE = COO_BLOCK_THREADS * COO_ITEMS_PER_THREAD; - - if (g_iterations <= 0) return; - - - // SOA device storage - VertexId *d_rows; // SOA graph row coordinates - VertexId *d_columns; // SOA graph col coordinates - Value *d_values; // SOA graph values - Value *d_vector; // Vector multiplicand - Value *d_result; // Output row - PartialSum *d_block_partials; // Temporary storage for communicating dot product partials between threadblocks - - // Create SOA version of coo_graph on host - int num_edges = coo_graph.coo_tuples.size(); - VertexId *h_rows = new VertexId[num_edges]; - VertexId *h_columns = new VertexId[num_edges]; - Value *h_values = new Value[num_edges]; - for (int i = 0; i < num_edges; i++) - { - h_rows[i] = coo_graph.coo_tuples[i].row; - h_columns[i] = coo_graph.coo_tuples[i].col; - h_values[i] = coo_graph.coo_tuples[i].val; - } - - // Get CUDA properties - CudaProps cuda_props; - CubDebugExit(cuda_props.Init()); - - // Get kernel properties - KernelProps coo_kernel_props; - KernelProps finalize_kernel_props; - CubDebugExit(coo_kernel_props.Init( - CooKernel, - COO_BLOCK_THREADS, - cuda_props)); - CubDebugExit(finalize_kernel_props.Init( - CooFinalizeKernel, - FINALIZE_BLOCK_THREADS, - cuda_props)); - - // Determine launch configuration from kernel properties - int coo_grid_size = coo_kernel_props.OversubscribedGridSize(COO_TILE_SIZE, num_edges); - int finalize_partials = coo_grid_size * 2; - - // Allocate COO device arrays - CubDebugExit(DeviceAllocate((void**)&d_rows, sizeof(VertexId) * num_edges)); - CubDebugExit(DeviceAllocate((void**)&d_columns, sizeof(VertexId) * num_edges)); - CubDebugExit(DeviceAllocate((void**)&d_values, sizeof(Value) * num_edges)); - CubDebugExit(DeviceAllocate((void**)&d_vector, sizeof(Value) * coo_graph.col_dim)); - CubDebugExit(DeviceAllocate((void**)&d_result, sizeof(Value) * coo_graph.row_dim)); - CubDebugExit(DeviceAllocate((void**)&d_block_partials, sizeof(PartialSum) * finalize_partials)); - - // Copy host arrays to device - CubDebugExit(cudaMemcpy(d_rows, h_rows, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice)); - CubDebugExit(cudaMemcpy(d_columns, h_columns, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice)); - CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_edges, cudaMemcpyHostToDevice)); - CubDebugExit(cudaMemcpy(d_vector, h_vector, sizeof(Value) * coo_graph.col_dim, cudaMemcpyHostToDevice)); - - // Bind textures - TexVector::BindTexture(d_vector, coo_graph.col_dim); - - // Construct an even-share work distribution - GridEvenShare block_progress(num_edges, coo_grid_size, COO_TILE_SIZE); - - // Print debug info - printf("CooKernel<%d, %d><<<%d, %d>>>(...), Max SM occupancy: %d\n", - COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_kernel_props.max_block_occupancy); - if (coo_grid_size > 1) - { - printf("CooFinalizeKernel<<<1, %d>>>(...), Max SM occupancy: %d\n", - FINALIZE_BLOCK_THREADS, finalize_kernel_props.max_block_occupancy); - } - fflush(stdout); - - // Run kernel - GpuTimer gpu_timer; - float elapsed_millis = 0.0; - for (int i = 0; i < g_iterations; i++) - { - gpu_timer.Start(); - - // Initialize output - CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value))); - - // Run the COO kernel - CooKernel<<>>( - block_progress, - d_block_partials, - d_rows, - d_columns, - d_values, - d_vector, - d_result); - - if (coo_grid_size > 1) - { - // Run the COO finalize kernel - CooFinalizeKernel<<<1, FINALIZE_BLOCK_THREADS>>>( - d_block_partials, - finalize_partials, - d_result); - } - - gpu_timer.Stop(); - elapsed_millis += gpu_timer.ElapsedMillis(); - } - - // Force any kernel stdio to screen - CubDebugExit(cudaThreadSynchronize()); - fflush(stdout); - - // Display timing - float avg_elapsed = elapsed_millis / g_iterations; - int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim); - printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n", - g_iterations, - avg_elapsed, - total_bytes / avg_elapsed / 1000.0 / 1000.0, - num_edges * 2 / avg_elapsed / 1000.0 / 1000.0); - - // Check results - AssertEquals(0, CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, g_verbose, g_verbose)); - - // Cleanup - TexVector::UnbindTexture(); - CubDebugExit(DeviceFree(d_block_partials)); - CubDebugExit(DeviceFree(d_rows)); - CubDebugExit(DeviceFree(d_columns)); - CubDebugExit(DeviceFree(d_values)); - CubDebugExit(DeviceFree(d_vector)); - CubDebugExit(DeviceFree(d_result)); - delete[] h_rows; - delete[] h_columns; - delete[] h_values; -} - - -/** - * Compute reference answer on CPU - */ -template -void ComputeReference( - CooGraph& coo_graph, - Value* h_vector, - Value* h_reference) -{ - for (VertexId i = 0; i < coo_graph.row_dim; i++) - { - h_reference[i] = 0.0; - } - - for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++) - { - h_reference[coo_graph.coo_tuples[i].row] += - coo_graph.coo_tuples[i].val * - h_vector[coo_graph.coo_tuples[i].col]; - } -} - - -/** - * Assign arbitrary values to vector items - */ -template -void AssignVectorValues(Value *vector, int col_dim) -{ - for (int i = 0; i < col_dim; i++) - { - vector[i] = 1.0; - } -} - - -/** - * Main - */ -int main(int argc, char** argv) -{ - // Graph of uint32s as vertex ids, floats as values - typedef int VertexId; - typedef float Value; - - // Initialize command line - CommandLineArgs args(argc, argv); - g_verbose = args.CheckCmdLineFlag("v"); - args.GetCmdLineArgument("i", g_iterations); - - // Print usage - if (args.CheckCmdLineFlag("help")) - { - printf("%s\n [--device=] [--v] [--iterations=] [--grid-size=]\n" - "\t--type=wheel --spokes=\n" - "\t--type=grid2d --width= [--7pt]\n" - "\t--type=grid3d --width=\n" - "\t--type=market --file=\n" - "\n", argv[0]); - exit(0); - } - - // Initialize device - CubDebugExit(args.DeviceInit()); - - // Get graph type - string type; - args.GetCmdLineArgument("type", type); - - // Generate graph structure - - CpuTimer timer; - timer.Start(); - CooGraph coo_graph; - if (type == string("grid2d")) - { - VertexId width; - args.GetCmdLineArgument("width", width); - bool self = args.CheckCmdLineFlag("7pt"); - printf("Generating %s grid2d width(%d)... ", (self) ? "7-pt" : "6-pt", width); fflush(stdout); - if (coo_graph.InitGrid2d(width, self)) exit(1); - } else if (type == string("grid3d")) - { - VertexId width; - args.GetCmdLineArgument("width", width); - printf("Generating grid3d width(%d)... ", width); fflush(stdout); - if (coo_graph.InitGrid3d(width)) exit(1); - } - else if (type == string("wheel")) - { - VertexId spokes; - args.GetCmdLineArgument("spokes", spokes); - printf("Generating wheel spokes(%d)... ", spokes); fflush(stdout); - if (coo_graph.InitWheel(spokes)) exit(1); - } - else if (type == string("market")) - { - string filename; - args.GetCmdLineArgument("file", filename); - printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout); - if (coo_graph.InitMetis(filename)) exit(1); - } - else - { - printf("Unsupported graph type\n"); - exit(1); - } - timer.Stop(); - printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n", - timer.ElapsedMillis() / 1000.0, - coo_graph.coo_tuples.size(), - coo_graph.row_dim, - coo_graph.col_dim); - fflush(stdout); - - if (g_verbose) - { - cout << coo_graph << "\n"; - } - - // Create vector - Value *h_vector = new Value[coo_graph.col_dim]; - AssignVectorValues(h_vector, coo_graph.col_dim); - if (g_verbose) - { - printf("Vector[%d]: ", coo_graph.col_dim); - DisplayResults(h_vector, coo_graph.col_dim); - printf("\n\n"); - } - - // Compute reference answer - Value *h_reference = new Value[coo_graph.row_dim]; - ComputeReference(coo_graph, h_vector, h_reference); - if (g_verbose) - { - printf("Results[%d]: ", coo_graph.row_dim); - DisplayResults(h_reference, coo_graph.row_dim); - printf("\n\n"); - } - - // Run GPU version - TestDevice< - 128, - 10, - 6, - 256, - 4>(coo_graph, h_vector, h_reference); - - // Cleanup - delete[] h_vector; - delete[] h_reference; - - return 0; -} - - - + */ +template < + int COO_BLOCK_THREADS, + int COO_ITEMS_PER_THREAD, + int COO_BLOCK_OCCUPANCY, + int FINALIZE_BLOCK_THREADS, + int FINALIZE_ITEMS_PER_THREAD, + typename VertexId, + typename Value> +void TestDevice( + CooGraph& coo_graph, + Value* h_vector, + Value* h_reference) +{ + typedef PartialSum PartialSum; + + const int COO_TILE_SIZE = COO_BLOCK_THREADS * COO_ITEMS_PER_THREAD; + + if (g_iterations <= 0) return; + + + // SOA device storage + VertexId *d_rows; // SOA graph row coordinates + VertexId *d_columns; // SOA graph col coordinates + Value *d_values; // SOA graph values + Value *d_vector; // Vector multiplicand + Value *d_result; // Output row + PartialSum *d_block_partials; // Temporary storage for communicating dot product partials between threadblocks + + // Create SOA version of coo_graph on host + int num_edges = coo_graph.coo_tuples.size(); + VertexId *h_rows = new VertexId[num_edges]; + VertexId *h_columns = new VertexId[num_edges]; + Value *h_values = new Value[num_edges]; + for (int i = 0; i < num_edges; i++) + { + h_rows[i] = coo_graph.coo_tuples[i].row; + h_columns[i] = coo_graph.coo_tuples[i].col; + h_values[i] = coo_graph.coo_tuples[i].val; + } + + // Get CUDA properties + CudaProps cuda_props; + CubDebugExit(cuda_props.Init()); + + // Get kernel properties + KernelProps coo_kernel_props; + KernelProps finalize_kernel_props; + CubDebugExit(coo_kernel_props.Init( + CooKernel, + COO_BLOCK_THREADS, + cuda_props)); + CubDebugExit(finalize_kernel_props.Init( + CooFinalizeKernel, + FINALIZE_BLOCK_THREADS, + cuda_props)); + + // Determine launch configuration from kernel properties + int coo_grid_size = coo_kernel_props.OversubscribedGridSize(COO_TILE_SIZE, num_edges); + int finalize_partials = coo_grid_size * 2; + + // Allocate COO device arrays + CubDebugExit(DeviceAllocate((void**)&d_rows, sizeof(VertexId) * num_edges)); + CubDebugExit(DeviceAllocate((void**)&d_columns, sizeof(VertexId) * num_edges)); + CubDebugExit(DeviceAllocate((void**)&d_values, sizeof(Value) * num_edges)); + CubDebugExit(DeviceAllocate((void**)&d_vector, sizeof(Value) * coo_graph.col_dim)); + CubDebugExit(DeviceAllocate((void**)&d_result, sizeof(Value) * coo_graph.row_dim)); + CubDebugExit(DeviceAllocate((void**)&d_block_partials, sizeof(PartialSum) * finalize_partials)); + + // Copy host arrays to device + CubDebugExit(cudaMemcpy(d_rows, h_rows, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice)); + CubDebugExit(cudaMemcpy(d_columns, h_columns, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice)); + CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_edges, cudaMemcpyHostToDevice)); + CubDebugExit(cudaMemcpy(d_vector, h_vector, sizeof(Value) * coo_graph.col_dim, cudaMemcpyHostToDevice)); + + // Bind textures + TexVector::BindTexture(d_vector, coo_graph.col_dim); + + // Construct an even-share work distribution + GridEvenShare block_progress(num_edges, coo_grid_size, COO_TILE_SIZE); + + // Print debug info + printf("CooKernel<%d, %d><<<%d, %d>>>(...), Max SM occupancy: %d\n", + COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_kernel_props.max_block_occupancy); + if (coo_grid_size > 1) + { + printf("CooFinalizeKernel<<<1, %d>>>(...), Max SM occupancy: %d\n", + FINALIZE_BLOCK_THREADS, finalize_kernel_props.max_block_occupancy); + } + fflush(stdout); + + // Run kernel + GpuTimer gpu_timer; + float elapsed_millis = 0.0; + for (int i = 0; i < g_iterations; i++) + { + gpu_timer.Start(); + + // Initialize output + CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value))); + + // Run the COO kernel + CooKernel<<>>( + block_progress, + d_block_partials, + d_rows, + d_columns, + d_values, + d_vector, + d_result); + + if (coo_grid_size > 1) + { + // Run the COO finalize kernel + CooFinalizeKernel<<<1, FINALIZE_BLOCK_THREADS>>>( + d_block_partials, + finalize_partials, + d_result); + } + + gpu_timer.Stop(); + elapsed_millis += gpu_timer.ElapsedMillis(); + } + + // Force any kernel stdio to screen + CubDebugExit(cudaThreadSynchronize()); + fflush(stdout); + + // Display timing + float avg_elapsed = elapsed_millis / g_iterations; + int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim); + printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n", + g_iterations, + avg_elapsed, + total_bytes / avg_elapsed / 1000.0 / 1000.0, + num_edges * 2 / avg_elapsed / 1000.0 / 1000.0); + + // Check results + AssertEquals(0, CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, g_verbose, g_verbose)); + + // Cleanup + TexVector::UnbindTexture(); + CubDebugExit(DeviceFree(d_block_partials)); + CubDebugExit(DeviceFree(d_rows)); + CubDebugExit(DeviceFree(d_columns)); + CubDebugExit(DeviceFree(d_values)); + CubDebugExit(DeviceFree(d_vector)); + CubDebugExit(DeviceFree(d_result)); + delete[] h_rows; + delete[] h_columns; + delete[] h_values; +} + + +/** + * Compute reference answer on CPU + */ +template +void ComputeReference( + CooGraph& coo_graph, + Value* h_vector, + Value* h_reference) +{ + for (VertexId i = 0; i < coo_graph.row_dim; i++) + { + h_reference[i] = 0.0; + } + + for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++) + { + h_reference[coo_graph.coo_tuples[i].row] += + coo_graph.coo_tuples[i].val * + h_vector[coo_graph.coo_tuples[i].col]; + } +} + + +/** + * Assign arbitrary values to vector items + */ +template +void AssignVectorValues(Value *vector, int col_dim) +{ + for (int i = 0; i < col_dim; i++) + { + vector[i] = 1.0; + } +} + + +/** + * Main + */ +int main(int argc, char** argv) +{ + // Graph of uint32s as vertex ids, floats as values + typedef int VertexId; + typedef float Value; + + // Initialize command line + CommandLineArgs args(argc, argv); + g_verbose = args.CheckCmdLineFlag("v"); + args.GetCmdLineArgument("i", g_iterations); + + // Print usage + if (args.CheckCmdLineFlag("help")) + { + printf("%s\n [--device=] [--v] [--iterations=] [--grid-size=]\n" + "\t--type=wheel --spokes=\n" + "\t--type=grid2d --width= [--7pt]\n" + "\t--type=grid3d --width=\n" + "\t--type=market --file=\n" + "\n", argv[0]); + exit(0); + } + + // Initialize device + CubDebugExit(args.DeviceInit()); + + // Get graph type + string type; + args.GetCmdLineArgument("type", type); + + // Generate graph structure + + CpuTimer timer; + timer.Start(); + CooGraph coo_graph; + if (type == string("grid2d")) + { + VertexId width; + args.GetCmdLineArgument("width", width); + bool self = args.CheckCmdLineFlag("7pt"); + printf("Generating %s grid2d width(%d)... ", (self) ? "7-pt" : "6-pt", width); fflush(stdout); + if (coo_graph.InitGrid2d(width, self)) exit(1); + } else if (type == string("grid3d")) + { + VertexId width; + args.GetCmdLineArgument("width", width); + printf("Generating grid3d width(%d)... ", width); fflush(stdout); + if (coo_graph.InitGrid3d(width)) exit(1); + } + else if (type == string("wheel")) + { + VertexId spokes; + args.GetCmdLineArgument("spokes", spokes); + printf("Generating wheel spokes(%d)... ", spokes); fflush(stdout); + if (coo_graph.InitWheel(spokes)) exit(1); + } + else if (type == string("market")) + { + string filename; + args.GetCmdLineArgument("file", filename); + printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout); + if (coo_graph.InitMetis(filename)) exit(1); + } + else + { + printf("Unsupported graph type\n"); + exit(1); + } + timer.Stop(); + printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n", + timer.ElapsedMillis() / 1000.0, + coo_graph.coo_tuples.size(), + coo_graph.row_dim, + coo_graph.col_dim); + fflush(stdout); + + if (g_verbose) + { + cout << coo_graph << "\n"; + } + + // Create vector + Value *h_vector = new Value[coo_graph.col_dim]; + AssignVectorValues(h_vector, coo_graph.col_dim); + if (g_verbose) + { + printf("Vector[%d]: ", coo_graph.col_dim); + DisplayResults(h_vector, coo_graph.col_dim); + printf("\n\n"); + } + + // Compute reference answer + Value *h_reference = new Value[coo_graph.row_dim]; + ComputeReference(coo_graph, h_vector, h_reference); + if (g_verbose) + { + printf("Results[%d]: ", coo_graph.row_dim); + DisplayResults(h_reference, coo_graph.row_dim); + printf("\n\n"); + } + + // Run GPU version + TestDevice< + 128, + 10, + 6, + 256, + 4>(coo_graph, h_vector, h_reference); + + // Cleanup + delete[] h_vector; + delete[] h_reference; + + return 0; +} + + + diff --git a/test/test_coo_spmv_double.cu b/test/test_coo_spmv_double.cu index 664ef88635..3743fe27db 100644 --- a/test/test_coo_spmv_double.cu +++ b/test/test_coo_spmv_double.cu @@ -1,1481 +1,1481 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/****************************************************************************** - * Experimental reduce-value-by-row COO implementation of SPMV - ******************************************************************************/ - -// Ensure printing of CUDA runtime errors to console -#define CUB_STDERR - -#include -#include -#include -#include -#include -#include - -#include "cub.cuh" - -using namespace cub; -using namespace std; - - -//--------------------------------------------------------------------- -// Globals, constants and typedefs -//--------------------------------------------------------------------- - -bool g_verbose = false; -int g_iterations = 1; - - -//--------------------------------------------------------------------- -// Graph building types and utilities -//--------------------------------------------------------------------- - -/** +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Experimental reduce-value-by-row COO implementation of SPMV + ******************************************************************************/ + +// Ensure printing of CUDA runtime errors to console +#define CUB_STDERR + +#include +#include +#include +#include +#include +#include + +#include "cub.cuh" + +using namespace cub; +using namespace std; + + +//--------------------------------------------------------------------- +// Globals, constants and typedefs +//--------------------------------------------------------------------- + +bool g_verbose = false; +int g_iterations = 1; + + +//--------------------------------------------------------------------- +// Graph building types and utilities +//--------------------------------------------------------------------- + +/** * COO graph type. A COO graph is just a vector of edge tuples. - */ -template -struct CooGraph -{ - /** + */ +template +struct CooGraph +{ + /** * COO edge tuple. (A COO graph is just a vector of these.) - */ - struct CooTuple - { - VertexId row; - VertexId col; - Value val; - - CooTuple() {} - CooTuple(VertexId row, VertexId col) : row(row), col(col) {} - CooTuple(VertexId row, VertexId col, Value val) : row(row), col(col), val(val) {} - }; - - /** - * Comparator for sorting COO sparse format edges - */ - static bool CooTupleCompare (const CooTuple &elem1, const CooTuple &elem2) - { - if (elem1.row < elem2.row) - { - return true; - } - else if ((elem1.row == elem2.row) && (elem1.col < elem2.col)) - { - return true; - } - - return false; - } - - - /** - * Fields - */ - int row_dim; // Num rows - int col_dim; // Num cols - vector coo_tuples; // Non-zero entries - - - /** - * CooGraph ostream operator - */ - friend std::ostream& operator<<(std::ostream& os, const CooGraph& coo_graph) - { - os << "Sparse COO (" << coo_graph.row_dim << " rows, " << coo_graph.col_dim << " cols, " << coo_graph.coo_tuples.size() << " nonzeros):\n"; - os << "Ordinal, Row, Col, Val\n"; - for (int i = 0; i < coo_graph.coo_tuples.size(); i++) - { - os << i << ',' << coo_graph.coo_tuples[i].row << ',' << coo_graph.coo_tuples[i].col << ',' << coo_graph.coo_tuples[i].val << "\n"; - } - return os; - } - - /** - * Update graph dims based upon COO tuples - */ - void UpdateDims() - { - row_dim = -1; - col_dim = -1; - - for (int i = 0; i < coo_tuples.size(); i++) - { - row_dim = CUB_MAX(row_dim, coo_tuples[i].row); - col_dim = CUB_MAX(col_dim, coo_tuples[i].col); - } - - row_dim++; - col_dim++; - } - - /** - * Builds a METIS COO sparse from the given file. - */ - int InitMetis(const string &metis_filename) - { - coo_tuples.clear(); - - // Read from file - FILE *f_in = fopen(metis_filename.c_str(), "r"); - if (!f_in) return -1; - - int edges_read = -1; - int edges = 0; - - char line[1024]; - - while(true) - { - if (fscanf(f_in, "%[^\n]\n", line) <= 0) - { - break; - } - if (line[0] == '%') - { - // Comment - } - else if (edges_read == -1) - { - // Problem description - long long ll_nodes_x, ll_nodes_y, ll_edges; - if (sscanf(line, "%lld %lld %lld", &ll_nodes_x, &ll_nodes_y, &ll_edges) != 3) - { - fprintf(stderr, "Error parsing MARKET graph: invalid problem description\n"); - return -1; - } - - if (ll_nodes_x != ll_nodes_y) - { - fprintf(stderr, "Error parsing MARKET graph: not square (%lld, %lld)\n", ll_nodes_x, ll_nodes_y); - return -1; - } - - edges = ll_edges; - - printf(" (%lld nodes, %lld directed edges)... ", - (unsigned long long) ll_nodes_x, - (unsigned long long) ll_edges); - fflush(stdout); - - // Allocate coo graph - coo_tuples.reserve(edges); - edges_read++; - } - else - { - if (edges_read >= edges) - { - fprintf(stderr, "Error parsing MARKET graph: encountered more than %d edges\n", edges); - fclose(f_in); - return -1; - } - - long long ll_row, ll_col; - float val; - if (sscanf(line, "%lld %lld %f", &ll_col, &ll_row, &val) != 3) - { - fprintf(stderr, "Error parsing MARKET graph: badly formed edge\n", edges); - fclose(f_in); - return -1; - } - - ll_row -= 1; - ll_col -= 1; - - coo_tuples.push_back(CooTuple(ll_row, ll_col, val)); // zero-based array - edges_read++; - } - } - - if (edges_read != edges) - { - fprintf(stderr, "Error parsing MARKET graph: only %d/%d edges read\n", edges_read, edges); - fclose(f_in); - return -1; - } - - // Sort by rows, then columns, update dims - std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); - UpdateDims(); - - fclose(f_in); - - return 0; - } - - - /** - * Builds a wheel COO sparse graph having spokes spokes. - */ - int InitWheel(VertexId spokes) - { - VertexId edges = spokes + (spokes - 1); - - coo_tuples.clear(); - coo_tuples.reserve(edges); - - // Add spoke edges - for (VertexId i = 0; i < spokes; i++) - { - coo_tuples.push_back(CooTuple(0, i + 1)); - } - - // Add rim - for (VertexId i = 0; i < spokes; i++) - { - VertexId dest = (i + 1) % spokes; - coo_tuples.push_back(CooTuple(i + 1, dest + 1)); - } - - // Sort by rows, then columns, update dims - std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); - UpdateDims(); - - // Assign arbitrary values to graph vertices - for (int i = 0; i < coo_tuples.size(); i++) - { - coo_tuples[i].val = i % 21; - } - - return 0; - } - - - /** - * Builds a square 3D grid COO sparse graph. Interior nodes have degree 7 (including - * a self-loop). Values are unintialized, coo_tuples are sorted. - */ - int InitGrid3d(VertexId width) - { - VertexId interior_nodes = (width - 2) * (width - 2) * (width - 2); - VertexId face_nodes = (width - 2) * (width - 2) * 6; - VertexId edge_nodes = (width - 2) * 12; - VertexId corner_nodes = 8; - VertexId nodes = width * width * width; - VertexId edges = (interior_nodes * 6) + (face_nodes * 5) + (edge_nodes * 4) + (corner_nodes * 3) + nodes; - - coo_tuples.clear(); - coo_tuples.reserve(edges); - - for (VertexId i = 0; i < width; i++) { - for (VertexId j = 0; j < width; j++) { - for (VertexId k = 0; k < width; k++) { - - VertexId me = (i * width * width) + (j * width) + k; - - VertexId neighbor = (i * width * width) + (j * width) + (k - 1); - if (k - 1 >= 0) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = (i * width * width) + (j * width) + (k + 1); - if (k + 1 < width) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = (i * width * width) + ((j - 1) * width) + k; - if (j - 1 >= 0) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = (i * width * width) + ((j + 1) * width) + k; - if (j + 1 < width) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = ((i - 1) * width * width) + (j * width) + k; - if (i - 1 >= 0) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = ((i + 1) * width * width) + (j * width) + k; - if (i + 1 < width) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = me; - coo_tuples.push_back(CooTuple(me, neighbor)); - } - } - } - - // Sort by rows, then columns, update dims - std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); - UpdateDims(); - - // Assign arbitrary values to graph vertices - for (int i = 0; i < coo_tuples.size(); i++) - { - coo_tuples[i].val = i % 21; - } - - return 0; - } - - - /** - * Builds a square 2D grid CSR graph. Interior nodes have degree 5 (including - * a self-loop) - * - * Returns 0 on success, 1 on failure. - */ - int InitGrid2d(VertexId width, bool self) - { - VertexId interior_nodes = (width - 2) * (width - 2); - VertexId edge_nodes = (width - 2) * 4; - VertexId corner_nodes = 4; - VertexId edges = (interior_nodes * 4) + (edge_nodes * 3) + (corner_nodes * 2); - - if (self) edges += edges; - - coo_tuples.clear(); - coo_tuples.reserve(edges); - - for (VertexId j = 0; j < width; j++) { - for (VertexId k = 0; k < width; k++) { - - VertexId me = (j * width) + k; - - VertexId neighbor = (j * width) + (k - 1); - if (k - 1 >= 0) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = (j * width) + (k + 1); - if (k + 1 < width) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = ((j - 1) * width) + k; - if (j - 1 >= 0) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - neighbor = ((j + 1) * width) + k; - if (j + 1 < width) { - coo_tuples.push_back(CooTuple(me, neighbor)); - } - - if (self) - { - neighbor = me; - coo_tuples.push_back(CooTuple(me, neighbor)); - } - } - } - - // Sort by rows, then columns, update dims - std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); - UpdateDims(); - - // Assign arbitrary values to graph vertices - for (int i = 0; i < coo_tuples.size(); i++) - { - coo_tuples[i].val = i % 21; - } - - return 0; - } -}; - - - - -//--------------------------------------------------------------------- -// GPU types and device functions -//--------------------------------------------------------------------- - - -/// Pairing of dot product partial sums and corresponding row-id -template -struct PartialSum -{ - double partial; /// PartialSum sum - int row; /// Row-id - int padding; - - /// Tags indicating this structure provides overloaded ThreadLoad and ThreadStore operations - typedef void ThreadLoadTag; - typedef void ThreadStoreTag; - - /// ThreadLoad (simply defer to loading individual items) - template - __device__ __forceinline__ void ThreadLoad(PartialSum *ptr) - { - partial = cub::ThreadLoad(&(ptr->partial)); - row = cub::ThreadLoad(&(ptr->row)); - } - - /// ThreadStore (simply defer to storing individual items) - template - __device__ __forceinline__ void ThreadStore(PartialSum *ptr) const - { - // Always write partial first - cub::ThreadStore(&(ptr->partial), partial); - cub::ThreadStore(&(ptr->row), row); - } - -}; - - -/// Templated Texture reference type for multiplicand vector -template -struct TexVector -{ - // Texture reference type - typedef texture TexRef; - - static TexRef ref; - - /** - * Bind textures - */ - static void BindTexture(void *d_in, int elements) - { - cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc(); - if (d_in) - { - size_t offset; - size_t bytes = sizeof(Value) * elements; - CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes)); - } - } - - /** - * Unbind textures - */ - static void UnbindTexture() - { - CubDebugExit(cudaUnbindTexture(ref)); - } -}; - -// Texture reference definitions -template -typename TexVector::TexRef TexVector::ref = 0; - - -/// Reduce-by-row scan operator -struct ReduceByKeyOp -{ - template - __device__ __forceinline__ PartialSum operator()( - const PartialSum &first, - const PartialSum &second) - { - PartialSum retval; - - retval.partial = (second.row != first.row) ? - second.partial : - first.partial + second.partial; - - retval.row = second.row; - return retval; - } -}; - - -/// Min-row reduction operator -struct MinRowOp -{ - template - __device__ __forceinline__ PartialSum operator()( - const PartialSum &first, - const PartialSum &second) - { - return (first.row < second.row) ? - first : - second; - } -}; - - -// Callback functor for waiting on the previous threadblock to compute its partial sum (the prefix for this threadblock) -template -struct BlockPrefixOp -{ - PartialSum prefix; - - /** - * Threadblock-wide prefix callback functor called by thread-0 in BlockScan::ExclusiveScan(). - * Returns the threadblock-wide prefix to apply to all scan inputs. - */ - __device__ __forceinline__ PartialSum operator()( - const PartialSum &block_aggregate) ///< The aggregate sum of the local prefix sum inputs - { - ReduceByKeyOp scan_op; - - PartialSum retval = prefix; - prefix = scan_op(prefix, block_aggregate); - return retval; - } -}; - - -/// Functor for detecting row discontinuities. -struct NewRowOp -{ - /// Returns true if row_b is the start of a new row - template - __device__ __forceinline__ bool operator()( - const VertexId& row_a, - const VertexId& row_b) - { - return (row_a != row_b); - } -}; - - -/** + */ + struct CooTuple + { + VertexId row; + VertexId col; + Value val; + + CooTuple() {} + CooTuple(VertexId row, VertexId col) : row(row), col(col) {} + CooTuple(VertexId row, VertexId col, Value val) : row(row), col(col), val(val) {} + }; + + /** + * Comparator for sorting COO sparse format edges + */ + static bool CooTupleCompare (const CooTuple &elem1, const CooTuple &elem2) + { + if (elem1.row < elem2.row) + { + return true; + } + else if ((elem1.row == elem2.row) && (elem1.col < elem2.col)) + { + return true; + } + + return false; + } + + + /** + * Fields + */ + int row_dim; // Num rows + int col_dim; // Num cols + vector coo_tuples; // Non-zero entries + + + /** + * CooGraph ostream operator + */ + friend std::ostream& operator<<(std::ostream& os, const CooGraph& coo_graph) + { + os << "Sparse COO (" << coo_graph.row_dim << " rows, " << coo_graph.col_dim << " cols, " << coo_graph.coo_tuples.size() << " nonzeros):\n"; + os << "Ordinal, Row, Col, Val\n"; + for (int i = 0; i < coo_graph.coo_tuples.size(); i++) + { + os << i << ',' << coo_graph.coo_tuples[i].row << ',' << coo_graph.coo_tuples[i].col << ',' << coo_graph.coo_tuples[i].val << "\n"; + } + return os; + } + + /** + * Update graph dims based upon COO tuples + */ + void UpdateDims() + { + row_dim = -1; + col_dim = -1; + + for (int i = 0; i < coo_tuples.size(); i++) + { + row_dim = CUB_MAX(row_dim, coo_tuples[i].row); + col_dim = CUB_MAX(col_dim, coo_tuples[i].col); + } + + row_dim++; + col_dim++; + } + + /** + * Builds a METIS COO sparse from the given file. + */ + int InitMetis(const string &metis_filename) + { + coo_tuples.clear(); + + // Read from file + FILE *f_in = fopen(metis_filename.c_str(), "r"); + if (!f_in) return -1; + + int edges_read = -1; + int edges = 0; + + char line[1024]; + + while(true) + { + if (fscanf(f_in, "%[^\n]\n", line) <= 0) + { + break; + } + if (line[0] == '%') + { + // Comment + } + else if (edges_read == -1) + { + // Problem description + long long ll_nodes_x, ll_nodes_y, ll_edges; + if (sscanf(line, "%lld %lld %lld", &ll_nodes_x, &ll_nodes_y, &ll_edges) != 3) + { + fprintf(stderr, "Error parsing MARKET graph: invalid problem description\n"); + return -1; + } + + if (ll_nodes_x != ll_nodes_y) + { + fprintf(stderr, "Error parsing MARKET graph: not square (%lld, %lld)\n", ll_nodes_x, ll_nodes_y); + return -1; + } + + edges = ll_edges; + + printf(" (%lld nodes, %lld directed edges)... ", + (unsigned long long) ll_nodes_x, + (unsigned long long) ll_edges); + fflush(stdout); + + // Allocate coo graph + coo_tuples.reserve(edges); + edges_read++; + } + else + { + if (edges_read >= edges) + { + fprintf(stderr, "Error parsing MARKET graph: encountered more than %d edges\n", edges); + fclose(f_in); + return -1; + } + + long long ll_row, ll_col; + float val; + if (sscanf(line, "%lld %lld %f", &ll_col, &ll_row, &val) != 3) + { + fprintf(stderr, "Error parsing MARKET graph: badly formed edge\n", edges); + fclose(f_in); + return -1; + } + + ll_row -= 1; + ll_col -= 1; + + coo_tuples.push_back(CooTuple(ll_row, ll_col, val)); // zero-based array + edges_read++; + } + } + + if (edges_read != edges) + { + fprintf(stderr, "Error parsing MARKET graph: only %d/%d edges read\n", edges_read, edges); + fclose(f_in); + return -1; + } + + // Sort by rows, then columns, update dims + std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); + UpdateDims(); + + fclose(f_in); + + return 0; + } + + + /** + * Builds a wheel COO sparse graph having spokes spokes. + */ + int InitWheel(VertexId spokes) + { + VertexId edges = spokes + (spokes - 1); + + coo_tuples.clear(); + coo_tuples.reserve(edges); + + // Add spoke edges + for (VertexId i = 0; i < spokes; i++) + { + coo_tuples.push_back(CooTuple(0, i + 1)); + } + + // Add rim + for (VertexId i = 0; i < spokes; i++) + { + VertexId dest = (i + 1) % spokes; + coo_tuples.push_back(CooTuple(i + 1, dest + 1)); + } + + // Sort by rows, then columns, update dims + std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); + UpdateDims(); + + // Assign arbitrary values to graph vertices + for (int i = 0; i < coo_tuples.size(); i++) + { + coo_tuples[i].val = i % 21; + } + + return 0; + } + + + /** + * Builds a square 3D grid COO sparse graph. Interior nodes have degree 7 (including + * a self-loop). Values are unintialized, coo_tuples are sorted. + */ + int InitGrid3d(VertexId width) + { + VertexId interior_nodes = (width - 2) * (width - 2) * (width - 2); + VertexId face_nodes = (width - 2) * (width - 2) * 6; + VertexId edge_nodes = (width - 2) * 12; + VertexId corner_nodes = 8; + VertexId nodes = width * width * width; + VertexId edges = (interior_nodes * 6) + (face_nodes * 5) + (edge_nodes * 4) + (corner_nodes * 3) + nodes; + + coo_tuples.clear(); + coo_tuples.reserve(edges); + + for (VertexId i = 0; i < width; i++) { + for (VertexId j = 0; j < width; j++) { + for (VertexId k = 0; k < width; k++) { + + VertexId me = (i * width * width) + (j * width) + k; + + VertexId neighbor = (i * width * width) + (j * width) + (k - 1); + if (k - 1 >= 0) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = (i * width * width) + (j * width) + (k + 1); + if (k + 1 < width) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = (i * width * width) + ((j - 1) * width) + k; + if (j - 1 >= 0) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = (i * width * width) + ((j + 1) * width) + k; + if (j + 1 < width) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = ((i - 1) * width * width) + (j * width) + k; + if (i - 1 >= 0) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = ((i + 1) * width * width) + (j * width) + k; + if (i + 1 < width) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = me; + coo_tuples.push_back(CooTuple(me, neighbor)); + } + } + } + + // Sort by rows, then columns, update dims + std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); + UpdateDims(); + + // Assign arbitrary values to graph vertices + for (int i = 0; i < coo_tuples.size(); i++) + { + coo_tuples[i].val = i % 21; + } + + return 0; + } + + + /** + * Builds a square 2D grid CSR graph. Interior nodes have degree 5 (including + * a self-loop) + * + * Returns 0 on success, 1 on failure. + */ + int InitGrid2d(VertexId width, bool self) + { + VertexId interior_nodes = (width - 2) * (width - 2); + VertexId edge_nodes = (width - 2) * 4; + VertexId corner_nodes = 4; + VertexId edges = (interior_nodes * 4) + (edge_nodes * 3) + (corner_nodes * 2); + + if (self) edges += edges; + + coo_tuples.clear(); + coo_tuples.reserve(edges); + + for (VertexId j = 0; j < width; j++) { + for (VertexId k = 0; k < width; k++) { + + VertexId me = (j * width) + k; + + VertexId neighbor = (j * width) + (k - 1); + if (k - 1 >= 0) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = (j * width) + (k + 1); + if (k + 1 < width) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = ((j - 1) * width) + k; + if (j - 1 >= 0) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + neighbor = ((j + 1) * width) + k; + if (j + 1 < width) { + coo_tuples.push_back(CooTuple(me, neighbor)); + } + + if (self) + { + neighbor = me; + coo_tuples.push_back(CooTuple(me, neighbor)); + } + } + } + + // Sort by rows, then columns, update dims + std::stable_sort(coo_tuples.begin(), coo_tuples.end(), CooTupleCompare); + UpdateDims(); + + // Assign arbitrary values to graph vertices + for (int i = 0; i < coo_tuples.size(); i++) + { + coo_tuples[i].val = i % 21; + } + + return 0; + } +}; + + + + +//--------------------------------------------------------------------- +// GPU types and device functions +//--------------------------------------------------------------------- + + +/// Pairing of dot product partial sums and corresponding row-id +template +struct PartialSum +{ + double partial; /// PartialSum sum + int row; /// Row-id + int padding; + + /// Tags indicating this structure provides overloaded ThreadLoad and ThreadStore operations + typedef void ThreadLoadTag; + typedef void ThreadStoreTag; + + /// ThreadLoad (simply defer to loading individual items) + template + __device__ __forceinline__ void ThreadLoad(PartialSum *ptr) + { + partial = cub::ThreadLoad(&(ptr->partial)); + row = cub::ThreadLoad(&(ptr->row)); + } + + /// ThreadStore (simply defer to storing individual items) + template + __device__ __forceinline__ void ThreadStore(PartialSum *ptr) const + { + // Always write partial first + cub::ThreadStore(&(ptr->partial), partial); + cub::ThreadStore(&(ptr->row), row); + } + +}; + + +/// Templated Texture reference type for multiplicand vector +template +struct TexVector +{ + // Texture reference type + typedef texture TexRef; + + static TexRef ref; + + /** + * Bind textures + */ + static void BindTexture(void *d_in, int elements) + { + cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc(); + if (d_in) + { + size_t offset; + size_t bytes = sizeof(Value) * elements; + CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes)); + } + } + + /** + * Unbind textures + */ + static void UnbindTexture() + { + CubDebugExit(cudaUnbindTexture(ref)); + } +}; + +// Texture reference definitions +template +typename TexVector::TexRef TexVector::ref = 0; + + +/// Reduce-by-row scan operator +struct ReduceByKeyOp +{ + template + __device__ __forceinline__ PartialSum operator()( + const PartialSum &first, + const PartialSum &second) + { + PartialSum retval; + + retval.partial = (second.row != first.row) ? + second.partial : + first.partial + second.partial; + + retval.row = second.row; + return retval; + } +}; + + +/// Min-row reduction operator +struct MinRowOp +{ + template + __device__ __forceinline__ PartialSum operator()( + const PartialSum &first, + const PartialSum &second) + { + return (first.row < second.row) ? + first : + second; + } +}; + + +// Callback functor for waiting on the previous threadblock to compute its partial sum (the prefix for this threadblock) +template +struct BlockPrefixOp +{ + PartialSum prefix; + + /** + * Threadblock-wide prefix callback functor called by thread-0 in BlockScan::ExclusiveScan(). + * Returns the threadblock-wide prefix to apply to all scan inputs. + */ + __device__ __forceinline__ PartialSum operator()( + const PartialSum &block_aggregate) ///< The aggregate sum of the local prefix sum inputs + { + ReduceByKeyOp scan_op; + + PartialSum retval = prefix; + prefix = scan_op(prefix, block_aggregate); + return retval; + } +}; + + +/// Functor for detecting row discontinuities. +struct NewRowOp +{ + /// Returns true if row_b is the start of a new row + template + __device__ __forceinline__ bool operator()( + const VertexId& row_a, + const VertexId& row_b) + { + return (row_a != row_b); + } +}; + + +/** * Threadblock abstraction for processing sparse SPMV tiles - */ -template < - int BLOCK_THREADS, - int ITEMS_PER_THREAD, - typename VertexId, - typename Value> -struct SpmvBlock -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Constants - enum - { - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - // Head flag type - typedef int HeadFlag; - - // Dot product partial sum type - typedef PartialSum PartialSum; - - // Parameterized CUB types for the parallel execution context - typedef BlockScan BlockScan; - typedef BlockExchange BlockExchangeRows; - typedef BlockExchange BlockExchangeValues; - typedef BlockDiscontinuity BlockDiscontinuity; - typedef BlockReduce BlockReduce; - - // Shared memory type for this threadblock - struct SmemStorage - { - union - { - typename BlockExchangeRows::SmemStorage exchange_rows; // Smem needed for striped->blocked transpose - typename BlockExchangeValues::SmemStorage exchange_values; // Smem needed for striped->blocked transpose - struct - { - typename BlockDiscontinuity::SmemStorage discontinuity; // Smem needed for head-flagging - typename BlockScan::SmemStorage scan; // Smem needed for reduce-value-by-row scan - }; - typename BlockReduce::SmemStorage reduce; // Smem needed for min-finding reduction - }; - - VertexId last_block_row; - VertexId prev_tile_row; - PartialSum identity; - PartialSum first_scatter; - }; - - //--------------------------------------------------------------------- - // Operations - //--------------------------------------------------------------------- - - /** - * Processes a COO input tile of edges, outputting dot products for each row - */ - __device__ __forceinline__ - static void ProcessTile( - SmemStorage &smem_storage, - BlockPrefixOp &carry, - VertexId* d_rows, - VertexId* d_columns, - Value* d_values, - Value* d_vector, - Value* d_result, - int block_offset, - int guarded_items = 0) - { - VertexId columns[ITEMS_PER_THREAD]; - VertexId rows[ITEMS_PER_THREAD]; - Value values[ITEMS_PER_THREAD]; - PartialSum partial_sums[ITEMS_PER_THREAD]; - HeadFlag head_flags[ITEMS_PER_THREAD]; - - // Load a threadblock-striped tile of A (sparse row-ids, column-ids, and values) - if (guarded_items) - { - // This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last - // vertex for out-of-bound items, but zero-valued - BlockLoadDirectStriped(d_columns + block_offset, guarded_items, VertexId(0), columns, BLOCK_THREADS); - BlockLoadDirectStriped(d_values + block_offset, guarded_items, Value(0), values, BLOCK_THREADS); - } - else - { - // Unguarded loads - BlockLoadDirectStriped(d_columns + block_offset, columns, BLOCK_THREADS); - BlockLoadDirectStriped(d_values + block_offset, values, BLOCK_THREADS); - } - - // Fence to prevent hoisting any dependent code below into the loads above - __syncthreads(); - - // Load the referenced values from x and compute the dot product partials sums - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - Value vec_item; - uint2 *v = reinterpret_cast(&vec_item); - *v= tex1Dfetch(TexVector::ref, columns[ITEM]); - values[ITEM] *= vec_item; - } - - // Load a threadblock-striped tile of A (sparse row-ids, column-ids, and values) - if (guarded_items) - { - // This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last - // vertex for out-of-bound items, but zero-valued - BlockLoadDirectStriped(d_rows + block_offset, guarded_items, smem_storage.last_block_row, rows, BLOCK_THREADS); - } - else - { - // Unguarded loads - BlockLoadDirectStriped(d_rows + block_offset, rows, BLOCK_THREADS); - } - - // Transpose from threadblock-striped to threadblock-blocked arrangement - BlockExchangeValues::StripedToBlocked(smem_storage.exchange_values, values); - - // Barrier for smem reuse and coherence - __syncthreads(); - - // Transpose from threadblock-striped to threadblock-blocked arrangement - BlockExchangeRows::StripedToBlocked(smem_storage.exchange_rows, rows); - - // Barrier for smem reuse and coherence - __syncthreads(); - - // Flag row heads by looking for discontinuities - BlockDiscontinuity::Flag( - smem_storage.discontinuity, - rows, // Original row ids - smem_storage.prev_tile_row, // Last row id from previous threadblock - NewRowOp(), // Functor for detecting start of new rows - head_flags); // (Out) Head flags - - // Assemble - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - partial_sums[ITEM].partial = values[ITEM]; - partial_sums[ITEM].row = rows[ITEM]; - } - - // Compute the exclusive scan of partial_sums - PartialSum block_aggregate; // Threadblock-wide aggregate in thread0 (unused) - BlockScan::ExclusiveScan( - smem_storage.scan, - partial_sums, - partial_sums, // (Out) - smem_storage.identity, - ReduceByKeyOp(), - block_aggregate, // (Out) - carry); // (In-out) - - // Store the last row in the tile (for computing head flags in the next tile) - if (threadIdx.x == BLOCK_THREADS - 1) - { - smem_storage.prev_tile_row = rows[ITEMS_PER_THREAD - 1]; - } - - // Pull the last row ID out of smem - VertexId last_block_row = smem_storage.last_block_row; - - // Scatter an accumulated dot product if it is the head of a valid row - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if (head_flags[ITEM]) - { - // Scatter - d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial; - } - else - { - // Otherwise reset the row ID to the last row in the threadblock's range - partial_sums[ITEM].row = last_block_row; - } - } - - // Find the first-scattered dot product if not yet set - if (smem_storage.first_scatter.row == last_block_row) - { - // Barrier for smem reuse and coherence - __syncthreads(); - - PartialSum candidate = BlockReduce::Reduce( - smem_storage.reduce, - partial_sums, - MinRowOp()); - - // Stash the first-scattered dot product in smem - if (threadIdx.x == 0) - { - smem_storage.first_scatter = candidate; - } - } - - } -}; - - -/** - * COO SpMV kernel - */ -template < - int BLOCK_THREADS, - int ITEMS_PER_THREAD, - int BLOCK_OCCUPANCY, - typename VertexId, - typename Value> -__launch_bounds__ (BLOCK_THREADS, BLOCK_OCCUPANCY) -__global__ void CooKernel( - GridEvenShare block_progress, - PartialSum *d_block_partials, - VertexId *d_rows, - VertexId *d_columns, - Value *d_values, - Value *d_vector, - Value *d_result) -{ - // Constants - enum - { - TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD - }; - - // SpMV threadblock tile-processing abstraction - typedef SpmvBlock SpmvBlock; - - // Shared memory - __shared__ typename SpmvBlock::SmemStorage smem_storage; - - // Initialize threadblock even-share to tell us where to start and stop our tile-processing - block_progress.Init(); - - // Stateful prefix carryover from one tile to the next - BlockPrefixOp > carry; - - // Initialize scalar shared memory values - if (threadIdx.x == 0) - { - VertexId first_block_row = d_rows[block_progress.block_offset]; - VertexId last_block_row = d_rows[block_progress.block_oob - 1]; - - // Initialize carry to identity - carry.prefix.row = first_block_row; - carry.prefix.partial = Value(0); - smem_storage.identity = carry.prefix; - - smem_storage.first_scatter.row = last_block_row; - smem_storage.first_scatter.partial = Value(0); - smem_storage.last_block_row = last_block_row; - smem_storage.prev_tile_row = (blockIdx.x == 0) ? - first_block_row : - d_rows[block_progress.block_offset - 1]; - } - - // Process full tiles - while (block_progress.block_offset <= block_progress.block_oob - TILE_SIZE) - { - SpmvBlock::ProcessTile( - smem_storage, - carry, - d_rows, - d_columns, - d_values, - d_vector, - d_result, - block_progress.block_offset); - - block_progress.block_offset += TILE_SIZE; - } - - // Process final partial tile (if present) - int guarded_items = block_progress.block_oob - block_progress.block_offset; - if (guarded_items) - { - SpmvBlock::ProcessTile( - smem_storage, - carry, - d_rows, - d_columns, - d_values, - d_vector, - d_result, - block_progress.block_offset, - guarded_items); - } - - if (threadIdx.x == 0) - { - if (gridDim.x == 1) - { - // Scatter the final aggregate (this kernel contains only 1 threadblock) - d_result[carry.prefix.row] = carry.prefix.partial; - } - else - { - // Write out threadblock first-item and carry aggregate - - // Unweight the first-output if it's the same row as the carry aggregate - PartialSum first_scatter = smem_storage.first_scatter; - if (first_scatter.row == carry.prefix.row) first_scatter.partial = Value(0); - - d_block_partials[blockIdx.x * 2] = first_scatter; - d_block_partials[(blockIdx.x * 2) + 1] = carry.prefix; - } - } -} - - -/** - * Threadblock abstraction for processing sparse SPMV tiles - */ -template < - int BLOCK_THREADS, - int ITEMS_PER_THREAD, - typename VertexId, - typename Value> -struct FinalizeSpmvBlock -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Constants - enum - { - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - // Head flag type - typedef int HeadFlag; - - // Dot product partial sum type - typedef PartialSum PartialSum; - - // Parameterized CUB types for the parallel execution context - typedef BlockScan BlockScan; - typedef BlockDiscontinuity BlockDiscontinuity; - - // Shared memory type for this threadblock - struct SmemStorage - { - union - { - typename BlockScan::SmemStorage scan; // Smem needed for reduce-value-by-row scan - typename BlockDiscontinuity::SmemStorage discontinuity; // Smem needed for head-flagging - }; - - VertexId last_block_row; - VertexId prev_tile_row; - PartialSum identity; - }; - - //--------------------------------------------------------------------- - // Operations - //--------------------------------------------------------------------- - - /** - * Processes a COO input tile of edges, outputting dot products for each row - */ - __device__ __forceinline__ - static void ProcessTile( - SmemStorage &smem_storage, - BlockPrefixOp &carry, - PartialSum *d_block_partials, - Value *d_result, - int block_offset, - int guarded_items = 0) - { - VertexId rows[ITEMS_PER_THREAD]; - PartialSum partial_sums[ITEMS_PER_THREAD]; - HeadFlag head_flags[ITEMS_PER_THREAD]; - - // Load a threadblock-striped tile of A (sparse row-ids, column-ids, and values) - if (guarded_items) - { - // This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last - // vertex for out-of-bound items, but zero-valued - PartialSum default_sum; - default_sum.row = smem_storage.last_block_row; - default_sum.partial = Value(0); - - BlockLoadDirect(d_block_partials + block_offset, guarded_items, default_sum, partial_sums); - } - else - { - // Unguarded loads - BlockLoadDirect(d_block_partials + block_offset, partial_sums); - } - - // Fence to prevent hoisting any dependent code below into the loads above -// __threadfence_block(); - - // Copy out row IDs for row-head flagging - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - rows[ITEM] = partial_sums[ITEM].row; - } - - // Flag row heads by looking for discontinuities - BlockDiscontinuity::Flag( - smem_storage.discontinuity, - rows, // Original row ids - smem_storage.prev_tile_row, // Last row id from previous threadblock - NewRowOp(), // Functor for detecting start of new rows - head_flags); // (Out) Head flags - - // Barrier for smem reuse and coherence - __syncthreads(); - - // Store the last row in the tile (for computing head flags in the next tile) - if (threadIdx.x == BLOCK_THREADS - 1) - { - smem_storage.prev_tile_row = rows[ITEMS_PER_THREAD - 1]; - } - - // Compute the exclusive scan of partial_sums - PartialSum block_aggregate; // Threadblock-wide aggregate in thread0 (unused) - BlockScan::ExclusiveScan( - smem_storage.scan, - partial_sums, - partial_sums, // (Out) - smem_storage.identity, - ReduceByKeyOp(), - block_aggregate, // (Out) - carry); // (In-out) - - // Scatter an accumulated dot product if it is the head of a valid row - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if (head_flags[ITEM]) - { - // Scatter - d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial; - } - } - - } -}; - - -/** - * COO Finalize kernel. - */ -template < - int BLOCK_THREADS, - int ITEMS_PER_THREAD, - typename VertexId, - typename Value> -__launch_bounds__ (BLOCK_THREADS, 1) -__global__ void CooFinalizeKernel( - PartialSum *d_block_partials, - int finalize_partials, - Value *d_result) -{ - // Constants - enum - { - TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD - }; - - // SpMV threadblock tile-processing abstraction - typedef FinalizeSpmvBlock FinalizeSpmvBlock; - - // Shared memory - __shared__ typename FinalizeSpmvBlock::SmemStorage smem_storage; - - // Stateful prefix carryover from one tile to the next - BlockPrefixOp > carry; - - // Initialize scalar shared memory values - if (threadIdx.x == 0) - { - VertexId first_block_row = d_block_partials[0].row; - VertexId last_block_row = d_block_partials[finalize_partials - 1].row; - - // Initialize carry to identity - carry.prefix.row = first_block_row; - carry.prefix.partial = Value(0); - smem_storage.identity = carry.prefix; - - smem_storage.last_block_row = last_block_row; - smem_storage.prev_tile_row = first_block_row; - } - - // Barrier for smem coherence - __syncthreads(); - - // Process full tiles - int block_offset = 0; - while (block_offset <= finalize_partials - TILE_SIZE) - { - FinalizeSpmvBlock::ProcessTile( - smem_storage, - carry, - d_block_partials, - d_result, - block_offset); - - // Barrier for smem reuse and coherence - __syncthreads(); - - block_offset += TILE_SIZE; - } - - // Process final partial tile (if present) - int guarded_items = finalize_partials - block_offset; - if (guarded_items) - { - FinalizeSpmvBlock::ProcessTile( - smem_storage, - carry, - d_block_partials, - d_result, - block_offset, - guarded_items); - } - - // Scatter the final aggregate (this kernel contains only 1 threadblock) - if (threadIdx.x == 0) - { - d_result[carry.prefix.row] = carry.prefix.partial; - } -} - - - - - - -//--------------------------------------------------------------------- -// Host subroutines -//--------------------------------------------------------------------- - -/** - * Compares the equivalence of two arrays - */ -template -int CompareResults(float* computed, float* reference, SizeT len) -{ - int retval = 0; - for (SizeT i = 0; i < len; i++) - { - if (computed[i] > 0.0) - { - if ((computed[i] * 1.001 < reference[i]) || - (computed[i] * 0.999 > reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - else - { - if ((computed[i] * 1.001 > reference[i]) || - (computed[i] * 0.999 < reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - } - - if (!retval) printf("CORRECT"); - return retval; -} - -/** - * Compares the equivalence of two arrays - */ -template -int CompareResults(double* computed, double* reference, SizeT len) -{ - int retval = 0; - for (SizeT i = 0; i < len; i++) - { - if (computed[i] > 0.0) - { - if ((computed[i] * 1.001 < reference[i]) || - (computed[i] * 0.999 > reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - else - { - if ((computed[i] * 1.001 > reference[i]) || - (computed[i] * 0.999 < reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - } - - if (!retval) printf("CORRECT"); - return retval; -} - - -/** + */ +template < + int BLOCK_THREADS, + int ITEMS_PER_THREAD, + typename VertexId, + typename Value> +struct SpmvBlock +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Constants + enum + { + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + // Head flag type + typedef int HeadFlag; + + // Dot product partial sum type + typedef PartialSum PartialSum; + + // Parameterized CUB types for the parallel execution context + typedef BlockScan BlockScan; + typedef BlockExchange BlockExchangeRows; + typedef BlockExchange BlockExchangeValues; + typedef BlockDiscontinuity BlockDiscontinuity; + typedef BlockReduce BlockReduce; + + // Shared memory type for this threadblock + struct SmemStorage + { + union + { + typename BlockExchangeRows::SmemStorage exchange_rows; // Smem needed for striped->blocked transpose + typename BlockExchangeValues::SmemStorage exchange_values; // Smem needed for striped->blocked transpose + struct + { + typename BlockDiscontinuity::SmemStorage discontinuity; // Smem needed for head-flagging + typename BlockScan::SmemStorage scan; // Smem needed for reduce-value-by-row scan + }; + typename BlockReduce::SmemStorage reduce; // Smem needed for min-finding reduction + }; + + VertexId last_block_row; + VertexId prev_tile_row; + PartialSum identity; + PartialSum first_scatter; + }; + + //--------------------------------------------------------------------- + // Operations + //--------------------------------------------------------------------- + + /** + * Processes a COO input tile of edges, outputting dot products for each row + */ + __device__ __forceinline__ + static void ProcessTile( + SmemStorage &smem_storage, + BlockPrefixOp &carry, + VertexId* d_rows, + VertexId* d_columns, + Value* d_values, + Value* d_vector, + Value* d_result, + int block_offset, + int guarded_items = 0) + { + VertexId columns[ITEMS_PER_THREAD]; + VertexId rows[ITEMS_PER_THREAD]; + Value values[ITEMS_PER_THREAD]; + PartialSum partial_sums[ITEMS_PER_THREAD]; + HeadFlag head_flags[ITEMS_PER_THREAD]; + + // Load a threadblock-striped tile of A (sparse row-ids, column-ids, and values) + if (guarded_items) + { + // This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last + // vertex for out-of-bound items, but zero-valued + BlockLoadDirectStriped(d_columns + block_offset, guarded_items, VertexId(0), columns, BLOCK_THREADS); + BlockLoadDirectStriped(d_values + block_offset, guarded_items, Value(0), values, BLOCK_THREADS); + } + else + { + // Unguarded loads + BlockLoadDirectStriped(d_columns + block_offset, columns, BLOCK_THREADS); + BlockLoadDirectStriped(d_values + block_offset, values, BLOCK_THREADS); + } + + // Fence to prevent hoisting any dependent code below into the loads above + __syncthreads(); + + // Load the referenced values from x and compute the dot product partials sums + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + Value vec_item; + uint2 *v = reinterpret_cast(&vec_item); + *v= tex1Dfetch(TexVector::ref, columns[ITEM]); + values[ITEM] *= vec_item; + } + + // Load a threadblock-striped tile of A (sparse row-ids, column-ids, and values) + if (guarded_items) + { + // This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last + // vertex for out-of-bound items, but zero-valued + BlockLoadDirectStriped(d_rows + block_offset, guarded_items, smem_storage.last_block_row, rows, BLOCK_THREADS); + } + else + { + // Unguarded loads + BlockLoadDirectStriped(d_rows + block_offset, rows, BLOCK_THREADS); + } + + // Transpose from threadblock-striped to threadblock-blocked arrangement + BlockExchangeValues::StripedToBlocked(smem_storage.exchange_values, values); + + // Barrier for smem reuse and coherence + __syncthreads(); + + // Transpose from threadblock-striped to threadblock-blocked arrangement + BlockExchangeRows::StripedToBlocked(smem_storage.exchange_rows, rows); + + // Barrier for smem reuse and coherence + __syncthreads(); + + // Flag row heads by looking for discontinuities + BlockDiscontinuity::Flag( + smem_storage.discontinuity, + rows, // Original row ids + smem_storage.prev_tile_row, // Last row id from previous threadblock + NewRowOp(), // Functor for detecting start of new rows + head_flags); // (Out) Head flags + + // Assemble + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + partial_sums[ITEM].partial = values[ITEM]; + partial_sums[ITEM].row = rows[ITEM]; + } + + // Compute the exclusive scan of partial_sums + PartialSum block_aggregate; // Threadblock-wide aggregate in thread0 (unused) + BlockScan::ExclusiveScan( + smem_storage.scan, + partial_sums, + partial_sums, // (Out) + smem_storage.identity, + ReduceByKeyOp(), + block_aggregate, // (Out) + carry); // (In-out) + + // Store the last row in the tile (for computing head flags in the next tile) + if (threadIdx.x == BLOCK_THREADS - 1) + { + smem_storage.prev_tile_row = rows[ITEMS_PER_THREAD - 1]; + } + + // Pull the last row ID out of smem + VertexId last_block_row = smem_storage.last_block_row; + + // Scatter an accumulated dot product if it is the head of a valid row + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (head_flags[ITEM]) + { + // Scatter + d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial; + } + else + { + // Otherwise reset the row ID to the last row in the threadblock's range + partial_sums[ITEM].row = last_block_row; + } + } + + // Find the first-scattered dot product if not yet set + if (smem_storage.first_scatter.row == last_block_row) + { + // Barrier for smem reuse and coherence + __syncthreads(); + + PartialSum candidate = BlockReduce::Reduce( + smem_storage.reduce, + partial_sums, + MinRowOp()); + + // Stash the first-scattered dot product in smem + if (threadIdx.x == 0) + { + smem_storage.first_scatter = candidate; + } + } + + } +}; + + +/** + * COO SpMV kernel + */ +template < + int BLOCK_THREADS, + int ITEMS_PER_THREAD, + int BLOCK_OCCUPANCY, + typename VertexId, + typename Value> +__launch_bounds__ (BLOCK_THREADS, BLOCK_OCCUPANCY) +__global__ void CooKernel( + GridEvenShare block_progress, + PartialSum *d_block_partials, + VertexId *d_rows, + VertexId *d_columns, + Value *d_values, + Value *d_vector, + Value *d_result) +{ + // Constants + enum + { + TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD + }; + + // SpMV threadblock tile-processing abstraction + typedef SpmvBlock SpmvBlock; + + // Shared memory + __shared__ typename SpmvBlock::SmemStorage smem_storage; + + // Initialize threadblock even-share to tell us where to start and stop our tile-processing + block_progress.Init(); + + // Stateful prefix carryover from one tile to the next + BlockPrefixOp > carry; + + // Initialize scalar shared memory values + if (threadIdx.x == 0) + { + VertexId first_block_row = d_rows[block_progress.block_offset]; + VertexId last_block_row = d_rows[block_progress.block_oob - 1]; + + // Initialize carry to identity + carry.prefix.row = first_block_row; + carry.prefix.partial = Value(0); + smem_storage.identity = carry.prefix; + + smem_storage.first_scatter.row = last_block_row; + smem_storage.first_scatter.partial = Value(0); + smem_storage.last_block_row = last_block_row; + smem_storage.prev_tile_row = (blockIdx.x == 0) ? + first_block_row : + d_rows[block_progress.block_offset - 1]; + } + + // Process full tiles + while (block_progress.block_offset <= block_progress.block_oob - TILE_SIZE) + { + SpmvBlock::ProcessTile( + smem_storage, + carry, + d_rows, + d_columns, + d_values, + d_vector, + d_result, + block_progress.block_offset); + + block_progress.block_offset += TILE_SIZE; + } + + // Process final partial tile (if present) + int guarded_items = block_progress.block_oob - block_progress.block_offset; + if (guarded_items) + { + SpmvBlock::ProcessTile( + smem_storage, + carry, + d_rows, + d_columns, + d_values, + d_vector, + d_result, + block_progress.block_offset, + guarded_items); + } + + if (threadIdx.x == 0) + { + if (gridDim.x == 1) + { + // Scatter the final aggregate (this kernel contains only 1 threadblock) + d_result[carry.prefix.row] = carry.prefix.partial; + } + else + { + // Write out threadblock first-item and carry aggregate + + // Unweight the first-output if it's the same row as the carry aggregate + PartialSum first_scatter = smem_storage.first_scatter; + if (first_scatter.row == carry.prefix.row) first_scatter.partial = Value(0); + + d_block_partials[blockIdx.x * 2] = first_scatter; + d_block_partials[(blockIdx.x * 2) + 1] = carry.prefix; + } + } +} + + +/** + * Threadblock abstraction for processing sparse SPMV tiles + */ +template < + int BLOCK_THREADS, + int ITEMS_PER_THREAD, + typename VertexId, + typename Value> +struct FinalizeSpmvBlock +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Constants + enum + { + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + // Head flag type + typedef int HeadFlag; + + // Dot product partial sum type + typedef PartialSum PartialSum; + + // Parameterized CUB types for the parallel execution context + typedef BlockScan BlockScan; + typedef BlockDiscontinuity BlockDiscontinuity; + + // Shared memory type for this threadblock + struct SmemStorage + { + union + { + typename BlockScan::SmemStorage scan; // Smem needed for reduce-value-by-row scan + typename BlockDiscontinuity::SmemStorage discontinuity; // Smem needed for head-flagging + }; + + VertexId last_block_row; + VertexId prev_tile_row; + PartialSum identity; + }; + + //--------------------------------------------------------------------- + // Operations + //--------------------------------------------------------------------- + + /** + * Processes a COO input tile of edges, outputting dot products for each row + */ + __device__ __forceinline__ + static void ProcessTile( + SmemStorage &smem_storage, + BlockPrefixOp &carry, + PartialSum *d_block_partials, + Value *d_result, + int block_offset, + int guarded_items = 0) + { + VertexId rows[ITEMS_PER_THREAD]; + PartialSum partial_sums[ITEMS_PER_THREAD]; + HeadFlag head_flags[ITEMS_PER_THREAD]; + + // Load a threadblock-striped tile of A (sparse row-ids, column-ids, and values) + if (guarded_items) + { + // This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last + // vertex for out-of-bound items, but zero-valued + PartialSum default_sum; + default_sum.row = smem_storage.last_block_row; + default_sum.partial = Value(0); + + BlockLoadDirect(d_block_partials + block_offset, guarded_items, default_sum, partial_sums); + } + else + { + // Unguarded loads + BlockLoadDirect(d_block_partials + block_offset, partial_sums); + } + + // Fence to prevent hoisting any dependent code below into the loads above +// __threadfence_block(); + + // Copy out row IDs for row-head flagging + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + rows[ITEM] = partial_sums[ITEM].row; + } + + // Flag row heads by looking for discontinuities + BlockDiscontinuity::Flag( + smem_storage.discontinuity, + rows, // Original row ids + smem_storage.prev_tile_row, // Last row id from previous threadblock + NewRowOp(), // Functor for detecting start of new rows + head_flags); // (Out) Head flags + + // Barrier for smem reuse and coherence + __syncthreads(); + + // Store the last row in the tile (for computing head flags in the next tile) + if (threadIdx.x == BLOCK_THREADS - 1) + { + smem_storage.prev_tile_row = rows[ITEMS_PER_THREAD - 1]; + } + + // Compute the exclusive scan of partial_sums + PartialSum block_aggregate; // Threadblock-wide aggregate in thread0 (unused) + BlockScan::ExclusiveScan( + smem_storage.scan, + partial_sums, + partial_sums, // (Out) + smem_storage.identity, + ReduceByKeyOp(), + block_aggregate, // (Out) + carry); // (In-out) + + // Scatter an accumulated dot product if it is the head of a valid row + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (head_flags[ITEM]) + { + // Scatter + d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial; + } + } + + } +}; + + +/** + * COO Finalize kernel. + */ +template < + int BLOCK_THREADS, + int ITEMS_PER_THREAD, + typename VertexId, + typename Value> +__launch_bounds__ (BLOCK_THREADS, 1) +__global__ void CooFinalizeKernel( + PartialSum *d_block_partials, + int finalize_partials, + Value *d_result) +{ + // Constants + enum + { + TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD + }; + + // SpMV threadblock tile-processing abstraction + typedef FinalizeSpmvBlock FinalizeSpmvBlock; + + // Shared memory + __shared__ typename FinalizeSpmvBlock::SmemStorage smem_storage; + + // Stateful prefix carryover from one tile to the next + BlockPrefixOp > carry; + + // Initialize scalar shared memory values + if (threadIdx.x == 0) + { + VertexId first_block_row = d_block_partials[0].row; + VertexId last_block_row = d_block_partials[finalize_partials - 1].row; + + // Initialize carry to identity + carry.prefix.row = first_block_row; + carry.prefix.partial = Value(0); + smem_storage.identity = carry.prefix; + + smem_storage.last_block_row = last_block_row; + smem_storage.prev_tile_row = first_block_row; + } + + // Barrier for smem coherence + __syncthreads(); + + // Process full tiles + int block_offset = 0; + while (block_offset <= finalize_partials - TILE_SIZE) + { + FinalizeSpmvBlock::ProcessTile( + smem_storage, + carry, + d_block_partials, + d_result, + block_offset); + + // Barrier for smem reuse and coherence + __syncthreads(); + + block_offset += TILE_SIZE; + } + + // Process final partial tile (if present) + int guarded_items = finalize_partials - block_offset; + if (guarded_items) + { + FinalizeSpmvBlock::ProcessTile( + smem_storage, + carry, + d_block_partials, + d_result, + block_offset, + guarded_items); + } + + // Scatter the final aggregate (this kernel contains only 1 threadblock) + if (threadIdx.x == 0) + { + d_result[carry.prefix.row] = carry.prefix.partial; + } +} + + + + + + +//--------------------------------------------------------------------- +// Host subroutines +//--------------------------------------------------------------------- + +/** + * Compares the equivalence of two arrays + */ +template +int CompareResults(float* computed, float* reference, SizeT len) +{ + int retval = 0; + for (SizeT i = 0; i < len; i++) + { + if (computed[i] > 0.0) + { + if ((computed[i] * 1.001 < reference[i]) || + (computed[i] * 0.999 > reference[i])) + { + printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); + return 1; + } + } + else + { + if ((computed[i] * 1.001 > reference[i]) || + (computed[i] * 0.999 < reference[i])) + { + printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); + return 1; + } + } + } + + if (!retval) printf("CORRECT"); + return retval; +} + +/** + * Compares the equivalence of two arrays + */ +template +int CompareResults(double* computed, double* reference, SizeT len) +{ + int retval = 0; + for (SizeT i = 0; i < len; i++) + { + if (computed[i] > 0.0) + { + if ((computed[i] * 1.001 < reference[i]) || + (computed[i] * 0.999 > reference[i])) + { + printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); + return 1; + } + } + else + { + if ((computed[i] * 1.001 > reference[i]) || + (computed[i] * 0.999 < reference[i])) + { + printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); + return 1; + } + } + } + + if (!retval) printf("CORRECT"); + return retval; +} + + +/** * Simple test of device - */ -template < - int COO_BLOCK_THREADS, - int COO_ITEMS_PER_THREAD, - int COO_BLOCK_OCCUPANCY, - int FINALIZE_BLOCK_THREADS, - int FINALIZE_ITEMS_PER_THREAD, - typename VertexId, - typename Value> -void TestDevice( - CooGraph& coo_graph, - Value* h_vector, - Value* h_reference) -{ - typedef PartialSum PartialSum; - - const int COO_TILE_SIZE = COO_BLOCK_THREADS * COO_ITEMS_PER_THREAD; - - if (g_iterations <= 0) return; - - - // SOA device storage - VertexId *d_rows; // SOA graph row coordinates - VertexId *d_columns; // SOA graph col coordinates - Value *d_values; // SOA graph values - Value *d_vector; // Vector multiplicand - Value *d_result; // Output row - PartialSum *d_block_partials; // Temporary storage for communicating dot product partials between threadblocks - - // Create SOA version of coo_graph on host - int num_edges = coo_graph.coo_tuples.size(); - VertexId *h_rows = new VertexId[num_edges]; - VertexId *h_columns = new VertexId[num_edges]; - Value *h_values = new Value[num_edges]; - for (int i = 0; i < num_edges; i++) - { - h_rows[i] = coo_graph.coo_tuples[i].row; - h_columns[i] = coo_graph.coo_tuples[i].col; - h_values[i] = coo_graph.coo_tuples[i].val; - } - - // Get CUDA properties - CudaProps cuda_props; - CubDebugExit(cuda_props.Init()); - - // Get kernel properties - KernelProps coo_kernel_props; - KernelProps finalize_kernel_props; - CubDebugExit(coo_kernel_props.Init( - CooKernel, - COO_BLOCK_THREADS, - cuda_props)); - CubDebugExit(finalize_kernel_props.Init( - CooFinalizeKernel, - FINALIZE_BLOCK_THREADS, - cuda_props)); - - // Determine launch configuration from kernel properties - int coo_grid_size = coo_kernel_props.OversubscribedGridSize(COO_TILE_SIZE, num_edges); - int finalize_partials = coo_grid_size * 2; - - // Allocate COO device arrays - CubDebugExit(DeviceAllocate((void**)&d_rows, sizeof(VertexId) * num_edges)); - CubDebugExit(DeviceAllocate((void**)&d_columns, sizeof(VertexId) * num_edges)); - CubDebugExit(DeviceAllocate((void**)&d_values, sizeof(Value) * num_edges)); - CubDebugExit(DeviceAllocate((void**)&d_vector, sizeof(Value) * coo_graph.col_dim)); - CubDebugExit(DeviceAllocate((void**)&d_result, sizeof(Value) * coo_graph.row_dim)); - CubDebugExit(DeviceAllocate((void**)&d_block_partials, sizeof(PartialSum) * finalize_partials)); - - // Copy host arrays to device - CubDebugExit(cudaMemcpy(d_rows, h_rows, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice)); - CubDebugExit(cudaMemcpy(d_columns, h_columns, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice)); - CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_edges, cudaMemcpyHostToDevice)); - CubDebugExit(cudaMemcpy(d_vector, h_vector, sizeof(Value) * coo_graph.col_dim, cudaMemcpyHostToDevice)); - - // Bind textures -// TexVector::BindTexture(d_vector, coo_graph.col_dim); - TexVector::BindTexture((uint2 *) d_vector, coo_graph.col_dim); - - // Construct an even-share work distribution - GridEvenShare block_progress(num_edges, coo_grid_size, COO_TILE_SIZE); - - // Print debug info - printf("CooKernel<%d, %d><<<%d, %d>>>(...), Max SM occupancy: %d\n", - COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_kernel_props.max_block_occupancy); - if (coo_grid_size > 1) - { - printf("CooFinalizeKernel<<<1, %d>>>(...), Max SM occupancy: %d\n", - FINALIZE_BLOCK_THREADS, finalize_kernel_props.max_block_occupancy); - } - fflush(stdout); - - // Run kernel - GpuTimer gpu_timer; - float elapsed_millis = 0.0; - for (int i = 0; i < g_iterations; i++) - { - gpu_timer.Start(); - - // Initialize output - CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value))); - - // Run the COO kernel - CooKernel<<>>( - block_progress, - d_block_partials, - d_rows, - d_columns, - d_values, - d_vector, - d_result); - - if (coo_grid_size > 1) - { - // Run the COO finalize kernel - CooFinalizeKernel<<<1, FINALIZE_BLOCK_THREADS>>>( - d_block_partials, - finalize_partials, - d_result); - } - - gpu_timer.Stop(); - elapsed_millis += gpu_timer.ElapsedMillis(); - } - - // Force any kernel stdio to screen - CubDebugExit(cudaThreadSynchronize()); - fflush(stdout); - - // Display timing - float avg_elapsed = elapsed_millis / g_iterations; - int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim); - printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n", - g_iterations, - avg_elapsed, - total_bytes / avg_elapsed / 1000.0 / 1000.0, - num_edges * 2 / avg_elapsed / 1000.0 / 1000.0); - - // Check results - AssertEquals(0, CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, g_verbose, g_verbose)); - - // Cleanup -// TexVector::UnbindTexture(); - TexVector::UnbindTexture(); - CubDebugExit(DeviceFree(d_block_partials)); - CubDebugExit(DeviceFree(d_rows)); - CubDebugExit(DeviceFree(d_columns)); - CubDebugExit(DeviceFree(d_values)); - CubDebugExit(DeviceFree(d_vector)); - CubDebugExit(DeviceFree(d_result)); - delete[] h_rows; - delete[] h_columns; - delete[] h_values; -} - - -/** - * Compute reference answer on CPU - */ -template -void ComputeReference( - CooGraph& coo_graph, - Value* h_vector, - Value* h_reference) -{ - for (VertexId i = 0; i < coo_graph.row_dim; i++) - { - h_reference[i] = 0.0; - } - - for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++) - { - h_reference[coo_graph.coo_tuples[i].row] += - coo_graph.coo_tuples[i].val * - h_vector[coo_graph.coo_tuples[i].col]; - } -} - - -/** - * Assign arbitrary values to vector items - */ -template -void AssignVectorValues(Value *vector, int col_dim) -{ - for (int i = 0; i < col_dim; i++) - { - vector[i] = 1.0; - } -} - - -/** - * Main - */ -int main(int argc, char** argv) -{ - // Graph of uint32s as vertex ids, floats as values - typedef int VertexId; - typedef double Value; - - // Initialize command line - CommandLineArgs args(argc, argv); - g_verbose = args.CheckCmdLineFlag("v"); - args.GetCmdLineArgument("i", g_iterations); - - // Print usage - if (args.CheckCmdLineFlag("help")) - { - printf("%s\n [--device=] [--v] [--iterations=] [--grid-size=]\n" - "\t--type=wheel --spokes=\n" - "\t--type=grid2d --width= [--7pt]\n" - "\t--type=grid3d --width=\n" - "\t--type=market --file=\n" - "\n", argv[0]); - exit(0); - } - - // Initialize device - CubDebugExit(args.DeviceInit()); - - // Get graph type - string type; - args.GetCmdLineArgument("type", type); - - // Generate graph structure - - CpuTimer timer; - timer.Start(); - CooGraph coo_graph; - if (type == string("grid2d")) - { - VertexId width; - args.GetCmdLineArgument("width", width); - bool self = args.CheckCmdLineFlag("7pt"); - printf("Generating %s grid2d width(%d)... ", (self) ? "7-pt" : "6-pt", width); fflush(stdout); - if (coo_graph.InitGrid2d(width, self)) exit(1); - } else if (type == string("grid3d")) - { - VertexId width; - args.GetCmdLineArgument("width", width); - printf("Generating grid3d width(%d)... ", width); fflush(stdout); - if (coo_graph.InitGrid3d(width)) exit(1); - } - else if (type == string("wheel")) - { - VertexId spokes; - args.GetCmdLineArgument("spokes", spokes); - printf("Generating wheel spokes(%d)... ", spokes); fflush(stdout); - if (coo_graph.InitWheel(spokes)) exit(1); - } - else if (type == string("market")) - { - string filename; - args.GetCmdLineArgument("file", filename); - printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout); - if (coo_graph.InitMetis(filename)) exit(1); - } - else - { - printf("Unsupported graph type\n"); - exit(1); - } - timer.Stop(); - printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n", - timer.ElapsedMillis() / 1000.0, - coo_graph.coo_tuples.size(), - coo_graph.row_dim, - coo_graph.col_dim); - fflush(stdout); - - if (g_verbose) - { - cout << coo_graph << "\n"; - } - - // Create vector - Value *h_vector = new Value[coo_graph.col_dim]; - AssignVectorValues(h_vector, coo_graph.col_dim); - if (g_verbose) - { - printf("Vector[%d]: ", coo_graph.col_dim); - DisplayResults(h_vector, coo_graph.col_dim); - printf("\n\n"); - } - - // Compute reference answer - Value *h_reference = new Value[coo_graph.row_dim]; - ComputeReference(coo_graph, h_vector, h_reference); - if (g_verbose) - { - printf("Results[%d]: ", coo_graph.row_dim); - DisplayResults(h_reference, coo_graph.row_dim); - printf("\n\n"); - } - - // Run GPU version - TestDevice< - 128, - 7, - 5, - 256, - 4>(coo_graph, h_vector, h_reference); - - // Cleanup - delete[] h_vector; - delete[] h_reference; - - return 0; -} - - - + */ +template < + int COO_BLOCK_THREADS, + int COO_ITEMS_PER_THREAD, + int COO_BLOCK_OCCUPANCY, + int FINALIZE_BLOCK_THREADS, + int FINALIZE_ITEMS_PER_THREAD, + typename VertexId, + typename Value> +void TestDevice( + CooGraph& coo_graph, + Value* h_vector, + Value* h_reference) +{ + typedef PartialSum PartialSum; + + const int COO_TILE_SIZE = COO_BLOCK_THREADS * COO_ITEMS_PER_THREAD; + + if (g_iterations <= 0) return; + + + // SOA device storage + VertexId *d_rows; // SOA graph row coordinates + VertexId *d_columns; // SOA graph col coordinates + Value *d_values; // SOA graph values + Value *d_vector; // Vector multiplicand + Value *d_result; // Output row + PartialSum *d_block_partials; // Temporary storage for communicating dot product partials between threadblocks + + // Create SOA version of coo_graph on host + int num_edges = coo_graph.coo_tuples.size(); + VertexId *h_rows = new VertexId[num_edges]; + VertexId *h_columns = new VertexId[num_edges]; + Value *h_values = new Value[num_edges]; + for (int i = 0; i < num_edges; i++) + { + h_rows[i] = coo_graph.coo_tuples[i].row; + h_columns[i] = coo_graph.coo_tuples[i].col; + h_values[i] = coo_graph.coo_tuples[i].val; + } + + // Get CUDA properties + CudaProps cuda_props; + CubDebugExit(cuda_props.Init()); + + // Get kernel properties + KernelProps coo_kernel_props; + KernelProps finalize_kernel_props; + CubDebugExit(coo_kernel_props.Init( + CooKernel, + COO_BLOCK_THREADS, + cuda_props)); + CubDebugExit(finalize_kernel_props.Init( + CooFinalizeKernel, + FINALIZE_BLOCK_THREADS, + cuda_props)); + + // Determine launch configuration from kernel properties + int coo_grid_size = coo_kernel_props.OversubscribedGridSize(COO_TILE_SIZE, num_edges); + int finalize_partials = coo_grid_size * 2; + + // Allocate COO device arrays + CubDebugExit(DeviceAllocate((void**)&d_rows, sizeof(VertexId) * num_edges)); + CubDebugExit(DeviceAllocate((void**)&d_columns, sizeof(VertexId) * num_edges)); + CubDebugExit(DeviceAllocate((void**)&d_values, sizeof(Value) * num_edges)); + CubDebugExit(DeviceAllocate((void**)&d_vector, sizeof(Value) * coo_graph.col_dim)); + CubDebugExit(DeviceAllocate((void**)&d_result, sizeof(Value) * coo_graph.row_dim)); + CubDebugExit(DeviceAllocate((void**)&d_block_partials, sizeof(PartialSum) * finalize_partials)); + + // Copy host arrays to device + CubDebugExit(cudaMemcpy(d_rows, h_rows, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice)); + CubDebugExit(cudaMemcpy(d_columns, h_columns, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice)); + CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_edges, cudaMemcpyHostToDevice)); + CubDebugExit(cudaMemcpy(d_vector, h_vector, sizeof(Value) * coo_graph.col_dim, cudaMemcpyHostToDevice)); + + // Bind textures +// TexVector::BindTexture(d_vector, coo_graph.col_dim); + TexVector::BindTexture((uint2 *) d_vector, coo_graph.col_dim); + + // Construct an even-share work distribution + GridEvenShare block_progress(num_edges, coo_grid_size, COO_TILE_SIZE); + + // Print debug info + printf("CooKernel<%d, %d><<<%d, %d>>>(...), Max SM occupancy: %d\n", + COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_kernel_props.max_block_occupancy); + if (coo_grid_size > 1) + { + printf("CooFinalizeKernel<<<1, %d>>>(...), Max SM occupancy: %d\n", + FINALIZE_BLOCK_THREADS, finalize_kernel_props.max_block_occupancy); + } + fflush(stdout); + + // Run kernel + GpuTimer gpu_timer; + float elapsed_millis = 0.0; + for (int i = 0; i < g_iterations; i++) + { + gpu_timer.Start(); + + // Initialize output + CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value))); + + // Run the COO kernel + CooKernel<<>>( + block_progress, + d_block_partials, + d_rows, + d_columns, + d_values, + d_vector, + d_result); + + if (coo_grid_size > 1) + { + // Run the COO finalize kernel + CooFinalizeKernel<<<1, FINALIZE_BLOCK_THREADS>>>( + d_block_partials, + finalize_partials, + d_result); + } + + gpu_timer.Stop(); + elapsed_millis += gpu_timer.ElapsedMillis(); + } + + // Force any kernel stdio to screen + CubDebugExit(cudaThreadSynchronize()); + fflush(stdout); + + // Display timing + float avg_elapsed = elapsed_millis / g_iterations; + int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim); + printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n", + g_iterations, + avg_elapsed, + total_bytes / avg_elapsed / 1000.0 / 1000.0, + num_edges * 2 / avg_elapsed / 1000.0 / 1000.0); + + // Check results + AssertEquals(0, CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, g_verbose, g_verbose)); + + // Cleanup +// TexVector::UnbindTexture(); + TexVector::UnbindTexture(); + CubDebugExit(DeviceFree(d_block_partials)); + CubDebugExit(DeviceFree(d_rows)); + CubDebugExit(DeviceFree(d_columns)); + CubDebugExit(DeviceFree(d_values)); + CubDebugExit(DeviceFree(d_vector)); + CubDebugExit(DeviceFree(d_result)); + delete[] h_rows; + delete[] h_columns; + delete[] h_values; +} + + +/** + * Compute reference answer on CPU + */ +template +void ComputeReference( + CooGraph& coo_graph, + Value* h_vector, + Value* h_reference) +{ + for (VertexId i = 0; i < coo_graph.row_dim; i++) + { + h_reference[i] = 0.0; + } + + for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++) + { + h_reference[coo_graph.coo_tuples[i].row] += + coo_graph.coo_tuples[i].val * + h_vector[coo_graph.coo_tuples[i].col]; + } +} + + +/** + * Assign arbitrary values to vector items + */ +template +void AssignVectorValues(Value *vector, int col_dim) +{ + for (int i = 0; i < col_dim; i++) + { + vector[i] = 1.0; + } +} + + +/** + * Main + */ +int main(int argc, char** argv) +{ + // Graph of uint32s as vertex ids, floats as values + typedef int VertexId; + typedef double Value; + + // Initialize command line + CommandLineArgs args(argc, argv); + g_verbose = args.CheckCmdLineFlag("v"); + args.GetCmdLineArgument("i", g_iterations); + + // Print usage + if (args.CheckCmdLineFlag("help")) + { + printf("%s\n [--device=] [--v] [--iterations=] [--grid-size=]\n" + "\t--type=wheel --spokes=\n" + "\t--type=grid2d --width= [--7pt]\n" + "\t--type=grid3d --width=\n" + "\t--type=market --file=\n" + "\n", argv[0]); + exit(0); + } + + // Initialize device + CubDebugExit(args.DeviceInit()); + + // Get graph type + string type; + args.GetCmdLineArgument("type", type); + + // Generate graph structure + + CpuTimer timer; + timer.Start(); + CooGraph coo_graph; + if (type == string("grid2d")) + { + VertexId width; + args.GetCmdLineArgument("width", width); + bool self = args.CheckCmdLineFlag("7pt"); + printf("Generating %s grid2d width(%d)... ", (self) ? "7-pt" : "6-pt", width); fflush(stdout); + if (coo_graph.InitGrid2d(width, self)) exit(1); + } else if (type == string("grid3d")) + { + VertexId width; + args.GetCmdLineArgument("width", width); + printf("Generating grid3d width(%d)... ", width); fflush(stdout); + if (coo_graph.InitGrid3d(width)) exit(1); + } + else if (type == string("wheel")) + { + VertexId spokes; + args.GetCmdLineArgument("spokes", spokes); + printf("Generating wheel spokes(%d)... ", spokes); fflush(stdout); + if (coo_graph.InitWheel(spokes)) exit(1); + } + else if (type == string("market")) + { + string filename; + args.GetCmdLineArgument("file", filename); + printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout); + if (coo_graph.InitMetis(filename)) exit(1); + } + else + { + printf("Unsupported graph type\n"); + exit(1); + } + timer.Stop(); + printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n", + timer.ElapsedMillis() / 1000.0, + coo_graph.coo_tuples.size(), + coo_graph.row_dim, + coo_graph.col_dim); + fflush(stdout); + + if (g_verbose) + { + cout << coo_graph << "\n"; + } + + // Create vector + Value *h_vector = new Value[coo_graph.col_dim]; + AssignVectorValues(h_vector, coo_graph.col_dim); + if (g_verbose) + { + printf("Vector[%d]: ", coo_graph.col_dim); + DisplayResults(h_vector, coo_graph.col_dim); + printf("\n\n"); + } + + // Compute reference answer + Value *h_reference = new Value[coo_graph.row_dim]; + ComputeReference(coo_graph, h_vector, h_reference); + if (g_verbose) + { + printf("Results[%d]: ", coo_graph.row_dim); + DisplayResults(h_reference, coo_graph.row_dim); + printf("\n\n"); + } + + // Run GPU version + TestDevice< + 128, + 7, + 5, + 256, + 4>(coo_graph, h_vector, h_reference); + + // Cleanup + delete[] h_vector; + delete[] h_reference; + + return 0; +} + + + From 0c117a9fda5a21dd0114ae54ff324957979f2183 Mon Sep 17 00:00:00 2001 From: Steven Dalton Date: Thu, 11 Apr 2013 23:22:19 -0500 Subject: [PATCH 2/2] Reworked floating point array equality tests --- test/test_coo_spmv.cu | 69 +--------------------------------- test/test_coo_spmv_double.cu | 73 +----------------------------------- test/test_util.h | 43 +++++++++++++++++++++ 3 files changed, 45 insertions(+), 140 deletions(-) diff --git a/test/test_coo_spmv.cu b/test/test_coo_spmv.cu index 24f0573a8c..38f491d6ff 100644 --- a/test/test_coo_spmv.cu +++ b/test/test_coo_spmv.cu @@ -1095,73 +1095,6 @@ __global__ void CooFinalizeKernel( // Host subroutines //--------------------------------------------------------------------- -/** - * Compares the equivalence of two arrays - */ -template -int CompareResults(float* computed, float* reference, SizeT len) -{ - int retval = 0; - for (SizeT i = 0; i < len; i++) - { - if (computed[i] > 0.0) - { - if ((computed[i] * 1.001 < reference[i]) || - (computed[i] * 0.999 > reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - else - { - if ((computed[i] * 1.001 > reference[i]) || - (computed[i] * 0.999 < reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - } - - if (!retval) printf("CORRECT"); - return retval; -} - -/** - * Compares the equivalence of two arrays - */ -template -int CompareResults(double* computed, double* reference, SizeT len) -{ - int retval = 0; - for (SizeT i = 0; i < len; i++) - { - if (computed[i] > 0.0) - { - if ((computed[i] * 1.001 < reference[i]) || - (computed[i] * 0.999 > reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - else - { - if ((computed[i] * 1.001 > reference[i]) || - (computed[i] * 0.999 < reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - } - - if (!retval) printf("CORRECT"); - return retval; -} - - /** * Simple test of device */ @@ -1426,7 +1359,7 @@ int main(int argc, char** argv) exit(1); } timer.Stop(); - printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n", + printf("Done (%.3fs). %lu non-zeros, %d rows, %d columns\n", timer.ElapsedMillis() / 1000.0, coo_graph.coo_tuples.size(), coo_graph.row_dim, diff --git a/test/test_coo_spmv_double.cu b/test/test_coo_spmv_double.cu index 3743fe27db..145128e5cf 100644 --- a/test/test_coo_spmv_double.cu +++ b/test/test_coo_spmv_double.cu @@ -1093,77 +1093,6 @@ __global__ void CooFinalizeKernel( -//--------------------------------------------------------------------- -// Host subroutines -//--------------------------------------------------------------------- - -/** - * Compares the equivalence of two arrays - */ -template -int CompareResults(float* computed, float* reference, SizeT len) -{ - int retval = 0; - for (SizeT i = 0; i < len; i++) - { - if (computed[i] > 0.0) - { - if ((computed[i] * 1.001 < reference[i]) || - (computed[i] * 0.999 > reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - else - { - if ((computed[i] * 1.001 > reference[i]) || - (computed[i] * 0.999 < reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - } - - if (!retval) printf("CORRECT"); - return retval; -} - -/** - * Compares the equivalence of two arrays - */ -template -int CompareResults(double* computed, double* reference, SizeT len) -{ - int retval = 0; - for (SizeT i = 0; i < len; i++) - { - if (computed[i] > 0.0) - { - if ((computed[i] * 1.001 < reference[i]) || - (computed[i] * 0.999 > reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - else - { - if ((computed[i] * 1.001 > reference[i]) || - (computed[i] * 0.999 < reference[i])) - { - printf("INCORRECT ([%d]: %f != %f) ", i, computed[i], reference[i]); - return 1; - } - } - } - - if (!retval) printf("CORRECT"); - return retval; -} - - /** * Simple test of device */ @@ -1430,7 +1359,7 @@ int main(int argc, char** argv) exit(1); } timer.Stop(); - printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n", + printf("Done (%.3fs). %lu non-zeros, %d rows, %d columns\n", timer.ElapsedMillis() / 1000.0, coo_graph.coo_tuples.size(), coo_graph.row_dim, diff --git a/test/test_util.h b/test/test_util.h index 0f6775c4c6..5b10891514 100644 --- a/test/test_util.h +++ b/test/test_util.h @@ -45,6 +45,7 @@ #include #include #include +#include #include "cub.cuh" @@ -692,6 +693,48 @@ int CompareResults(T* computed, S* reference, SizeT len) return 0; } +/** + * Compares the equivalence of two arrays + */ +template +int CompareResults(float* computed, float* reference, SizeT len) +{ + int retval = 0; + for (SizeT i = 0; i < len; i++) + { + float difference = std::abs(computed[i]-reference[i]); + if(difference > std::numeric_limits::epsilon()) + { + printf("INCORRECT ([%lu,%e]: %f != %f) \n", i, difference, computed[i], reference[i]); + return 1; + } + } + + if (!retval) printf("CORRECT\n"); + return retval; +} + +/** + * Compares the equivalence of two arrays + */ +template +int CompareResults(double* computed, double* reference, SizeT len) +{ + int retval = 0; + for (SizeT i = 0; i < len; i++) + { + double difference = std::abs(computed[i]-reference[i]); + if(difference > std::numeric_limits::epsilon()) + { + printf("INCORRECT ([%lu,%le]: %lf != %lf) \n", i, difference, computed[i], reference[i]); + return 1; + } + } + + if (!retval) printf("CORRECT\n"); + return retval; +} + /** * Verify the contents of a device array match those