Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
ce9f994
Initial rewrite to support N-dimensional tensors
stijnh Jun 20, 2023
1a5fd74
Tests pass for new tensor implementation
stijnh Jun 22, 2023
9f4a8e6
Various features add for tensor support
stijnh Jun 27, 2023
1812de6
Rename aliases in prelude
stijnh Jun 27, 2023
3c63bbc
Add missing include in complex.h
stijnh Jul 17, 2023
8fb8b94
Allow implicit conversion of `tensor<T, 1>` to type `T`
stijnh Jul 17, 2023
bdfed80
Rename aliases in prelude
stijnh Jul 17, 2023
96f18a6
Add `dot` function
stijnh Jul 17, 2023
a7489e4
Add ternary `where` function
stijnh Jul 24, 2023
eacf584
Simplify code for `kernel_float::apply_impl`
stijnh Jul 24, 2023
5475ee6
Added `fast_*` functions for fast math
stijnh Jul 25, 2023
df48350
Add fma function
stijnh Jul 25, 2023
f905f62
Add `kconstant` type
stijnh Jul 25, 2023
0cf2670
Remove support for N-d tensors and only support 1D vectors
stijnh Jul 25, 2023
e503d23
Clean up of vector constructors
stijnh Aug 1, 2023
3b34b7e
Write more documentation
stijnh Aug 1, 2023
2405950
Small fixes and changes
stijnh Aug 8, 2023
2ce8b7d
Add cast specialization for `constant<T>`
stijnh Aug 8, 2023
b012dbe
Add `for_each` and `range` functions
stijnh Aug 14, 2023
b3e93f1
Add cross product function
stijnh Aug 14, 2023
cc83e92
Improve how dot product is computed for 16bit floats
stijnh Aug 14, 2023
d9efc31
Fix compilation error
stijnh Aug 14, 2023
3c3059a
Add functions to calculate magnitude
stijnh Aug 14, 2023
9b71242
Fix several compilation issues
stijnh Aug 14, 2023
e1568c0
Wrote documentation for many functions
stijnh Aug 15, 2023
35d5532
Add functions `concat` and `select`
stijnh Aug 15, 2023
2acc262
Change how is implemented
stijnh Aug 15, 2023
fdbb671
Add missing INLINEs for literal operators
stijnh Aug 16, 2023
67e7c5e
Remove `constant<T>` operator overloads since they are ambiguous
stijnh Aug 16, 2023
b236a52
Add vector conversion for `T[N]`
stijnh Aug 16, 2023
2551fb2
Add `into_vector` to documentation
stijnh Aug 16, 2023
df42b93
Add template deduction guides
stijnh Aug 16, 2023
8ab491f
Fix several issues in `complex.h`
stijnh Aug 24, 2023
be214a2
Fallback to using fp32 for fp16 operations that are not supported (e.…
stijnh Aug 24, 2023
903d677
Promote `half` + `bfloat16` to `float`
stijnh Aug 28, 2023
31ffbb7
Promote `constant<T>` + `constant<R>` to `constant<promote_t<T, R>>`
stijnh Aug 28, 2023
1212e8f
Add deduction guides for type aliases in prelude
stijnh Aug 28, 2023
cc846b6
Update single include
stijnh Aug 28, 2023
64f2190
Rewrite test framework
stijnh Sep 18, 2023
90372b2
Small bug fixes
stijnh Sep 18, 2023
ebd0967
Rename several helper structs from `X_helper` to `X_impl`
stijnh Sep 19, 2023
7acff4c
Use raw pointers in `apply_impl` and `reduce_impl`
stijnh Sep 19, 2023
da0a46b
Add tests for reductions
stijnh Sep 19, 2023
3f3edaa
Rewrite `magnitude_impl` and `dot_impl` to take direct pointers inste…
stijnh Sep 19, 2023
07af0ad
Rename `into_vector_traits` to `into_vector_impl`
stijnh Sep 20, 2023
c0939d0
Fix incorrect definition of FMA
stijnh Sep 21, 2023
6a7bd2e
Add operator overloads for constants
stijnh Sep 21, 2023
227f987
Add more tests
stijnh Sep 21, 2023
4c74866
Add license boilerplate to single-header include
stijnh Sep 21, 2023
46d598c
Add github workflow to run unittests
stijnh Sep 21, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions .github/workflows/cmake-action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: CMake

on:
workflow_call:
inputs:
cuda-version:
required: true
type: string

env:
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
BUILD_TYPE: Debug

jobs:
build:
# The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
# You can convert this to a matrix build if you need cross-platform coverage.
# See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
runs-on: ubuntu-latest

steps:
- uses: Jimver/cuda-toolkit@v0.2.11
id: cuda-toolkit
with:
method: network
sub-packages: '["nvcc"]'
cuda: ${{ inputs.cuda-version }}

- uses: actions/checkout@v3
with:
submodules: 'true'

- name: Configure CMake
# Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
# See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DKERNEL_FLOAT_BUILD_TEST=1 -DKERNEL_FLOAT_BUILD_EXAMPLE=1

- name: Build
# Build your program with the given configuration
run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}

- name: Test
working-directory: ${{github.workspace}}/build
# Execute tests defined by the CMake configuration.
# See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
run: ./tests/kernel_float_tests --durations=yes --success --verbosity=high ~[GPU]

28 changes: 28 additions & 0 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: CMake

on:
push:
pull_request:
branches: [ "main" ]

env:
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
BUILD_TYPE: Debug

jobs:
build-cuda:
uses: ./.github/workflows/cmake-action.yml
with:
cuda-version: "12.2.0"

build-cuda-11-7:
needs: build-cuda
uses: ./.github/workflows/cmake-action.yml
with:
cuda-version: "11.7.0"

build-cuda-12-0:
needs: build-cuda
uses: ./.github/workflows/cmake-action.yml
with:
cuda-version: "12.0.0"
21 changes: 20 additions & 1 deletion combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,24 @@
import subprocess
from datetime import datetime

license_boilerplate = """/*
* Kernel Float: Header-only library for vector types and reduced precision floating-point math.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

"""

directory = "include/kernel_float"
contents = dict()

Expand All @@ -28,7 +46,8 @@
except Exception as e:
print(f"warning: {e}")

output = "\n".join([
output = license_boilerplate
output += "\n".join([
"//" + "=" * 80,
"// this file has been auto-generated, do not modify its contents!",
f"// date: {date}",
Expand Down
4 changes: 3 additions & 1 deletion docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ API Reference
.. toctree::
api/types.rst
api/primitives.rst
api/generation.rst
api/unary_operators.rst
api/binary_operators.rst
api/reductions.rst
api/shuffling.rst
api/mathematical.rst
api/conditional.rst

65 changes: 35 additions & 30 deletions docs/build_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,51 +65,51 @@ def build_index_page(groups):

return filename

aliases = []
for ty in ["vec", "float", "double", "half", "bfloat16x", ""]:
if ty != "vec":
aliases.append(f"{ty}X")

aliases = ["scalar", "vec"]
for ty in ["vec"]:
for i in range(2, 8 + 1):
aliases.append(f"{ty}{i}")

groups = {
"Types": [
("vector", "vector", "struct"),
("Aliases", [
"unaligned_vec",
"vec",
] + aliases,
"typedef"),
("Aliases", aliases, "typedef"),
],
"Primitives": [
("range", "range()"),
("range", "range(F)"),
"map",
"reduce",
"zip",
"zip_common",
"cast",
"broadcast",
"resize",
"for_each",
],
"Shuffling": [
"convert",
"make_vec",
"into_vector",
"concat",
"swizzle",
"first",
"last",
"reversed",
"rotate_left",
"rotate_right",
"select",
"for_each",
],
"Unary Operators": [
"Generation": [
"range",
"range_like",
"each_index",
"fill",
"fill_like",
"zeros",
"zeros_like",
"ones",
"ones_like",
],
"Shuffling": [
# "concat",
# "swizzle",
# "first",
# "last",
# "reversed",
# "rotate_left",
# "rotate_right",
],
"Unary Operators": [
"negate",
"bit_not",
"logical_not",
Expand All @@ -135,21 +135,21 @@ def build_index_page(groups):
("min", "min(L&&, R&&)"),
"nextafter",
"modf",
"pow",
("pow", "pow(L&&, R&&)"),
"remainder",
#"rhypot",
],
"Reductions": [
"sum",
("max", "max(V&&)"),
("min", "min(V&&)"),
("max", "max(const V&)"),
("min", "min(const V&)"),
"product",
"all",
"any",
"count",
],
"Mathematical": [
"abs",
("abs", "abs(const V&)"),
"acos",
"acosh",
"asin",
Expand All @@ -166,22 +166,22 @@ def build_index_page(groups):
"erfcinv",
"erfcx",
"erfinv",
"exp",
("exp", "exp(const V&)"),
"exp10",
"exp2",
"fabs",
"floor",
"ilogb",
"lgamma",
"log",
("log", "log(const V&)"),
"log10",
"logb",
"nearbyint",
"normcdf",
"rcbrt",
"sin",
"sinh",
"sqrt",
("sqrt", "sqrt(const V&)"),
"tan",
"tanh",
"tgamma",
Expand All @@ -193,6 +193,11 @@ def build_index_page(groups):
"isinf",
"isnan",
],
"Conditional": [
("where", "where(const C&, const L&, const R&)"),
("where", "where(const C&, const L&)"),
("where", "where(const C&)"),
]
}

build_index_page(groups)
22 changes: 8 additions & 14 deletions examples/vector_add/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
#include <vector>

#include "kernel_float.h"
namespace kf = kernel_float;

using x = kf::half;
using namespace kernel_float::prelude;

void cuda_check(cudaError_t code) {
if (code != cudaSuccess) {
Expand All @@ -15,11 +13,7 @@ void cuda_check(cudaError_t code) {
}

template<int N>
__global__ void my_kernel(
int length,
const kf::unaligned_vec<__half, N>* input,
double constant,
kf::unaligned_vec<float, N>* output) {
__global__ void my_kernel(int length, const khalf<N>* input, double constant, kfloat<N>* output) {
int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i * N < length) {
Expand All @@ -30,24 +24,24 @@ __global__ void my_kernel(
template<int items_per_thread>
void run_kernel(int n) {
double constant = 1.0;
std::vector<__half> input(n);
std::vector<half> input(n);
std::vector<float> output_expected;
std::vector<float> output_result;

// Generate input data
for (int i = 0; i < n; i++) {
input[i] = __half(i);
input[i] = half(i);
output_expected[i] = float(i + constant);
}

// Allocate device memory
kf::unaligned_vec<__half, items_per_thread>* input_dev;
kf::unaligned_vec<float, items_per_thread>* output_dev;
cuda_check(cudaMalloc(&input_dev, sizeof(__half) * n));
khalf<items_per_thread>* input_dev;
kfloat<items_per_thread>* output_dev;
cuda_check(cudaMalloc(&input_dev, sizeof(half) * n));
cuda_check(cudaMalloc(&output_dev, sizeof(float) * n));

// Copy device memory
cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(__half) * n, cudaMemcpyDefault));
cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(half) * n, cudaMemcpyDefault));

// Launch kernel!
int block_size = 256;
Expand Down
12 changes: 6 additions & 6 deletions include/kernel_float.h
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
#ifndef KERNEL_FLOAT_H
#define KERNEL_FLOAT_H

#include "kernel_float/base.h"
#include "kernel_float/bf16.h"
#include "kernel_float/binops.h"
#include "kernel_float/cast.h"
#include "kernel_float/conversion.h"
#include "kernel_float/fp16.h"
#include "kernel_float/fp8.h"
#include "kernel_float/interface.h"
#include "kernel_float/iterate.h"
#include "kernel_float/macros.h"
#include "kernel_float/meta.h"
#include "kernel_float/prelude.h"
#include "kernel_float/reduce.h"
#include "kernel_float/storage.h"
#include "kernel_float/swizzle.h"
#include "kernel_float/triops.h"
#include "kernel_float/unops.h"
#include "kernel_float/vector.h"

#endif
#endif
Loading