diff --git a/.gitignore b/.gitignore
index c7f37fb..ab0657e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ docs/build
 
 Manifest*.toml
 .*-bbook/
+!.gpu_raytracing_tutorial-bbook
diff --git a/Project.toml b/Project.toml
index a1bfc2b..72bebff 100644
--- a/Project.toml
+++ b/Project.toml
@@ -20,7 +20,7 @@ RaycoreMakieExt = "Makie"
 [compat]
 julia = "1.10"
 GeometryBasics = "0.5"
-KernelAbstractions = "0.9"
+KernelAbstractions = "0.9, 0.10"
 LinearAlgebra = "1"
 Random = "1"
 StaticArrays = "1.9.7"
diff --git a/README.md b/README.md
index 1cb845b..785851c 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ using Raycore, GeometryBasics, LinearAlgebra
 sphere = Tesselation(Sphere(Point3f(0, 0, 2), 1.0f0), 20)
 
 # Build BVH acceleration structure
-bvh = BVHAccel([sphere])
+bvh = BVH([sphere])
 
 # Cast rays and find intersections
 ray = Ray(o=Point3f(0, 0, 0), d=Vec3f(0, 0, 1))
diff --git a/docs/examples.jl b/docs/examples.jl
index fd3ca4b..6dcae50 100644
--- a/docs/examples.jl
+++ b/docs/examples.jl
@@ -14,7 +14,7 @@ begin
     l = 0.5
     floor = Rect3f(-l, -l, -0.01, 2l, 2l, 0.01)
     cat = load(Makie.assetpath("cat.obj"))
-    bvh = Raycore.BVHAccel([s1, s2, s3, s4, cat]);
+    bvh = Raycore.BVH([s1, s2, s3, s4, cat]);
     world_mesh = GeometryBasics.Mesh(bvh)
     f, ax, pl = Makie.mesh(world_mesh; color=:teal)
     center!(ax.scene)
diff --git a/docs/make.jl b/docs/make.jl
index 9cbb5d6..e405a22 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -18,6 +18,7 @@ makedocs(;
         "Examples" => [
             "BVH Hit Tests" => "bvh_hit_tests.md",
             "Ray Tracing Tutorial" => "raytracing_tutorial.md",
+            "GPU Ray Tracing Tutorial" => "gpu_raytracing.md",
             "View Factors and More" => "viewfactors.md",
         ],
     ],
diff --git a/docs/src/.gpu_raytracing_tutorial-bbook/data/gpu-benchmarks.png b/docs/src/.gpu_raytracing_tutorial-bbook/data/gpu-benchmarks.png
new file mode 100644
index 0000000..f55ab7f
Binary files /dev/null and b/docs/src/.gpu_raytracing_tutorial-bbook/data/gpu-benchmarks.png differ
diff --git a/docs/src/.gpu_raytracing_tutorial-bbook/meta.toml b/docs/src/.gpu_raytracing_tutorial-bbook/meta.toml
new file mode 100644
index 0000000..9f7a875
--- /dev/null
+++ b/docs/src/.gpu_raytracing_tutorial-bbook/meta.toml
@@ -0,0 +1 @@
+version = "0.1.0"
diff --git a/docs/src/bvh_hit_tests_content.md b/docs/src/bvh_hit_tests_content.md
index 04711b2..b6ab3b6 100644
--- a/docs/src/bvh_hit_tests_content.md
+++ b/docs/src/bvh_hit_tests_content.md
@@ -17,7 +17,7 @@ function create_test_scene()
     sphere2 = Tesselation(Sphere(Point3f(0, 0, 3), 1.0f0), 20)   # Middle
     sphere3 = Tesselation(Sphere(Point3f(0, 0, 1), 1.0f0), 20)   # Closest
 
-    bvh = Raycore.BVHAccel([sphere1, sphere2, sphere3])
+    bvh = Raycore.BVH([sphere1, sphere2, sphere3])
     return bvh
 end
 
@@ -89,7 +89,7 @@ for i in 1:30
     push!(complex_spheres, Tesselation(Sphere(Point3f(x, y, z), r), 8))
 end
 
-complex_bvh = Raycore.BVHAccel(complex_spheres)
+complex_bvh = Raycore.BVH(complex_spheres)
 # Test rays to find cases where any_hit differs from closest_hit
 test_rays = map(rand(Point2f, 200)) do p  
     p = (p .* 14f0) .- 8f0
diff --git a/docs/src/gpu_raytracing.md b/docs/src/gpu_raytracing.md
new file mode 100644
index 0000000..74136ff
--- /dev/null
+++ b/docs/src/gpu_raytracing.md
@@ -0,0 +1,14 @@
+# GPU Ray Tracing with Raycore
+
+```@setup gpu_raytracing
+using Bonito, BonitoBook, Raycore
+book_app = App() do
+    path = normpath(joinpath(dirname(pathof(Raycore)), "..", "docs", "src", "gpu_raytracing_tutorial.md"))
+    BonitoBook.InlineBook(path)
+end
+Bonito.Page()
+```
+
+```@example gpu_raytracing
+book_app # hide
+```
diff --git a/docs/src/gpu_raytracing_tutorial.md b/docs/src/gpu_raytracing_tutorial.md
new file mode 100644
index 0000000..be3ab77
--- /dev/null
+++ b/docs/src/gpu_raytracing_tutorial.md
@@ -0,0 +1,280 @@
+# Ray Tracing on the GPU
+
+In this tutorial, we'll take the ray tracer from the previous tutorial and port it to the GPU using **KernelAbstractions.jl** and a GPU backend of choice (CUDA.jl, AMDGPU.jl, OpenCL.jl, OneApi.jl, or Metal.jl). We'll explore three different kernel implementations, each with different optimization strategies, and benchmark their performance against each other.
+
+By the end, you'll understand how to write efficient GPU kernels for ray tracing and the tradeoffs between different approaches!
+
+## Setup
+
+```julia (editor=true, logging=false, output=true)
+using Raycore, GeometryBasics, LinearAlgebra
+using Colors, ImageShow
+using WGLMakie
+using KernelAbstractions
+using BenchmarkTools
+```
+To run things on the GPU with KernelAbstractions, you need to choose the correct package for your GPU and set the array type we use from there on.
+
+```julia (editor=true, logging=false, output=true)
+#using CUDA; GArray = CuArray; # For NVIDIA GPUS
+#using AMDGPU; GArray = ROCArray; # for AMD GPUs
+#using Metal; GArray = MtlArray; # for Apple hardware
+#using oneAPI; GArray = oneArray; # for intel
+# OpenCL with the pocl backend should work for most CPUs and some GPUs, but might not be as fast.
+# using pocl_jll, OpenCL; GArray = CLArray;
+GArray = Array # For the tutorial to run on CI we just use the CPU
+```
+**Ready for GPU!** We have:
+
+  * `Raycore` for fast ray-triangle intersections
+  * `KernelAbstractions` for portable GPU kernels (works with CUDA, AMD, Metal, oneAPI, and OpenCL)
+  * `BenchmarkTools` for performance comparison
+
+## Part 1: Scene Setup (Same as CPU Tutorial)
+
+Let's use the exact same scene as the CPU tutorial - the Makie cat with room geometry:
+
+```julia (editor=true, logging=false, output=true)
+# Load and prepare the cat model
+include("raytracing-core.jl")
+bvh, ctx = example_scene()
+# We have a Makie extension for plotting the scene graph
+f, ax, pl = plot(bvh; axis=(; show_axis=false))
+f
+```
+```julia (editor=true, logging=false, output=true)
+cam = cameracontrols(ax.scene)
+cam.eyeposition[] = [0, 1.0, -4]
+cam.lookat[] = [0, 0, 2]
+cam.upvector[] = [0.0, 1, 0.0]
+cam.fov[] = 45.0
+```
+## Part 2: GPU Kernel Version 1 - Basic Naive Approach
+
+The simplest GPU kernel - one thread per pixel:
+
+```julia (editor=true, logging=false, output=true)
+import KernelAbstractions as KA
+
+# Basic kernel: one thread per pixel, straightforward implementation
+@kernel function raytrace_kernel_v1!(
+    img, @Const(bvh), @Const(ctx),
+    camera_pos, focal_length, aspect, sky_color, ::Val{NSamples}
+) where {NSamples}
+    # Get pixel coordinates
+    idx = @index(Global, Linear)
+    height, width = size(img)
+    # Convert linear index to 2D coordinates
+    x = ((idx - 1) % width) + 1
+    y = ((idx - 1) ÷ width) + 1
+    if x <= width && y <= height
+        # Generate camera ray and calculate a simple light model
+        color = Vec3f(0)
+        for i in 1:NSamples
+            color = color .+ sample_light(bvh, ctx, width, height, camera_pos, focal_length, aspect, x, y, sky_color)
+        end
+        @inbounds img[y, x] = to_rgb(color ./ NSamples)
+    end
+end
+```
+The `trace_gpu` function is a universal launcher that works with any of our kernels. It handles the backend-specific setup automatically using **KernelAbstractions.jl**:
+
+```julia (editor=true, logging=false, output=true)
+function trace_gpu(kernel, img, bvh, ctx;
+        camera_pos=Point3f(0, -0.9, -2.5), fov=45.0f0,
+        sky_color=RGB{Float32}(0.5f0,0.7f0,1.0f0),
+        samples_per_pixel=4,
+        ndrange=length(img), tilesize=nothing
+    )
+    height, width = size(img)
+    aspect = Float32(width / height)
+    focal_length = 1.0f0 / tan(deg2rad(fov / 2))
+
+    # KernelAbstractions automatically detects the backend (CPU/GPU) from the array type
+    backend = KA.get_backend(img)
+
+    # Create the kernel with or without tilesize (for workgroup configuration)
+    kernel! = isnothing(tilesize) ? kernel(backend) : kernel(backend, tilesize)
+
+    kernel!(img, bvh, ctx, camera_pos, focal_length, aspect, sky_color, Val(samples_per_pixel), ndrange=ndrange)
+
+    # Ensure GPU computation completes before returning
+    KA.synchronize(backend)
+    return img
+end
+```
+**Key KernelAbstractions.jl concepts:**
+
+  * **Backend detection**: `get_backend(array)` automatically determines if we're using CPU, AMD GPU, NVIDIA GPU, etc.
+  * **Kernel compilation**: `kernel(backend)` compiles the kernel for the specific backend
+  * **Workgroup configuration**: Optional `tilesize` parameter controls thread organization
+  * **Thread indexing**: Inside kernels, use `@index(Global, Linear)` or `@index(Global, Cartesian)` to get thread IDs
+  * **Synchronization**: `synchronize(backend)` ensures all GPU work completes before continuing
+
+Let's test kernel v1 on the CPU (yes, they always work with normal Arrays):
+
+```julia (editor=true, logging=false, output=true)
+img = fill(RGBf(0, 0, 0), 400, 720)
+bench_kernel_cpu_v1 = @benchmark trace_gpu(raytrace_kernel_v1!, img, bvh, ctx)
+img
+```
+To run things on the GPU, we simply convert the arrays to the GPU backend array type. `to_gpu` is a helper in Raycore to convert nested structs correctly for the kernel. It's not doing anything special, besides that struct of arrays need to be converted to device arrays and for pure arrays `GPUArray(array)` is enough.
+
+```julia (editor=true, logging=false, output=true)
+using Raycore: to_gpu
+img = fill(RGBf(0, 0, 0), 400, 720)
+img_gpu = GArray(img);
+bvh_gpu = to_gpu(GArray, bvh);
+ctx_gpu = to_gpu(GArray, ctx);
+bench_kernel_v1 = @benchmark trace_gpu(raytrace_kernel_v1!, img_gpu, bvh_gpu, ctx_gpu)
+# Bring back to CPU to display image
+Array(img_gpu)
+```
+**First GPU render!** This is the simplest approach - one thread per pixel with no optimization.
+
+## Part 3: Optimized Kernel - Loop Unrolling
+
+Loop overhead is significant on GPUs! Manually unrolling the sampling loop eliminates this overhead:
+
+```julia (editor=true, logging=false, output=true)
+# Optimized kernel: Unrolled sampling loop
+@kernel function raytrace_kernel_unrolled!(
+        img, @Const(bvh), @Const(ctx),
+        camera_pos, focal_length, aspect, sky_color, ::Val{NSamples}
+    ) where {NSamples}
+    idx = @index(Global, Linear)
+    height, width = size(img)
+    x = ((idx - 1) % width) + 1
+    y = ((idx - 1) ÷ width) + 1
+    if x <= width && y <= height
+        # ntuple with compile-time constant for unrolling
+        samples = ntuple(NSamples) do i
+            sample_light(bvh, ctx, width, height,
+                camera_pos, focal_length, aspect,
+                x, y, sky_color
+            )
+        end
+        color = mean(samples)
+        @inbounds img[y, x] = to_rgb(color)
+    end
+end
+
+bench_kernel_unrolled = @benchmark trace_gpu(raytrace_kernel_unrolled!, img_gpu, bvh_gpu, ctx_gpu)
+Array(img_gpu)
+```
+  * This eliminates branch overhead from loop conditions
+  * Reduces register pressure
+  * Better instruction-level parallelism
+  * **1.39x faster than baseline!**
+
+## Part 4: Tiled Kernel with Optimized Tile Size
+
+The tile size dramatically affects performance. Let's use the optimal size discovered through benchmarking:
+
+```julia (editor=true, logging=false, output=true)
+# Tiled kernel with optimized tile size
+@kernel function raytrace_kernel_tiled!(
+    img, bvh, ctx,
+    camera_pos, focal_length, aspect, sky_color, ::Val{NSamples}
+) where {NSamples}
+    # Get tile and local coordinates
+    _tile_xy = @index(Group, Cartesian)
+    _local_xy = @index(Local, Cartesian)
+    _groupsize = @groupsize()
+
+    # Direct tuple unpacking is faster than Vec construction
+    tile_x, tile_y = Tuple(_tile_xy)
+    local_x, local_y = Tuple(_local_xy)
+    group_w, group_h = _groupsize
+
+    # Compute global pixel coordinates
+    x = (tile_x - 1) * group_w + local_x
+    y = (tile_y - 1) * group_h + local_y
+
+    height, width = size(img)
+    if x <= width && y <= height
+        samples = ntuple(NSamples) do i
+            sample_light(bvh, ctx, width, height,
+                camera_pos, focal_length, aspect,
+                x, y, sky_color
+            )
+        end
+        color = mean(samples)
+        @inbounds img[y, x] = to_rgb(color)
+    end
+end
+bench_kernel_tiled_32_16 = @benchmark trace_gpu(
+    $raytrace_kernel_tiled!, $img_gpu, $bvh_gpu, $ctx_gpu;
+    ndrange=size($img_gpu), tilesize=(32,16))
+
+# Benchmark two more important tile sizes for comparison
+bench_kernel_tiled_32_32 = @benchmark trace_gpu(
+    $raytrace_kernel_tiled!, $img_gpu, $bvh_gpu, $ctx_gpu;
+    ndrange=size($img_gpu), tilesize=(32,32))
+
+bench_kernel_tiled_8_8 = @benchmark trace_gpu(
+    $raytrace_kernel_tiled!, $img_gpu, $bvh_gpu, $ctx_gpu;
+    ndrange=size($img_gpu), tilesize=(8,8))
+
+# Use optimal tile size: (32, 16) - discovered through benchmarking!
+Array(img_gpu)
+```
+**Tile size matters!** With `(32, 16)` tiles, this kernel is **1.22x faster** than baseline. With poor tile sizes like `(8, 8)`, it can be **2.5x slower**!
+
+## Part 5: Wavefront Path Tracing
+
+The wavefront approach reorganizes ray tracing to minimize thread divergence by grouping similar work together. Instead of each thread handling an entire pixel's path, we separate the work into stages. Discussing the exact implementation is outside the scope of this tutorial, so we only include the finished renderer here:
+
+```julia (editor=true, logging=false, output=true)
+include("wavefront-renderer.jl")
+```
+Let's benchmark the wavefront renderer on both CPU and GPU:
+
+```julia (editor=true, logging=false, output=true)
+# CPU benchmark
+renderer_cpu = WavefrontRenderer(img, bvh, ctx)
+bench_wavefront_cpu = @benchmark render!($renderer_cpu)
+
+# GPU benchmark
+renderer_gpu = to_gpu(GArray, renderer_cpu)
+bench_wavefront_gpu = @benchmark render!($renderer_gpu)
+
+renderer_gpu = to_gpu(GArray, WavefrontRenderer(img, bvh, ctx; samples_per_pixel=16))
+render!(renderer_gpu)
+# Display result
+Array(renderer_gpu.framebuffer)
+```
+**Wavefront benefits:**
+
+  * Reduces thread divergence by grouping similar work
+  * Better memory access patterns
+  * Scales well with scene complexity
+  * Enables advanced features like path tracing
+
+## Part 6: Comprehensive Performance Benchmarks
+
+Now let's compare all kernels including the wavefront renderer:
+
+```julia (editor=true, logging=false, output=true)
+benchmarks = [
+    bench_kernel_v1, bench_kernel_cpu_v1, bench_kernel_unrolled,
+    bench_kernel_tiled_8_8, bench_kernel_tiled_32_16, bench_kernel_tiled_32_32,
+    bench_wavefront_cpu, bench_wavefront_gpu
+]
+labels = [
+    "Baseline\n(gpu)", "Baseline\n(cpu)", "Unrolled",
+    "Tiled\n(8×8)", "Tiled\n(32×16)", "Tiled\n(32×32)",
+    "Wavefront\n(cpu)", "Wavefront\n(gpu)"
+]
+
+fig, times, speedups = plot_kernel_benchmarks(benchmarks, labels)
+# save(data"gpu-benchmarks.png", fig; backend=Main.GLMakie)
+# We use the Fig from my local run, since on CI this won't show the differences
+DOM.img(src=Asset(data"gpu-benchmarks.png"), width="700px")
+```
+### Next Steps
+
+  * Add **adaptive sampling** (more samples only where needed)
+  * Explore **shared memory** optimizations for BVH traversal
+  * Implement **streaming multisampling** across frames
+
diff --git a/docs/src/index.md b/docs/src/index.md
index 96d8d98..8d7ed2a 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -27,6 +27,14 @@ Build a complete ray tracer from scratch with shadows, materials, reflections, a
 
 [Ray Tracing with Raycore](@ref)
 
+### GPU Ray Tracing Tutorial
+
+Port the ray tracer to the GPU with KernelAbstractions.jl. Learn about kernel optimization, loop unrolling, tiling, and wavefront rendering.
+
+![GPU Ray Tracing](.gpu_raytracing_tutorial-bbook/data/gpu-benchmarks.png)
+
+[GPU Ray Tracing with Raycore](@ref)
+
 ### View Factors Analysis
 
 Calculate view factors, illumination, and centroids for radiosity and thermal analysis.
diff --git a/docs/src/raytracing-core.jl b/docs/src/raytracing-core.jl
new file mode 100644
index 0000000..40e047d
--- /dev/null
+++ b/docs/src/raytracing-core.jl
@@ -0,0 +1,312 @@
+using Raycore, GeometryBasics, LinearAlgebra
+using Colors, ImageShow
+import KernelAbstractions as KA
+function compute_normal(triangle, bary_coords)
+    v0, v1, v2 = Raycore.normals(triangle)
+    u, v, w = bary_coords[1], bary_coords[2], bary_coords[3]
+    return Vec3f(normalize(v0 * u + v1 * v + v2 * w))
+end
+
+# Generate camera ray for a pixel with optional jitter
+function camera_ray(x, y, width, height, camera_pos, focal_length, aspect; jitter=Vec2f(0))
+    ndc_x = (2.0f0 * (Float32(x) - 0.5f0 + jitter[1]) / Float32(width) - 1.0f0) * aspect
+    ndc_y = 1.0f0 - 2.0f0 * (Float32(y) - 0.5f0 + jitter[2]) / Float32(height)
+    direction = normalize(Vec3f(ndc_x, ndc_y, focal_length))
+    return Raycore.Ray(o=camera_pos, d=direction)
+end
+
+# Convert between color representations
+to_vec3f(c::RGB) = Vec3f(c.r, c.g, c.b)
+to_rgb(v::Vec3f) = RGB{Float32}(v...)
+
+struct PointLight
+    position::Point3f
+    intensity::Float32
+    color::RGB{Float32}
+end
+
+struct Material
+    base_color::RGB{Float32}
+    metallic::Float32
+    roughness::Float32
+end
+
+struct RenderContext{L<:AbstractVector{PointLight},M<:AbstractVector{Material}}
+    lights::L
+    materials::M
+    ambient::Float32
+end
+
+function Raycore.to_gpu(Arr, ctx::RenderContext)
+    return RenderContext(to_gpu(Arr, ctx.lights), to_gpu(Arr, ctx.materials), ctx.ambient)
+end
+
+function render_context()
+    # Create lights and materials
+    lights = [
+        PointLight(Point3f(3, 4, -2), 50.0f0, RGB(1.0f0, 0.9f0, 0.8f0)),
+        PointLight(Point3f(-3, 2, 0), 20.0f0, RGB(0.7f0, 0.8f0, 1.0f0)),
+        PointLight(Point3f(0, 5, 5), 15.0f0, RGB(1.0f0, 1.0f0, 1.0f0))
+    ]
+
+    materials = [
+        Material(RGB(0.8f0, 0.6f0, 0.4f0), 0.0f0, 0.8f0),  # cat
+        Material(RGB(0.3f0, 0.5f0, 0.3f0), 0.0f0, 0.9f0),  # floor
+        Material(RGB(0.8f0, 0.6f0, 0.5f0), 0.8f0, 0.05f0),  # back wall
+        Material(RGB(0.7f0, 0.7f0, 0.8f0), 0.0f0, 0.8f0),  # left wall
+        Material(RGB(0.9f0, 0.9f0, 0.9f0), 0.8f0, 0.02f0),  # sphere1 - metallic
+        Material(RGB(0.3f0, 0.6f0, 0.9f0), 0.5f0, 0.3f0),  # sphere2 - semi-metallic
+    ]
+
+    return RenderContext(lights, materials, 0.1f0)
+end
+
+function compute_light(
+    bvh, point, normal, light_pos, light_intensity, light_color; shadow_samples=1)
+
+    light_vec = light_pos - point
+    light_dist = norm(light_vec)
+    light_dir = light_vec / light_dist
+
+    diffuse = max(0.0f0, dot(normal, light_dir))
+
+    # Shadow testing with optional soft shadows
+    shadow_factor = 0.0f0
+    light_radius = 0.2f0  # Size of area light for soft shadows
+
+    for _ in 1:shadow_samples
+        # For shadow_samples=1, this is just the light position (hard shadow)
+        # For shadow_samples>1, we sample random points on a disk (soft shadow)
+        if shadow_samples > 1
+            # Random point on disk perpendicular to light direction
+            offset = (rand(Vec3f) .* 2.0f0 .- 1.0f0) * light_radius
+            offset = offset - light_dir * dot(offset, light_dir)
+            shadow_target = light_pos + offset
+        else
+            shadow_target = light_pos
+        end
+
+        shadow_vec = shadow_target - point
+        shadow_dist = norm(shadow_vec)
+        shadow_dir = normalize(shadow_vec)
+
+        shadow_ray = Raycore.Ray(o=point + normal * 0.001f0, d=shadow_dir)
+        shadow_hit, _, hit_dist, _ = Raycore.any_hit(bvh, shadow_ray)
+
+        if !shadow_hit || hit_dist >= shadow_dist
+            shadow_factor += 1.0f0
+        end
+    end
+    shadow_factor /= shadow_samples
+
+    # Compute final light contribution
+    attenuation = light_intensity / (light_dist * light_dist)
+    return to_vec3f(light_color) * (diffuse * attenuation * shadow_factor)
+end
+
+function shadow_kernel(bvh, ctx, tri, dist, bary, ray; shadow_samples=1)
+    hit_point = ray.o + ray.d * dist
+    normal = compute_normal(tri, bary)
+    # Single point light
+    light_pos = Point3f(3, 4, -2)
+    light_intensity = 50.0f0
+    light_color = RGB{Float32}(1.0f0, 0.9f0, 0.8f0)
+    # Hard shadows (shadow_samples=1)
+    light_contrib = compute_light(
+        bvh, hit_point, normal, light_pos, light_intensity, light_color;
+        shadow_samples=shadow_samples
+    )
+    ambient = 0.1f0
+
+    brightness = ambient .+ light_contrib
+    return to_rgb(brightness)
+end
+
+function compute_multi_light(bvh, ctx, point, normal, mat; shadow_samples=1)
+    base_color = to_vec3f(mat.base_color)
+    total_color = base_color * ctx.ambient
+
+    for light in ctx.lights
+        light_contrib = compute_light(bvh, point, normal, light.position, light.intensity, light.color, shadow_samples=shadow_samples)
+        total_color += base_color .* light_contrib
+    end
+
+    return total_color
+end
+function reflective_kernel(bvh, ctx, tri, dist, bary, ray, sky_color, shadow_samples=8)
+    hit_point = ray.o + ray.d * dist
+    normal = compute_normal(tri, bary)
+    mat = ctx.materials[tri.material_idx]
+
+    # Direct lighting with soft shadows
+    direct_color = compute_multi_light(bvh, ctx, hit_point, normal, mat, shadow_samples=shadow_samples)
+
+    # Reflections for metallic surfaces
+    if mat.metallic > 0.0f0
+        wo = -ray.d
+        reflect_dir = Raycore.reflect(wo, normal)
+
+        # Optional roughness
+        if mat.roughness > 0.0f0
+            offset = (rand(Vec3f) .* 2.0f0 .- 1.0f0) * mat.roughness
+            reflect_dir = normalize(reflect_dir + offset)
+        end
+
+        # Cast reflection ray
+        reflect_ray = Raycore.Ray(o=hit_point + normal * 0.001f0, d=reflect_dir)
+        refl_hit, refl_tri, refl_dist, refl_bary = Raycore.closest_hit(bvh, reflect_ray)
+
+        reflection_color = if refl_hit
+            refl_point = reflect_ray.o + reflect_ray.d * refl_dist
+            refl_normal = compute_normal(refl_tri, refl_bary)
+            refl_mat = ctx.materials[refl_tri.material_idx]
+            compute_multi_light(bvh, ctx, refl_point, refl_normal, refl_mat, shadow_samples=shadow_samples)
+        else
+            to_vec3f(sky_color)
+        end
+
+        direct_color = direct_color * (1.0f0 - mat.metallic) + reflection_color * mat.metallic
+    end
+
+    return to_rgb(direct_color)
+end
+
+function example_scene()
+    cat_mesh = Makie.loadasset("cat.obj")
+    angle = deg2rad(150f0)
+    rotation = Makie.Quaternionf(0, sin(angle/2), 0, cos(angle/2))
+    rotated_coords = [rotation * Point3f(v) for v in coordinates(cat_mesh)]
+
+    # Get bounding box and translate cat to sit on the floor
+    cat_bbox = Rect3f(rotated_coords)
+    floor_y = -1.5f0
+    cat_offset = Vec3f(0, floor_y - cat_bbox.origin[2], 0)
+
+    cat_mesh = GeometryBasics.normal_mesh(
+        [v + cat_offset for v in rotated_coords],
+        faces(cat_mesh)
+    )
+
+    # Create a simple room: floor, back wall, and side wall
+    floor = normal_mesh(Rect3f(Vec3f(-5, -1.5, -2), Vec3f(10, 0.01, 10)))
+    back_wall = normal_mesh(Rect3f(Vec3f(-5, -1.5, 8), Vec3f(10, 5, 0.01)))
+    left_wall = normal_mesh(Rect3f(Vec3f(-5, -1.5, -2), Vec3f(0.01, 5, 10)))
+
+    # Add a couple of spheres for visual interest
+    sphere1 = Tesselation(Sphere(Point3f(-2, -1.5 + 0.8, 2), 0.8f0), 64)
+    sphere2 = Tesselation(Sphere(Point3f(2, -1.5 + 0.6, 1), 0.6f0), 64)
+
+    # Build our BVH acceleration structure
+    scene_geometry = [cat_mesh, floor, back_wall, left_wall, sphere1, sphere2]
+    bvh = Raycore.BVH(scene_geometry)
+    return bvh, render_context()
+end
+
+function sample_light(bvh, ctx, width, height, camera_pos, focal_length, aspect, x, y, sky_color)
+    jitter = rand(Vec2f)
+    ray = camera_ray(x, y, width, height, camera_pos, focal_length, aspect; jitter)
+    hit_found, tri, dist, bary = Raycore.closest_hit(bvh, ray)
+    if hit_found
+        color = reflective_kernel(bvh, ctx, tri, dist, bary, ray, RGB(0.5f0, 0.7f0, 1.0f0), 8)
+        return to_vec3f(color)
+    end
+    return to_vec3f(sky_color)
+end
+
+
+import KernelAbstractions as KA
+# Basic kernel: one thread per pixel, straightforward implementation
+@kernel function raytrace_kernel_v1!(
+        img, bvh, ctx,
+        width, height, camera_pos, focal_length, aspect, sky_color
+    )
+    # Get pixel coordinates
+    idx = @index(Global, Linear)
+    # Convert linear index to 2D coordinates
+    x = ((idx - 1) % width) + 1
+    y = ((idx - 1) ÷ width) + 1
+    if x <= width && y <= height
+        # Generate camera ray
+        color = Vec3f(0, 0, 0)
+        for i in 1:10
+            color = color .+ sample_light(bvh, ctx, width, height, camera_pos, focal_length, aspect, x, y, sky_color)
+        end
+        img[y, x] = (to_rgb(color) ./ 10f0)
+    end
+end
+
+
+# New launcher: array-based (backend-agnostic) tracer for kernel v1
+# This accepts the image and all scene arrays on the caller side. The caller may pass
+# CPU arrays (Array) or GPU arrays (ROCArray) — KernelAbstractions will pick the right backend
+# based on the `img` array backend.
+function trace_gpu_v1(kernel, img, bvh, ctx;
+        camera_pos=Point3f(0, -0.9, -2.5), fov=45.0f0,
+        sky_color=RGB{Float32}(0.5f0, 0.7f0, 1.0f0),
+        ndrange=length(img)
+    )
+    height, width = size(img)
+    aspect = Float32(width / height)
+    focal_length = 1.0f0 / tan(deg2rad(fov / 2))
+
+    backend = KA.get_backend(img)
+    kernel! = kernel(backend)
+
+    kernel!(
+        img, bvh, ctx,
+        Int32(width), Int32(height),
+        camera_pos, focal_length, aspect,
+        sky_color,
+        ndrange=ndrange
+    )
+    KA.synchronize(backend)
+    return img
+end
+# Helper function to plot kernel benchmark comparisons
+function plot_kernel_benchmarks(benchmarks, labels; title="GPU Kernel Performance")
+    # Extract median times in milliseconds
+    times = [median(b.times) / 1e6 for b in benchmarks]
+
+    # Sort by performance (fastest first)
+    sorted_indices = sortperm(times)
+    sorted_times = times[sorted_indices]
+    sorted_labels = labels[sorted_indices]
+
+    # Generate colors for bars based on number of benchmarks
+    n = length(benchmarks)
+    colors = Makie.resample_cmap(:viridis, n)
+
+    # Create performance visualization
+    fig = Figure(size=(700, 400))
+
+    # Single plot: Execution time comparison (sorted by performance)
+    ax = Axis(fig[1, 1],
+        title=title,
+        xlabel="Kernel Configuration",
+        ylabel="Time (ms)",
+        xticks=(1:length(sorted_labels), sorted_labels))
+
+    barplot!(ax, 1:length(sorted_times), sorted_times, color=colors[sorted_indices])
+
+    # Add value labels
+    for (i, t) in enumerate(sorted_times)
+        text!(ax, i, t, text="$(round(t, digits=1))ms",
+              align=(:center, :bottom), fontsize=12)
+    end
+
+    # Highlight the winner (fastest)
+    scatter!(ax, [1], [sorted_times[1]],
+             color=:gold, markersize=30, marker=:star5)
+
+    return fig, times, sorted_indices
+end
+
+# using Raycore: to_gpu
+# bvh, ctx = example_scene()
+# img = fill(RGBf(0, 0, 0), 512, 512)
+# pres = []
+# img_gpu = ROCArray(img);
+# bvh_gpu = to_gpu(ROCArray, bvh; preserve=pres);
+# ctx_gpu = to_gpu(ROCArray, ctx; preserve=pres);
+# img_v1 = trace_gpu_v1(raytrace_kernel_v1!, img_gpu, bvh_gpu, ctx_gpu);
+# Array(img_v1)
diff --git a/docs/src/raytracing_tutorial_content.md b/docs/src/raytracing_tutorial_content.md
index b60298b..2d26f40 100644
--- a/docs/src/raytracing_tutorial_content.md
+++ b/docs/src/raytracing_tutorial_content.md
@@ -1,15 +1,12 @@
-# Ray Tracing with Raycore: Building a Real Ray Tracer
+# Ray Tracing in one Hour
 
-In this tutorial, we'll build a simple but complete ray tracer from scratch using Raycore. We'll start with the absolute basics and progressively add features until we have a ray tracer that produces beautiful images with shadows, materials, and reflections.
-
-By the end, you'll have a working ray tracer that renders at interactive speeds!
+Analougus to the famous [Ray Tracing in one Weekend](https://raytracing.github.io/), this tutorial uses Raycore to do the hard work of performant ray triangle intersection and therefore get a high performing ray tracer in a much shorter time. We'll start with the absolute basics and progressively add features until we have a ray tracer that produces beautiful images with shadows, materials, and reflections.
 
 ## Setup
 
 ```julia (editor=true, logging=false, output=true)
 using Raycore, GeometryBasics, LinearAlgebra
-using Colors, ImageShow
-using Makie  # For loading assets
+using Colors, ImageShow, WGLMakie
 using BenchmarkTools
 ```
 **Ready to go!** We have:
@@ -50,10 +47,20 @@ sphere2 = Tesselation(Sphere(Point3f(2, -1.5 + 0.6, 1), 0.6f0), 64)
 
 # Build our BVH acceleration structure
 scene_geometry = [cat_mesh, floor, back_wall, left_wall, sphere1, sphere2]
-bvh = Raycore.BVHAccel(scene_geometry)
+bvh = Raycore.BVH(scene_geometry)
+f, ax, pl = plot(bvh; axis=(; show_axis=false))
 ```
-**Scene created!** Cat model, room geometry, decorative spheres, and BVH for fast ray traversal.
+Set the camera to something better:
 
+```julia (editor=true, logging=false, output=true)
+cam = cameracontrols(ax.scene)
+cam.eyeposition[] = [0, 1.0, -4]
+cam.lookat[] = [0, 0, 2]
+cam.upvector[] = [0.0, 1, 0.0]
+cam.fov[] = 45.0
+update_cam!(ax.scene, cam)
+nothing
+```
 ## Part 2: Helper Functions - Building Blocks
 
 Let's define reusable helper functions we'll use throughout:
@@ -80,6 +87,8 @@ to_rgb(v::Vec3f) = RGB{Float32}(v...)
 ```
 ## Part 3: The Simplest Ray Tracer - Depth Visualization
 
+We're using one main function to shoot rays for each pixel. For simplicity, we already added multisampling and simple multi threading, to enjoy smoother images and faster rendering times throughout the tutorial. Read the GPU tutorial how to further improve the performance of this simple, not yet optimal kernel.
+
 ```julia (editor=true, logging=false, output=true)
 function trace(f, bvh; width=700, height=300,
                camera_pos=Point3f(0, -0.9, -2.5), fov=45.0f0,
@@ -92,12 +101,11 @@ function trace(f, bvh; width=700, height=300,
     Threads.@threads for y in 1:height
         for x in 1:width
             color_sum = Vec3f(0)
-
             for _ in 1:samples
                 jitter = samples > 1 ? rand(Vec2f) : Vec2f(0)
+                # Calculate the ray shooting from the camera pixel into the scene
                 ray = camera_ray(x, y, width, height, camera_pos, focal_length, aspect; jitter)
                 hit_found, triangle, distance, bary_coords = Raycore.closest_hit(bvh, ray)
-
                 color = if hit_found
                     to_vec3f(f(bvh, ctx, triangle, distance, bary_coords, ray))
                 else
@@ -109,7 +117,6 @@ function trace(f, bvh; width=700, height=300,
             img[y, x] = to_rgb(color_sum / samples)
         end
     end
-
     return img
 end
 
@@ -338,6 +345,8 @@ end
 
 tone_mapping(img, a=0.38, y=1.0)
 ```
+For performance type stability is a must! We can use JET to test if a function is completely type stable, which we also test in the Raycore tests for all functions.
+
 ```julia (editor=true, logging=false, output=true)
 using JET
 
@@ -346,10 +355,10 @@ test_ray = camera_ray(350, 150, 700, 300, Point3f(0, -0.9, -2.5), 1.0f0 / tan(de
 hit_found, test_tri, test_dist, test_bary = Raycore.closest_hit(bvh, test_ray)
 
 # Check kernel type stability (filter to Main module to ignore Base internals)
-@test_opt target_modules=(Main,) depth_kernel(bvh, ctx, test_tri, test_dist, test_bary, test_ray)
-@test_opt target_modules=(Main,) shadow_kernel(bvh, ctx, test_tri, test_dist, test_bary, test_ray)
-@test_opt target_modules=(Main,) material_kernel(bvh, ctx, test_tri, test_dist, test_bary, test_ray)
-@test_opt target_modules=(Main,) reflective_kernel(bvh, ctx, test_tri, test_dist, test_bary, test_ray, RGB(0.5f0, 0.7f0, 1.0f0))
+@test_opt depth_kernel(bvh, ctx, test_tri, test_dist, test_bary, test_ray)
+@test_opt shadow_kernel(bvh, ctx, test_tri, test_dist, test_bary, test_ray)
+@test_opt material_kernel(bvh, ctx, test_tri, test_dist, test_bary, test_ray)
+@test_opt reflective_kernel(bvh, ctx, test_tri, test_dist, test_bary, test_ray, RGB(0.5f0, 0.7f0, 1.0f0))
 nothing
 ```
 ## Summary
@@ -376,7 +385,7 @@ We built a complete ray tracer with:
 
 **Key Raycore Functions:**
 
-  * `Raycore.BVHAccel(meshes)` - Build acceleration structure
+  * `Raycore.BVH(meshes)` - Build acceleration structure
   * `Raycore.Ray(o=origin, d=direction)` - Create ray
   * `Raycore.closest_hit(bvh, ray)` - Find nearest intersection
   * `Raycore.any_hit(bvh, ray)` - Test for any intersection
@@ -389,5 +398,3 @@ We built a complete ray tracer with:
 
 This shows how a well-designed function can handle multiple use cases cleanly!
 
-Happy ray tracing!
-
diff --git a/docs/src/test_wavefront.jl b/docs/src/test_wavefront.jl
new file mode 100644
index 0000000..11dfb28
--- /dev/null
+++ b/docs/src/test_wavefront.jl
@@ -0,0 +1,38 @@
+using Revise
+using Raycore
+using Raycore: to_gpu
+using KernelAbstractions
+using KernelAbstractions: @kernel, @index, @Const
+using GeometryBasics, Colors, LinearAlgebra
+import Makie
+using Makie: RGBf
+import KernelAbstractions as KA
+using ImageShow
+
+# Load helper functions
+include("raytracing-core.jl")
+include("wavefront-renderer.jl")
+
+begin
+    bvh, ctx = example_scene()
+    img = fill(RGBf(0, 0, 0), 1024, 2048)
+    renderer = WavefrontRenderer(
+        img, bvh, ctx;
+        camera_pos=Point3f(0, -0.9, -2.5),
+        fov=45.0f0,
+        sky_color=RGB{Float32}(0.5f0, 0.7f0, 1.0f0),
+        samples_per_pixel=4
+    )
+    Array(@time render!(renderer))
+end
+
+using FileIO
+save("wavefront.png", map(col -> mapc(c -> clamp(c, 0f0, 1f0), col), renderer.framebuffer))
+
+using AMDGPU
+amd_renderer = to_gpu(ROCArray, renderer);
+Array(@time render!(amd_renderer))
+
+using pocl_jll, OpenCL
+amd_renderer = to_gpu(CLArray, renderer);
+Array(@time render!(amd_renderer))
diff --git a/docs/src/viewfactors_content.md b/docs/src/viewfactors_content.md
index 69d50eb..57a7900 100644
--- a/docs/src/viewfactors_content.md
+++ b/docs/src/viewfactors_content.md
@@ -21,7 +21,7 @@ s4 = LowSphere(0.4f0, Point3f(0, 1.0, 0); ntriangles)
 cat = load(Makie.assetpath("cat.obj"))
 
 # Build BVH acceleration structure
-bvh = BVHAccel([s1, s2, s3, s4, cat])
+bvh = BVH([s1, s2, s3, s4, cat])
 world_mesh = GeometryBasics.Mesh(bvh)
 
 # Visualize the scene
diff --git a/docs/src/wavefront-renderer.jl b/docs/src/wavefront-renderer.jl
new file mode 100644
index 0000000..74b7af6
--- /dev/null
+++ b/docs/src/wavefront-renderer.jl
@@ -0,0 +1,861 @@
+using Raycore, GeometryBasics, LinearAlgebra
+using Raycore: gpu_int
+using Colors
+using KernelAbstractions
+using KernelAbstractions: @kernel, @index, @Const
+import KernelAbstractions as KA
+using Statistics
+# ============================================================================
+# SoA Access Macros
+# ============================================================================
+
+"""
+    @get field1, field2, ... = soa[idx]
+
+Macro to extract multiple fields from a Structure of Arrays (SoA) at index `idx`.
+"""
+macro get(expr)
+    if expr.head != :(=)
+        error("@get expects assignment syntax: @get field1, field2 = soa[idx]")
+    end
+
+    lhs = expr.args[1]
+    rhs = expr.args[2]
+
+    # Parse left side (field names)
+    if lhs isa Symbol
+        fields = [lhs]
+    elseif lhs.head == :tuple
+        fields = lhs.args
+    else
+        error("@get left side must be field names or tuple of field names")
+    end
+
+    # Parse right side (soa[idx])
+    if rhs.head != :ref
+        error("@get right side must be array indexing: soa[idx]")
+    end
+    soa = rhs.args[1]
+    idx = rhs.args[2]
+
+    # Generate field extraction code
+    assignments = [:($(esc(field)) = $(esc(soa)).$(field)[$(esc(idx))]) for field in fields]
+
+    return Expr(:block, assignments...)
+end
+
+"""
+    @set soa[idx] = (field1=val1, field2=val2, ...)
+
+Macro to set multiple fields in a Structure of Arrays (SoA) at index `idx`.
+Expects named tuple syntax on the right side.
+"""
+macro set(expr)
+    if expr.head != :(=)
+        error("@set expects assignment syntax: @set soa[idx] = (field1=val1, ...)")
+    end
+
+    lhs = expr.args[1]
+    rhs = expr.args[2]
+
+    # Parse left side (soa[idx])
+    if lhs.head != :ref
+        error("@set left side must be array indexing: soa[idx]")
+    end
+    soa = lhs.args[1]
+    idx = lhs.args[2]
+
+    # Parse right side (named tuple or parameters)
+    assignments = []
+    if rhs.head == :tuple || rhs.head == :parameters
+        for arg in rhs.args
+            if arg isa Expr && arg.head == :(=)
+                field = arg.args[1]
+                val = arg.args[2]
+                push!(assignments, :($(esc(soa)).$(field)[$(esc(idx))] = $(esc(val))))
+            else
+                error("@set expects named parameters: @set soa[idx] = (field=value, ...)")
+            end
+        end
+    else
+        error("@set expects a tuple with named fields: @set soa[idx] = (field=value, ...)")
+    end
+
+    return Expr(:block, assignments...)
+end
+
+# ============================================================================
+# Work Queue Structures
+# ============================================================================
+
+struct PrimaryRayWork
+    ray::Raycore.Ray
+    pixel_x::Int32
+    pixel_y::Int32
+    sample_idx::Int32
+end
+
+struct PrimaryHitWork{Tri}
+    hit_found::Bool
+    tri::Tri
+    dist::Float32
+    bary::Vec3f
+    ray::Raycore.Ray
+    pixel_x::Int32
+    pixel_y::Int32
+    sample_idx::Int32
+end
+
+struct ShadowRayWork
+    ray::Raycore.Ray
+    hit_idx::Int32  # Index back to the hit that generated this shadow ray
+    light_idx::Int32
+end
+
+struct ShadowResult
+    visible::Bool  # true if light is visible (no occlusion)
+    hit_idx::Int32
+    light_idx::Int32
+end
+
+struct ReflectionRayWork
+    ray::Raycore.Ray
+    hit_idx::Int32  # Index back to primary hit
+end
+
+struct ReflectionHitWork{T}
+    hit_found::Bool
+    tri_idx::T  # Store triangle index instead of full triangle
+    dist::Float32
+    bary::Vec3f
+    ray::Raycore.Ray
+    primary_hit_idx::Int32
+end
+
+struct ShadedResult
+    color::Vec3f
+    pixel_x::Int32
+    pixel_y::Int32
+    sample_idx::Int32
+end
+
+# ============================================================================
+# Stage 1: Generate Primary Camera Rays
+# ============================================================================
+
+@kernel function generate_primary_rays!(
+    @Const(width), @Const(height),
+    @Const(camera_pos), @Const(focal_length), @Const(aspect),
+    ray_queue,
+    ::Val{NSamples}
+) where {NSamples}
+    i = @index(Global, Cartesian)
+    y = gpu_int(i[1])
+    x = gpu_int(i[2])
+
+    @inbounds if y <= height && x <= width
+        # Convert to linear pixel index
+        pixel_idx = (y - gpu_int(1)) * width + x
+        # Unroll sampling loop with ntuple
+        ntuple(Val(NSamples)) do s
+            s_idx = gpu_int(s)
+            ray_idx = (pixel_idx - gpu_int(1)) * gpu_int(NSamples) + s_idx
+            jitter = rand(Vec2f)
+            ndc_x = (2.0f0 * (Float32(x) - 0.5f0 + jitter[1]) / Float32(width) - 1.0f0) * aspect
+            ndc_y = 1.0f0 - 2.0f0 * (Float32(y) - 0.5f0 + jitter[2]) / Float32(height)
+            direction = normalize(Vec3f(ndc_x, ndc_y, focal_length))
+            ray = Raycore.Ray(o=camera_pos, d=direction)
+
+            # Write to SoA using @set
+            @set ray_queue[ray_idx] = (ray=ray, pixel_x=x, pixel_y=y, sample_idx=s_idx)
+            nothing
+        end
+    end
+end
+
+# ============================================================================
+# Stage 2: Intersect Primary Rays
+# ============================================================================
+
+@kernel function intersect_primary_rays!(
+    @Const(bvh),
+    @Const(ray_queue),
+    hit_queue
+)
+    i = @index(Global, Linear)
+    idx = gpu_int(i)
+
+    @inbounds if idx <= length(ray_queue.ray)
+        # Read from SoA
+        @get ray, pixel_x, pixel_y, sample_idx = ray_queue[idx]
+        hit_found, tri, dist, bary = Raycore.closest_hit(bvh, ray)
+        # Write to SoA using @set
+        @set hit_queue[idx] = (hit_found=hit_found, tri=tri, dist=dist, bary=Vec3f(bary),
+                               ray=ray, pixel_x=pixel_x, pixel_y=pixel_y, sample_idx=sample_idx)
+    end
+end
+
+# ============================================================================
+# Stage 3: Generate Shadow Rays (for all hits × all lights)
+# ============================================================================
+
+@kernel function generate_shadow_rays!(
+    @Const(hit_queue),
+    @Const(ctx),
+    shadow_ray_queue,
+    ::Val{NLights}
+) where {NLights}
+    i = @index(Global, Linear)
+    idx = gpu_int(i)
+
+    @inbounds if idx <= length(hit_queue.hit_found)
+        # Read from SoA
+        @get hit_found, tri, dist, bary, ray = hit_queue[idx]
+
+        if hit_found
+            # Compute hit point and normal
+            hit_point = ray.o + ray.d * dist
+            v0, v1, v2 = Raycore.normals(tri)
+            u, v, w = bary[1], bary[2], bary[3]
+            normal = Vec3f(normalize(v0 * u + v1 * v + v2 * w))
+
+            # Generate shadow ray for each light (unrolled)
+            ntuple(Val(NLights)) do light_idx
+                light_idx_gpu = gpu_int(light_idx)
+                shadow_ray_idx = (idx - gpu_int(1)) * gpu_int(NLights) + light_idx_gpu
+                light = ctx.lights[light_idx_gpu]
+
+                # Shadow ray towards light with proper bias to avoid shadow acne
+                shadow_bias = 0.01f0
+                shadow_origin = hit_point + normal * shadow_bias
+                light_vec = light.position - shadow_origin
+                shadow_dir = normalize(light_vec)
+                light_dist = norm(light_vec)
+                shadow_ray = Raycore.Ray(o=shadow_origin, d=shadow_dir, t_max=light_dist)
+
+                # Write to SoA using @set
+                @set shadow_ray_queue[shadow_ray_idx] = (ray=shadow_ray, hit_idx=idx, light_idx=light_idx_gpu)
+            end
+        else
+            # Sky hit - no shadow rays needed, but we need to fill the queue
+            dummy_ray = Raycore.Ray(o=Point3f(0,0,0), d=Vec3f(0,0,1), t_max=0.0f0)
+            ntuple(Val(NLights)) do light_idx
+                light_idx_gpu = gpu_int(light_idx)
+                shadow_ray_idx = (idx - gpu_int(1)) * gpu_int(NLights) + light_idx_gpu
+                # Write dummy to SoA using @set
+                @set shadow_ray_queue[shadow_ray_idx] = (ray=dummy_ray, hit_idx=idx, light_idx=light_idx_gpu)
+                nothing
+            end
+        end
+    end
+end
+
+# ============================================================================
+# Stage 4: Test Shadow Rays (occlusion test)
+# ============================================================================
+
+@kernel function test_shadow_rays!(
+    @Const(bvh),
+    @Const(shadow_ray_queue),
+    shadow_result_queue
+)
+    i = @index(Global, Linear)
+    idx = gpu_int(i)
+    @inbounds if idx <= length(shadow_ray_queue.ray)
+        # Read from SoA
+        @get ray, hit_idx, light_idx = shadow_ray_queue[idx]
+
+        # Test for occlusion
+        # any_hit respects ray.t_max and only returns hits before the light
+        # So if we get a hit, something is blocking the light
+        visible = if ray.t_max > 0.0f0
+            hit_found, _, _, _ = Raycore.any_hit(bvh, ray)
+            !hit_found  # Visible only if no obstruction
+        else
+            false  # Dummy ray (sky hits)
+        end
+        # Write to SoA using @set
+        @set shadow_result_queue[idx] = (visible=visible, hit_idx=hit_idx, light_idx=light_idx)
+    end
+end
+
+# ============================================================================
+# Stage 5: Shade Primary Hits with Shadow Information
+# ============================================================================
+
+@kernel function shade_primary_hits!(
+    @Const(hit_queue),
+    @Const(ctx),
+    @Const(shadow_results),
+    @Const(sky_color),
+    shading_queue,
+    ::Val{NLights}
+) where {NLights}
+    i = @index(Global, Linear)
+    idx = gpu_int(i)
+
+    @inbounds if idx <= length(hit_queue.hit_found)
+        # Read from SoA
+        @get hit_found, tri, dist, bary, ray, pixel_x, pixel_y, sample_idx = hit_queue[idx]
+
+        if hit_found
+            # Compute hit point and normal
+            hit_point = ray.o + ray.d * dist
+            v0, v1, v2 = Raycore.normals(tri)
+            u, v, w = bary[1], bary[2], bary[3]
+            normal = Vec3f(normalize(v0 * u + v1 * v + v2 * w))
+
+            # Get material
+            mat = ctx.materials[tri.material_idx]
+            base_color = Vec3f(mat.base_color.r, mat.base_color.g, mat.base_color.b)
+            # Start with ambient
+            total_color = base_color * ctx.ambient
+            # Add contribution from each light using shadow test results (unrolled)
+            light_contributions = ntuple(Val(NLights)) do light_idx
+                light_idx_gpu = gpu_int(light_idx)
+                shadow_idx = (idx - gpu_int(1)) * gpu_int(NLights) + light_idx_gpu
+                visible = shadow_results.visible[shadow_idx]
+
+                if visible
+                    light = ctx.lights[light_idx_gpu]
+                    light_vec = light.position - hit_point
+                    light_dist = norm(light_vec)
+                    light_dir = light_vec / light_dist
+
+                    diffuse = max(0.0f0, dot(normal, light_dir))
+                    attenuation = light.intensity / (light_dist * light_dist)
+
+                    light_color = Vec3f(light.color.r, light.color.g, light.color.b)
+                    base_color .* (light_color * (diffuse * attenuation))
+                else
+                    Vec3f(0, 0, 0)
+                end
+            end
+
+            # Sum all light contributions
+            total_color += sum(light_contributions)
+            # Write to SoA using @set
+            @set shading_queue[idx] = (color=total_color, pixel_x=pixel_x, pixel_y=pixel_y, sample_idx=sample_idx)
+        else
+            # Sky color
+            sky_vec = Vec3f(sky_color.r, sky_color.g, sky_color.b)
+            # Write to SoA using @set
+            @set shading_queue[idx] = (color=sky_vec, pixel_x=pixel_x, pixel_y=pixel_y, sample_idx=sample_idx)
+        end
+    end
+end
+
+# ============================================================================
+# Stage 6: Generate Reflection Rays (for metallic materials)
+# ============================================================================
+
+@kernel function generate_reflection_rays!(
+    @Const(hit_queue),
+    @Const(ctx),
+    reflection_ray_soa,  # SoA: NamedTuple with .ray and .hit_idx arrays
+    active_count  # Output: number of active reflection rays
+)
+    i = @index(Global, Linear)
+    idx = gpu_int(i)
+
+    @inbounds if idx <= length(hit_queue.hit_found)
+        # Read from SoA
+        @get hit_found, tri, dist, bary, ray = hit_queue[idx]
+
+        if hit_found
+            mat = ctx.materials[tri.material_idx]
+            if mat.metallic > 0.0f0
+                # Compute reflection
+                hit_point = ray.o + ray.d * dist
+                v0, v1, v2 = Raycore.normals(tri)
+                u, v, w = bary[1], bary[2], bary[3]
+                normal = Vec3f(normalize(v0 * u + v1 * v + v2 * w))
+
+                wo = -ray.d
+                reflect_dir = Raycore.reflect(wo, normal)
+
+                # Add roughness
+                if mat.roughness > 0.0f0
+                    offset = (rand(Vec3f) .* 2.0f0 .- 1.0f0) * mat.roughness
+                    reflect_dir = normalize(reflect_dir + offset)
+                end
+                # Use proper bias to avoid self-intersection
+                reflect_ray = Raycore.Ray(o=hit_point + normal * 0.01f0, d=reflect_dir)
+
+                # Write to SoA using @set
+                @set reflection_ray_soa[idx] = (ray=reflect_ray, hit_idx=idx)
+            else
+                # No reflection - dummy ray
+                dummy_ray = Raycore.Ray(o=Point3f(0,0,0), d=Vec3f(0,0,1), t_max=0.0f0)
+                @set reflection_ray_soa[idx] = (ray=dummy_ray, hit_idx=idx)
+            end
+        else
+            # Sky hit - no reflection
+            dummy_ray = Raycore.Ray(o=Point3f(0,0,0), d=Vec3f(0,0,1), t_max=0.0f0)
+            @set reflection_ray_soa[idx] = (ray=dummy_ray, hit_idx=idx)
+        end
+    end
+end
+
+# ============================================================================
+# Stage 7: Intersect Reflection Rays (SoA version)
+# ============================================================================
+
+@kernel function intersect_reflection_rays!(
+    @Const(bvh),
+    @Const(reflection_ray_soa),  # NamedTuple with .ray and .hit_idx arrays
+    reflection_hit_soa           # NamedTuple with separate field arrays
+)
+    i = @index(Global, Linear)
+    idx = gpu_int(i)
+
+    @inbounds if idx <= length(reflection_ray_soa.ray)
+        # Read from SoA
+        @get ray, hit_idx = reflection_ray_soa[idx]
+
+        if ray.t_max > 0.0f0
+            hit_found, tri, dist, bary = Raycore.closest_hit(bvh, ray)
+            # Write to SoA using @set
+            @set reflection_hit_soa[idx] = (hit_found=hit_found, tri_idx=tri.primitive_idx,
+                dist=dist, bary=Vec3f(bary),
+                ray=ray, primary_hit_idx=hit_idx)
+        else
+            # Write only hit_found as false for dummy rays
+            reflection_hit_soa.hit_found[idx] = false
+        end
+    end
+end
+
+# ============================================================================
+# Stage 8: Shade Reflection Hits and Blend into Primary
+# ============================================================================
+
+@kernel function shade_reflections_and_blend!(
+    @Const(hit_queue),
+    @Const(reflection_hit_soa),  # SoA version: NamedTuple of arrays
+    @Const(bvh_triangles),        # Need original triangles for material lookup
+    @Const(ctx),
+    @Const(sky_color),
+    shading_queue  # In/out: update with reflection contribution
+)
+    i = @index(Global, Linear)
+    idx = gpu_int(i)
+
+    @inbounds if idx <= length(hit_queue.hit_found)
+        # Read from SoA
+        @get hit_found, tri, pixel_x, pixel_y, sample_idx = hit_queue[idx]
+
+        if hit_found
+            mat = ctx.materials[tri.material_idx]
+
+            if mat.metallic > 0.0f0
+                # Access from SoA directly (using different variable names for clarity)
+                refl_hit_found = reflection_hit_soa.hit_found[idx]
+                refl_tri_idx = reflection_hit_soa.tri_idx[idx]
+                refl_dist = reflection_hit_soa.dist[idx]
+                refl_bary = reflection_hit_soa.bary[idx]
+                refl_ray = reflection_hit_soa.ray[idx]
+
+                # Compute reflection color (simplified - no recursive shadows for performance)
+                reflection_color = if refl_hit_found
+                    refl_point = refl_ray.o + refl_ray.d * refl_dist
+
+                    # Get triangle from BVH using stored index
+                    refl_tri = bvh_triangles[refl_tri_idx]
+                    v0, v1, v2 = Raycore.normals(refl_tri)
+                    u, v, w = refl_bary[1], refl_bary[2], refl_bary[3]
+                    refl_normal = Vec3f(normalize(v0 * u + v1 * v + v2 * w))
+
+                    refl_mat = ctx.materials[refl_tri.material_idx]
+                    refl_base_color = Vec3f(refl_mat.base_color.r, refl_mat.base_color.g, refl_mat.base_color.b)
+
+                    # Simplified lighting (ambient + first light only, no shadows)
+                    refl_color = refl_base_color * ctx.ambient
+
+                    if length(ctx.lights) > 0
+                        light = ctx.lights[gpu_int(1)]
+                        light_vec = light.position - refl_point
+                        light_dist = norm(light_vec)
+                        light_dir = normalize(light_vec)
+                        diffuse = max(0.0f0, dot(refl_normal, light_dir))
+                        attenuation = light.intensity / (light_dist * light_dist)
+                        light_color = Vec3f(light.color.r, light.color.g, light.color.b)
+                        refl_color += refl_base_color .* (light_color * (diffuse * attenuation))
+                    end
+                    refl_color
+                else
+                    Vec3f(sky_color.r, sky_color.g, sky_color.b)
+                end
+
+                # Blend with primary color
+                primary_color = shading_queue.color[idx]
+                blended_color = primary_color * (1.0f0 - mat.metallic) + reflection_color * mat.metallic
+
+                # Update shading result in SoA using @set
+                @set shading_queue[idx] = (color=blended_color, pixel_x=pixel_x, pixel_y=pixel_y, sample_idx=sample_idx)
+            end
+        end
+    end
+end
+
+# ============================================================================
+# Stage 9: Accumulate Final Image
+# ============================================================================
+
+@kernel function accumulate_final!(
+    @Const(shading_queue),
+    img,
+    sample_accumulator
+)
+    i = @index(Global, Linear)
+    idx = gpu_int(i)
+
+    @inbounds if idx <= length(shading_queue.color)
+        # Read from SoA
+        color = shading_queue.color[idx]
+        sample_accumulator[idx] = color
+    end
+end
+
+@kernel function finalize_image!(
+    @Const(sample_accumulator),
+    img,
+    ::Val{NSamples}
+) where {NSamples}
+    i = @index(Global, Cartesian)
+    y = gpu_int(i[1])
+    x = gpu_int(i[2])
+    height, width = size(img)
+
+    @inbounds if y <= height && x <= width
+        # Convert to linear index
+        pixel_idx = (y - gpu_int(1)) * width + x
+        # Unroll sample accumulation loop
+        colors = ntuple(Val(NSamples)) do s
+            s_idx = gpu_int(s)
+            sample_idx = (pixel_idx - gpu_int(1)) * gpu_int(NSamples) + s_idx
+            sample_accumulator[sample_idx]
+        end
+
+        # Sum all colors
+        img[y, x] = RGB{Float32}(mean(colors)...)
+    end
+end
+
+# ============================================================================
+# Main Wavefront Tracer (Full Features)
+# ============================================================================
+
+function similar_soa(img, T, num_elements)
+    fields = [f => similar(img, fieldtype(T, f), num_elements) for f in fieldnames(T)]
+    return (; fields...)
+end
+
+
+# ============================================================================
+# WavefrontRenderer - Struct to hold all buffers for wavefront rendering
+# ============================================================================
+
+"""
+    WavefrontRenderer
+
+A renderer struct that contains all buffers needed for wavefront path tracing.
+Supports different array types (Array, ROCArray, CLArray, etc.) and can be
+converted to GPU using `to_gpu(ArrayType, renderer)`.
+
+# Fields
+- `framebuffer`: Output image buffer
+- `bvh`: BVH acceleration structure
+- `ctx`: Scene context (materials, lights)
+- Camera parameters: `camera_pos`, `fov`, `sky_color`, `samples_per_pixel`
+- Work queues for each wavefront stage
+"""
+struct WavefrontRenderer{ImgArr <: AbstractMatrix, BVH, Ctx}
+    # Image dimensions
+    width::Int32
+    height::Int32
+
+    # Image and scene
+    framebuffer::ImgArr
+    bvh::BVH
+    ctx::Ctx
+
+    # Camera parameters
+    camera_pos::Point3f
+    fov::Float32
+    sky_color::RGB{Float32}
+    samples_per_pixel::Int32
+
+    # Work queues (all SoA)
+    primary_ray_queue::NamedTuple
+    primary_hit_queue::NamedTuple
+    shadow_ray_queue::NamedTuple
+    shadow_result_queue::NamedTuple
+    reflection_ray_soa::NamedTuple
+    reflection_hit_soa::NamedTuple
+    shading_queue::NamedTuple
+    sample_accumulator::AbstractVector
+    active_count::AbstractVector
+end
+
+"""
+    WavefrontRenderer(img, bvh, ctx; camera_pos, fov, sky_color, samples_per_pixel)
+
+Create a WavefrontRenderer with all necessary buffers allocated for the given image size and scene.
+"""
+function WavefrontRenderer(
+        img, bvh, ctx;
+        camera_pos=Point3f(0, -0.9, -2.5),
+        fov=45.0f0,
+        sky_color=RGB{Float32}(0.5f0, 0.7f0, 1.0f0),
+        samples_per_pixel=4
+    )
+    height, width = size(img)
+
+    num_pixels = width * height
+    num_rays = num_pixels * samples_per_pixel
+    num_lights = Int32(length(ctx.lights))
+    num_shadow_rays = num_rays * num_lights
+
+    # Allocate work queues as SoA
+    primary_ray_queue = similar_soa(img, PrimaryRayWork, num_rays)
+    primary_hit_queue = similar_soa(img, PrimaryHitWork{eltype(bvh.primitives)}, num_rays)
+    shadow_ray_queue = similar_soa(img, ShadowRayWork, num_shadow_rays)
+    shadow_result_queue = similar_soa(img, ShadowResult, num_shadow_rays)
+    reflection_ray_soa = similar_soa(img, ReflectionRayWork, num_rays)
+    reflection_hit_soa = similar_soa(img, ReflectionHitWork{Int32}, num_rays)
+    shading_queue = similar_soa(img, ShadedResult, num_rays)
+    sample_accumulator = similar(img, Vec3f, num_rays)
+    active_count = similar(img, Int32, 1)
+
+    return WavefrontRenderer(
+        Int32(width), Int32(height),
+        img, bvh, ctx,
+        camera_pos, fov, sky_color, Int32(samples_per_pixel),
+        primary_ray_queue, primary_hit_queue,
+        shadow_ray_queue, shadow_result_queue,
+        reflection_ray_soa, reflection_hit_soa,
+        shading_queue, sample_accumulator, active_count
+    )
+end
+
+"""
+    to_gpu(ArrayType, renderer::WavefrontRenderer)
+
+Convert a WavefrontRenderer to use a different array type (e.g., ROCArray, CLArray).
+This creates a new renderer with all buffers converted to the target array type.
+"""
+function Raycore.to_gpu(Arr, renderer::WavefrontRenderer)
+    # Convert image
+    img = Arr(renderer.framebuffer)
+
+    # Convert BVH and context
+    bvh_gpu = Raycore.to_gpu(Arr, renderer.bvh)
+    ctx_gpu = Raycore.to_gpu(Arr, renderer.ctx)
+
+    # Recreate renderer with new array type
+    return WavefrontRenderer(
+        img, bvh_gpu, ctx_gpu;
+        camera_pos=renderer.camera_pos,
+        fov=renderer.fov,
+        sky_color=renderer.sky_color,
+        samples_per_pixel=Int(renderer.samples_per_pixel)
+    )
+end
+
+"""
+    trace_wavefront_full!(renderer::WavefrontRenderer)
+
+Execute the full wavefront path tracing pipeline using the provided renderer.
+The result is stored in `renderer.framebuffer`.
+"""
+function render!(renderer::WavefrontRenderer)
+    width = Int(renderer.width)
+    height = Int(renderer.height)
+    samples_per_pixel = Int(renderer.samples_per_pixel)
+
+    aspect = Float32(width / height)
+    focal_length = 1.0f0 / tan(deg2rad(renderer.fov / 2))
+
+    backend = KA.get_backend(renderer.framebuffer)
+
+    num_pixels = width * height
+    num_rays = num_pixels * samples_per_pixel
+    num_lights = Int(length(renderer.ctx.lights))
+    num_shadow_rays = num_rays * num_lights
+
+    # Stage 1: Generate primary rays
+    gen_kernel! = generate_primary_rays!(backend)
+    gen_kernel!(
+        renderer.width, renderer.height,
+        renderer.camera_pos, focal_length, aspect,
+        renderer.primary_ray_queue,
+        Val(samples_per_pixel),
+        ndrange=(height, width)
+    )
+
+    # Stage 2: Intersect primary rays
+    intersect_kernel! = intersect_primary_rays!(backend)
+    intersect_kernel!(
+        renderer.bvh,
+        renderer.primary_ray_queue,
+        renderer.primary_hit_queue,
+        ndrange=num_rays
+    )
+
+    # Stage 3: Generate shadow rays
+    shadow_gen_kernel! = generate_shadow_rays!(backend)
+    shadow_gen_kernel!(
+        renderer.primary_hit_queue,
+        renderer.ctx,
+        renderer.shadow_ray_queue,
+        Val(num_lights),
+        ndrange=num_rays
+    )
+
+    # Stage 4: Test shadow rays
+    shadow_test_kernel! = test_shadow_rays!(backend)
+    shadow_test_kernel!(
+        renderer.bvh,
+        renderer.shadow_ray_queue,
+        renderer.shadow_result_queue,
+        ndrange=num_shadow_rays
+    )
+
+    # Stage 5: Shade primary hits with shadows
+    shade_kernel! = shade_primary_hits!(backend)
+    shade_kernel!(
+        renderer.primary_hit_queue,
+        renderer.ctx,
+        renderer.shadow_result_queue,
+        renderer.sky_color,
+        renderer.shading_queue,
+        Val(num_lights),
+        ndrange=num_rays
+    )
+
+    # Stage 6: Generate reflection rays (SoA)
+    refl_gen_kernel! = generate_reflection_rays!(backend)
+    refl_gen_kernel!(
+        renderer.primary_hit_queue,
+        renderer.ctx,
+        renderer.reflection_ray_soa,
+        renderer.active_count,
+        ndrange=num_rays
+    )
+
+    # Stage 7: Intersect reflection rays (SoA)
+    refl_intersect_kernel! = intersect_reflection_rays!(backend)
+    refl_intersect_kernel!(
+        renderer.bvh,
+        renderer.reflection_ray_soa,
+        renderer.reflection_hit_soa,
+        ndrange=num_rays
+    )
+
+    # Stage 8: Shade reflections and blend (using SoA)
+    refl_shade_kernel! = shade_reflections_and_blend!(backend)
+    refl_shade_kernel!(
+        renderer.primary_hit_queue,
+        renderer.reflection_hit_soa,
+        renderer.bvh.primitives,
+        renderer.ctx,
+        renderer.sky_color,
+        renderer.shading_queue,
+        ndrange=num_rays
+    )
+
+    # Stage 9: Accumulate final image
+    accum_kernel! = accumulate_final!(backend)
+    accum_kernel!(
+        renderer.shading_queue,
+        renderer.framebuffer,
+        renderer.sample_accumulator,
+        ndrange=num_rays
+    )
+
+    final_kernel! = finalize_image!(backend)
+    final_kernel!(
+        renderer.sample_accumulator,
+        renderer.framebuffer,
+        Val(samples_per_pixel),
+        ndrange=(height, width)
+    )
+    KA.synchronize(backend)
+
+    return renderer.framebuffer
+end
+
+function trace_wavefront_full(
+        img, bvh, ctx;
+        camera_pos=Point3f(0, -0.9, -2.5), fov=45.0f0,
+        sky_color=RGB{Float32}(0.5f0,0.7f0,1.0f0),
+        samples_per_pixel=4
+    )
+    height, width = size(img)
+    aspect = Float32(width / height)
+    focal_length = 1.0f0 / tan(deg2rad(fov / 2))
+
+    backend = KA.get_backend(img)
+
+    num_pixels = width * height
+    num_rays = num_pixels * samples_per_pixel
+    num_lights = Int(length(ctx.lights))
+    num_shadow_rays = num_rays * num_lights
+
+    # Allocate work queues as SoA
+    primary_ray_queue = similar_soa(img, PrimaryRayWork, num_rays)
+    primary_hit_queue = similar_soa(img, PrimaryHitWork{eltype(bvh.primitives)}, num_rays)
+    shadow_ray_queue = similar_soa(img, ShadowRayWork, num_shadow_rays)
+    shadow_result_queue = similar_soa(img, ShadowResult, num_shadow_rays)
+    reflection_ray_soa = similar_soa(img, ReflectionRayWork, num_rays)
+    reflection_hit_soa = similar_soa(img, ReflectionHitWork{Int32}, num_rays)
+    shading_queue = similar_soa(img, ShadedResult, num_rays)
+    sample_accumulator = similar(img, Vec3f, num_rays)
+
+    # Stage 1: Generate primary rays
+    gen_kernel! = generate_primary_rays!(backend)
+    gen_kernel!(Int32(width), Int32(height), camera_pos, focal_length, aspect, primary_ray_queue, Val(samples_per_pixel), ndrange=(height, width))
+    KA.synchronize(backend)
+
+    # Stage 2: Intersect primary rays
+    intersect_kernel! = intersect_primary_rays!(backend)
+    intersect_kernel!(bvh, primary_ray_queue, primary_hit_queue, ndrange=num_rays)
+    KA.synchronize(backend)
+
+    # Stage 3: Generate shadow rays
+    shadow_gen_kernel! = generate_shadow_rays!(backend)
+    shadow_gen_kernel!(primary_hit_queue, ctx, shadow_ray_queue, Val(num_lights), ndrange=num_rays)
+
+    # Stage 4: Test shadow rays
+    shadow_test_kernel! = test_shadow_rays!(backend)
+    shadow_test_kernel!(bvh, shadow_ray_queue, shadow_result_queue, ndrange=num_shadow_rays)
+
+    # Stage 5: Shade primary hits with shadows
+    shade_kernel! = shade_primary_hits!(backend)
+    shade_kernel!(primary_hit_queue, ctx, shadow_result_queue, sky_color, shading_queue, Val(num_lights), ndrange=num_rays)
+
+    # Stage 6: Generate reflection rays (SoA)
+    refl_gen_kernel! = generate_reflection_rays!(backend)
+    active_count = similar(img, Int32, 1)
+    refl_gen_kernel!(primary_hit_queue, ctx, reflection_ray_soa, active_count, ndrange=num_rays)
+    KA.synchronize(backend)
+
+    # Stage 7: Intersect reflection rays (SoA)
+    refl_intersect_kernel! = intersect_reflection_rays!(backend)
+    refl_intersect_kernel!(bvh, reflection_ray_soa, reflection_hit_soa, ndrange=num_rays)
+    KA.synchronize(backend)
+
+    # Stage 8: Shade reflections and blend (using SoA)
+    refl_shade_kernel! = shade_reflections_and_blend!(backend)
+    refl_shade_kernel!(primary_hit_queue, reflection_hit_soa, bvh.primitives, ctx, sky_color, shading_queue, ndrange=num_rays)
+
+    # Stage 9: Accumulate final image
+    accum_kernel! = accumulate_final!(backend)
+    accum_kernel!(shading_queue, img, sample_accumulator, ndrange=num_rays)
+
+    final_kernel! = finalize_image!(backend)
+    final_kernel!(sample_accumulator, img, Val(samples_per_pixel), ndrange=(height, width))
+    KA.synchronize(backend)
+    return img
+end
diff --git a/ext/RaycoreMakieExt.jl b/ext/RaycoreMakieExt.jl
index 597e343..bbbcf28 100644
--- a/ext/RaycoreMakieExt.jl
+++ b/ext/RaycoreMakieExt.jl
@@ -31,7 +31,7 @@ using Raycore, GeometryBasics, GLMakie
 # Create geometry
 sphere1 = Tesselation(Sphere(Point3f(0, 0, 1), 1.0f0), 20)
 sphere2 = Tesselation(Sphere(Point3f(0, 0, 3), 1.0f0), 20)
-bvh = Raycore.BVHAccel([sphere1, sphere2])
+bvh = Raycore.BVH([sphere1, sphere2])
 
 # Create rays
 rays = [
@@ -183,7 +183,7 @@ end
 """
 Helper function to draw BVH geometry
 """
-function draw_bvh!(plot, bvh::Raycore.BVHAccel, colors, alpha)
+function draw_bvh!(plot, bvh::Raycore.BVH, colors, alpha)
     # Group primitives by their material_idx
     primitive_groups = Dict{UInt32, Vector{Raycore.Triangle}}()
     for prim in bvh.primitives
@@ -228,4 +228,24 @@ function draw_bvh!(plot, bvh::Raycore.BVHAccel, colors, alpha)
     end
 end
 
+Makie.plottype(::Raycore.BVH) = Makie.Mesh
+
+function Makie.convert_arguments(::Type{Makie.Mesh}, bvh::Raycore.BVH)
+    # Convert BVH to a Mesh for plotting
+    vertices = Point3f[]
+    faces = GeometryBasics.TriangleFace{Int}[]
+    colors = Float32[]
+    normals = Vec3f[]
+    for (i, prim) in enumerate(bvh.primitives)
+        start_idx = length(vertices)
+        for (v, n) in zip(prim.vertices, prim.normals)
+            push!(vertices, v)
+            push!(colors, prim.material_idx)
+            push!(normals, Vec3f(n))
+        end
+        push!(faces, GeometryBasics.TriangleFace(start_idx + 1, start_idx + 2, start_idx + 3))
+    end
+    return (GeometryBasics.Mesh(vertices, faces; normal=normals, color=colors), )
+end
+
 end # module
diff --git a/gpu-experiments.jl b/gpu-experiments.jl
new file mode 100644
index 0000000..e1d6327
--- /dev/null
+++ b/gpu-experiments.jl
@@ -0,0 +1,103 @@
+using Rusticl_jll, pocl_jll, OpenCL, AMDGPU, KernelAbstractions, Makie, LinearAlgebra, GeometryBasics, Raycore, StaticArrays, FileIO, Colors
+import KernelAbstractions as KA
+using ImageShow
+
+# Load the cat model and rotate it to face the camera
+cat_mesh = Makie.loadasset("cat.obj")
+angle = deg2rad(150f0)
+rotation = Makie.Quaternionf(0, sin(angle / 2), 0, cos(angle / 2))
+rotated_coords = [rotation * Point3f(v) for v in coordinates(cat_mesh)]
+
+# Get bounding box and translate cat to sit on the floor
+cat_bbox = Rect3f(rotated_coords)
+floor_y = -1.5f0
+cat_offset = Vec3f(0, floor_y - cat_bbox.origin[2], 0)
+
+cat_mesh = GeometryBasics.normal_mesh(
+    [v + cat_offset for v in rotated_coords],
+    faces(cat_mesh)
+)
+# Create a simple room: floor, back wall, and side wall
+floor = normal_mesh(Rect3f(Vec3f(-5, -1.5, -2), Vec3f(10, 0.01, 10)))
+back_wall = normal_mesh(Rect3f(Vec3f(-5, -1.5, 8), Vec3f(10, 5, 0.01)))
+left_wall = normal_mesh(Rect3f(Vec3f(-5, -1.5, -2), Vec3f(0.01, 5, 10)))
+
+# Add a couple of spheres for visual interest
+sphere1 = Tesselation(Sphere(Point3f(-2, -1.5 + 0.8, 2), 0.8f0), 64)
+sphere2 = Tesselation(Sphere(Point3f(2, -1.5 + 0.6, 1), 0.6f0), 64)
+
+# Build our BVH acceleration structure
+scene_geometry = [cat_mesh, floor, back_wall, left_wall, sphere1, sphere2]
+bvh = Raycore.BVH(scene_geometry)
+
+# Compute interpolated normal at hit point
+function compute_normal(triangle, bary_coords)
+    v0, v1, v2 = Raycore.normals(triangle)
+    u, v, w = bary_coords[1], bary_coords[2], bary_coords[3]
+    return Vec3f(normalize(v0 * u + v1 * v + v2 * w))
+end
+
+# Generate camera ray for a pixel with optional jitter
+function camera_ray(x, y, width, height, camera_pos, focal_length, aspect; jitter=Vec2f(0))
+    ndc_x = (2.0f0 * (Float32(x) - 0.5f0 + jitter[1]) / Float32(width) - 1.0f0) * aspect
+    ndc_y = 1.0f0 - 2.0f0 * (Float32(y) - 0.5f0 + jitter[2]) / Float32(height)
+    direction = normalize(Vec3f(ndc_x, ndc_y, focal_length))
+    return Raycore.Ray(o=camera_pos, d=direction)
+end
+# Convert between color representations
+to_vec3f(c::RGB) = Vec3f(c.r, c.g, c.b)
+to_rgb(v::Vec3f) = RGB{Float32}(v[1], v[2], v[3])
+
+
+@kernel function ka_trace_image!(color_callback, bvh, camera_pos, focal_length, aspect, img, sky_color)
+    _idx = @index(Global, Cartesian)
+    x, y = map(x-> x % Int32, Tuple(_idx))
+    height, width = size(img)
+    @inbounds if checkbounds(Bool, img, _idx)
+        ray = camera_ray(y, x, width, height, camera_pos, focal_length, aspect)
+        hit_found, triangle, distance, bary_coords = Raycore.closest_hit(bvh, ray)
+        color = if hit_found
+            to_vec3f(color_callback(bvh, triangle, distance, bary_coords, ray))
+        else
+            to_vec3f(sky_color)
+        end
+        img[x, y] = color
+    end
+    nothing
+end
+
+function ka_trace!(color_callback, ArrayType, bvh;
+        width=2048, height=1024,
+        camera_pos=Point3f(0, -0.9, -2.5), fov=45.0f0,
+        sky_color=RGB{Float32}(0.5f0, 0.7f0, 1.0f0),
+    )
+    img = ArrayType(fill(Vec3f(0), height, width))
+    backend = KA.get_backend(img)
+    kernel! = ka_trace_image!(backend)
+    aspect = Float32(width/height)
+    focal_length = 1.0f0 / tan(deg2rad(fov / 2))
+    kernel!(color_callback, bvh, camera_pos, focal_length, aspect, img, sky_color, ndrange=size(img), workgroupsize=(16, 16))
+    KA.synchronize(backend)
+
+    return map(x-> to_rgb(clamp.(x, Vec3f(0), Vec3f(1))), Array(img))
+end
+
+using ImageShow, Colors
+
+depth_kernel(bvh, tri, dist, bary, ray) = RGB(1.0f0 - min(dist / 10.0f0, 1.0f0))
+
+# # ROCArray somehow stays black!
+# pres = []
+# g_scene = Trace.to_gpu(ROCArray, bvh; preserve=pres);
+# img = ka_trace!(depth_kernel, ROCArray, g_scene)
+# all(img .== RGB(0,0,0)) # nooo :(
+
+# # OpenCL backend works, but runs on the CPU:
+# pres = []
+# g_scene = Trace.to_gpu(CLArray, bvh; preserve=pres);
+# img_cl = ka_trace!(depth_kernel, CLArray, g_scene)
+
+# Save the scene
+save("test-ray1.png", img_cl)
+using BenchmarkTools
+@btime ka_trace!(depth_kernel, Array, bvh)
diff --git a/raycore-blogpost.md b/raycore-blogpost.md
new file mode 100644
index 0000000..58fc767
--- /dev/null
+++ b/raycore-blogpost.md
@@ -0,0 +1,134 @@
+# Announcing Raycore.jl: High-Performance Ray Tracing for CPU and GPU
+
+I'm excited to announce **Raycore.jl**, a high-performance ray-triangle intersection engine with BVH acceleration, designed for both CPU and GPU execution in Julia. Whether you're building a physically-based renderer, simulating light transport, or exploring acoustic propagation, Raycore provides the performance and flexibility you need.
+
+## Why Write a New Ray Intersection Engine?
+
+You might wonder: why build yet another ray tracer? The answer lies in Julia's unique strengths and the opportunities they create.
+
+### Advantages of Julia
+
+* **High-level language with performance close to C/C++** - Write readable code that runs fast
+* **Great GPU support** - Single codebase runs on CUDA, AMD, Metal, oneAPI, and OpenCL via KernelAbstractions.jl
+* **Multiple dispatch for different geometries, algorithms, and materials** - Extend the system cleanly without modifying core code
+* **Pluggable architecture for new features** - Add custom materials, sampling strategies, or acceleration structures
+* **One of the best languages to write out math** - The code looks like the equations you'd write on paper
+
+### Honest Assessment: The Tradeoffs
+
+Julia isn't perfect, and I want to be upfront about the challenges:
+
+* **Long compile times for first use** - The first run of a function triggers JIT compilation
+* **GPU code still has some rough edges** - Complex kernels require careful attention to avoid allocations and GPU-unfriendly constructs
+
+In practice, compile times aren't as bad as they might sound. You keep a Julia session running and only pay the compilation cost once. There's also ongoing work on precompilation that could reduce these times to near-zero in the future. For GPU code, better tooling for detecting and fixing issues is on the horizon, along with improved error messages when problematic LLVM code is generated.
+
+### The Big Picture
+
+The flexibility to write a high-performance ray tracer in a high-level language opens up exciting possibilities:
+
+* **Use automatic differentiation** to optimize scene parameters or light placement
+* **Plug in new optimizations seamlessly** without fighting a type system or rewriting core algorithms
+* **Democratize working on high-performance ray tracing** - contributions don't require C++ expertise
+* **Rapid experimentation** - test new ideas without lengthy compile cycles
+
+## What is Raycore.jl?
+
+Raycore is a focused library that does one thing well: fast ray-triangle intersections with BVH acceleration. It provides the building blocks for ray tracing applications without imposing a particular rendering architecture.
+
+**Core Features:**
+- Fast BVH construction and traversal
+- CPU and GPU support via KernelAbstractions.jl
+- Analysis tools: centroid calculation, illumination analysis, view factors for radiosity
+- Makie integration for visualization
+
+**Performance:** On GPU, we've achieved significant speedups through kernel optimizations including loop unrolling, tiling, and wavefront rendering approaches that minimize thread divergence.
+
+## Interactive Tutorials
+
+The documentation includes several hands-on tutorials that build from basics to advanced GPU optimization:
+
+### 1. BVH Hit Tests & Basics
+
+Learn the fundamentals of ray-triangle intersection, BVH construction, and visualization.
+
+![BVH Basics](docs/src/basics.png)
+
+[Try the tutorial →](https://docs.raycore.jl)
+
+### 2. Ray Tracing Tutorial
+
+Build a complete ray tracer from scratch with shadows, materials, reflections, and tone mapping.
+
+![Ray Tracing](docs/src/raytracing.png)
+
+[Try the tutorial →](https://docs.raycore.jl)
+
+### 3. Ray Tracing on the GPU
+
+Port the ray tracer to GPU and learn optimization techniques: loop unrolling, tiling, and wavefront rendering. Includes comprehensive benchmarks comparing different approaches.
+
+![GPU Benchmarks](docs/src/.gpu_raytracing_tutorial-bbook/data/gpu-benchmarks.png)
+
+[Try the tutorial →](https://docs.raycore.jl)
+
+### 4. View Factors & Analysis
+
+Calculate view factors, illumination, and centroids for radiosity and thermal analysis applications.
+
+![View Factors](docs/src/viewfactors.png)
+
+[Try the tutorial →](https://docs.raycore.jl)
+
+## What Can It Be Used For?
+
+Ray tracing isn't just for rendering pretty pictures. Raycore enables a wide range of physics and engineering applications:
+
+* **Physically-based rendering** - Photorealistic image synthesis with accurate light transport
+* **Light transport simulations** - Analyze lighting design, daylighting, and energy efficiency
+* **Acoustic simulations** - Model sound propagation in architectural spaces
+* **Neutron transport simulations** - Nuclear reactor analysis and radiation shielding
+* **Thermal radiosity** - Heat transfer analysis in complex geometries
+* **Any application that needs ray tracing** - The core is general-purpose
+
+A high-performance implementation of CPU and GPU ray tracing in Julia can be a huge enabler for research and development in these fields, especially considering how easy it is to jump into the code and make changes dynamically. Need to add a new material model? Write a few methods. Want to try a different BVH construction algorithm? Implement the interface. The barrier to experimentation is low.
+
+## Getting Started
+
+Install Raycore.jl from the Julia package manager:
+
+```julia
+using Pkg
+Pkg.add("Raycore")
+```
+
+Then check out the [interactive tutorials](https://docs.raycore.jl) to start building your first ray tracer!
+
+## Future Work
+
+While Raycore is production-ready for many applications, there are exciting directions for future development:
+
+* **Advanced acceleration structures** - Explore alternatives to BVH like kd-trees or octrees for specific use cases
+* **Importance sampling** - Better Monte Carlo integration strategies for path tracing
+* **Spectral rendering** - Move beyond RGB to full spectral wavelengths
+* **Bi-directional path tracing** - Handle difficult lighting scenarios more efficiently
+* **GPU memory optimizations** - Reduce memory footprint for larger scenes
+* **Improved precompilation** - Further reduce first-run latency
+* **Domain-specific extensions** - Purpose-built tools for acoustic, thermal, or neutron transport
+
+Contributions are welcome! The codebase is designed to be approachable, and the Julia community is friendly and helpful.
+
+## Acknowledgments
+
+This project builds on the excellent work of the Julia GPU ecosystem, particularly KernelAbstractions.jl for portable GPU programming, and the Julia visualization stack including Makie.jl for the interactive tutorials.
+
+Special thanks to everyone who provided feedback during development and helped shape Raycore into what it is today.
+
+---
+
+**Links:**
+- [Documentation & Tutorials](https://docs.raycore.jl)
+- [GitHub Repository](https://github.com/yourusername/Raycore.jl)
+- [Julia Discourse](https://discourse.julialang.org)
+
+I'm excited to see what you build with Raycore.jl. Happy ray tracing!
diff --git a/src/Raycore.jl b/src/Raycore.jl
index 144291f..99a1592 100644
--- a/src/Raycore.jl
+++ b/src/Raycore.jl
@@ -45,7 +45,7 @@ include("kernels.jl")
 include("ray_intersection_session.jl")
 
 # Core types
-export Ray, RayDifferentials, Triangle, TriangleMesh, BVHAccel, Bounds3, Normal3f
+export Ray, RayDifferentials, Triangle, TriangleMesh, AccelPrimitive, BVH, Bounds3, Normal3f
 
 # Ray intersection functions
 export closest_hit, any_hit, world_bound
diff --git a/src/bvh.jl b/src/bvh.jl
index 0b47f0b..a1ee404 100644
--- a/src/bvh.jl
+++ b/src/bvh.jl
@@ -73,13 +73,141 @@ function primitives_to_bvh(primitives, max_node_primitives=1)
     return (ordered_primitives, max_node_primitives, flattened)
 end
 
-struct BVHAccel{
-            PVec <:AbstractVector,
-            NodeVec <: AbstractVector{LinearBVH}
-        } <: AccelPrimitive
-    primitives::PVec
-    max_node_primitives::UInt8
+"""
+    CompactTriangle
+
+Pre-transformed triangle data for efficient GPU ray-triangle intersection.
+Uses edge vectors for Möller-Trumbore intersection algorithm.
+
+Fields:
+- v0: First vertex position (4th component for alignment)
+- edge1: v1 - v0 edge vector
+- edge2: v2 - v0 edge vector
+- normal: Face normal (4th component for alignment)
+- indices: [material_idx, primitive_idx] for shading
+"""
+struct CompactTriangle
+    v0::SVector{4, Float32}
+    edge1::SVector{4, Float32}
+    edge2::SVector{4, Float32}
+    normal::SVector{4, Float32}
+    indices::SVector{2, UInt32}
+end
+
+"""
+    to_compact_triangle(tri::Triangle) -> CompactTriangle
+
+Convert a Triangle to CompactTriangle format with pre-computed edge vectors.
+"""
+@inline function to_compact_triangle(tri::Triangle)::CompactTriangle
+    vs = vertices(tri)
+    v0, v1, v2 = vs[1], vs[2], vs[3]
+    edge1 = v1 - v0
+    edge2 = v2 - v0
+
+    # Compute face normal
+    face_normal = normalize(cross(Vec3f(edge1), Vec3f(edge2)))
+
+    return CompactTriangle(
+        SVector{4, Float32}(v0[1], v0[2], v0[3], 0f0),
+        SVector{4, Float32}(edge1[1], edge1[2], edge1[3], 0f0),
+        SVector{4, Float32}(edge2[1], edge2[2], edge2[3], 0f0),
+        SVector{4, Float32}(face_normal[1], face_normal[2], face_normal[3], 0f0),
+        SVector{2, UInt32}(tri.material_idx, tri.primitive_idx)
+    )
+end
+
+"""
+    intersect_compact_triangle(tri::CompactTriangle, ray_o, ray_d, t_max)
+
+Watertight Möller-Trumbore ray-triangle intersection for GPU.
+Uses pre-computed edge vectors for efficiency.
+
+Returns: (hit, t, u, v) where u,v are barycentric coordinates
+"""
+@inline function intersect_compact_triangle(
+    tri::CompactTriangle,
+    ray_o::Point3f,
+    ray_d::Vec3f,
+    t_max::Float32
+)
+    EPSILON = 1.0f-6
+
+    # Möller-Trumbore algorithm - work directly with SVector components to avoid allocations
+    # h = cross(ray_d, edge2)
+    h_x = ray_d[2] * tri.edge2[3] - ray_d[3] * tri.edge2[2]
+    h_y = ray_d[3] * tri.edge2[1] - ray_d[1] * tri.edge2[3]
+    h_z = ray_d[1] * tri.edge2[2] - ray_d[2] * tri.edge2[1]
+
+    # a = dot(edge1, h)
+    a = tri.edge1[1] * h_x + tri.edge1[2] * h_y + tri.edge1[3] * h_z
+
+    # Check if ray is parallel to triangle
+    if abs(a) < EPSILON
+        return (false, t_max, 0f0, 0f0)
+    end
+
+    f = 1f0 / a
+
+    # s = ray_o - v0
+    s_x = ray_o[1] - tri.v0[1]
+    s_y = ray_o[2] - tri.v0[2]
+    s_z = ray_o[3] - tri.v0[3]
+
+    # u = f * dot(s, h)
+    u = f * (s_x * h_x + s_y * h_y + s_z * h_z)
+
+    if u < 0f0 || u > 1f0
+        return (false, t_max, 0f0, 0f0)
+    end
+
+    # q = cross(s, edge1)
+    q_x = s_y * tri.edge1[3] - s_z * tri.edge1[2]
+    q_y = s_z * tri.edge1[1] - s_x * tri.edge1[3]
+    q_z = s_x * tri.edge1[2] - s_y * tri.edge1[1]
+
+    # v = f * dot(ray_d, q)
+    v = f * (ray_d[1] * q_x + ray_d[2] * q_y + ray_d[3] * q_z)
+
+    if v < 0f0 || u + v > 1f0
+        return (false, t_max, 0f0, 0f0)
+    end
+
+    # t = f * dot(edge2, q)
+    t = f * (tri.edge2[1] * q_x + tri.edge2[2] * q_y + tri.edge2[3] * q_z)
+
+    if t > EPSILON && t < t_max
+        return (true, t, u, v)
+    end
+
+    return (false, t_max, 0f0, 0f0)
+end
+
+"""
+    BVH{NodeVec, TriVec, OrigTriVec}
+
+GPU-optimized BVH acceleration structure.
+
+Key optimizations:
+- Uses LinearBVH node structure (flat array, depth-first layout)
+- Pre-transforms triangles with edge vectors for Möller-Trumbore
+- Designed for efficient GPU kernel traversal
+
+Fields:
+- nodes: LinearBVH nodes (flat array, depth-first layout)
+- triangles: Pre-transformed compact triangles
+- primitives: Original triangles (for normals, UVs, materials)
+- max_node_primitives: Maximum primitives per leaf node
+"""
+struct BVH{
+    NodeVec <: AbstractVector{LinearBVH},
+    TriVec <: AbstractVector{CompactTriangle},
+    OrigTriVec <: AbstractVector{Triangle}
+} <: AccelPrimitive
     nodes::NodeVec
+    triangles::TriVec
+    primitives::OrigTriVec
+    max_node_primitives::UInt8
 end
 
 
@@ -90,7 +218,18 @@ function to_triangle_mesh(x::GeometryBasics.AbstractGeometry)
     return TriangleMesh(m)
 end
 
-function BVHAccel(
+"""
+    BVH(primitives::AbstractVector, max_node_primitives::Integer=1)
+
+Construct a BVH acceleration structure from a list of primitives (meshes or geometries).
+
+Arguments:
+- `primitives`: Vector of triangle meshes or GeometryBasics geometries
+- `max_node_primitives`: Maximum number of primitives per leaf node (default: 1)
+
+Returns a GPU-optimized BVH with pre-transformed triangles for efficient ray tracing.
+"""
+function BVH(
         primitives::AbstractVector{P}, max_node_primitives::Integer=1,
     ) where {P}
     triangles = Triangle[]
@@ -105,7 +244,16 @@ function BVHAccel(
     ordered_primitives = map(enumerate(ordered_primitives)) do (i, tri)
         Triangle(tri, primitive_idx=UInt32(i))
     end
-    return BVHAccel(ordered_primitives, UInt8(max_prim), nodes)
+
+    # Convert triangles to compact format with pre-computed edges
+    compact_tris = [to_compact_triangle(tri) for tri in ordered_primitives]
+
+    return BVH(
+        nodes,
+        compact_tris,
+        ordered_primitives,
+        UInt8(max_prim)
+    )
 end
 
 mutable struct BucketInfo
@@ -236,13 +384,13 @@ function _unroll(
     l_offset + 1
 end
 
-@inline function world_bound(bvh::BVHAccel)::Bounds3
-    length(bvh.nodes) > Int32(0) ? bvh.nodes[1].bounds : Bounds3()
+@inline function world_bound(bvh::BVH)::Bounds3
+    length(bvh.nodes) > 0 ? bvh.nodes[1].bounds : Bounds3()
 end
 
 struct MemAllocator
 end
-@inline _allocate(::MemAllocator, T::Type, n::Val{N}) where {N} = zeros(MVector{N,T})
+@inline _allocate(::MemAllocator, T::Type, n::Val{N}) where {N} = MVector{N,T}(undef)
 Base.@propagate_inbounds function _setindex(arr::MVector{N, T}, idx::Integer, value::T) where {N, T}
     arr[idx] = value
     return arr
@@ -250,62 +398,61 @@ end
 
 
 """
-    traverse_bvh(bvh::BVHAccel{P}, ray::AbstractRay, hit_callback::F) where {P, F<:Function}
-
-Internal function that traverses the BVH to find ray-primitive intersections.
-Uses a callback pattern to handle different intersection behaviors.
+    closest_hit(bvh::BVH, ray::AbstractRay)
 
-Arguments:
-- `bvh`: The BVH acceleration structure
-- `ray`: The ray to test for intersections
-- `hit_callback`: Function called when primitive is tested. Signature:
-   hit_callback(primitive, ray) -> (continue_traversal::Bool, ray::AbstractRay, results::Any)
+Find the closest intersection between a ray and the GPU BVH.
+Uses manual traversal with compact triangle intersection for best performance.
 
 Returns:
-- The final result from the hit_callback
+- `hit_found`: Boolean indicating if an intersection was found
+- `hit_primitive`: The primitive that was hit (if any)
+- `distance`: Distance along the ray to the hit point (hit_point = ray.o + ray.d * distance)
+- `barycentric_coords`: Barycentric coordinates of the hit point
 """
-@inline function traverse_bvh(hit_callback::F, bvh::BVHAccel{P}, ray::AbstractRay, allocator=MemAllocator()) where {P, F<:Function}
-    if length(bvh.nodes) == 0
-        # We dont handle the empty case yet, since its not that easy to make it type stable
-        # Its possible, but why would we intersect an empty BVH?
-        error("BVH is empty; cannot traverse.")
-    end
-
-    # Prepare ray for traversal
+@inline function closest_hit(bvh::BVH, ray::AbstractRay, allocator=MemAllocator())
     ray = check_direction(ray)
     inv_dir = 1f0 ./ ray.d
     dir_is_neg = is_dir_negative(ray.d)
 
-    # Initialize traversal stack
+    # Initialize traversal
     local to_visit_offset::Int32 = Int32(1)
     current_node_idx = Int32(1)
-    nodes_to_visit = _allocate(allocator, Int32, Val(64))
-    primitives = bvh.primitives
+    # Direct MVector construction for type stability (critical for GPU, especially OpenCL/SPIR-V)
+    nodes_to_visit = MVector{64, Int32}(undef)
     nodes = bvh.nodes
+    triangles = bvh.triangles
+    original_tris = bvh.primitives
 
-    # State variables to hold callback results
-    continue_search = true
-    prim1 = primitives[1]
-    _, ray, result = hit_callback(prim1, ray, nothing)
+    # Track closest hit
+    hit_found = false
+    hit_tri_idx = Int32(0)
+    closest_t = ray.t_max
+    hit_u = 0f0
+    hit_v = 0f0
 
     # Traverse BVH
     @_inbounds while true
         current_node = nodes[current_node_idx]
+
         # Test ray against current node's bounding box
         if intersect_p(current_node.bounds, ray, inv_dir, dir_is_neg)
             local cnprim::Int32 = current_node.n_primitives % Int32
             if !current_node.is_interior && cnprim > Int32(0)
-                # Leaf node - test all primitives
+                # Leaf node - test all triangles
                 offset = current_node.offset % Int32
 
                 for i in Int32(0):(cnprim - Int32(1))
-                    primitive = primitives[offset + i]
-
-                    # Call the callback for this primitive
-                    continue_search, ray, result = hit_callback(primitive, ray, result)
-                    # Early exit if callback requests it
-                    if !continue_search
-                        return false, ray, result
+                    tri_idx = offset + i
+                    compact_tri = triangles[tri_idx]
+
+                    # Use compact intersection
+                    tmp_hit, t, u, v = intersect_compact_triangle(compact_tri, ray.o, ray.d, closest_t)
+                    if tmp_hit && t < closest_t
+                        closest_t = t
+                        hit_found = true
+                        hit_tri_idx = tri_idx
+                        hit_u = u
+                        hit_v = v
                     end
                 end
 
@@ -317,11 +464,20 @@ Returns:
                 current_node_idx = nodes_to_visit[to_visit_offset]
             else
                 # Interior node - push children to stack
-                if dir_is_neg[current_node.split_axis] == Int32(2)
-                    nodes_to_visit = _setindex(nodes_to_visit, to_visit_offset, current_node_idx + Int32(1))
+                # Explicitly unroll axis cases to avoid LLVM select chains in SPIR-V
+                local is_neg = if current_node.split_axis == Int32(1)
+                    dir_is_neg[1] == Int32(2)
+                elseif current_node.split_axis == Int32(2)
+                    dir_is_neg[2] == Int32(2)
+                else  # split_axis == 3
+                    dir_is_neg[3] == Int32(2)
+                end
+
+                if is_neg
+                    nodes_to_visit[to_visit_offset] = current_node_idx + Int32(1)
                     current_node_idx = current_node.offset % Int32
                 else
-                    nodes_to_visit = _setindex(nodes_to_visit, to_visit_offset, current_node.offset % Int32)
+                    nodes_to_visit[to_visit_offset] = current_node.offset % Int32
                     current_node_idx += Int32(1)
                 end
                 to_visit_offset += Int32(1)
@@ -335,74 +491,114 @@ Returns:
             current_node_idx = nodes_to_visit[to_visit_offset]
         end
     end
-    # Return final state
-    return continue_search, ray, result
-end
-
-# Initialization
-closest_hit_callback(primitive, ray, ::Nothing) = false, ray, (false, primitive, 0.0f0, Point3f(0.0))
 
-function closest_hit_callback(primitive, ray, prev_result::Tuple{Bool, P, Float32, Point3f}) where {P}
-    # Test intersection and update if closer
-    tmp_hit, ray, tmp_bary = intersect_p!(primitive, ray)
-    if tmp_hit
-        # Calculate distance from ray origin to hit point
-        distance = ray.t_max
-        return true, ray, (true, primitive, distance, tmp_bary)
+    # Return result
+    if hit_found
+        orig_tri = original_tris[hit_tri_idx]
+        w = 1f0 - hit_u - hit_v
+        # Use SVector for GPU compatibility - Point3f is just an alias for SVector
+        bary_point = SVector{3, Float32}(w, hit_u, hit_v)
+        return (true, orig_tri, closest_t, bary_point)
+    else
+        # Return dummy result matching standard BVH behavior
+        dummy_tri = original_tris[1]
+        bary_point = SVector{3, Float32}(0f0, 0f0, 0f0)
+        return (false, dummy_tri, 0f0, bary_point)
     end
-    # Always continue search to find closest
-    return true, ray, prev_result
 end
 
 """
-    closest_hit(bvh::BVHAccel{P}, ray::AbstractRay) where {P}
+    any_hit(bvh::BVH, ray::AbstractRay)
 
-Find the closest intersection between a ray and the primitives stored in a BVH.
+Test if a ray intersects any primitive in the GPU BVH (for occlusion testing).
+Stops at the first intersection found.
 
 Returns:
-- `hit_found`: Boolean indicating if an intersection was found
+- `hit_found`: Boolean indicating if any intersection was found
 - `hit_primitive`: The primitive that was hit (if any)
 - `distance`: Distance along the ray to the hit point (hit_point = ray.o + ray.d * distance)
 - `barycentric_coords`: Barycentric coordinates of the hit point
 """
-@inline function closest_hit(bvh::BVHAccel{P}, ray::AbstractRay, allocator=MemAllocator()) where {P}
-    # Traverse BVH with closest-hit callback
-    _, _, result = @inline traverse_bvh(closest_hit_callback, bvh, ray, allocator)
-    return result::Tuple{Bool, Triangle, Float32, Point3f}
-end
+@inline function any_hit(bvh::BVH, ray::AbstractRay, allocator=MemAllocator())
+    ray = check_direction(ray)
+    inv_dir = 1f0 ./ ray.d
+    dir_is_neg = is_dir_negative(ray.d)
 
+    # Initialize traversal
+    local to_visit_offset::Int32 = Int32(1)
+    current_node_idx = Int32(1)
+    # Direct MVector construction for type stability (critical for GPU, especially OpenCL/SPIR-V)
+    nodes_to_visit = MVector{64, Int32}(undef)
+    nodes = bvh.nodes
+    triangles = bvh.triangles
+    original_tris = bvh.primitives
 
-any_hit_callback(primitive, current_ray, result::Nothing) = (false, current_ray, (false, primitive, 0.0f0, Point3f(0.0)))
+    # Traverse BVH
+    @_inbounds while true
+        current_node = nodes[current_node_idx]
 
-# Define any-hit callback
-function any_hit_callback(primitive, current_ray, prev_result::Tuple{Bool, P, Float32, Point3f}) where {P}
-    # Test for intersection
-    tmp_hit, dist, tmp_bary = intersect(primitive, current_ray)
-    if tmp_hit
-        # Stop traversal on first hit and return hit info
-        return false, current_ray, (true, primitive, dist, tmp_bary)
-    end
-    # Continue search if no hit
-    return true, current_ray, prev_result
-end
+        # Test ray against current node's bounding box
+        if intersect_p(current_node.bounds, ray, inv_dir, dir_is_neg)
+            local cnprim::Int32 = current_node.n_primitives % Int32
+            if !current_node.is_interior && cnprim > Int32(0)
+                # Leaf node - test triangles
+                offset = current_node.offset % Int32
 
-"""
-    any_hit(bvh::BVHAccel, ray::AbstractRay)
+                for i in Int32(0):(cnprim - Int32(1))
+                    tri_idx = offset + i
+                    compact_tri = triangles[tri_idx]
+
+                    # Test for any hit
+                    tmp_hit, t, u, v = intersect_compact_triangle(compact_tri, ray.o, ray.d, ray.t_max)
+                    if tmp_hit
+                        # Return immediately on first hit
+                        orig_tri = original_tris[tri_idx]
+                        w = 1f0 - u - v
+                        bary_point = SVector{3, Float32}(w, u, v)
+                        return (true, orig_tri, t, bary_point)
+                    end
+                end
 
-Test if a ray intersects any primitive in the BVH (without finding the closest hit).
-This function stops at the first intersection found, making it faster than closest_hit
-when you only need to know if there's an intersection.
+                # Done with leaf, pop next node from stack
+                if to_visit_offset === Int32(1)
+                    break
+                end
+                to_visit_offset -= Int32(1)
+                current_node_idx = nodes_to_visit[to_visit_offset]
+            else
+                # Interior node - push children to stack
+                # Explicitly unroll axis cases to avoid LLVM select chains in SPIR-V
+                local is_neg = if current_node.split_axis == Int32(1)
+                    dir_is_neg[1] == Int32(2)
+                elseif current_node.split_axis == Int32(2)
+                    dir_is_neg[2] == Int32(2)
+                else  # split_axis == 3
+                    dir_is_neg[3] == Int32(2)
+                end
 
-Returns:
-- `hit_found`: Boolean indicating if any intersection was found
-- `hit_primitive`: The primitive that was hit (if any)
-- `distance`: Distance along the ray to the hit point (hit_point = ray.o + ray.d * distance)
-- `barycentric_coords`: Barycentric coordinates of the hit point
-"""
-@inline function any_hit(bvh::BVHAccel, ray::AbstractRay, allocator=MemAllocator())
-    # Traverse BVH with any-hit callback
-    continue_search, _, result = traverse_bvh(any_hit_callback, bvh, ray, allocator)
-    return result::Tuple{Bool, Triangle, Float32, Point3f}
+                if is_neg
+                    nodes_to_visit[to_visit_offset] = current_node_idx + Int32(1)
+                    current_node_idx = current_node.offset % Int32
+                else
+                    nodes_to_visit[to_visit_offset] = current_node.offset % Int32
+                    current_node_idx += Int32(1)
+                end
+                to_visit_offset += Int32(1)
+            end
+        else
+            # Miss - pop next node from stack
+            if to_visit_offset === Int32(1)
+                break
+            end
+            to_visit_offset -= Int32(1)
+            current_node_idx = nodes_to_visit[to_visit_offset]
+        end
+    end
+
+    # No hit found
+    dummy_tri = original_tris[1]
+    bary_point = SVector{3, Float32}(0f0, 0f0, 0f0)
+    return (false, dummy_tri, 0f0, bary_point)
 end
 
 function calculate_ray_grid_bounds(bounds::GeometryBasics.Rect, ray_direction::Vec3f)
@@ -411,7 +607,7 @@ function calculate_ray_grid_bounds(bounds::GeometryBasics.Rect, ray_direction::V
     # 1. Find a plane perpendicular to the ray direction
     # We need two basis vectors that are perpendicular to the ray direction
     # First, find a non-parallel vector to create our first basis vector
-    if abs(direction[1]) < 0.9
+    if abs(direction[1]) < 0.9f0
         temp = Vec3f(1.0, 0.0, 0.0)
     else
         temp = Vec3f(0.0, 1.0, 0.0)
@@ -476,11 +672,11 @@ function generate_ray_grid(grid_info, grid_size::Int)
 end
 
 """
-    generate_ray_grid(bvh::BVHAccel, ray_direction::Vec3f, grid_size::Int)
+    generate_ray_grid(bvh::BVH, ray_direction::Vec3f, grid_size::Int)
 
 Generate a grid of ray origins based on the BVH bounding box and a given ray direction.
 """
-function generate_ray_grid(bvh::BVHAccel, ray_direction::Vec3f, grid_size::Int)
+function generate_ray_grid(bvh::BVH, ray_direction::Vec3f, grid_size::Int)
     bounds = world_bound(bvh)
     bb = Rect3f(bounds.p_min, bounds.p_max .- bounds.p_min)
     grid_info = calculate_ray_grid_bounds(bb, ray_direction)
@@ -488,10 +684,10 @@ function generate_ray_grid(bvh::BVHAccel, ray_direction::Vec3f, grid_size::Int)
 end
 
 
-function GeometryBasics.Mesh(bvh::BVHAccel)
+function GeometryBasics.Mesh(bvh::BVH)
     points = Point3f[]
     faces = GLTriangleFace[]
-    prims = bvh.primitives# sort(bvh.primitives; by=x -> x.material_idx)
+    prims = bvh.primitives # Use original triangles, not compact ones
     for (ti, tringle) in enumerate(prims)
         push!(points, tringle.vertices...)
         tt = ((ti - 1) * 3) + 1
@@ -501,38 +697,26 @@ function GeometryBasics.Mesh(bvh::BVHAccel)
     return GeometryBasics.Mesh(points, faces)
 end
 
-# Pretty printing for BVHAccel
-function Base.show(io::IO, ::MIME"text/plain", bvh::BVHAccel)
-    n_triangles = length(bvh.primitives)
+# Pretty printing for BVH
+function Base.show(io::IO, ::MIME"text/plain", bvh::BVH)
+    n_triangles = length(bvh.triangles)
     n_nodes = length(bvh.nodes)
-    bounds = world_bound(bvh)
 
     # Count leaf vs interior nodes
     n_leaves = count(node -> !node.is_interior, bvh.nodes)
     n_interior = n_nodes - n_leaves
 
-    # Calculate average primitives per leaf
-    total_leaf_prims = sum(node -> node.is_interior ? 0 : Int(node.n_primitives), bvh.nodes)
-    avg_prims_per_leaf = n_leaves > 0 ? total_leaf_prims / n_leaves : 0.0
-
-    println(io, "BVHAccel:")
-    println(io, "  Triangles:     ", n_triangles)
+    println(io, "BVH:")
+    println(io, "  Triangles:     ", n_triangles, " (pre-transformed)")
     println(io, "  BVH nodes:     ", n_nodes, " (", n_interior, " interior, ", n_leaves, " leaves)")
-    println(io, "  Bounds:        ", bounds.p_min, " to ", bounds.p_max)
-    println(io, "  Max prims:     ", Int(bvh.max_node_primitives), " per leaf")
-    print(io,   "  Avg prims:     ", round(avg_prims_per_leaf, digits=2), " per leaf")
+    print(io,   "  Max prims:     ", Int(bvh.max_node_primitives), " per leaf")
 end
 
-function Base.show(io::IO, bvh::BVHAccel)
+function Base.show(io::IO, bvh::BVH)
     if get(io, :compact, false)
-        n_triangles = length(bvh.primitives)
+        n_triangles = length(bvh.triangles)
         n_nodes = length(bvh.nodes)
-        n_leaves = count(node -> !node.is_interior, bvh.nodes)
-        n_interior = n_nodes - n_leaves
-        print(io, "BVHAccel(")
-        print(io, "triangles=", n_triangles, ", ")
-        print(io, "nodes=", n_nodes, " (", n_interior, " interior, ", n_leaves, " leaves)")
-        print(io, ")")
+        print(io, "BVH(triangles=", n_triangles, ", nodes=", n_nodes, ")")
     else
         show(io, MIME("text/plain"), bvh)
     end
diff --git a/src/kernel-abstractions.jl b/src/kernel-abstractions.jl
index 4992b9d..fa5446d 100644
--- a/src/kernel-abstractions.jl
+++ b/src/kernel-abstractions.jl
@@ -2,22 +2,27 @@ import KernelAbstractions as KA
 
 KA.@kernel some_kernel_f() = nothing
 
+global PRESERVE = []
+
 function some_kernel(arr)
     backend = KA.get_backend(arr)
     return some_kernel_f(backend)
 end
 
-function to_gpu(ArrayType, m::AbstractArray; preserve=[])
+function to_gpu(ArrayType, m::AbstractArray)
     arr = ArrayType(m)
-    push!(preserve, arr)
+    push!(PRESERVE, arr)
+    finalizer((arr) -> filter!(x-> x === arr, PRESERVE), arr)
     kernel = some_kernel(arr)
     return KA.argconvert(kernel, arr)
 end
 
-# Conversion constructor for e.g. GPU arrays
-# TODO, create tree on GPU? Not sure if that will gain much though...
-function to_gpu(ArrayType, bvh::Raycore.BVHAccel; preserve=[])
-    primitives = to_gpu(ArrayType, bvh.primitives; preserve=preserve)
-    nodes = to_gpu(ArrayType, bvh.nodes; preserve=preserve)
-    return Raycore.BVHAccel(primitives, bvh.max_node_primitives, nodes)
+# GPU conversion for BVH
+function to_gpu(ArrayType, bvh::Raycore.BVH)
+    nodes = to_gpu(ArrayType, bvh.nodes)
+    triangles = to_gpu(ArrayType, bvh.triangles)
+    primitives = to_gpu(ArrayType, bvh.primitives)
+    return Raycore.BVH(nodes, triangles, primitives, bvh.max_node_primitives)
 end
+
+gpu_int(x) = Base.unsafe_trunc(Int32, x)
diff --git a/src/ray_intersection_session.jl b/src/ray_intersection_session.jl
index 72cf025..28383d3 100644
--- a/src/ray_intersection_session.jl
+++ b/src/ray_intersection_session.jl
@@ -6,7 +6,7 @@ and the computed intersection results.
 
 # Fields
 - `rays::Vector{<:AbstractRay}`: Array of rays to trace
-- `bvh::BVHAccel`: BVH acceleration structure to intersect against
+- `bvh::BVH`: BVH acceleration structure to intersect against
 - `hit_function::F`: Function to use for intersection testing (e.g., `closest_hit` or `any_hit`)
 - `hits::Vector{Tuple{Bool, Triangle, Float32, Point3f}}`: Results of hit_function applied to each ray
 
@@ -16,7 +16,7 @@ using Raycore, GeometryBasics
 
 # Create BVH from geometry
 sphere = Tesselation(Sphere(Point3f(0, 0, 1), 1.0f0), 20)
-bvh = Raycore.BVHAccel([sphere])
+bvh = Raycore.BVH([sphere])
 
 # Create rays
 rays = [
@@ -39,10 +39,10 @@ end
 struct RayIntersectionSession{Rays, F}
     hit_function::F
     rays::Rays
-    bvh::BVHAccel
+    bvh::BVH
     hits::Vector{Tuple{Bool, Triangle, Float32, Point3f}}
 
-    function RayIntersectionSession(hit_function::F, rays::Rays, bvh::BVHAccel) where {Rays,F}
+    function RayIntersectionSession(hit_function::F, rays::Rays, bvh::BVH) where {Rays,F}
         # Compute all hits
         hits = [hit_function(bvh, ray) for ray in rays]
         new{Rays, F}(hit_function, rays, bvh, hits)
diff --git a/test/gpu-threading-benchmarks.jl b/test/gpu-threading-benchmarks.jl
deleted file mode 100644
index 010bf2e..0000000
--- a/test/gpu-threading-benchmarks.jl
+++ /dev/null
@@ -1,234 +0,0 @@
-using GeometryBasics, LinearAlgebra, Raycore, BenchmarkTools
-
-# using CUDA
-# ArrayType = CuArray
-
-LowSphere(radius, contact=Point3f(0)) = Sphere(contact .+ Point3f(0, 0, radius), radius)
-
-function tmesh(prim, material)
-    prim =  prim isa Sphere ? Tesselation(prim, 64) : prim
-    return normal_mesh(prim)
-end
-
-
-begin
-    material_red = nothing
-    s1 = tmesh(LowSphere(0.5f0), material_red)
-    s2 = tmesh(LowSphere(0.3f0, Point3f(0.5, 0.5, 0)), material_red)
-    s3 = tmesh(LowSphere(0.3f0, Point3f(-0.5, 0.5, 0)), material_red)
-    s4 = tmesh(LowSphere(0.4f0, Point3f(0, 1.0, 0)), material_red)
-
-    ground = tmesh(Rect3f(Vec3f(-5, -5, 0), Vec3f(10, 10, 0.01)), material_red)
-    back = tmesh(Rect3f(Vec3f(-5, -3, 0), Vec3f(10, 0.01, 10)), material_red)
-    l = tmesh(Rect3f(Vec3f(-2, -5, 0), Vec3f(0.01, 10, 10)), material_red)
-    r = tmesh(Rect3f(Vec3f(2, -5, 0), Vec3f(0.01, 10, 10)), material_red)
-    bvh = Raycore.BVHAccel([s1, s2, s3, s4, ground, back, l, r]);
-end
-
-# using AMDGPU
-# ArrayType = ROCArray
-# using CUDA
-# ArrayType = CuArray
-
-# using Metal
-# ArrayType = MtlArray
-
-preserve = []
-gpu_scene = to_gpu(ArrayType, scene; preserve=preserve);
-gpu_img = ArrayType(zeros(RGBf, res, res));
-# launch_trace_image!(img, cam, bvh, lights);
-# @btime launch_trace_image!(img, cam, bvh, lights);
-# @btime launch_trace_image!(gpu_img, cam, gpu_bvh, lights);
-launch_trace_image!(gpu_img, cam, gpu_scene);
-launch_trace_image!(img, cam, scene, lights)
-# 76.420 ms (234 allocations: 86.05 KiB)
-# 75.973 ms (234 allocations: 86.05 KiB)
-Array(gpu_img)
-
-function cu_trace_image!(img, camera, bvh, lights)
-    x = threadIdx().x
-    y = threadIdx().y
-    if checkbounds(Bool, img, (x, y))
-        @_inbounds img[x, y] = trace_pixel(camera, bvh, (x,y), lights)
-    end
-end
-
-k = some_kernel(img)
-ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(k, size(img), (16, 16))
-blocks = length(KA.blocks(iterspace))
-threads = length(KA.workitems(iterspace))
-
-function cu_launch_trace_image!(img, camera, bvh, lights)
-    CUDA.@sync @cuda threads = length(img) cu_trace_image!(img, camera, bvh, lights)
-    return img
-end
-cu_launch_trace_image!(gpu_img, cam, gpu_bvh, lights);
-Array(gpu_img)
-# 380.081 ms (913 allocations: 23.55 KiB)
-# CUDA (3070 mobile)
-# 238.149 ms (46 allocations: 6.22 KiB)
-# Int64 -> Int32
-# 65.34 m
-# workgroupsize=(16,16)
-# 31.022 ms (35 allocations: 5.89 KiB)
-
-function trace_image!(img, camera, scene)
-    for xy in CartesianIndices(size(img))
-        @_inbounds img[xy] = RGBf(trace_pixel(camera, scene, xy).c...)
-    end
-    return img
-end
-
-function threads_trace_image!(img, camera, bvh)
-    Threads.@threads for xy in CartesianIndices(size(img))
-        @_inbounds img[xy] = trace_pixel(camera, bvh, xy)
-    end
-    return img
-end
-
-@btime trace_image!(img, cam, bvh)
-# Single: 707.754 ms (0 allocations: 0 bytes)
-# New Triangle layout  1
-# 860.535 ms (0 allocations: 0 bytes)
-# GPU intersection compatible
-# 403.335 ms (0 allocations: 0 bytes)
-
-@btime threads_trace_image!(img, cam, bvh)
-# Start
-# Multi : 73.090 ms (262266 allocations: 156.04 MiB)
-# BVH inline
-# Multi (static): 66.564 ms (122 allocations: 45.62 KiB)
-# New Triangle layout 1
-# 80.222 ms (122 allocations: 32.88 KiB)
-# 42.842 ms (122 allocations: 32.88 KiB) (more inline)
-# GPU intersection compatible
-# 42.681 ms (122 allocations: 32.88 KiB)
-
-
-using Tullio
-
-@_inbounds function tullio_trace_image!(img, camera, bvh)
-    @tullio img[x, y] = trace_pixel(camera, bvh, (x, y))
-    return img
-end
-
-@btime tullio_trace_image!(img, cam, bvh)
-# BVH inline + tullio
-# Multi: 150.944 ms (107 allocations: 33.17 KiB)
-# New Triangle layout 1
-# 161.447 ms (107 allocations: 33.17 KiB)
-# 117.139 ms (107 allocations: 33.17 KiB) (more inline)
-# GPU intersection compatible
-# 82.461 ms (109 allocations: 22.39 KiB)
-
-@btime launch_trace_image!(img, cam, bvh)
-# 71.405 ms (233 allocations: 86.05 KiB)
-# 47.240 ms (233 allocations: 86.09 KiB)
-# GPU intersection compatible
-# 44.629 ms (233 allocations: 54.50 KiB)
-
-
-##########################
-##########################
-##########################
-# Random benchmarks
-v1 = Vec3f(0.0, 0.0, 0.0)
-v2 = Vec3f(1.0, 0.0, 0.0)
-v3 = Vec3f(0.0, 1.0, 0.0)
-
-ray_origin = Vec3f(0.5, 0.5, 1.0)
-ray_direction = Vec3f(0.0, 0.0, -1.0)
-
-using Raycore: Normal3f
-m = Raycore.TriangleMesh(Raycore.ShapeCore(), UInt32[1, 2, 3], Point3f[v1, v2, v3], [Normal3f(0.0, 0.0, 1.0), Normal3f(0.0, 0.0, 1.0), Normal3f(0.0, 0.0, 1.0)])
-
-t = Raycore.Triangle(m, 1)
-r = Raycore.Ray(o=Point3f(ray_origin), d=ray_direction)
-Raycore.intersect_p(t, r)
-Raycore.intersect_triangle(r.o, r.d, t.vertices...)
-
-# function launch_trace_image_ir!(img, camera, bvh, lights)
-#     backend = KA.get_backend(img)
-#     kernel! = ka_trace_image!(backend)
-#     open("test2.ir", "w") do io
-#         @device_code_llvm io begin
-#             kernel!(img, camera, bvh, lights, ndrange = size(img), workgroupsize = (16, 16))
-#         end
-#     end
-#     AMDGPU.synchronize(; stop_hostcalls=false)
-#     return img
-# end
-
-ray = Raycore.RayDifferentials(Raycore.Ray(o=Point3f(0.5, 0.5, 1.0), d=Vec3f(0.0, 0.0, -1.0)))
-open("li.llvm", "w") do io
-    code_llvm(io, Raycore.li, typeof.((Raycore.UniformSampler(8), 5, ray, scene, 1)))
-end
-
-open("li-wt.jl", "w") do io
-    code_warntype(io, Raycore.li, typeof.((Raycore.UniformSampler(8), 5, ray, scene, 1)))
-end
-
-camera_sample = Raycore.get_camera_sample(integrator.sampler, Point2f(512))
-ray, ω = Raycore.generate_ray_differential(integrator.camera, camera_sample)
-
-
-ray = Raycore.Ray(o=Point3f(0.0, 0.0, 2.0), d=Vec3f(0.0, 0.0, -1.0))
-function test(results, bvh, ray)
-    for i in 1:100000
-        results[i] = Raycore.any_hit(bvh, ray, PerfNTuple)
-    end
-    return results
-end
-
-@profview test(results, bvh, ray)
-@btime Raycore.closest_hit(bvh, ray)
-results = Vector{Tuple{Bool, Raycore.Triangle, Float32, Point3f}}(undef, 100000);
-@btime test(results, bvh, ray);
-
-@btime Raycore.any_hit(bvh, ray)
-
-@code_typed Raycore.traverse_bvh(Raycore.any_hit_callback, bvh, ray, Raycore.MemAllocator())
-
-sizeof(zeros(Raycore.MVector{64,Int32}))
-
-###
-# Int32 always
-# 42.000 μs (1 allocation: 624 bytes)
-# Tuple instead of vector for nodes_to_visit
-# 43.400 μs (1 allocation: 624 bytes)
-# AFTER GPU rework
-# closest_hit
-# 40.500 μs (1 allocation: 368 bytes)
-# intersect_p
-# 11.500 μs (0 allocations: 0 bytes)
-
-### LinearBVHLeaf as one type
-# 5.247460 seconds (17.55 k allocations: 19.783 MiB, 46 lock conflicts)
-
-struct PerfNTuple{N,T}
-    data::NTuple{N,T}
-end
-
-@generated function Raycore._setindex(r::PerfNTuple{N,T}, idx::IT, value::T) where {N,T, IT <: Integer}
-    expr = Expr(:tuple)
-    for i in 1:N
-        idxt = IT(i)
-        push!(expr.args, :(idx === $idxt ? value : r.data[$idxt]))
-    end
-    return :($(PerfNTuple)($expr))
-end
-
-Base.@propagate_inbounds Base.getindex(r::PerfNTuple, idx::Integer) = r.data[idx]
-
-@generated function Raycore._allocate(::Type{PerfNTuple}, ::Type{T}, ::Val{N}) where {T,N}
-    expr = Expr(:tuple)
-    for i in 1:N
-        push!(expr.args, :($(T(0))))
-    end
-    return :($(PerfNTuple){$N, $T}($expr))
-end
-
-m = Raycore._allocate(PerfNTuple, Int32, Val(64))
-m2 = Raycore._setindex(m, 10, Int32(42))
-
-@btime Raycore.any_hit(bvh, ray, PerfNTuple)
diff --git a/test/test_intersection.jl b/test/test_intersection.jl
index 09a0ef0..6c93c98 100644
--- a/test/test_intersection.jl
+++ b/test/test_intersection.jl
@@ -71,7 +71,7 @@ end
         push!(triangle_meshes, mesh)
     end
 
-    bvh = Raycore.BVHAccel(triangle_meshes)
+    bvh = Raycore.BVH(triangle_meshes)
     # Test basic BVH functionality with triangle meshes
     @test !isnothing(Raycore.world_bound(bvh))
 
@@ -99,7 +99,7 @@ end
         push!(triangle_meshes, mesh)
     end
 
-    bvh = Raycore.BVHAccel(triangle_meshes)
+    bvh = Raycore.BVH(triangle_meshes)
     # Test that BVH can be created and has a valid bound
     bound = Raycore.world_bound(bvh)
     @test !isnothing(bound)
diff --git a/test/test_type_stability.jl b/test/test_type_stability.jl
index 695ee92..2c03157 100644
--- a/test/test_type_stability.jl
+++ b/test/test_type_stability.jl
@@ -52,7 +52,7 @@ end
 # BVH
 function gen_bvh_accel()
     mesh = Rect3f(Point3f(0), Vec3f(1))
-    Raycore.BVHAccel([mesh], 1)
+    Raycore.BVH([mesh], 1)
 end
 
 # Quaternion