From cce0b52e7bc7ca2b95fe9f95c8528cd87e787e33 Mon Sep 17 00:00:00 2001 From: Scott McMurray Date: Fri, 26 May 2023 01:23:55 -0700 Subject: [PATCH 1/2] Add a codegen test for manually swapping a small `Copy` type To confirm we're not just helping `mem::swap` --- tests/codegen/swap-small-types.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/codegen/swap-small-types.rs b/tests/codegen/swap-small-types.rs index 03e2a2327fc4c..6289d7af3a08c 100644 --- a/tests/codegen/swap-small-types.rs +++ b/tests/codegen/swap-small-types.rs @@ -8,10 +8,30 @@ use std::mem::swap; type RGB48 = [u16; 3]; +// CHECK-LABEL: @swap_rgb48_manually( +#[no_mangle] +pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) { + // CHECK-NOT: alloca + // CHECK: %temp = alloca [3 x i16] + // CHECK-NOT: alloca + // CHECK-NOT: call void @llvm.memcpy + // CHECK: call void @llvm.memcpy.{{.+}}({{.+}} %temp, {{.+}} %x, {{.+}} 6, {{.+}}) + // CHECK: call void @llvm.memcpy.{{.+}}({{.+}} %x, {{.+}} %y, {{.+}} 6, {{.+}}) + // CHECK: call void @llvm.memcpy.{{.+}}({{.+}} %y, {{.+}} %temp, {{.+}} 6, {{.+}}) + // CHECK-NOT: call void @llvm.memcpy + + let temp = *x; + *x = *y; + *y = temp; +} + // CHECK-LABEL: @swap_rgb48 #[no_mangle] pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) { // FIXME MIR inlining messes up LLVM optimizations. + // If these checks start failing, please update this test. + // CHECK: alloca [3 x i16] + // CHECK: call void @llvm.memcpy // WOULD-CHECK-NOT: alloca // WOULD-CHECK: load i48 // WOULD-CHECK: store i48 From e1b020df9f72eab7e8b3e38a5263ddda54ce18e1 Mon Sep 17 00:00:00 2001 From: Scott McMurray Date: Fri, 26 May 2023 03:32:22 -0700 Subject: [PATCH 2/2] Use `load`-`store` instead of `memcpy` for short integer arrays --- compiler/rustc_codegen_llvm/src/type_.rs | 3 ++ compiler/rustc_codegen_llvm/src/type_of.rs | 33 +++++++++++++++++ compiler/rustc_codegen_ssa/src/base.rs | 14 +++++++- .../rustc_codegen_ssa/src/traits/type_.rs | 22 ++++++++++++ tests/codegen/array-codegen.rs | 35 +++++++++++++++++++ tests/codegen/mem-replace-simple-type.rs | 11 ++++++ tests/codegen/swap-simd-types.rs | 9 +++++ tests/codegen/swap-small-types.rs | 25 ++++++------- 8 files changed, 136 insertions(+), 16 deletions(-) create mode 100644 tests/codegen/array-codegen.rs diff --git a/compiler/rustc_codegen_llvm/src/type_.rs b/compiler/rustc_codegen_llvm/src/type_.rs index d3fad5699c800..4ffa2b9c6a39d 100644 --- a/compiler/rustc_codegen_llvm/src/type_.rs +++ b/compiler/rustc_codegen_llvm/src/type_.rs @@ -288,6 +288,9 @@ impl<'ll, 'tcx> LayoutTypeMethods<'tcx> for CodegenCx<'ll, 'tcx> { fn reg_backend_type(&self, ty: &Reg) -> &'ll Type { ty.llvm_type(self) } + fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option { + layout.scalar_copy_llvm_type(self) + } } impl<'ll, 'tcx> TypeMembershipMethods<'tcx> for CodegenCx<'ll, 'tcx> { diff --git a/compiler/rustc_codegen_llvm/src/type_of.rs b/compiler/rustc_codegen_llvm/src/type_of.rs index a493c9c0548e9..3339e4e07edd3 100644 --- a/compiler/rustc_codegen_llvm/src/type_of.rs +++ b/compiler/rustc_codegen_llvm/src/type_of.rs @@ -6,6 +6,7 @@ use rustc_middle::bug; use rustc_middle::ty::layout::{FnAbiOf, LayoutOf, TyAndLayout}; use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths}; use rustc_middle::ty::{self, Ty, TypeVisitableExt}; +use rustc_target::abi::HasDataLayout; use rustc_target::abi::{Abi, Align, FieldsShape}; use rustc_target::abi::{Int, Pointer, F32, F64}; use rustc_target::abi::{PointeeInfo, Scalar, Size, TyAbiInterface, Variants}; @@ -192,6 +193,7 @@ pub trait LayoutLlvmExt<'tcx> { ) -> &'a Type; fn llvm_field_index<'a>(&self, cx: &CodegenCx<'a, 'tcx>, index: usize) -> u64; fn pointee_info_at<'a>(&self, cx: &CodegenCx<'a, 'tcx>, offset: Size) -> Option; + fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type>; } impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> { @@ -414,4 +416,35 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> { cx.pointee_infos.borrow_mut().insert((self.ty, offset), result); result } + + fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type> { + debug_assert!(self.is_sized()); + + // FIXME: this is a fairly arbitrary choice, but 128 bits on WASM + // (matching the 128-bit SIMD types proposal) and 256 bits on x64 + // (like AVX2 registers) seems at least like a tolerable starting point. + let threshold = cx.data_layout().pointer_size * 4; + if self.layout.size() > threshold { + return None; + } + + // Vectors, even for non-power-of-two sizes, have the same layout as + // arrays but don't count as aggregate types + if let FieldsShape::Array { count, .. } = self.layout.fields() + && let element = self.field(cx, 0) + && element.ty.is_integral() + { + // `cx.type_ix(bits)` is tempting here, but while that works great + // for things that *stay* as memory-to-memory copies, it also ends + // up suppressing vectorization as it introduces shifts when it + // extracts all the individual values. + + let ety = element.llvm_type(cx); + return Some(cx.type_vector(ety, *count)); + } + + // FIXME: The above only handled integer arrays; surely more things + // would also be possible. Be careful about provenance, though! + None + } } diff --git a/compiler/rustc_codegen_ssa/src/base.rs b/compiler/rustc_codegen_ssa/src/base.rs index 242d209b68425..dc4a28c866ff3 100644 --- a/compiler/rustc_codegen_ssa/src/base.rs +++ b/compiler/rustc_codegen_ssa/src/base.rs @@ -380,7 +380,19 @@ pub fn memcpy_ty<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( return; } - bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags); + if flags == MemFlags::empty() + && let Some(bty) = bx.cx().scalar_copy_backend_type(layout) + { + // I look forward to only supporting opaque pointers + let pty = bx.type_ptr_to(bty); + let src = bx.pointercast(src, pty); + let dst = bx.pointercast(dst, pty); + + let temp = bx.load(bty, src, src_align); + bx.store(temp, dst, dst_align); + } else { + bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags); + } } pub fn codegen_instance<'a, 'tcx: 'a, Bx: BuilderMethods<'a, 'tcx>>( diff --git a/compiler/rustc_codegen_ssa/src/traits/type_.rs b/compiler/rustc_codegen_ssa/src/traits/type_.rs index 36d9864221bcb..e64417e1a4a69 100644 --- a/compiler/rustc_codegen_ssa/src/traits/type_.rs +++ b/compiler/rustc_codegen_ssa/src/traits/type_.rs @@ -126,6 +126,28 @@ pub trait LayoutTypeMethods<'tcx>: Backend<'tcx> { index: usize, immediate: bool, ) -> Self::Type; + + /// A type that can be used in a [`super::BuilderMethods::load`] + + /// [`super::BuilderMethods::store`] pair to implement a *typed* copy, + /// such as a MIR `*_0 = *_1`. + /// + /// It's always legal to return `None` here, as the provided impl does, + /// in which case callers should use [`super::BuilderMethods::memcpy`] + /// instead of the `load`+`store` pair. + /// + /// This can be helpful for things like arrays, where the LLVM backend type + /// `[3 x i16]` optimizes to three separate loads and stores, but it can + /// instead be copied via an `i48` that stays as the single `load`+`store`. + /// (As of 2023-05 LLVM cannot necessarily optimize away a `memcpy` in these + /// cases, due to `poison` handling, but in codegen we have more information + /// about the type invariants, so can emit something better instead.) + /// + /// This *should* return `None` for particularly-large types, where leaving + /// the `memcpy` may well be important to avoid code size explosion. + fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option { + let _ = layout; + None + } } // For backends that support CFI using type membership (i.e., testing whether a given pointer is diff --git a/tests/codegen/array-codegen.rs b/tests/codegen/array-codegen.rs new file mode 100644 index 0000000000000..98488eb92ee02 --- /dev/null +++ b/tests/codegen/array-codegen.rs @@ -0,0 +1,35 @@ +// compile-flags: -O -C no-prepopulate-passes +// min-llvm-version: 15.0 (for opaque pointers) + +#![crate_type = "lib"] + +// CHECK-LABEL: @array_load +#[no_mangle] +pub fn array_load(a: &[u8; 4]) -> [u8; 4] { + // CHECK: %0 = alloca [4 x i8], align 1 + // CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1 + // CHECK: store <4 x i8> %[[TEMP1]], ptr %0, align 1 + // CHECK: %[[TEMP2:.+]] = load i32, ptr %0, align 1 + // CHECK: ret i32 %[[TEMP2]] + *a +} + +// CHECK-LABEL: @array_store +#[no_mangle] +pub fn array_store(a: [u8; 4], p: &mut [u8; 4]) { + // CHECK: %a = alloca [4 x i8] + // CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1 + // CHECK-NEXT: store <4 x i8> %[[TEMP]], ptr %p, align 1 + *p = a; +} + +// CHECK-LABEL: @array_copy +#[no_mangle] +pub fn array_copy(a: &[u8; 4], p: &mut [u8; 4]) { + // CHECK: %[[LOCAL:.+]] = alloca [4 x i8], align 1 + // CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1 + // CHECK: store <4 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1 + // CHECK: %[[TEMP2:.+]] = load <4 x i8>, ptr %[[LOCAL]], align 1 + // CHECK: store <4 x i8> %[[TEMP2]], ptr %p, align 1 + *p = *a; +} diff --git a/tests/codegen/mem-replace-simple-type.rs b/tests/codegen/mem-replace-simple-type.rs index 4253ef1366604..6151177de15b2 100644 --- a/tests/codegen/mem-replace-simple-type.rs +++ b/tests/codegen/mem-replace-simple-type.rs @@ -32,3 +32,14 @@ pub fn replace_ref_str<'a>(r: &mut &'a str, v: &'a str) -> &'a str { // CHECK: ret { ptr, i64 } %[[P2]] std::mem::replace(r, v) } + +#[no_mangle] +// CHECK-LABEL: @replace_short_array( +pub fn replace_short_array(r: &mut [u32; 3], v: [u32; 3]) -> [u32; 3] { + // CHECK-NOT: alloca + // CHECK: %[[R:.+]] = load <3 x i32>, ptr %r, align 4 + // CHECK: store <3 x i32> %[[R]], ptr %0 + // CHECK: %[[V:.+]] = load <3 x i32>, ptr %v, align 4 + // CHECK: store <3 x i32> %[[V]], ptr %r + std::mem::replace(r, v) +} diff --git a/tests/codegen/swap-simd-types.rs b/tests/codegen/swap-simd-types.rs index c90b277eb4487..3472a42b0e65e 100644 --- a/tests/codegen/swap-simd-types.rs +++ b/tests/codegen/swap-simd-types.rs @@ -30,3 +30,12 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) { x.swap_with_slice(y); } } + +// CHECK-LABEL: @swap_bytes32 +#[no_mangle] +pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) { +// CHECK-NOT: alloca +// CHECK: load <32 x i8>{{.+}}align 1 +// CHECK: store <32 x i8>{{.+}}align 1 + swap(x, y) +} diff --git a/tests/codegen/swap-small-types.rs b/tests/codegen/swap-small-types.rs index 6289d7af3a08c..419645a3fc6bc 100644 --- a/tests/codegen/swap-small-types.rs +++ b/tests/codegen/swap-small-types.rs @@ -1,4 +1,4 @@ -// compile-flags: -O +// compile-flags: -O -Z merge-functions=disabled // only-x86_64 // ignore-debug: the debug assertions get in the way @@ -12,13 +12,10 @@ type RGB48 = [u16; 3]; #[no_mangle] pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) { // CHECK-NOT: alloca - // CHECK: %temp = alloca [3 x i16] - // CHECK-NOT: alloca - // CHECK-NOT: call void @llvm.memcpy - // CHECK: call void @llvm.memcpy.{{.+}}({{.+}} %temp, {{.+}} %x, {{.+}} 6, {{.+}}) - // CHECK: call void @llvm.memcpy.{{.+}}({{.+}} %x, {{.+}} %y, {{.+}} 6, {{.+}}) - // CHECK: call void @llvm.memcpy.{{.+}}({{.+}} %y, {{.+}} %temp, {{.+}} 6, {{.+}}) - // CHECK-NOT: call void @llvm.memcpy + // CHECK: %[[TEMP0:.+]] = load <3 x i16>, ptr %x, align 2 + // CHECK: %[[TEMP1:.+]] = load <3 x i16>, ptr %y, align 2 + // CHECK: store <3 x i16> %[[TEMP1]], ptr %x, align 2 + // CHECK: store <3 x i16> %[[TEMP0]], ptr %y, align 2 let temp = *x; *x = *y; @@ -28,13 +25,11 @@ pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) { // CHECK-LABEL: @swap_rgb48 #[no_mangle] pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) { - // FIXME MIR inlining messes up LLVM optimizations. - // If these checks start failing, please update this test. - // CHECK: alloca [3 x i16] - // CHECK: call void @llvm.memcpy -// WOULD-CHECK-NOT: alloca -// WOULD-CHECK: load i48 -// WOULD-CHECK: store i48 + // CHECK-NOT: alloca + // CHECK: load <3 x i16> + // CHECK: load <3 x i16> + // CHECK: store <3 x i16> + // CHECK: store <3 x i16> swap(x, y) }