diff --git a/src/f32x8_.rs b/src/f32x8_.rs index a24517af..aff994f0 100644 --- a/src/f32x8_.rs +++ b/src/f32x8_.rs @@ -567,6 +567,119 @@ impl f32x8 { self ^ (signs & Self::from(-0.0)) } + #[inline] + #[must_use] + #[allow(non_upper_case_globals)] + pub fn asin_acos(self) -> (Self, Self) { + // Based on the Agner Fog "vector class library": + // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h + const_f32_as_f32x8!(P4asinf, 4.2163199048E-2); + const_f32_as_f32x8!(P3asinf, 2.4181311049E-2); + const_f32_as_f32x8!(P2asinf, 4.5470025998E-2); + const_f32_as_f32x8!(P1asinf, 7.4953002686E-2); + const_f32_as_f32x8!(P0asinf, 1.6666752422E-1); + + let xa = self.abs(); + let big = xa.cmp_ge(f32x8::splat(0.5)); + + let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa); + let x2 = xa * xa; + let x3 = big.blend(x1, x2); + + let xb = x1.sqrt(); + + let x4 = big.blend(xb, xa); + + let z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); + let z = z.mul_add(x3 * x4, x4); + + let z1 = z + z; + + // acos + let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1); + let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self); + let acos = big.blend(z3, z4); + + // asin + let z3 = f32x8::FRAC_PI_2 - z1; + let asin = big.blend(z3, z); + let asin = asin.flip_signs(self); + + (asin, acos) + } + + #[inline] + #[must_use] + #[allow(non_upper_case_globals)] + pub fn asin(self) -> Self { + // Based on the Agner Fog "vector class library": + // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h + const_f32_as_f32x8!(P4asinf, 4.2163199048E-2); + const_f32_as_f32x8!(P3asinf, 2.4181311049E-2); + const_f32_as_f32x8!(P2asinf, 4.5470025998E-2); + const_f32_as_f32x8!(P1asinf, 7.4953002686E-2); + const_f32_as_f32x8!(P0asinf, 1.6666752422E-1); + + let xa = self.abs(); + let big = xa.cmp_ge(f32x8::splat(0.5)); + + let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa); + let x2 = xa * xa; + let x3 = big.blend(x1, x2); + + let xb = x1.sqrt(); + + let x4 = big.blend(xb, xa); + + let z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); + let z = z.mul_add(x3 * x4, x4); + + let z1 = z + z; + + // asin + let z3 = f32x8::FRAC_PI_2 - z1; + let asin = big.blend(z3, z); + let asin = asin.flip_signs(self); + + asin + } + + #[inline] + #[must_use] + #[allow(non_upper_case_globals)] + pub fn acos(self) -> Self { + // Based on the Agner Fog "vector class library": + // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h + const_f32_as_f32x8!(P4asinf, 4.2163199048E-2); + const_f32_as_f32x8!(P3asinf, 2.4181311049E-2); + const_f32_as_f32x8!(P2asinf, 4.5470025998E-2); + const_f32_as_f32x8!(P1asinf, 7.4953002686E-2); + const_f32_as_f32x8!(P0asinf, 1.6666752422E-1); + + let xa = self.abs(); + let big = xa.cmp_ge(f32x8::splat(0.5)); + + let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa); + let x2 = xa * xa; + let x3 = big.blend(x1, x2); + + let xb = x1.sqrt(); + + let x4 = big.blend(xb, xa); + + let z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); + let z = z.mul_add(x3 * x4, x4); + + let z1 = z + z; + + // acos + let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1); + let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self); + let acos = big.blend(z3, z4); + + acos + } + #[inline] #[must_use] #[allow(non_upper_case_globals)] @@ -863,3 +976,11 @@ impl Not for f32x8 { } } } + +#[must_use] +#[inline] +fn polynomial_4(x: f32x8, c0: f32x8, c1: f32x8, c2: f32x8, c3: f32x8, c4: f32x8) -> f32x8 { + let x2 = x * x; + let x4 = x2 * x2; + c3.mul_add(x, c2).mul_add(x2, c1.mul_add(x, c0) + c4 * x4) +} \ No newline at end of file diff --git a/src/f64x4_.rs b/src/f64x4_.rs index 92591339..380d86d6 100644 --- a/src/f64x4_.rs +++ b/src/f64x4_.rs @@ -472,6 +472,256 @@ impl f64x4 { self ^ (signs & Self::from(-0.0)) } + #[inline] + #[must_use] + #[allow(non_upper_case_globals)] + pub fn asin_acos(self) -> (Self, Self) { + // Based on the Agner Fog "vector class library": + // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h + const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3); + const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1); + const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0); + const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1); + const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1); + + const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1); + const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2); + const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2); + const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2); + + const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3); + const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1); + const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0); + const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1); + const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1); + const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0); + + const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1); + const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1); + const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2); + const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2); + const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1); + + let xa = self.abs(); + + let big = xa.cmp_ge(f64x4::splat(0.625)); + + let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa); + + let x2 = x1 * x1; + let x3 = x2 * x1; + let x4 = x2 * x2; + let x5 = x4 * x1; + + let dobig = big.any(); + let dosmall = !big.all(); + + let mut rx = f64x4::default(); + let mut sx = f64x4::default(); + let mut px = f64x4::default(); + let mut qx = f64x4::default(); + + if dobig { + rx = x3.mul_add(R3asin, x2 * R2asin) + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin)); + sx = x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin)); + } + if dosmall { + px = x3.mul_add(P3asin, P0asin) + x4.mul_add(P4asin, x1 * P1asin) + x5.mul_add(P5asin, x2 * P2asin); + qx = x4.mul_add(Q4asin, x5) + x3.mul_add(Q3asin, x1*Q1asin) + x2.mul_add(Q2asin, Q0asin); + }; + + let vx = big.blend(rx, px); + let wx = big.blend(sx, qx); + + let y1 = vx / wx * x1; + + let mut z1 = f64x4::default(); + let mut z2 = f64x4::default(); + if dobig { + let xb = (x1 + x1).sqrt(); + z1 = xb.mul_add(y1, xb); + } + + if dosmall { + z2 = xa.mul_add(y1, xa); + } + + // asin + let z3 = f64x4::FRAC_PI_2 - z1; + let asin = big.blend(z3, z2); + let asin = asin.flip_signs(self); + + // acos + let z3 = self.cmp_lt(f64x4::ZERO).blend(f64x4::PI - z1, z1); + let z4 = f64x4::FRAC_PI_2 - z2.flip_signs(self); + let acos = big.blend(z3, z4); + + (asin, acos) + } + + #[inline] + #[must_use] + #[allow(non_upper_case_globals)] + pub fn acos(self) -> Self { + // Based on the Agner Fog "vector class library": + // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h + const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3); + const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1); + const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0); + const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1); + const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1); + + const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1); + const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2); + const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2); + const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2); + + const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3); + const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1); + const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0); + const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1); + const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1); + const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0); + + const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1); + const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1); + const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2); + const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2); + const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1); + + let xa = self.abs(); + + let big = xa.cmp_ge(f64x4::splat(0.625)); + + let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa); + + let x2 = x1 * x1; + let x3 = x2 * x1; + let x4 = x2 * x2; + let x5 = x4 * x1; + + let dobig = big.any(); + let dosmall = !big.all(); + + let mut rx = f64x4::default(); + let mut sx = f64x4::default(); + let mut px = f64x4::default(); + let mut qx = f64x4::default(); + + if dobig { + rx = x3.mul_add(R3asin, x2 * R2asin) + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin)); + sx = x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin)); + } + if dosmall { + px = x3.mul_add(P3asin, P0asin) + x4.mul_add(P4asin, x1 * P1asin) + x5.mul_add(P5asin, x2 * P2asin); + qx = x4.mul_add(Q4asin, x5) + x3.mul_add(Q3asin, x1*Q1asin) + x2.mul_add(Q2asin, Q0asin); + }; + + let vx = big.blend(rx, px); + let wx = big.blend(sx, qx); + + let y1 = vx / wx * x1; + + let mut z1 = f64x4::default(); + let mut z2 = f64x4::default(); + if dobig { + let xb = (x1 + x1).sqrt(); + z1 = xb.mul_add(y1, xb); + } + + if dosmall { + z2 = xa.mul_add(y1, xa); + } + + // acos + let z3 = self.cmp_lt(f64x4::ZERO).blend(f64x4::PI - z1, z1); + let z4 = f64x4::FRAC_PI_2 - z2.flip_signs(self); + let acos = big.blend(z3, z4); + + acos + } + #[inline] + #[must_use] + #[allow(non_upper_case_globals)] + pub fn asin(self) -> Self { + // Based on the Agner Fog "vector class library": + // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h + const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3); + const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1); + const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0); + const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1); + const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1); + + const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1); + const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2); + const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2); + const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2); + + const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3); + const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1); + const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0); + const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1); + const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1); + const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0); + + const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1); + const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1); + const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2); + const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2); + const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1); + + let xa = self.abs(); + + let big = xa.cmp_ge(f64x4::splat(0.625)); + + let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa); + + let x2 = x1 * x1; + let x3 = x2 * x1; + let x4 = x2 * x2; + let x5 = x4 * x1; + + let dobig = big.any(); + let dosmall = !big.all(); + + let mut rx = f64x4::default(); + let mut sx = f64x4::default(); + let mut px = f64x4::default(); + let mut qx = f64x4::default(); + + if dobig { + rx = x3.mul_add(R3asin, x2 * R2asin) + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin)); + sx = x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin)); + } + if dosmall { + px = x3.mul_add(P3asin, P0asin) + x4.mul_add(P4asin, x1 * P1asin) + x5.mul_add(P5asin, x2 * P2asin); + qx = x4.mul_add(Q4asin, x5) + x3.mul_add(Q3asin, x1*Q1asin) + x2.mul_add(Q2asin, Q0asin); + }; + + let vx = big.blend(rx, px); + let wx = big.blend(sx, qx); + + let y1 = vx / wx * x1; + + let mut z1 = f64x4::default(); + let mut z2 = f64x4::default(); + if dobig { + let xb = (x1 + x1).sqrt(); + z1 = xb.mul_add(y1, xb); + } + + if dosmall { + z2 = xa.mul_add(y1, xa); + } + + // asin + let z3 = f64x4::FRAC_PI_2 - z1; + let asin = big.blend(z3, z2); + let asin = asin.flip_signs(self); + + asin + } + #[inline] #[must_use] #[allow(non_upper_case_globals)] diff --git a/tests/t_f32x4.rs b/tests/t_f32x4.rs index 0d77e414..5ee6d6cd 100644 --- a/tests/t_f32x4.rs +++ b/tests/t_f32x4.rs @@ -295,6 +295,8 @@ fn impl_f32x4_sin_cos() { } } +// FIXME: remove cfg requirement once masks as their own types are implemented +#[cfg(target_feature = "sse")] #[test] fn impl_f32x4_asin_acos() { let inc = 1.0 / 2501.0 / 4.0; @@ -322,6 +324,8 @@ fn impl_f32x4_asin_acos() { } } +// FIXME: remove cfg requirement once masks as their own types are implemented +#[cfg(target_feature = "sse")] #[test] fn impl_f32x4_asin() { let inc = 1.0 / 2501.0 / 4.0; @@ -348,6 +352,8 @@ fn impl_f32x4_asin() { } } +// FIXME: remove cfg requirement once masks as their own types are implemented +#[cfg(target_feature = "sse")] #[test] fn impl_f32x4_acos() { let inc = 1.0 / 2501.0 / 4.0; diff --git a/tests/t_f32x8.rs b/tests/t_f32x8.rs index 4b853011..d36594d5 100644 --- a/tests/t_f32x8.rs +++ b/tests/t_f32x8.rs @@ -311,6 +311,90 @@ fn impl_f32x8_flip_signs() { assert_eq!(expected, actual); } +// FIXME: remove cfg requirement once masks as their own types are implemented +#[cfg(target_feature = "avx")] +#[test] +fn impl_f32x8_asin_acos() { + let inc = 1.0 / 2501.0 / 8.0; + for x in -2500..=2500 { + let base = (x * 8) as f32 * inc; + let origs = [base, base + inc, base + 2.0 * inc, base + 3.0 * inc, base + 4.0 * inc, base + 5.0 * inc, base + 6.0 * inc, base + 7.0 * inc]; + let (actual_asins, actual_acoses) = f32x8::from(origs).asin_acos(); + for i in 0..8 { + let orig = origs[i]; + let check = |name: &str, vals: f32x8, expected: f32| { + let actual_arr: [f32; 8] = cast(vals); + let actual = actual_arr[i]; + assert!( + (actual - expected).abs() < 0.0000006, + "Wanted {name}({orig}) to be {expected} but got {actual}", + name = name, + orig = orig, + expected = expected, + actual = actual + ); + }; + check("asin", actual_asins, orig.asin()); + check("acos", actual_acoses, orig.acos()); + } + } +} + +// FIXME: remove cfg requirement once masks as their own types are implemented +#[cfg(target_feature = "avx")] +#[test] +fn impl_f32x8_asin() { + let inc = 1.0 / 2501.0 / 8.0; + for x in -2500..=2500 { + let base = (x * 4) as f32 * inc; + let origs = [base, base + inc, base + 2.0 * inc, base + 3.0 * inc, base + 4.0 * inc, base + 5.0 * inc, base + 6.0 * inc, base + 7.0 * inc]; + let actual_asins = f32x8::from(origs).asin(); + for i in 0..8 { + let orig = origs[i]; + let check = |name: &str, vals: f32x8, expected: f32| { + let actual_arr: [f32; 8] = cast(vals); + let actual = actual_arr[i]; + assert!( + (actual - expected).abs() < 0.0000006, + "Wanted {name}({orig}) to be {expected} but got {actual}", + name = name, + orig = orig, + expected = expected, + actual = actual + ); + }; + check("asin", actual_asins, orig.asin()); + } + } +} + +// FIXME: remove cfg requirement once masks as their own types are implemented +#[cfg(target_feature = "avx")] +#[test] +fn impl_f32x8_acos() { + let inc = 1.0 / 2501.0 / 8.0; + for x in -2500..=2500 { + let base = (x * 8) as f32 * inc; + let origs = [base, base + inc, base + 2.0 * inc, base + 3.0 * inc, base + 4.0 * inc, base + 5.0 * inc, base + 6.0 * inc, base + 7.0 * inc]; + let actual_acoses = f32x8::from(origs).acos(); + for i in 0..8 { + let orig = origs[i]; + let check = |name: &str, vals: f32x8, expected: f32| { + let actual_arr: [f32; 8] = cast(vals); + let actual = actual_arr[i]; + assert!( + (actual - expected).abs() < 0.0000006, + "Wanted {name}({orig}) to be {expected} but got {actual}", + name = name, + orig = orig, + expected = expected, + actual = actual + ); + }; + check("acos", actual_acoses, orig.acos()); + } + } +} #[test] fn impl_f32x8_sin_cos() { for x in -2500..=2500 { diff --git a/tests/t_f64x4.rs b/tests/t_f64x4.rs index d451fea0..88cd6e6f 100644 --- a/tests/t_f64x4.rs +++ b/tests/t_f64x4.rs @@ -269,6 +269,91 @@ fn impl_f64x4_flip_signs() { assert_eq!(expected, actual); } +// FIXME: remove cfg requirement once masks as their own types are implemented +#[cfg(target_feature = "sse")] +#[test] +fn impl_f64x4_asin_acos() { + let inc = 1.0 / 2501.0 / 4.0; + for x in -2500..=2500 { + let base = (x * 4) as f64 * inc; + let origs = [base, base + inc, base + 2.0 * inc, base + 3.0 * inc]; + let (actual_asins, actual_acoses) = f64x4::from(origs).asin_acos(); + for i in 0..4 { + let orig = origs[i]; + let check = |name: &str, vals: f64x4, expected: f64| { + let actual_arr: [f64; 4] = cast(vals); + let actual = actual_arr[i]; + assert!( + (actual - expected).abs() < 0.0000006, + "Wanted {name}({orig}) to be {expected} but got {actual}", + name = name, + orig = orig, + expected = expected, + actual = actual + ); + }; + check("asin", actual_asins, orig.asin()); + check("acos", actual_acoses, orig.acos()); + } + } +} + +// FIXME: remove cfg requirement once masks as their own types are implemented +#[cfg(target_feature = "sse")] +#[test] +fn impl_f64x4_asin() { + let inc = 1.0 / 2501.0 / 4.0; + for x in -2500..=2500 { + let base = (x * 4) as f64 * inc; + let origs = [base, base + inc, base + 2.0 * inc, base + 3.0 * inc]; + let actual_asins = f64x4::from(origs).asin(); + for i in 0..4 { + let orig = origs[i]; + let check = |name: &str, vals: f64x4, expected: f64| { + let actual_arr: [f64; 4] = cast(vals); + let actual = actual_arr[i]; + assert!( + (actual - expected).abs() < 0.0000006, + "Wanted {name}({orig}) to be {expected} but got {actual}", + name = name, + orig = orig, + expected = expected, + actual = actual + ); + }; + check("asin", actual_asins, orig.asin()); + } + } +} + +// FIXME: remove cfg requirement once masks as their own types are implemented +#[cfg(target_feature = "sse")] +#[test] +fn impl_f64x4_acos() { + let inc = 1.0 / 2501.0 / 4.0; + for x in -2500..=2500 { + let base = (x * 4) as f64 * inc; + let origs = [base, base + inc, base + 2.0 * inc, base + 3.0 * inc]; + let actual_acoses = f64x4::from(origs).acos(); + for i in 0..4 { + let orig = origs[i]; + let check = |name: &str, vals: f64x4, expected: f64| { + let actual_arr: [f64; 4] = cast(vals); + let actual = actual_arr[i]; + assert!( + (actual - expected).abs() < 0.0000006, + "Wanted {name}({orig}) to be {expected} but got {actual}", + name = name, + orig = orig, + expected = expected, + actual = actual + ); + }; + check("acos", actual_acoses, orig.acos()); + } + } +} + #[test] fn impl_f64x4_sin_cos() { for x in -2500..=2500 {