Use -mstackrealign for 32-bits systems and SSE2

Legrandin · Nov 14, 2022 · 80d6640 · 80d6640
1 parent 32f64d5
commit 80d6640
Show file tree

Hide file tree

Showing 6 changed files with 39 additions and 74 deletions.
diff --git a/compiler_opt.py b/compiler_opt.py
@@ -251,7 +251,7 @@ def compiler_is_clang():
     return test_compilation(source, msg="clang")
 
 
-def compiler_is_gcc():
+def compiler_is_gcc(extra_cc_options=[]):
     source = """
     #if defined(__clang__) || !defined(__GNUC__)
     #error Not GCC
@@ -260,20 +260,14 @@ def compiler_is_gcc():
     {
         return 0;
     }"""
-    return test_compilation(source, msg="gcc")
-
-
-def support_gcc_realign():
-    source = """
-    void __attribute__((force_align_arg_pointer)) a(void) {}
-    int main(void) { return 0; }
-    """
-    return test_compilation(source, msg="gcc")
+    return test_compilation(source,
+                            msg="gcc",
+                            extra_cc_options=extra_cc_options)
 
 
 def compiler_supports_sse2():
-    source = """
-    #include <intrin.h>
+    source_template = """
+    %s
     int main(void)
     {
         __m128i r0;
@@ -283,39 +277,31 @@ def compiler_supports_sse2():
         return mask;
     }
     """
-    if test_compilation(source, msg="SSE2(intrin.h)"):
-        return {'extra_cc_options': [], 'extra_macros': ['HAVE_INTRIN_H', 'USE_SSE2']}
 
-    source = """
-    #include <x86intrin.h>
-    int main(void)
-    {
-        __m128i r0;
-        int mask;
-        r0 = _mm_set1_epi32(0);
-        mask = _mm_movemask_epi8(r0);
-        return mask;
-    }
-    """
-    if test_compilation(source, extra_cc_options=['-msse2'], msg="SSE2(x86intrin.h)"):
-        return {'extra_cc_options': ['-msse2'], 'extra_macros': ['HAVE_X86INTRIN_H', 'USE_SSE2']}
+    source_intrin_h = source_template % "#include <intrin.h>"
+    source_x86intrin_h = source_template % "#include <x86intrin.h>"
+    source_xemmintrin_h = source_template % "#include <xmmintrin.h>\n#include <emmintrin.h>"
 
-    source = """
-    #include <xmmintrin.h>
-    #include <emmintrin.h>
-    int main(void)
-    {
-        __m128i r0;
-        int mask;
-        r0 = _mm_set1_epi32(0);
-        mask = _mm_movemask_epi8(r0);
-        return mask;
-    }
-    """
-    if test_compilation(source, extra_cc_options=['-msse2'], msg="SSE2(emmintrin.h)"):
-        return {'extra_cc_options': ['-msse2'], 'extra_macros': ['HAVE_EMMINTRIN_H', 'USE_SSE2']}
+    system_bits = 8 * struct.calcsize("P")
 
-    return False
+    result = None
+    if test_compilation(source_intrin_h, msg="SSE2(intrin.h)"):
+        result = {'extra_cc_options': [], 'extra_macros': ['HAVE_INTRIN_H', 'USE_SSE2']}
+    elif test_compilation(source_x86intrin_h, extra_cc_options=['-msse2'], msg="SSE2(x86intrin.h)"):
+        result = {'extra_cc_options': ['-msse2'], 'extra_macros': ['HAVE_X86INTRIN_H', 'USE_SSE2']}
+    elif test_compilation(source_xemmintrin_h, extra_cc_options=['-msse2'], msg="SSE2(emmintrin.h)"):
+        result = {'extra_cc_options': ['-msse2'], 'extra_macros': ['HAVE_EMMINTRIN_H', 'USE_SSE2']}
+    else:
+        result = False
+
+    # On 32-bit x86 platforms, gcc assumes the stack to be aligned to 16
+    # bytes, but the caller may actually only align it to 4 bytes, which
+    # make functions crash if they use SSE2 intrinsics.
+    # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838
+    if result and system_bits == 32 and compiler_is_gcc(extra_cc_options=['-mstackrealign']):
+        result['extra_cc_options'].append('-mstackrealign')
+
+    return result
 
 
 def remove_extension(extensions, name):
@@ -379,15 +365,6 @@ def set_compiler_options(package_root, extensions):
         for macro in sse2_result['extra_macros']:
             extra_macros.append((macro, None))
 
-    # Compiler specific settings
-    if gcc:
-        # On 32-bit x86 platforms, gcc assumes the stack to be aligned to 16
-        # bytes, but the caller may actually only align it to 4 bytes, which
-        # make functions crash if they use SSE2 intrinsics.
-        # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838
-        if system_bits == 32 and support_gcc_realign():
-            extra_macros.append(("GCC_REALIGN", None))
-
     # Module-specific options
 
     # AESNI

diff --git a/src/AESNI.c b/src/AESNI.c
@@ -58,7 +58,7 @@ typedef struct {
 
 enum SubType { OnlySub, SubRotXor };
 
-static FUNC_SSE2 uint32_t sub_rot(uint32_t w, unsigned idx /** round/Nk **/, enum SubType subType)
+static uint32_t sub_rot(uint32_t w, unsigned idx /** round/Nk **/, enum SubType subType)
 {
     __m128i x, y, z;
 
@@ -90,7 +90,7 @@ static FUNC_SSE2 uint32_t sub_rot(uint32_t w, unsigned idx /** round/Nk **/, enu
     return (uint32_t)_mm_cvtsi128_si32(z);
 }
 
-static FUNC_SSE2 int expand_key(__m128i *erk, __m128i *drk, const uint8_t *key, unsigned Nk, unsigned Nr)
+static int expand_key(__m128i *erk, __m128i *drk, const uint8_t *key, unsigned Nk, unsigned Nr)
 {
     uint32_t rk[4*(14+2)];
     unsigned tot_words;
@@ -137,7 +137,7 @@ static FUNC_SSE2 int expand_key(__m128i *erk, __m128i *drk, const uint8_t *key,
     return 0;
 }
 
-static FUNC_SSE2 int AESNI_encrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
+static int AESNI_encrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
 {
     unsigned rounds;
     __m128i r[14+1];
@@ -246,7 +246,7 @@ static FUNC_SSE2 int AESNI_encrypt(const BlockBase *bb, const uint8_t *in, uint8
     return 0;
 }
 
-static FUNC_SSE2 int AESNI_decrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
+static int AESNI_decrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
 {
     unsigned rounds;
     __m128i r[14+1];

diff --git a/src/bignum.c b/src/bignum.c
@@ -154,7 +154,7 @@ STATIC void product(uint64_t *t, uint64_t *scratchpad, const uint64_t *a, const
  * @param words The number of words of a, b, and out
  * @return      0 for success, the appropriate code otherwise.
  */
-STATIC FUNC_SSE2 int mod_select(uint64_t *out, const uint64_t *a, const uint64_t *b, unsigned cond, size_t words)
+STATIC int mod_select(uint64_t *out, const uint64_t *a, const uint64_t *b, unsigned cond, size_t words)
 {
     uint64_t mask;
 #if defined(USE_SSE2)

diff --git a/src/common.h b/src/common.h
@@ -192,16 +192,4 @@ static inline const uint8_t* memchr_not(const uint8_t* s, int c, size_t n)
     return NULL;
 }
 
-/*
- * On 32-bit x86 platforms, gcc assumes the stack to be aligned to 16
- * bytes, but the caller may actually only align it to 4 bytes, which
- * make functions crash if they use SSE2 intrinsics.
- * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838
- */
-#if defined(GCC_REALIGN)
-#define FUNC_SSE2 __attribute__((force_align_arg_pointer))
-#else
-#define FUNC_SSE2
-#endif
-
 #endif
diff --git a/src/ghash_clmul.c b/src/ghash_clmul.c
@@ -111,7 +111,7 @@ struct exp_key {
  *
  * See at the bottom for an explanation.
  */
-STATIC FUNC_SSE2 __m128i reduce(__m128i prod_high, __m128i prod_low)
+STATIC __m128i reduce(__m128i prod_high, __m128i prod_low)
 {
     const uint64_t c2 = (uint64_t)0xc2 << 56;
     __m128i t1, t2, t3, t4, t7;
@@ -133,7 +133,7 @@ STATIC FUNC_SSE2 __m128i reduce(__m128i prod_high, __m128i prod_low)
 /**
  * Perform the carry-less multiplication of two polynomials of degree 127.
  */
-STATIC FUNC_SSE2 void clmult(__m128i *prod_high, __m128i *prod_low, __m128i a, __m128i b)
+STATIC void clmult(__m128i *prod_high, __m128i *prod_low, __m128i a, __m128i b)
 {
     __m128i c, d, e, f, g, h, i;
 
@@ -151,7 +151,7 @@ STATIC FUNC_SSE2 void clmult(__m128i *prod_high, __m128i *prod_low, __m128i a, _
 /**
  * Multiply a polynomial of degree 127 by x, modulo p(x) = x^128 + x^127 + x^126 + x^121 + 1
  */
-STATIC FUNC_SSE2 __m128i multx(__m128i a)
+STATIC __m128i multx(__m128i a)
 {
     int msb;
     int64_t r;
@@ -179,7 +179,7 @@ STATIC FUNC_SSE2 __m128i multx(__m128i a)
 }
 
 /** Swap bytes in an XMM register **/
-STATIC FUNC_SSE2 __m128i swap(__m128i a)
+STATIC __m128i swap(__m128i a)
 {
     __m128i mask;
 
@@ -218,7 +218,7 @@ EXPORT_SYM int ghash_destroy_clmul(struct exp_key *expanded)
     return 0;
 }
 
-EXPORT_SYM FUNC_SSE2 int ghash_clmul(
+EXPORT_SYM int ghash_clmul(
         uint8_t y_out[16],
         const uint8_t block_data[],
         size_t len,

diff --git a/src/multiply_32.c b/src/multiply_32.c
@@ -50,7 +50,7 @@
  * Multiply a vector a[] by a scalar b. Add the result into vector t[],
  * starting at the given offset.
  */
-void static inline FUNC_SSE2 addmul32(uint32_t* t, size_t offset, const uint32_t *a, uint32_t b, size_t t_words, size_t a_words)
+void static inline addmul32(uint32_t* t, size_t offset, const uint32_t *a, uint32_t b, size_t t_words, size_t a_words)
 {
     uint32_t carry;
     size_t i;