Merge branch 'gcc_stack_align'

Legrandin · Nov 11, 2019 · ef1e4a4 · ef1e4a4
2 parents d442d9c + 31c23ca
commit ef1e4a4
Show file tree

Hide file tree

Showing 6 changed files with 44 additions and 27 deletions.
diff --git a/compiler_opt.py b/compiler_opt.py
@@ -263,6 +263,14 @@ def compiler_is_gcc():
     return test_compilation(source, msg="gcc")
 
 
+def support_gcc_realign():
+    source = """
+    void __attribute__((force_align_arg_pointer)) a(void) {}
+    int main(void) { return 0; }
+    """
+    return test_compilation(source, msg="gcc")
+
+
 def compiler_supports_sse2():
     source = """
     #include <intrin.h>
@@ -371,6 +379,15 @@ def set_compiler_options(package_root, extensions):
         for macro in sse2_result['extra_macros']:
             extra_macros.append((macro, None))
 
+    # Compiler specific settings
+    if gcc:
+        # On 32-bit x86 platforms, gcc assumes the stack to be aligned to 16
+        # bytes, but the caller may actually only align it to 4 bytes, which
+        # make functions crash if they use SSE2 intrinsics.
+        # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838
+        if system_bits == 32 and support_gcc_realign():
+            extra_macros.append(("GCC_REALIGN", None))
+
     # Module-specific options
 
     # AESNI

diff --git a/src/AESNI.c b/src/AESNI.c
@@ -42,9 +42,8 @@ FAKE_INIT(raw_aesni)
 #define BLOCK_SIZE 16
 
 struct block_state {
-    __m128i *erk;       /** Round keys for encryption (11, 13 or 15 elements) **/
-    __m128i *drk;       /** Round keys for decryption **/
-    __m128i *tmp_rk;
+    __m128i *erk;   /** Round keys for encryption (11, 13 or 15 elements) **/
+    __m128i *drk;   /** Round keys for decryption **/
     unsigned rounds;
 };
 
@@ -59,7 +58,7 @@ typedef struct {
 
 enum SubType { OnlySub, SubRotXor };
 
-static uint32_t sub_rot(uint32_t w, unsigned idx /** round/Nk **/, enum SubType subType)
+static FUNC_SSE2 uint32_t sub_rot(uint32_t w, unsigned idx /** round/Nk **/, enum SubType subType)
 {
     __m128i x, y, z;
 
@@ -91,7 +90,7 @@ static uint32_t sub_rot(uint32_t w, unsigned idx /** round/Nk **/, enum SubType
     return (uint32_t)_mm_cvtsi128_si32(z);
 }
 
-static int expand_key(__m128i *erk, __m128i *drk, const uint8_t *key, unsigned Nk, unsigned Nr)
+static FUNC_SSE2 int expand_key(__m128i *erk, __m128i *drk, const uint8_t *key, unsigned Nk, unsigned Nr)
 {
     uint32_t rk[4*(14+2)];
     unsigned tot_words;
@@ -138,21 +137,19 @@ static int expand_key(__m128i *erk, __m128i *drk, const uint8_t *key, unsigned N
     return 0;
 }
 
-static int AESNI_encrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
+static FUNC_SSE2 int AESNI_encrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
 {
     unsigned rounds;
+    __m128i r[14+1];
     const struct block_state *state;
     unsigned k;
-    __m128i *r;
 
     if ((bb == NULL) || (in == NULL) || (out == NULL))
         return ERR_NULL;
 
     state = &((AESNI_State*)bb)->algo_state;
     rounds = state->rounds;
 
-    r = state->tmp_rk;
-
     if (rounds > 14)
         return ERR_NR_ROUNDS;
 
@@ -249,12 +246,12 @@ static int AESNI_encrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, s
     return 0;
 }
 
-static int AESNI_decrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
+static FUNC_SSE2 int AESNI_decrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
 {
     unsigned rounds;
+    __m128i r[14+1];
     const struct block_state *state;
     unsigned k;
-    __m128i *r;
 
     if ((bb == NULL) || (in == NULL) || (out == NULL))
         return ERR_NULL;
@@ -265,8 +262,6 @@ static int AESNI_decrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, s
     if (rounds > 14)
         return ERR_NR_ROUNDS;
 
-    r = state->tmp_rk;
-
     for (k=0; k<=rounds; k++) {
         r[k] = state->drk[k];
     }
@@ -416,12 +411,6 @@ EXPORT_SYM int AESNI_start_operation(const uint8_t key[], size_t key_len, AESNI_
         goto error;
     }
 
-    state->tmp_rk = align_alloc(Nb*(Nr+1)*sizeof(uint32_t), 16);
-    if (state->tmp_rk == NULL) {
-        result = ERR_MEMORY;
-        goto error;
-    }
-
     result = expand_key(state->erk, state->drk, key, (unsigned)key_len/4, Nr);
     if (result) {
         goto error;
@@ -431,7 +420,6 @@ EXPORT_SYM int AESNI_start_operation(const uint8_t key[], size_t key_len, AESNI_
 error:
     align_free(state->erk);
     align_free(state->drk);
-    align_free(state->tmp_rk);
     free(*pResult);
     return result;
 }

diff --git a/src/common.h b/src/common.h
@@ -192,4 +192,16 @@ static inline const uint8_t* memchr_not(const uint8_t* s, int c, size_t n)
     return NULL;
 }
 
+/*
+ * On 32-bit x86 platforms, gcc assumes the stack to be aligned to 16
+ * bytes, but the caller may actually only align it to 4 bytes, which
+ * make functions crash if they use SSE2 intrinsics.
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838
+ */
+#if defined(GCC_REALIGN)
+#define FUNC_SSE2 __attribute__((force_align_arg_pointer))
+#else
+#define FUNC_SSE2
+#endif
+
 #endif
diff --git a/src/ghash_clmul.c b/src/ghash_clmul.c
@@ -111,7 +111,7 @@ struct exp_key {
  *
  * See at the bottom for an explanation.
  */
-STATIC __m128i reduce(__m128i prod_high, __m128i prod_low)
+STATIC FUNC_SSE2 __m128i reduce(__m128i prod_high, __m128i prod_low)
 {
     const uint64_t c2 = (uint64_t)0xc2 << 56;
     __m128i t1, t2, t3, t4, t7;
@@ -133,7 +133,7 @@ STATIC __m128i reduce(__m128i prod_high, __m128i prod_low)
 /**
  * Perform the carry-less multiplication of two polynomials of degree 127.
  */
-STATIC void clmult(__m128i *prod_high, __m128i *prod_low, __m128i a, __m128i b)
+STATIC FUNC_SSE2 void clmult(__m128i *prod_high, __m128i *prod_low, __m128i a, __m128i b)
 {
     __m128i c, d, e, f, g, h, i;
 
@@ -151,7 +151,7 @@ STATIC void clmult(__m128i *prod_high, __m128i *prod_low, __m128i a, __m128i b)
 /**
  * Multiply a polynomial of degree 127 by x, modulo p(x) = x^128 + x^127 + x^126 + x^121 + 1
  */
-STATIC __m128i multx(__m128i a)
+STATIC FUNC_SSE2 __m128i multx(__m128i a)
 {
     int msb;
     int64_t r;
@@ -179,7 +179,7 @@ STATIC __m128i multx(__m128i a)
 }
 
 /** Swap bytes in an XMM register **/
-STATIC __m128i swap(__m128i a)
+STATIC FUNC_SSE2 __m128i swap(__m128i a)
 {
     __m128i mask;
 
@@ -218,7 +218,7 @@ EXPORT_SYM int ghash_destroy_clmul(struct exp_key *expanded)
     return 0;
 }
 
-EXPORT_SYM int ghash_clmul(
+EXPORT_SYM FUNC_SSE2 int ghash_clmul(
         uint8_t y_out[16],
         const uint8_t block_data[],
         size_t len,

diff --git a/src/mont.c b/src/mont.c
@@ -257,7 +257,7 @@ STATIC void product(uint64_t *t, uint64_t *scratchpad, const uint64_t *a, const
  * @param words The number of words of a, b, and out
  * @return      0 for success, the appropriate code otherwise.
  */
-STATIC int mont_select(uint64_t *out, const uint64_t *a, const uint64_t *b, unsigned cond, size_t words)
+STATIC FUNC_SSE2 int mont_select(uint64_t *out, const uint64_t *a, const uint64_t *b, unsigned cond, size_t words)
 {
     uint64_t mask;
 #if defined(USE_SSE2)

diff --git a/src/multiply_32.c b/src/multiply_32.c
@@ -50,7 +50,7 @@
  * Multiply a vector a[] by a scalar b. Add the result into vector t[],
  * starting at the given offset.
  */
-void static inline addmul32(uint32_t* t, size_t offset, const uint32_t *a, uint32_t b, size_t t_words, size_t a_words)
+void static inline FUNC_SSE2 addmul32(uint32_t* t, size_t offset, const uint32_t *a, uint32_t b, size_t t_words, size_t a_words)
 {
     uint32_t carry;
     size_t i;