Skip to content

Commit

Permalink
Use -mstackrealign for 32-bits systems and SSE2
Browse files Browse the repository at this point in the history
  • Loading branch information
Legrandin committed Nov 14, 2022
1 parent 32f64d5 commit 80d6640
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 74 deletions.
79 changes: 28 additions & 51 deletions compiler_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def compiler_is_clang():
return test_compilation(source, msg="clang")


def compiler_is_gcc():
def compiler_is_gcc(extra_cc_options=[]):
source = """
#if defined(__clang__) || !defined(__GNUC__)
#error Not GCC
Expand All @@ -260,20 +260,14 @@ def compiler_is_gcc():
{
return 0;
}"""
return test_compilation(source, msg="gcc")


def support_gcc_realign():
source = """
void __attribute__((force_align_arg_pointer)) a(void) {}
int main(void) { return 0; }
"""
return test_compilation(source, msg="gcc")
return test_compilation(source,
msg="gcc",
extra_cc_options=extra_cc_options)


def compiler_supports_sse2():
source = """
#include <intrin.h>
source_template = """
%s
int main(void)
{
__m128i r0;
Expand All @@ -283,39 +277,31 @@ def compiler_supports_sse2():
return mask;
}
"""
if test_compilation(source, msg="SSE2(intrin.h)"):
return {'extra_cc_options': [], 'extra_macros': ['HAVE_INTRIN_H', 'USE_SSE2']}

source = """
#include <x86intrin.h>
int main(void)
{
__m128i r0;
int mask;
r0 = _mm_set1_epi32(0);
mask = _mm_movemask_epi8(r0);
return mask;
}
"""
if test_compilation(source, extra_cc_options=['-msse2'], msg="SSE2(x86intrin.h)"):
return {'extra_cc_options': ['-msse2'], 'extra_macros': ['HAVE_X86INTRIN_H', 'USE_SSE2']}
source_intrin_h = source_template % "#include <intrin.h>"
source_x86intrin_h = source_template % "#include <x86intrin.h>"
source_xemmintrin_h = source_template % "#include <xmmintrin.h>\n#include <emmintrin.h>"

source = """
#include <xmmintrin.h>
#include <emmintrin.h>
int main(void)
{
__m128i r0;
int mask;
r0 = _mm_set1_epi32(0);
mask = _mm_movemask_epi8(r0);
return mask;
}
"""
if test_compilation(source, extra_cc_options=['-msse2'], msg="SSE2(emmintrin.h)"):
return {'extra_cc_options': ['-msse2'], 'extra_macros': ['HAVE_EMMINTRIN_H', 'USE_SSE2']}
system_bits = 8 * struct.calcsize("P")

return False
result = None
if test_compilation(source_intrin_h, msg="SSE2(intrin.h)"):
result = {'extra_cc_options': [], 'extra_macros': ['HAVE_INTRIN_H', 'USE_SSE2']}
elif test_compilation(source_x86intrin_h, extra_cc_options=['-msse2'], msg="SSE2(x86intrin.h)"):
result = {'extra_cc_options': ['-msse2'], 'extra_macros': ['HAVE_X86INTRIN_H', 'USE_SSE2']}
elif test_compilation(source_xemmintrin_h, extra_cc_options=['-msse2'], msg="SSE2(emmintrin.h)"):
result = {'extra_cc_options': ['-msse2'], 'extra_macros': ['HAVE_EMMINTRIN_H', 'USE_SSE2']}
else:
result = False

# On 32-bit x86 platforms, gcc assumes the stack to be aligned to 16
# bytes, but the caller may actually only align it to 4 bytes, which
# make functions crash if they use SSE2 intrinsics.
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838
if result and system_bits == 32 and compiler_is_gcc(extra_cc_options=['-mstackrealign']):
result['extra_cc_options'].append('-mstackrealign')

return result


def remove_extension(extensions, name):
Expand Down Expand Up @@ -379,15 +365,6 @@ def set_compiler_options(package_root, extensions):
for macro in sse2_result['extra_macros']:
extra_macros.append((macro, None))

# Compiler specific settings
if gcc:
# On 32-bit x86 platforms, gcc assumes the stack to be aligned to 16
# bytes, but the caller may actually only align it to 4 bytes, which
# make functions crash if they use SSE2 intrinsics.
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838
if system_bits == 32 and support_gcc_realign():
extra_macros.append(("GCC_REALIGN", None))

# Module-specific options

# AESNI
Expand Down
8 changes: 4 additions & 4 deletions src/AESNI.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ typedef struct {

enum SubType { OnlySub, SubRotXor };

static FUNC_SSE2 uint32_t sub_rot(uint32_t w, unsigned idx /** round/Nk **/, enum SubType subType)
static uint32_t sub_rot(uint32_t w, unsigned idx /** round/Nk **/, enum SubType subType)
{
__m128i x, y, z;

Expand Down Expand Up @@ -90,7 +90,7 @@ static FUNC_SSE2 uint32_t sub_rot(uint32_t w, unsigned idx /** round/Nk **/, enu
return (uint32_t)_mm_cvtsi128_si32(z);
}

static FUNC_SSE2 int expand_key(__m128i *erk, __m128i *drk, const uint8_t *key, unsigned Nk, unsigned Nr)
static int expand_key(__m128i *erk, __m128i *drk, const uint8_t *key, unsigned Nk, unsigned Nr)
{
uint32_t rk[4*(14+2)];
unsigned tot_words;
Expand Down Expand Up @@ -137,7 +137,7 @@ static FUNC_SSE2 int expand_key(__m128i *erk, __m128i *drk, const uint8_t *key,
return 0;
}

static FUNC_SSE2 int AESNI_encrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
static int AESNI_encrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
{
unsigned rounds;
__m128i r[14+1];
Expand Down Expand Up @@ -246,7 +246,7 @@ static FUNC_SSE2 int AESNI_encrypt(const BlockBase *bb, const uint8_t *in, uint8
return 0;
}

static FUNC_SSE2 int AESNI_decrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
static int AESNI_decrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
{
unsigned rounds;
__m128i r[14+1];
Expand Down
2 changes: 1 addition & 1 deletion src/bignum.c
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ STATIC void product(uint64_t *t, uint64_t *scratchpad, const uint64_t *a, const
* @param words The number of words of a, b, and out
* @return 0 for success, the appropriate code otherwise.
*/
STATIC FUNC_SSE2 int mod_select(uint64_t *out, const uint64_t *a, const uint64_t *b, unsigned cond, size_t words)
STATIC int mod_select(uint64_t *out, const uint64_t *a, const uint64_t *b, unsigned cond, size_t words)
{
uint64_t mask;
#if defined(USE_SSE2)
Expand Down
12 changes: 0 additions & 12 deletions src/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,16 +192,4 @@ static inline const uint8_t* memchr_not(const uint8_t* s, int c, size_t n)
return NULL;
}

/*
* On 32-bit x86 platforms, gcc assumes the stack to be aligned to 16
* bytes, but the caller may actually only align it to 4 bytes, which
* make functions crash if they use SSE2 intrinsics.
* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838
*/
#if defined(GCC_REALIGN)
#define FUNC_SSE2 __attribute__((force_align_arg_pointer))
#else
#define FUNC_SSE2
#endif

#endif
10 changes: 5 additions & 5 deletions src/ghash_clmul.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ struct exp_key {
*
* See at the bottom for an explanation.
*/
STATIC FUNC_SSE2 __m128i reduce(__m128i prod_high, __m128i prod_low)
STATIC __m128i reduce(__m128i prod_high, __m128i prod_low)
{
const uint64_t c2 = (uint64_t)0xc2 << 56;
__m128i t1, t2, t3, t4, t7;
Expand All @@ -133,7 +133,7 @@ STATIC FUNC_SSE2 __m128i reduce(__m128i prod_high, __m128i prod_low)
/**
* Perform the carry-less multiplication of two polynomials of degree 127.
*/
STATIC FUNC_SSE2 void clmult(__m128i *prod_high, __m128i *prod_low, __m128i a, __m128i b)
STATIC void clmult(__m128i *prod_high, __m128i *prod_low, __m128i a, __m128i b)
{
__m128i c, d, e, f, g, h, i;

Expand All @@ -151,7 +151,7 @@ STATIC FUNC_SSE2 void clmult(__m128i *prod_high, __m128i *prod_low, __m128i a, _
/**
* Multiply a polynomial of degree 127 by x, modulo p(x) = x^128 + x^127 + x^126 + x^121 + 1
*/
STATIC FUNC_SSE2 __m128i multx(__m128i a)
STATIC __m128i multx(__m128i a)
{
int msb;
int64_t r;
Expand Down Expand Up @@ -179,7 +179,7 @@ STATIC FUNC_SSE2 __m128i multx(__m128i a)
}

/** Swap bytes in an XMM register **/
STATIC FUNC_SSE2 __m128i swap(__m128i a)
STATIC __m128i swap(__m128i a)
{
__m128i mask;

Expand Down Expand Up @@ -218,7 +218,7 @@ EXPORT_SYM int ghash_destroy_clmul(struct exp_key *expanded)
return 0;
}

EXPORT_SYM FUNC_SSE2 int ghash_clmul(
EXPORT_SYM int ghash_clmul(
uint8_t y_out[16],
const uint8_t block_data[],
size_t len,
Expand Down
2 changes: 1 addition & 1 deletion src/multiply_32.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
* Multiply a vector a[] by a scalar b. Add the result into vector t[],
* starting at the given offset.
*/
void static inline FUNC_SSE2 addmul32(uint32_t* t, size_t offset, const uint32_t *a, uint32_t b, size_t t_words, size_t a_words)
void static inline addmul32(uint32_t* t, size_t offset, const uint32_t *a, uint32_t b, size_t t_words, size_t a_words)
{
uint32_t carry;
size_t i;
Expand Down

0 comments on commit 80d6640

Please sign in to comment.