Skip to content

Commit

Permalink
Merge branch 'gcc_stack_align'
Browse files Browse the repository at this point in the history
  • Loading branch information
Legrandin committed Nov 11, 2019
2 parents d442d9c + 31c23ca commit ef1e4a4
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 27 deletions.
17 changes: 17 additions & 0 deletions compiler_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,14 @@ def compiler_is_gcc():
return test_compilation(source, msg="gcc")


def support_gcc_realign():
source = """
void __attribute__((force_align_arg_pointer)) a(void) {}
int main(void) { return 0; }
"""
return test_compilation(source, msg="gcc")


def compiler_supports_sse2():
source = """
#include <intrin.h>
Expand Down Expand Up @@ -371,6 +379,15 @@ def set_compiler_options(package_root, extensions):
for macro in sse2_result['extra_macros']:
extra_macros.append((macro, None))

# Compiler specific settings
if gcc:
# On 32-bit x86 platforms, gcc assumes the stack to be aligned to 16
# bytes, but the caller may actually only align it to 4 bytes, which
# make functions crash if they use SSE2 intrinsics.
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838
if system_bits == 32 and support_gcc_realign():
extra_macros.append(("GCC_REALIGN", None))

# Module-specific options

# AESNI
Expand Down
28 changes: 8 additions & 20 deletions src/AESNI.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,8 @@ FAKE_INIT(raw_aesni)
#define BLOCK_SIZE 16

struct block_state {
__m128i *erk; /** Round keys for encryption (11, 13 or 15 elements) **/
__m128i *drk; /** Round keys for decryption **/
__m128i *tmp_rk;
__m128i *erk; /** Round keys for encryption (11, 13 or 15 elements) **/
__m128i *drk; /** Round keys for decryption **/
unsigned rounds;
};

Expand All @@ -59,7 +58,7 @@ typedef struct {

enum SubType { OnlySub, SubRotXor };

static uint32_t sub_rot(uint32_t w, unsigned idx /** round/Nk **/, enum SubType subType)
static FUNC_SSE2 uint32_t sub_rot(uint32_t w, unsigned idx /** round/Nk **/, enum SubType subType)
{
__m128i x, y, z;

Expand Down Expand Up @@ -91,7 +90,7 @@ static uint32_t sub_rot(uint32_t w, unsigned idx /** round/Nk **/, enum SubType
return (uint32_t)_mm_cvtsi128_si32(z);
}

static int expand_key(__m128i *erk, __m128i *drk, const uint8_t *key, unsigned Nk, unsigned Nr)
static FUNC_SSE2 int expand_key(__m128i *erk, __m128i *drk, const uint8_t *key, unsigned Nk, unsigned Nr)
{
uint32_t rk[4*(14+2)];
unsigned tot_words;
Expand Down Expand Up @@ -138,21 +137,19 @@ static int expand_key(__m128i *erk, __m128i *drk, const uint8_t *key, unsigned N
return 0;
}

static int AESNI_encrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
static FUNC_SSE2 int AESNI_encrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
{
unsigned rounds;
__m128i r[14+1];
const struct block_state *state;
unsigned k;
__m128i *r;

if ((bb == NULL) || (in == NULL) || (out == NULL))
return ERR_NULL;

state = &((AESNI_State*)bb)->algo_state;
rounds = state->rounds;

r = state->tmp_rk;

if (rounds > 14)
return ERR_NR_ROUNDS;

Expand Down Expand Up @@ -249,12 +246,12 @@ static int AESNI_encrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, s
return 0;
}

static int AESNI_decrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
static FUNC_SSE2 int AESNI_decrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, size_t data_len)
{
unsigned rounds;
__m128i r[14+1];
const struct block_state *state;
unsigned k;
__m128i *r;

if ((bb == NULL) || (in == NULL) || (out == NULL))
return ERR_NULL;
Expand All @@ -265,8 +262,6 @@ static int AESNI_decrypt(const BlockBase *bb, const uint8_t *in, uint8_t *out, s
if (rounds > 14)
return ERR_NR_ROUNDS;

r = state->tmp_rk;

for (k=0; k<=rounds; k++) {
r[k] = state->drk[k];
}
Expand Down Expand Up @@ -416,12 +411,6 @@ EXPORT_SYM int AESNI_start_operation(const uint8_t key[], size_t key_len, AESNI_
goto error;
}

state->tmp_rk = align_alloc(Nb*(Nr+1)*sizeof(uint32_t), 16);
if (state->tmp_rk == NULL) {
result = ERR_MEMORY;
goto error;
}

result = expand_key(state->erk, state->drk, key, (unsigned)key_len/4, Nr);
if (result) {
goto error;
Expand All @@ -431,7 +420,6 @@ EXPORT_SYM int AESNI_start_operation(const uint8_t key[], size_t key_len, AESNI_
error:
align_free(state->erk);
align_free(state->drk);
align_free(state->tmp_rk);
free(*pResult);
return result;
}
Expand Down
12 changes: 12 additions & 0 deletions src/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,4 +192,16 @@ static inline const uint8_t* memchr_not(const uint8_t* s, int c, size_t n)
return NULL;
}

/*
* On 32-bit x86 platforms, gcc assumes the stack to be aligned to 16
* bytes, but the caller may actually only align it to 4 bytes, which
* make functions crash if they use SSE2 intrinsics.
* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838
*/
#if defined(GCC_REALIGN)
#define FUNC_SSE2 __attribute__((force_align_arg_pointer))
#else
#define FUNC_SSE2
#endif

#endif
10 changes: 5 additions & 5 deletions src/ghash_clmul.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ struct exp_key {
*
* See at the bottom for an explanation.
*/
STATIC __m128i reduce(__m128i prod_high, __m128i prod_low)
STATIC FUNC_SSE2 __m128i reduce(__m128i prod_high, __m128i prod_low)
{
const uint64_t c2 = (uint64_t)0xc2 << 56;
__m128i t1, t2, t3, t4, t7;
Expand All @@ -133,7 +133,7 @@ STATIC __m128i reduce(__m128i prod_high, __m128i prod_low)
/**
* Perform the carry-less multiplication of two polynomials of degree 127.
*/
STATIC void clmult(__m128i *prod_high, __m128i *prod_low, __m128i a, __m128i b)
STATIC FUNC_SSE2 void clmult(__m128i *prod_high, __m128i *prod_low, __m128i a, __m128i b)
{
__m128i c, d, e, f, g, h, i;

Expand All @@ -151,7 +151,7 @@ STATIC void clmult(__m128i *prod_high, __m128i *prod_low, __m128i a, __m128i b)
/**
* Multiply a polynomial of degree 127 by x, modulo p(x) = x^128 + x^127 + x^126 + x^121 + 1
*/
STATIC __m128i multx(__m128i a)
STATIC FUNC_SSE2 __m128i multx(__m128i a)
{
int msb;
int64_t r;
Expand Down Expand Up @@ -179,7 +179,7 @@ STATIC __m128i multx(__m128i a)
}

/** Swap bytes in an XMM register **/
STATIC __m128i swap(__m128i a)
STATIC FUNC_SSE2 __m128i swap(__m128i a)
{
__m128i mask;

Expand Down Expand Up @@ -218,7 +218,7 @@ EXPORT_SYM int ghash_destroy_clmul(struct exp_key *expanded)
return 0;
}

EXPORT_SYM int ghash_clmul(
EXPORT_SYM FUNC_SSE2 int ghash_clmul(
uint8_t y_out[16],
const uint8_t block_data[],
size_t len,
Expand Down
2 changes: 1 addition & 1 deletion src/mont.c
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ STATIC void product(uint64_t *t, uint64_t *scratchpad, const uint64_t *a, const
* @param words The number of words of a, b, and out
* @return 0 for success, the appropriate code otherwise.
*/
STATIC int mont_select(uint64_t *out, const uint64_t *a, const uint64_t *b, unsigned cond, size_t words)
STATIC FUNC_SSE2 int mont_select(uint64_t *out, const uint64_t *a, const uint64_t *b, unsigned cond, size_t words)
{
uint64_t mask;
#if defined(USE_SSE2)
Expand Down
2 changes: 1 addition & 1 deletion src/multiply_32.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
* Multiply a vector a[] by a scalar b. Add the result into vector t[],
* starting at the given offset.
*/
void static inline addmul32(uint32_t* t, size_t offset, const uint32_t *a, uint32_t b, size_t t_words, size_t a_words)
void static inline FUNC_SSE2 addmul32(uint32_t* t, size_t offset, const uint32_t *a, uint32_t b, size_t t_words, size_t a_words)
{
uint32_t carry;
size_t i;
Expand Down

0 comments on commit ef1e4a4

Please sign in to comment.