From 097b6937494abf0ba1d21554a939940c5f35a42c Mon Sep 17 00:00:00 2001 From: Jean-Yves Avenard Date: Fri, 9 Aug 2013 20:25:53 +1000 Subject: [PATCH] Make SSE code work with unaligned memory. SSE code is 43% faster than C code on 16-bytes aligned memory SSE code is 29% faster than C code on unaligned memory On a Core 2 duo, the unaligned compatible SSE code is 4.4% slower than aligned-required SSE code with aligned memory On Nehalem processors and newer, there's no speed disadvantage in using unaligned move SSE instructions vs aligned move SSE instructions. --- mythtv/libs/libmyth/audio/audioconvert.cpp | 55 +++++++++++----------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/mythtv/libs/libmyth/audio/audioconvert.cpp b/mythtv/libs/libmyth/audio/audioconvert.cpp index 86720f0fba5..48d1e7ae69e 100644 --- a/mythtv/libs/libmyth/audio/audioconvert.cpp +++ b/mythtv/libs/libmyth/audio/audioconvert.cpp @@ -84,9 +84,8 @@ static inline float clipcheck(float f) } /* - All toFloat variants require 16 byte aligned input and output buffers on x86 for SSE optimised operation The SSE code processes 16 bytes at a time and leaves any remainder for the C - - there is no remainder in practice */ + */ static int toFloat8(float* out, const uchar* in, int len) { @@ -94,7 +93,7 @@ static int toFloat8(float* out, const uchar* in, int len) float f = 1.0f / ((1<<7)); #if ARCH_X86 - if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out)) + if (sse_check() && len >= 16) { int loops = len >> 4; i = loops << 4; @@ -108,7 +107,7 @@ static int toFloat8(float* out, const uchar* in, int len) "punpckldq %%xmm0, %%xmm0 \n\t" "punpckldq %%xmm7, %%xmm7 \n\t" "1: \n\t" - "movdqa (%1), %%xmm1 \n\t" + "movdqu (%1), %%xmm1 \n\t" "xorpd %%xmm2, %%xmm2 \n\t" "xorpd %%xmm3, %%xmm3 \n\t" "psubb %%xmm0, %%xmm1 \n\t" @@ -130,14 +129,14 @@ static int toFloat8(float* out, const uchar* in, int len) "mulps %%xmm7, %%xmm4 \n\t" "cvtdq2ps %%xmm6, %%xmm6 \n\t" "mulps %%xmm7, %%xmm5 \n\t" - "movaps %%xmm4, (%0) \n\t" + "movups %%xmm4, (%0) \n\t" "cvtdq2ps %%xmm1, %%xmm1 \n\t" "mulps %%xmm7, %%xmm6 \n\t" - "movaps %%xmm5, 16(%0) \n\t" + "movups %%xmm5, 16(%0) \n\t" "mulps %%xmm7, %%xmm1 \n\t" - "movaps %%xmm6, 32(%0) \n\t" + "movups %%xmm6, 32(%0) \n\t" "add $16, %1 \n\t" - "movaps %%xmm1, 48(%0) \n\t" + "movups %%xmm1, 48(%0) \n\t" "add $64, %0 \n\t" "sub $1, %%ecx \n\t" "jnz 1b \n\t" @@ -167,7 +166,7 @@ static int fromFloat8(uchar* out, const float* in, int len) float f = (1<<7); #if ARCH_X86 - if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out)) + if (sse_check() && len >= 16) { int loops = len >> 4; i = loops << 4; @@ -218,7 +217,7 @@ static int toFloat16(float* out, const short* in, int len) float f = 1.0f / ((1<<15)); #if ARCH_X86 - if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out)) + if (sse_check() && len >= 16) { int loops = len >> 4; i = loops << 4; @@ -229,10 +228,10 @@ static int toFloat16(float* out, const short* in, int len) "punpckldq %%xmm7, %%xmm7 \n\t" "1: \n\t" "xorpd %%xmm2, %%xmm2 \n\t" - "movdqa (%1), %%xmm1 \n\t" + "movdqu (%1), %%xmm1 \n\t" "xorpd %%xmm3, %%xmm3 \n\t" "punpcklwd %%xmm1, %%xmm2 \n\t" - "movdqa 16(%1), %%xmm4 \n\t" + "movdqu 16(%1), %%xmm4 \n\t" "punpckhwd %%xmm1, %%xmm3 \n\t" "psrad $16, %%xmm2 \n\t" "punpcklwd %%xmm4, %%xmm5 \n\t" @@ -245,14 +244,14 @@ static int toFloat16(float* out, const short* in, int len) "psrad $16, %%xmm6 \n\t" "mulps %%xmm7, %%xmm3 \n\t" "cvtdq2ps %%xmm5, %%xmm5 \n\t" - "movaps %%xmm2, (%0) \n\t" + "movups %%xmm2, (%0) \n\t" "cvtdq2ps %%xmm6, %%xmm6 \n\t" "mulps %%xmm7, %%xmm5 \n\t" - "movaps %%xmm3, 16(%0) \n\t" + "movups %%xmm3, 16(%0) \n\t" "mulps %%xmm7, %%xmm6 \n\t" - "movaps %%xmm5, 32(%0) \n\t" + "movups %%xmm5, 32(%0) \n\t" "add $32, %1 \n\t" - "movaps %%xmm6, 48(%0) \n\t" + "movups %%xmm6, 48(%0) \n\t" "add $64, %0 \n\t" "sub $1, %%ecx \n\t" "jnz 1b \n\t" @@ -278,7 +277,7 @@ static int fromFloat16(short* out, const float* in, int len) float f = (1<<15); #if ARCH_X86 - if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out)) + if (sse_check() && len >= 16) { int loops = len >> 4; i = loops << 4; @@ -329,7 +328,7 @@ static int toFloat32(AudioFormat format, float* out, const int* in, int len) shift = 0; #if ARCH_X86 - if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out)) + if (sse_check() && len >= 16) { int loops = len >> 4; i = loops << 4; @@ -340,27 +339,27 @@ static int toFloat32(AudioFormat format, float* out, const int* in, int len) "movd %4, %%xmm6 \n\t" "punpckldq %%xmm7, %%xmm7 \n\t" "1: \n\t" - "movdqa (%1), %%xmm1 \n\t" - "movdqa 16(%1), %%xmm2 \n\t" + "movdqu (%1), %%xmm1 \n\t" + "movdqu 16(%1), %%xmm2 \n\t" "psrad %%xmm6, %%xmm1 \n\t" - "movdqa 32(%1), %%xmm3 \n\t" + "movdqu 32(%1), %%xmm3 \n\t" "cvtdq2ps %%xmm1, %%xmm1 \n\t" "psrad %%xmm6, %%xmm2 \n\t" - "movdqa 48(%1), %%xmm4 \n\t" + "movdqu 48(%1), %%xmm4 \n\t" "cvtdq2ps %%xmm2, %%xmm2 \n\t" "psrad %%xmm6, %%xmm3 \n\t" "mulps %%xmm7, %%xmm1 \n\t" "psrad %%xmm6, %%xmm4 \n\t" "cvtdq2ps %%xmm3, %%xmm3 \n\t" - "movaps %%xmm1, (%0) \n\t" + "movups %%xmm1, (%0) \n\t" "mulps %%xmm7, %%xmm2 \n\t" "cvtdq2ps %%xmm4, %%xmm4 \n\t" - "movaps %%xmm2, 16(%0) \n\t" + "movups %%xmm2, 16(%0) \n\t" "mulps %%xmm7, %%xmm3 \n\t" "mulps %%xmm7, %%xmm4 \n\t" - "movaps %%xmm3, 32(%0) \n\t" + "movups %%xmm3, 32(%0) \n\t" "add $64, %1 \n\t" - "movaps %%xmm4, 48(%0) \n\t" + "movups %%xmm4, 48(%0) \n\t" "add $64, %0 \n\t" "sub $1, %%ecx \n\t" "jnz 1b \n\t" @@ -385,7 +384,7 @@ static int fromFloat32(AudioFormat format, int* out, const float* in, int len) shift = 0; #if ARCH_X86 - if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out)) + if (sse_check() && len >= 16) { float o = 0.99999995, mo = -1; int loops = len >> 4; @@ -465,7 +464,7 @@ static int fromFloatFLT(float* out, const float* in, int len) int i = 0; #if ARCH_X86 - if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out)) + if (sse_check() && len >= 16) { int loops = len >> 4; float o = 1, mo = -1;