Permalink
Browse files

Make SSE code work with unaligned memory.

SSE code is 43% faster than C code on 16-bytes aligned memory
SSE code is 29% faster than C code on unaligned memory
On a Core 2 duo, the unaligned compatible SSE code is 4.4% slower than aligned-required SSE code with aligned memory
On Nehalem processors and newer, there's no speed disadvantage in using unaligned move SSE instructions vs aligned move SSE instructions.
  • Loading branch information...
jyavenard committed Aug 9, 2013
1 parent b5755a1 commit 097b6937494abf0ba1d21554a939940c5f35a42c
Showing with 27 additions and 28 deletions.
  1. +27 −28 mythtv/libs/libmyth/audio/audioconvert.cpp
@@ -84,17 +84,16 @@ static inline float clipcheck(float f)
}
/*
All toFloat variants require 16 byte aligned input and output buffers on x86 for SSE optimised operation
The SSE code processes 16 bytes at a time and leaves any remainder for the C
- there is no remainder in practice */
*/
static int toFloat8(float* out, const uchar* in, int len)
{
int i = 0;
float f = 1.0f / ((1<<7));
#if ARCH_X86
if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out))
if (sse_check() && len >= 16)
{
int loops = len >> 4;
i = loops << 4;
@@ -108,7 +107,7 @@ static int toFloat8(float* out, const uchar* in, int len)
"punpckldq %%xmm0, %%xmm0 \n\t"
"punpckldq %%xmm7, %%xmm7 \n\t"
"1: \n\t"
"movdqa (%1), %%xmm1 \n\t"
"movdqu (%1), %%xmm1 \n\t"
"xorpd %%xmm2, %%xmm2 \n\t"
"xorpd %%xmm3, %%xmm3 \n\t"
"psubb %%xmm0, %%xmm1 \n\t"
@@ -130,14 +129,14 @@ static int toFloat8(float* out, const uchar* in, int len)
"mulps %%xmm7, %%xmm4 \n\t"
"cvtdq2ps %%xmm6, %%xmm6 \n\t"
"mulps %%xmm7, %%xmm5 \n\t"
"movaps %%xmm4, (%0) \n\t"
"movups %%xmm4, (%0) \n\t"
"cvtdq2ps %%xmm1, %%xmm1 \n\t"
"mulps %%xmm7, %%xmm6 \n\t"
"movaps %%xmm5, 16(%0) \n\t"
"movups %%xmm5, 16(%0) \n\t"
"mulps %%xmm7, %%xmm1 \n\t"
"movaps %%xmm6, 32(%0) \n\t"
"movups %%xmm6, 32(%0) \n\t"
"add $16, %1 \n\t"
"movaps %%xmm1, 48(%0) \n\t"
"movups %%xmm1, 48(%0) \n\t"
"add $64, %0 \n\t"
"sub $1, %%ecx \n\t"
"jnz 1b \n\t"
@@ -167,7 +166,7 @@ static int fromFloat8(uchar* out, const float* in, int len)
float f = (1<<7);
#if ARCH_X86
if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out))
if (sse_check() && len >= 16)
{
int loops = len >> 4;
i = loops << 4;
@@ -218,7 +217,7 @@ static int toFloat16(float* out, const short* in, int len)
float f = 1.0f / ((1<<15));
#if ARCH_X86
if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out))
if (sse_check() && len >= 16)
{
int loops = len >> 4;
i = loops << 4;
@@ -229,10 +228,10 @@ static int toFloat16(float* out, const short* in, int len)
"punpckldq %%xmm7, %%xmm7 \n\t"
"1: \n\t"
"xorpd %%xmm2, %%xmm2 \n\t"
"movdqa (%1), %%xmm1 \n\t"
"movdqu (%1), %%xmm1 \n\t"
"xorpd %%xmm3, %%xmm3 \n\t"
"punpcklwd %%xmm1, %%xmm2 \n\t"
"movdqa 16(%1), %%xmm4 \n\t"
"movdqu 16(%1), %%xmm4 \n\t"
"punpckhwd %%xmm1, %%xmm3 \n\t"
"psrad $16, %%xmm2 \n\t"
"punpcklwd %%xmm4, %%xmm5 \n\t"
@@ -245,14 +244,14 @@ static int toFloat16(float* out, const short* in, int len)
"psrad $16, %%xmm6 \n\t"
"mulps %%xmm7, %%xmm3 \n\t"
"cvtdq2ps %%xmm5, %%xmm5 \n\t"
"movaps %%xmm2, (%0) \n\t"
"movups %%xmm2, (%0) \n\t"
"cvtdq2ps %%xmm6, %%xmm6 \n\t"
"mulps %%xmm7, %%xmm5 \n\t"
"movaps %%xmm3, 16(%0) \n\t"
"movups %%xmm3, 16(%0) \n\t"
"mulps %%xmm7, %%xmm6 \n\t"
"movaps %%xmm5, 32(%0) \n\t"
"movups %%xmm5, 32(%0) \n\t"
"add $32, %1 \n\t"
"movaps %%xmm6, 48(%0) \n\t"
"movups %%xmm6, 48(%0) \n\t"
"add $64, %0 \n\t"
"sub $1, %%ecx \n\t"
"jnz 1b \n\t"
@@ -278,7 +277,7 @@ static int fromFloat16(short* out, const float* in, int len)
float f = (1<<15);
#if ARCH_X86
if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out))
if (sse_check() && len >= 16)
{
int loops = len >> 4;
i = loops << 4;
@@ -329,7 +328,7 @@ static int toFloat32(AudioFormat format, float* out, const int* in, int len)
shift = 0;
#if ARCH_X86
if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out))
if (sse_check() && len >= 16)
{
int loops = len >> 4;
i = loops << 4;
@@ -340,27 +339,27 @@ static int toFloat32(AudioFormat format, float* out, const int* in, int len)
"movd %4, %%xmm6 \n\t"
"punpckldq %%xmm7, %%xmm7 \n\t"
"1: \n\t"
"movdqa (%1), %%xmm1 \n\t"
"movdqa 16(%1), %%xmm2 \n\t"
"movdqu (%1), %%xmm1 \n\t"
"movdqu 16(%1), %%xmm2 \n\t"
"psrad %%xmm6, %%xmm1 \n\t"
"movdqa 32(%1), %%xmm3 \n\t"
"movdqu 32(%1), %%xmm3 \n\t"
"cvtdq2ps %%xmm1, %%xmm1 \n\t"
"psrad %%xmm6, %%xmm2 \n\t"
"movdqa 48(%1), %%xmm4 \n\t"
"movdqu 48(%1), %%xmm4 \n\t"
"cvtdq2ps %%xmm2, %%xmm2 \n\t"
"psrad %%xmm6, %%xmm3 \n\t"
"mulps %%xmm7, %%xmm1 \n\t"
"psrad %%xmm6, %%xmm4 \n\t"
"cvtdq2ps %%xmm3, %%xmm3 \n\t"
"movaps %%xmm1, (%0) \n\t"
"movups %%xmm1, (%0) \n\t"
"mulps %%xmm7, %%xmm2 \n\t"
"cvtdq2ps %%xmm4, %%xmm4 \n\t"
"movaps %%xmm2, 16(%0) \n\t"
"movups %%xmm2, 16(%0) \n\t"
"mulps %%xmm7, %%xmm3 \n\t"
"mulps %%xmm7, %%xmm4 \n\t"
"movaps %%xmm3, 32(%0) \n\t"
"movups %%xmm3, 32(%0) \n\t"
"add $64, %1 \n\t"
"movaps %%xmm4, 48(%0) \n\t"
"movups %%xmm4, 48(%0) \n\t"
"add $64, %0 \n\t"
"sub $1, %%ecx \n\t"
"jnz 1b \n\t"
@@ -385,7 +384,7 @@ static int fromFloat32(AudioFormat format, int* out, const float* in, int len)
shift = 0;
#if ARCH_X86
if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out))
if (sse_check() && len >= 16)
{
float o = 0.99999995, mo = -1;
int loops = len >> 4;
@@ -465,7 +464,7 @@ static int fromFloatFLT(float* out, const float* in, int len)
int i = 0;
#if ARCH_X86
if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out))
if (sse_check() && len >= 16)
{
int loops = len >> 4;
float o = 1, mo = -1;

0 comments on commit 097b693

Please sign in to comment.