From 097b6937494abf0ba1d21554a939940c5f35a42c Mon Sep 17 00:00:00 2001
From: Jean-Yves Avenard <jyavenard@mythtv.org>
Date: Fri, 9 Aug 2013 20:25:53 +1000
Subject: [PATCH] Make SSE code work with unaligned memory.

SSE code is 43% faster than C code on 16-bytes aligned memory
SSE code is 29% faster than C code on unaligned memory
On a Core 2 duo, the unaligned compatible SSE code is 4.4% slower than aligned-required SSE code with aligned memory
On Nehalem processors and newer, there's no speed disadvantage in using unaligned move SSE instructions vs aligned move SSE instructions.
---
 mythtv/libs/libmyth/audio/audioconvert.cpp | 55 +++++++++++-----------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/mythtv/libs/libmyth/audio/audioconvert.cpp b/mythtv/libs/libmyth/audio/audioconvert.cpp
index 86720f0fba5..48d1e7ae69e 100644
--- a/mythtv/libs/libmyth/audio/audioconvert.cpp
+++ b/mythtv/libs/libmyth/audio/audioconvert.cpp
@@ -84,9 +84,8 @@ static inline float clipcheck(float f)
 }
 
 /*
- All toFloat variants require 16 byte aligned input and output buffers on x86 for SSE optimised operation
  The SSE code processes 16 bytes at a time and leaves any remainder for the C
- - there is no remainder in practice */
+ */
 
 static int toFloat8(float* out, const uchar* in, int len)
 {
@@ -94,7 +93,7 @@ static int toFloat8(float* out, const uchar* in, int len)
     float f = 1.0f / ((1<<7));
 
 #if ARCH_X86
-    if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out))
+    if (sse_check() && len >= 16)
     {
         int loops = len >> 4;
         i = loops << 4;
@@ -108,7 +107,7 @@ static int toFloat8(float* out, const uchar* in, int len)
                           "punpckldq  %%xmm0, %%xmm0      \n\t"
                           "punpckldq  %%xmm7, %%xmm7      \n\t"
                           "1:                             \n\t"
-                          "movdqa     (%1), %%xmm1        \n\t"
+                          "movdqu     (%1), %%xmm1        \n\t"
                           "xorpd      %%xmm2, %%xmm2      \n\t"
                           "xorpd      %%xmm3, %%xmm3      \n\t"
                           "psubb      %%xmm0, %%xmm1      \n\t"
@@ -130,14 +129,14 @@ static int toFloat8(float* out, const uchar* in, int len)
                           "mulps      %%xmm7, %%xmm4      \n\t"
                           "cvtdq2ps   %%xmm6, %%xmm6      \n\t"
                           "mulps      %%xmm7, %%xmm5      \n\t"
-                          "movaps     %%xmm4, (%0)        \n\t"
+                          "movups     %%xmm4, (%0)        \n\t"
                           "cvtdq2ps   %%xmm1, %%xmm1      \n\t"
                           "mulps      %%xmm7, %%xmm6      \n\t"
-                          "movaps     %%xmm5, 16(%0)      \n\t"
+                          "movups     %%xmm5, 16(%0)      \n\t"
                           "mulps      %%xmm7, %%xmm1      \n\t"
-                          "movaps     %%xmm6, 32(%0)      \n\t"
+                          "movups     %%xmm6, 32(%0)      \n\t"
                           "add        $16,    %1          \n\t"
-                          "movaps     %%xmm1, 48(%0)      \n\t"
+                          "movups     %%xmm1, 48(%0)      \n\t"
                           "add        $64,    %0          \n\t"
                           "sub        $1, %%ecx           \n\t"
                           "jnz        1b                  \n\t"
@@ -167,7 +166,7 @@ static int fromFloat8(uchar* out, const float* in, int len)
     float f = (1<<7);
 
 #if ARCH_X86
-    if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out))
+    if (sse_check() && len >= 16)
     {
         int loops = len >> 4;
         i = loops << 4;
@@ -218,7 +217,7 @@ static int toFloat16(float* out, const short* in, int len)
     float f = 1.0f / ((1<<15));
 
 #if ARCH_X86
-    if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out))
+    if (sse_check() && len >= 16)
     {
         int loops = len >> 4;
         i = loops << 4;
@@ -229,10 +228,10 @@ static int toFloat16(float* out, const short* in, int len)
                           "punpckldq  %%xmm7, %%xmm7      \n\t"
                           "1:                             \n\t"
                           "xorpd      %%xmm2, %%xmm2      \n\t"
-                          "movdqa     (%1),   %%xmm1      \n\t"
+                          "movdqu     (%1),   %%xmm1      \n\t"
                           "xorpd      %%xmm3, %%xmm3      \n\t"
                           "punpcklwd  %%xmm1, %%xmm2      \n\t"
-                          "movdqa     16(%1), %%xmm4      \n\t"
+                          "movdqu     16(%1), %%xmm4      \n\t"
                           "punpckhwd  %%xmm1, %%xmm3      \n\t"
                           "psrad      $16,    %%xmm2      \n\t"
                           "punpcklwd  %%xmm4, %%xmm5      \n\t"
@@ -245,14 +244,14 @@ static int toFloat16(float* out, const short* in, int len)
                           "psrad      $16,    %%xmm6      \n\t"
                           "mulps      %%xmm7, %%xmm3      \n\t"
                           "cvtdq2ps   %%xmm5, %%xmm5      \n\t"
-                          "movaps     %%xmm2, (%0)        \n\t"
+                          "movups     %%xmm2, (%0)        \n\t"
                           "cvtdq2ps   %%xmm6, %%xmm6      \n\t"
                           "mulps      %%xmm7, %%xmm5      \n\t"
-                          "movaps     %%xmm3, 16(%0)      \n\t"
+                          "movups     %%xmm3, 16(%0)      \n\t"
                           "mulps      %%xmm7, %%xmm6      \n\t"
-                          "movaps     %%xmm5, 32(%0)      \n\t"
+                          "movups     %%xmm5, 32(%0)      \n\t"
                           "add        $32, %1             \n\t"
-                          "movaps     %%xmm6, 48(%0)      \n\t"
+                          "movups     %%xmm6, 48(%0)      \n\t"
                           "add        $64, %0             \n\t"
                           "sub        $1, %%ecx           \n\t"
                           "jnz        1b                  \n\t"
@@ -278,7 +277,7 @@ static int fromFloat16(short* out, const float* in, int len)
     float f = (1<<15);
 
 #if ARCH_X86
-    if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out))
+    if (sse_check() && len >= 16)
     {
         int loops = len >> 4;
         i = loops << 4;
@@ -329,7 +328,7 @@ static int toFloat32(AudioFormat format, float* out, const int* in, int len)
         shift = 0;
 
 #if ARCH_X86
-    if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out))
+    if (sse_check() && len >= 16)
     {
         int loops = len >> 4;
         i = loops << 4;
@@ -340,27 +339,27 @@ static int toFloat32(AudioFormat format, float* out, const int* in, int len)
                           "movd       %4, %%xmm6          \n\t"
                           "punpckldq  %%xmm7, %%xmm7      \n\t"
                           "1:                             \n\t"
-                          "movdqa     (%1),   %%xmm1      \n\t"
-                          "movdqa     16(%1), %%xmm2      \n\t"
+                          "movdqu     (%1),   %%xmm1      \n\t"
+                          "movdqu     16(%1), %%xmm2      \n\t"
                           "psrad      %%xmm6, %%xmm1      \n\t"
-                          "movdqa     32(%1), %%xmm3      \n\t"
+                          "movdqu     32(%1), %%xmm3      \n\t"
                           "cvtdq2ps   %%xmm1, %%xmm1      \n\t"
                           "psrad      %%xmm6, %%xmm2      \n\t"
-                          "movdqa     48(%1), %%xmm4      \n\t"
+                          "movdqu     48(%1), %%xmm4      \n\t"
                           "cvtdq2ps   %%xmm2, %%xmm2      \n\t"
                           "psrad      %%xmm6, %%xmm3      \n\t"
                           "mulps      %%xmm7, %%xmm1      \n\t"
                           "psrad      %%xmm6, %%xmm4      \n\t"
                           "cvtdq2ps   %%xmm3, %%xmm3      \n\t"
-                          "movaps     %%xmm1, (%0)        \n\t"
+                          "movups     %%xmm1, (%0)        \n\t"
                           "mulps      %%xmm7, %%xmm2      \n\t"
                           "cvtdq2ps   %%xmm4, %%xmm4      \n\t"
-                          "movaps     %%xmm2, 16(%0)      \n\t"
+                          "movups     %%xmm2, 16(%0)      \n\t"
                           "mulps      %%xmm7, %%xmm3      \n\t"
                           "mulps      %%xmm7, %%xmm4      \n\t"
-                          "movaps     %%xmm3, 32(%0)      \n\t"
+                          "movups     %%xmm3, 32(%0)      \n\t"
                           "add        $64,    %1          \n\t"
-                          "movaps     %%xmm4, 48(%0)      \n\t"
+                          "movups     %%xmm4, 48(%0)      \n\t"
                           "add        $64,    %0          \n\t"
                           "sub        $1, %%ecx           \n\t"
                           "jnz        1b                  \n\t"
@@ -385,7 +384,7 @@ static int fromFloat32(AudioFormat format, int* out, const float* in, int len)
         shift = 0;
 
 #if ARCH_X86
-    if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out))
+    if (sse_check() && len >= 16)
     {
         float o = 0.99999995, mo = -1;
         int loops = len >> 4;
@@ -465,7 +464,7 @@ static int fromFloatFLT(float* out, const float* in, int len)
     int i = 0;
 
 #if ARCH_X86
-    if (sse_check() && len >= 16 && ISALIGN(in) && ISALIGN(out))
+    if (sse_check() && len >= 16)
     {
         int loops = len >> 4;
         float o = 1, mo = -1;