[DAG] mergeStore - peek through truncates when finding dead store(tru…

…nc(load())) patterns Extend the existing store(load()) removal code to account for intermediate truncates that some targets won't remove with canCombineTruncStore - we only care about the load/store MemoryVT. Fixes regression from D146121
ibricchi · Mar 15, 2023 · c1f81e7 · c1f81e7
1 parent 7501e53
commit c1f81e7
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 37 deletions.
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1712,6 +1712,10 @@ SDValue peekThroughOneUseBitcasts(SDValue V);
 /// If \p V is not an extracted subvector, it is returned as-is.
 SDValue peekThroughExtractSubvectors(SDValue V);
 
+/// Return the non-truncated source operand of \p V if it exists.
+/// If \p V is not a truncation, it is returned as-is.
+SDValue peekThroughTruncates(SDValue V);
+
 /// Returns true if \p V is a bitwise not operation. Assumes that an all ones
 /// constant is canonicalized to be operand 1.
 bool isBitwiseNot(SDValue V, bool AllowUndefs = false);

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20391,9 +20391,13 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   }
 
   // If this is a load followed by a store to the same location, then the store
-  // is dead/noop.
+  // is dead/noop. Peek through any truncates if canCombineTruncStore failed.
+  // TODO: Add big-endian truncate support with test coverage.
   // TODO: Can relax for unordered atomics (see D66309)
-  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
+  SDValue TruncVal = DAG.getDataLayout().isLittleEndian()
+                         ? peekThroughTruncates(Value)
+                         : Value;
+  if (auto *Ld = dyn_cast<LoadSDNode>(TruncVal)) {
     if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
         ST->isUnindexed() && ST->isSimple() &&
         Ld->getAddressSpace() == ST->getAddressSpace() &&

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -11040,6 +11040,12 @@ SDValue llvm::peekThroughExtractSubvectors(SDValue V) {
   return V;
 }
 
+SDValue llvm::peekThroughTruncates(SDValue V) {
+  while (V.getOpcode() == ISD::TRUNCATE)
+    V = V.getOperand(0);
+  return V;
+}
+
 bool llvm::isBitwiseNot(SDValue V, bool AllowUndefs) {
   if (V.getOpcode() != ISD::XOR)
     return false;

diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -5,21 +5,19 @@
 define void @i24_or(ptr %a) {
 ; X86-LABEL: i24_or:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    movzbl 2(%ecx), %eax
-; X86-NEXT:    movb %al, 2(%ecx)
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    orl $384, %eax # imm = 0x180
-; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movzbl 2(%eax), %edx
+; X86-NEXT:    shll $16, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl $384, %edx # imm = 0x180
+; X86-NEXT:    movw %dx, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: i24_or:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzbl 2(%rdi), %ecx
-; X64-NEXT:    movb %cl, 2(%rdi)
 ; X64-NEXT:    shll $16, %ecx
 ; X64-NEXT:    orl %eax, %ecx
 ; X64-NEXT:    orl $384, %ecx # imm = 0x180
@@ -35,21 +33,19 @@ define void @i24_and_or(ptr %a) {
 ; X86-LABEL: i24_and_or:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzbl 2(%eax), %ecx
-; X86-NEXT:    movb %cl, 2(%eax)
-; X86-NEXT:    shll $16, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    orl $384, %ecx # imm = 0x180
-; X86-NEXT:    andl $-128, %ecx
-; X86-NEXT:    movw %cx, (%eax)
+; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movzbl 2(%eax), %edx
+; X86-NEXT:    shll $16, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl $384, %edx # imm = 0x180
+; X86-NEXT:    andl $-128, %edx
+; X86-NEXT:    movw %dx, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: i24_and_or:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzbl 2(%rdi), %ecx
-; X64-NEXT:    movb %cl, 2(%rdi)
 ; X64-NEXT:    shll $16, %ecx
 ; X64-NEXT:    orl %eax, %ecx
 ; X64-NEXT:    orl $384, %ecx # imm = 0x180
@@ -66,29 +62,27 @@ define void @i24_and_or(ptr %a) {
 define void @i24_insert_bit(ptr %a, i1 zeroext %bit) {
 ; X86-LABEL: i24_insert_bit:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    .cfi_offset %esi, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzbl 2(%eax), %ebx
-; X86-NEXT:    movb %bl, 2(%eax)
-; X86-NEXT:    shll $16, %ebx
-; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    movzbl 2(%eax), %esi
+; X86-NEXT:    shll $16, %esi
+; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    shll $13, %ecx
-; X86-NEXT:    andl $16769023, %ebx # imm = 0xFFDFFF
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    movw %bx, (%eax)
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    andl $16769023, %esi # imm = 0xFFDFFF
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movw %si, (%eax)
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: i24_insert_bit:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzbl 2(%rdi), %ecx
-; X64-NEXT:    movb %cl, 2(%rdi)
 ; X64-NEXT:    shll $16, %ecx
 ; X64-NEXT:    orl %eax, %ecx
 ; X64-NEXT:    shll $13, %esi
@@ -114,8 +108,6 @@ define void @i56_or(ptr %a) {
 ;
 ; X64-LABEL: i56_or:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl 4(%rdi), %eax
-; X64-NEXT:    movw %ax, 4(%rdi)
 ; X64-NEXT:    orl $384, (%rdi) # imm = 0x180
 ; X64-NEXT:    retq
   %aa = load i56, ptr %a, align 1
@@ -138,8 +130,6 @@ define void @i56_and_or(ptr %a) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 6(%rdi), %ecx
-; X64-NEXT:    movb %cl, 6(%rdi)
-; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
 ; X64-NEXT:    shll $16, %ecx
 ; X64-NEXT:    orl %eax, %ecx
 ; X64-NEXT:    shlq $32, %rcx
@@ -175,8 +165,6 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 6(%rdi), %ecx
-; X64-NEXT:    movb %cl, 6(%rdi)
-; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
 ; X64-NEXT:    shll $16, %ecx
 ; X64-NEXT:    orl %eax, %ecx
 ; X64-NEXT:    shlq $32, %rcx