Skip to content

Commit

Permalink
memcpy: removed alignment from L(vl_agnostic) and L(L2)
Browse files Browse the repository at this point in the history
  • Loading branch information
NaohiroTamura committed May 4, 2021
1 parent 35b8057 commit b1f16f3
Showing 1 changed file with 3 additions and 47 deletions.
50 changes: 3 additions & 47 deletions sysdeps/aarch64/multiarch/memcpy_a64fx.S
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,8 @@
#define dest_ptr x6
#define src_ptr x7
#define vector_length x8
#define vl_remainder x9 // vector_length remainder
#define cl_remainder x10 // CACHE_LINE_SIZE remainder
#define dest_notag x11
#define src_notag x12
#define dest_notag x9
#define src_notag x10

.arch armv8.2-a+sve

Expand Down Expand Up @@ -195,21 +193,8 @@ L(vl_agnostic): // VL Agnostic
mov rest, n
mov dest_ptr, dest
mov src_ptr, src
// align dest address at vector_length byte boundary
sub tmp1, vector_length, 1
ands tmp2, dest_ptr, tmp1
// if vl_remainder == 0
b.eq 1f
sub vl_remainder, vector_length, tmp2
// process remainder until the first vector_length boundary
whilelo p2.b, xzr, vl_remainder
ld1b z0.b, p2/z, [src_ptr]
st1b z0.b, p2, [dest_ptr]
add dest_ptr, dest_ptr, vl_remainder
add src_ptr, src_ptr, vl_remainder
sub rest, rest, vl_remainder
// if rest >= L2_SIZE && vector_length == 64 then L(L2)
1: mov tmp1, 64
mov tmp1, 64
cmp rest, L2_SIZE
ccmp vector_length, tmp1, 0, cs
b.eq L(L2)
Expand Down Expand Up @@ -285,35 +270,6 @@ L(last):
ret

L(L2):
// align dest address at CACHE_LINE_SIZE byte boundary
mov tmp1, CACHE_LINE_SIZE
ands tmp2, dest_ptr, CACHE_LINE_SIZE - 1
// if cl_remainder == 0
b.eq L(L2_dc_zva)
sub cl_remainder, tmp1, tmp2
// process remainder until the first CACHE_LINE_SIZE boundary
whilelo p1.b, xzr, cl_remainder // keep p0.b all true
whilelo p2.b, vector_length, cl_remainder
b.last 1f
ld1b z1.b, p1/z, [src_ptr, #0, mul vl]
ld1b z2.b, p2/z, [src_ptr, #1, mul vl]
st1b z1.b, p1, [dest_ptr, #0, mul vl]
st1b z2.b, p2, [dest_ptr, #1, mul vl]
b 2f
1: lsl tmp1, vector_length, 1 // vector_length * 2
whilelo p3.b, tmp1, cl_remainder
ld1b z1.b, p1/z, [src_ptr, #0, mul vl]
ld1b z2.b, p2/z, [src_ptr, #1, mul vl]
ld1b z3.b, p3/z, [src_ptr, #2, mul vl]
st1b z1.b, p1, [dest_ptr, #0, mul vl]
st1b z2.b, p2, [dest_ptr, #1, mul vl]
st1b z3.b, p3, [dest_ptr, #2, mul vl]
st1b z4.b, p4, [dest_ptr, #3, mul vl]
2: add dest_ptr, dest_ptr, cl_remainder
add src_ptr, src_ptr, cl_remainder
sub rest, rest, cl_remainder

L(L2_dc_zva):
// unroll
ld1b_unroll8
add src_ptr, src_ptr, CACHE_LINE_SIZE * 2
Expand Down

0 comments on commit b1f16f3

Please sign in to comment.