efficient bit rotate functions: ror, rol #11592

StefanKarpinski · 2015-06-05T15:19:09Z

The obvious way to implement bit rotation is this:

ror(x::Int, k::Int) = (x >>> k) | (x << (sizeof(UInt)<<3 - k))

This has a couple of issues, however. First, rotating by more that a word is broken:

julia> ror(13,72)
0

Second, the native code is awful:

julia> @code_native ror(13,4)
    .section    __TEXT,__text,regular,pure_instructions
Filename: none
Source line: 1
    pushq   %rbp
    movq    %rsp, %rbp
Source line: 1
    movslq  %esi, %rax
    cmpq    %rsi, %rax
    jne L75
    movl    $64, %eax
    subq    %rsi, %rax
    movslq  %eax, %rcx
    cmpq    %rax, %rcx
    jne L75
    movb    %sil, %cl
    movq    %rdi, %rdx
    shrq    %cl, %rdx
    xorl    %r8d, %r8d
    cmpl    $63, %esi
    cmovaq  %r8, %rdx
    movb    %al, %cl
    shlq    %cl, %rdi
    cmpl    $63, %eax
    cmovaq  %r8, %rdi
    orq %rdx, %rdi
    movq    %rdi, %rax
    popq    %rbp
    ret
L75:    movabsq $jl_inexact_exception, %rax
    movq    (%rax), %rdi
    movabsq $jl_throw_with_superfluous_argument, %rax
    movl    $1, %esi
    callq   *%rax

Both can be improved with a slightly trickier implementation:

ror(x::Int, k::Int) = (x >>> (0x3f & k)) | (x << (0x3f & -k))

julia> ror(13,72)
936748722493063168

julia> @code_native ror(13,72)
    .section    __TEXT,__text,regular,pure_instructions
Filename: none
Source line: 1
    pushq   %rbp
    movq    %rsp, %rbp
    movq    %rsi, %rcx
Source line: 1
    movq    %rdi, %rax
    shrq    %cl, %rax
    negl    %ecx
    shlq    %cl, %rdi
    orq %rax, %rdi
    movq    %rdi, %rax
    popq    %rbp
    ret

That's way better machine code – but ror is an x86 instruction – this should just boil down to that. Given that LLVM does not expose rotate instructions, what do we have to do here to get this to emit a single x86 instruction?

The text was updated successfully, but these errors were encountered:

JeffBezanson · 2015-06-05T15:22:28Z

What is the trickier implementation?

StefanKarpinski · 2015-06-05T15:36:03Z

Oops, edited to add that.

JeffBezanson · 2015-06-05T15:38:22Z

I was on the edge of my seat there for a minute :)

yuyichao · 2015-06-05T23:15:46Z

#!/usr/bin/julia -f

macro time_func(f, args...)
    args = eval(current_module(), Expr(:tuple, args...))::Tuple
    argnames = Symbol[gensym() for i in 1:length(args)]
    types = map(typeof, args)
    quote
        function wrapper($(argnames...))
            $(Expr(:meta, :noinline))
            $f($(argnames...))
        end
        function timing_wrapper()
            println($f, $types)
            wrapper($(args...))
            gc()
            @time for i in 1:1000000000
                wrapper($(args...))
            end
            gc()
        end
        timing_wrapper()
    end
end

ror1(x::UInt64, k::Int8) = (x >>> (0x3f & k)) | (x << (0x3f & -k))

function ror2(x::UInt64, k::Int8)
    Base.llvmcall("""
                  %3 = tail call i64 asm \"rorq \$1,\$0\", \"=r,{cx},0,~{dirflag},~{fpsr},~{flags}\"(i8 %1, i64 %0)
                  ret i64 %3
                  """, UInt64, Tuple{UInt64, UInt8}, x, k)
end

for i in 1:10
    println("$i: $(hex(ror1(UInt64(1), Int8(i))))")
end

for i in 1:10
    println("$i: $(hex(ror2(UInt64(1), Int8(i))))")
end

code_native(ror1, Tuple{UInt64, Int8})
code_native(ror2, Tuple{UInt64, Int8})

@time_func(ror1, UInt64(1), Int8(10))
@time_func(ror2, UInt64(1), Int8(10))

Output:

1: 8000000000000000
2: 4000000000000000
3: 2000000000000000
4: 1000000000000000
5: 800000000000000
6: 400000000000000
7: 200000000000000
8: 100000000000000
9: 80000000000000
10: 40000000000000
1: 8000000000000000
2: 4000000000000000
3: 2000000000000000
4: 1000000000000000
5: 800000000000000
6: 400000000000000
7: 200000000000000
8: 100000000000000
9: 80000000000000
10: 40000000000000
        .text
Filename: rol.jl
Source line: 0
        pushq   %rbp
        movq    %rsp, %rbp
Source line: 25
        movb    %sil, %cl
        rorq    %cl, %rdi
        movq    %rdi, %rax
        popq    %rbp
        retq
        .text
Filename: rol.jl
Source line: 0
        pushq   %rbp
        movq    %rsp, %rbp
Source line: 28
        movb    %sil, %cl
        rorq    %cl, %rdi
        movq    %rdi, %rax
        popq    %rbp
        retq
ror1(UInt64,Int8)
   2.076 seconds     
ror2(UInt64,Int8)
   2.071 seconds

llvm IR copied from clang -O3 --emit-llvm output
inline assembly with llvmcall works
LLVM 3.6.1 seems to be smart enough to optimize it.

yuyichao · 2015-06-05T23:20:13Z

P.S. And by "copied from clang -O3 --emit-llvm" output I meant the output of the following inline asm.

uint64_t
rotr64(uint64_t x, uint8_t r)
{
    asm("rorq %1,%0" : "+r" (x) : "c" (r));
    return x;
}

Which is in term adapted from here

ScottPJones · 2015-06-06T05:53:55Z

So, you add this for Intel platforms, and fall back to the old for ARM, Power, etc.? (or get somebody to figure out the correct inline asm for those platforms)? 👍 For my bit twiddling, this would be very nice, esp. since I'd no longer have to deal with inline asm for each platform myself.

yuyichao · 2015-06-06T14:25:15Z

Well, IMHO, although the inline assembly works, it should not be done in julia for most of the case.

In general, it should really be the job of LLVM to emit the best assembly and it already can in recent versions. Inline assembly can probably be used to do something that directly addresses some special hardware features but probably not for general purpose functions and especially not this one since llvm can already do it.

ScottPJones · 2015-06-06T16:50:00Z

Ok, I thought people had said that llvm didn't support it... It does as of what version?

yuyichao · 2015-06-06T16:51:59Z

It doesn't support it as an llvm instruction but the x86_64 backend can recognize the code and emit rol instruction. As mentioned above, this is llvm 3.6

ScottPJones · 2015-06-06T17:51:06Z

@yuyichao, you said:

LLVM 3.6.1 seems to be smart enough to optimize it.

I asked as of what version it was supported? (i.e. the earliest version, not what you used). Is that supported in 3.3?

yuyichao · 2015-06-06T17:55:53Z

I suppose @StefanKarpinski was on 3.3 and as I just checked it doesn't optmize the function to rol. But inline assembly doesn't seems to be supported by the old JIT anyway so it won't work.

StefanKarpinski · 2015-06-06T19:57:39Z

cc: @VicDrastik

yuyichao · 2015-10-24T14:49:30Z

Just an update.

I happened to notice that LLVM (3.7) does not emit rorq anymore with the pure julia version with FastISel, i.e. current master (disabling FastISel makes it emit rorq instruction again).

The difference in performance on my Haswell laptop is ~10%. Not sure if this is good enough...

c.c. @Keno

eschnett · 2016-01-04T18:49:07Z

Isn't there an -O flag for Julia that should make speed/performance tradeoffs such as calling FastISel or not?

Mostly because Julia does not create optimal code when doing cyclic bit shifts, see JuliaLang/julia#11592 and JuliaLang/julia#19923

mbauman · 2018-10-15T17:55:04Z

Now that we've deprecated the old ror and rol meanings, and now that it appears that LLVM is happy generating the x86 intrinsics with 1.0, is it worth exposing these bitwise operations as first-class Julia functions?

julia> ror(x::Int, k::Int) = (x >>> (0x3f & k)) | (x << (0x3f & -k))
ror (generic function with 1 method)

julia> @code_native ror(rand(Int), 3)
	.section	__TEXT,__text,regular,pure_instructions
; Function ror {
; Location: REPL[1]:1
; Function |; {
; Location: REPL[1]:1
	movl	%esi, %ecx
	decl	%eax
	rorl	%cl, %edi
;}
	decl	%eax
	movl	%edi, %eax
	retl
	nopl	(%eax)
;}

StefanKarpinski · 2018-10-15T19:13:41Z

Yes, I think we should certainly expose these. That still seems like a lot of instructions for a simple ror operation. Are the movl and decl instructions part of the function preamble and postamble these days?

KristofferC · 2018-10-15T19:50:35Z

I get

julia> @code_native ror(rand(Int), 3)
	.section	__TEXT,__text,regular,pure_instructions
; Function ror {
; Location: REPL[11]:1
; Function |; {
; Location: REPL[11]:1
	movl	%esi, %ecx
	rorq	%cl, %rdi
;}
	movq	%rdi, %rax
	retq
	nopl	(%rax)
;}

Keno · 2018-10-15T19:52:17Z

Looks like @mbauman is on a 32bit machine?

mbauman · 2018-10-15T19:53:55Z

Nope, that's the official 1.0.0 binary on my Mac (an old westmere/nehalem system). On master I see the same as you, @KristofferC.

KristofferC · 2018-10-15T19:54:58Z

Yep, same as you with 1.0.1 binary for me.

KristofferC · 2018-10-15T19:58:19Z

Also get the decl, rorl, decl calls with nightly, so perhaps a difference between source build and binary.

Keno · 2018-10-15T19:59:34Z

rorl only operates on the low 32bits though, so something is odd about that.

eschnett · 2018-10-15T20:32:31Z

The whole code fragment operates on 32-bit values, so it's self-consistent.

maxbennedich · 2018-12-29T05:01:52Z

Since you are on Mac, I presume that you're seeing #28046

KristofferC · 2019-03-23T11:46:43Z

1.1.0 release shows the correct output now (#28046)

maxbennedich · 2019-03-23T17:52:42Z

True, so we can now generate efficient native code, and also have it disassemble correctly on Macs, but shouldn't this issue also include exposing ror and rol as first class Julia functions (as per @mbauman's post above)? I think non-expert users may have a hard time figuring out how to generate efficient code, even if they find this issue (example from discourse).

KristofferC · 2019-03-23T19:43:39Z

Seems like a good idea.

JeffreySarnoff · 2019-07-03T21:36:58Z

bump

StefanKarpinski · 2019-07-05T18:23:06Z

Someone just needs to make a PR defining these, adding some tests and NEWS.

goggle · 2019-07-19T05:40:32Z

Should these be implemented for all different integer types (Int8, Int16, Int32, Int64, Int128)? Also for the unsigned integer types?

StefanKarpinski · 2019-07-19T22:06:01Z

At least the unsigned ones. I'm not sure what the right definition of ror and rol for signed types is except if you want to just rotate them as if they were unsigned, i.e. cast, rotate, cast back.

JeffreySarnoff · 2019-07-20T18:37:59Z

I had this in my files: (I use rotate with signed ints)

for (T,K) in ((Int128, 0x7f), (Int64, 0x3f), (Int32, 0x1f), (Int16, 0x0f), (Int8, 0x07),
              (UInt128, 0x7f), (UInt64, 0x3f), (UInt32, 0x1f), (UInt16, 0x0f), (UInt8, 0x07))
  @eval begin
    ror(x::$T, k::I) where {I<:Integer} = (x >>> ($K & k)) | (x <<  ($K & -k))
    rol(x::$T, k::I) where {I<:Integer} = (x <<  ($K & k)) | (x >>> ($K & -k))
  end
end

StefanKarpinski · 2019-07-21T01:47:52Z

Nice. The colons before the UInt8 literals are unnecessary and they could be computed from sizeof(T) but that's a nice generic implementation. Having thought a little, I do think that rotating the raw bits of signed integers is the only reasonable thing to do.

JeffreySarnoff · 2019-07-21T02:24:48Z

I removed the colons. I do not see how to use sizeof(T) without adding bloat to the functions.

StefanKarpinski · 2019-07-21T02:36:43Z

for T in Base.BitInteger_types
    mask = UInt8(sizeof(T) << 3 - 1)
    @eval begin
        ror(x::$T, k::Integer) = (x >>> ($mask & k)) | (x <<  ($mask & -k))
        rol(x::$T, k::Integer) = (x <<  ($mask & k)) | (x >>> ($mask & -k))
    end
end

JeffreySarnoff · 2019-07-21T03:20:53Z

elegant

mbauman · 2019-11-26T06:54:41Z

So here's kooky idea. We don't have great words for ror and rol. Nor do we have great words for popcnt/count_ones or tzcnt/trailing_zeros and friends. BUT we do have great words for them if we were to treat the bits as elements in an array.

Perhaps this is too clever by half, but I like the idea of exposing this through a special Bits struct:

struct Bits{T <: Integer} <: AbstractVector{Bool}
    data::T
end
Base.size(::Bits{T}) where {T} = (sizeof(T)*8,)
Base.getindex(b::Bits, i::Int) = b.data & (1 << (i-1)) != 0

This is likely more useful than bitstring, and it can provide the relevant specializations for these hard-to-name bit-twiddling optimizations:

old	alternative
`ror(x, k)`	`Integer(circshift(Bits(x), k))`
`count_ones(x)`	`count(Bits(x))`
`count_zeros(x)`	`count(!, Bits(x))`
`leading_zeros(x)`	`sizeof(x)*8 - findlast(Bits(x))`
`leading_ones(x)`	`sizeof(x)*8 - findlast(!, Bits(x))`
`trailing_zeros(x)`	`findfirst(Bits(x)) - 1`
`trailing_ones(x)`	`findfirst(!, Bits(x)) - 1`

Introduce one new name, get rid of 6 of the 37 remaining Base exports that are multi_word_with_underscores. Alright, so those leading_* guys look awful, but most of the time you actually want just the findlast(Bits(x)) index. Perhaps the axes should be 0:sizeof(T)-1, but then we still need a nice name for the function that does the sum(x.*2.^axes(x, 1)) integer reconstitution. It's not really an integer conversion, and even a constructor feels weird — would it take all vectors of bool values? Note that Bits(::BigInt) could generate the appropriate BitArray (and back, however we do that).

The biggest downside is that when you're doing bit-twiddling, you pretty much always want to know that you're using those strangely-named intrinsics. This makes them look like any other array function (because they are).

rfourquet · 2019-11-26T08:28:39Z

I like the idea of exposing this through a special Bits struct

For the record, this is exactly what the Bits package does. It needs a bit more love (I ported a small C++ library of mine, which I cleaned up partially in the package, but stopped after getting the bits I needed at that time), for example to specialize some array methods like count. My big question was indeed what the axes should be. It's currently 1:n, so in a way it's really like an (immutable) BitVector but with only one word.

The dual of Bits(x) would be something like Support(x), which is like a BitSet with only one word, which contains the indexes of ones in x (it's not published yet in the library).

StefanKarpinski added the performance Must go faster label Jun 5, 2015

StefanKarpinski mentioned this issue Jan 4, 2016

Negative shift counts could be handled better #14516

Closed

yuyichao mentioned this issue Apr 21, 2016

Disable fastisel for -O3 #15984

Merged

yuyichao mentioned this issue Jan 11, 2017

consolidate ror/rol and circshift #19923

Closed

mdavezac pushed a commit to CryptaLabs/ExtractRandom.jl that referenced this issue May 30, 2017

Some c++ vs julia exploration

1e966a8

Mostly because Julia does not create optimal code when doing cyclic bit shifts, see JuliaLang/julia#11592 and JuliaLang/julia#19923

KristofferC closed this as completed Mar 23, 2019

KristofferC reopened this Mar 23, 2019

StefanKarpinski added kind:good first issue Indicates a good issue for first-time contributors to Julia status:help wanted Indicates that a maintainer wants help on an issue or pull request and removed performance Must go faster labels Jul 5, 2019

anaveragehuman mentioned this issue Nov 25, 2019

Add ror and rol functions for rotating bits #33937

Merged

jakobnissen added a commit to jakobnissen/julia that referenced this issue Nov 25, 2019

Added rol/ror functions (issue JuliaLang#11592)

a28d660

jakobnissen mentioned this issue Nov 25, 2019

Added rol/ror functions to Base (issue #11592) #33938

Closed

StefanKarpinski closed this as completed in #33937 Feb 21, 2020

StefanKarpinski pushed a commit that referenced this issue Feb 21, 2020

Add bitrotate function (issue #11592) (#33937)

eac7411

birm pushed a commit to birm/julia that referenced this issue Feb 22, 2020

Add bitrotate function (issue JuliaLang#11592) (JuliaLang#33937)

c0be566

KristofferC pushed a commit that referenced this issue Apr 11, 2020

Add bitrotate function (issue #11592) (#33937)

81623fa

efficient bit rotate functions: ror, rol #11592

efficient bit rotate functions: ror, rol #11592

Comments

StefanKarpinski commented Jun 5, 2015

JeffBezanson commented Jun 5, 2015

StefanKarpinski commented Jun 5, 2015

JeffBezanson commented Jun 5, 2015

yuyichao commented Jun 5, 2015

yuyichao commented Jun 5, 2015

ScottPJones commented Jun 6, 2015

yuyichao commented Jun 6, 2015

ScottPJones commented Jun 6, 2015

yuyichao commented Jun 6, 2015

ScottPJones commented Jun 6, 2015

yuyichao commented Jun 6, 2015

StefanKarpinski commented Jun 6, 2015

yuyichao commented Oct 24, 2015

eschnett commented Jan 4, 2016

mbauman commented Oct 15, 2018

StefanKarpinski commented Oct 15, 2018

KristofferC commented Oct 15, 2018

Keno commented Oct 15, 2018

mbauman commented Oct 15, 2018 • edited

KristofferC commented Oct 15, 2018 • edited

KristofferC commented Oct 15, 2018 • edited

Keno commented Oct 15, 2018

eschnett commented Oct 15, 2018

maxbennedich commented Dec 29, 2018

KristofferC commented Mar 23, 2019

maxbennedich commented Mar 23, 2019

KristofferC commented Mar 23, 2019

JeffreySarnoff commented Jul 3, 2019

StefanKarpinski commented Jul 5, 2019

goggle commented Jul 19, 2019

StefanKarpinski commented Jul 19, 2019

JeffreySarnoff commented Jul 20, 2019 • edited

StefanKarpinski commented Jul 21, 2019

JeffreySarnoff commented Jul 21, 2019

StefanKarpinski commented Jul 21, 2019

JeffreySarnoff commented Jul 21, 2019

mbauman commented Nov 26, 2019

rfourquet commented Nov 26, 2019

mbauman commented Oct 15, 2018 •

edited

KristofferC commented Oct 15, 2018 •

edited

KristofferC commented Oct 15, 2018 •

edited

JeffreySarnoff commented Jul 20, 2019 •

edited