From 34ca21e01ac549b6848522b4f3f6bb3b5b585674 Mon Sep 17 00:00:00 2001 From: milankl Date: Mon, 16 Mar 2020 21:17:32 +0000 Subject: [PATCH] Jeffrey's Float8(::Float32) --- src/Float8s.jl | 3 + src/float32_to_float8.jl | 168 +++++++++++++++++++++++++ src/float32_to_float8_old.jl | 133 ++++++++++++++++++++ src/float8.jl | 236 ----------------------------------- src/float8_to_float32.jl | 102 +++++++++++++++ test/runtests.jl | 52 ++++---- 6 files changed, 432 insertions(+), 262 deletions(-) create mode 100644 src/float32_to_float8.jl create mode 100644 src/float32_to_float8_old.jl create mode 100644 src/float8_to_float32.jl diff --git a/src/Float8s.jl b/src/Float8s.jl index de74d7a..3c11a15 100644 --- a/src/Float8s.jl +++ b/src/Float8s.jl @@ -14,5 +14,8 @@ module Float8s export Float8, Float8_4, NaN8, Inf8, NaN8_4, Inf8_4 include("float8.jl") + include("float8_to_float32.jl") + include("float32_to_float8.jl") + include("float32_to_float8_old.jl") end diff --git a/src/float32_to_float8.jl b/src/float32_to_float8.jl new file mode 100644 index 0000000..a39bce1 --- /dev/null +++ b/src/float32_to_float8.jl @@ -0,0 +1,168 @@ +# written by Jeffrey Sarnoff, Feb 2020. +# the constants +# --------------- + +#= + One table is split into subsections of 16 entries each. + This keeps the scan time minimal, even after accounting + for the conditionals that select the proper subsection. + + These are Float8 values offset by 1/2 way to the next value, + this lets `findfirst` return an appropriately rounded value. + + The `F8offsetN` tuples are used with normal values, + values that are finite and are not subnormal. Values + that round to zero are handled before these are used. +=# + +const F8offset1 = (Float32[ + 0.2578125, 0.2734375, 0.2890625, 0.3046875, 0.3203125, 0.3359375, + 0.3515625, 0.3671875, 0.3828125, 0.3984375, 0.4140625, 0.4296875, + 0.4453125, 0.4609375, 0.4765625, 0.4921875]...,) + +const F8offset2 = (Float32[ + 0.515625, 0.546875, 0.578125, 0.609375, 0.640625, 0.671875, + 0.703125, 0.734375, 0.765625, 0.796875, 0.828125, 0.859375, + 0.890625, 0.921875, 0.953125, 0.984375]...,); + +const F8offset3 = (Float32[ + 1.03125, 1.09375, 1.15625, 1.21875, 1.28125, 1.34375, 1.40625, + 1.46875, 1.53125, 1.59375, 1.65625, 1.71875, 1.78125, 1.84375, + 1.90625, 1.96875]...,); + +const F8offset4 = (Float32[ + 2.0625, 2.1875, 2.3125, 2.4375, 2.5625, 2.6875, 2.8125, + 2.9375, 3.0625, 3.1875, 3.3125, 3.4375, 3.5625, 3.6875, + 3.8125, 3.9375]...,); + +const F8offset5 = (Float32[ + 4.125, 4.375, 4.625, 4.875, 5.125, 5.375, 5.625, 5.875, 6.125, + 6.375, 6.625, 6.875, 7.125, 7.375, 7.625, 7.875]...,); + +const F8offset6 = (Float32[ + 8.25, 8.75, 9.25, 9.75, 10.25, 10.75, 11.25, 11.75, 12.25, 12.75, + 13.25, 13.75, 14.25, 14.75, 15.25, 15.75]...,); +#= + There is one table used with subnormal values. + It is derived from the actual values of each + subnormal quantity, shifted up halfway to the + next subnormal. This lets scanning also round. + An initial (anchor) value is prepended, that value + is half of the smallest subnormal. + + A corresponding table of UInt8 values also is used. +=# + +const F8offset_subnormal = ( + 0.0078125f0, 0.0234375f0, 0.0390625f0, 0.0546875f0, 0.0703125f0, + 0.0859375f0, 0.1015625f0, 0.1171875f0, 0.1328125f0, 0.1484375f0, + 0.1640625f0, 0.1796825f0, 0.1953125f0, 0.2109375f0, 0.2265625f0, + 0.2421875f0) + +const U8subnormal = (collect(UInt8.(0:15))...,) + +# some named constants to clarify the source text + +const roundsto_floatmax8 = 15.25f0 +const roundsto_zero8 = 0.0078125f0 +const roundsto_subnormal = 0.2421875f0 +const floatmaxplus8 = 15.75f0 # floatmax(Float8) + floatmin(Float8) + +const UNaN8 = 0x78 +const UInf8 = 0x70 +const UFloatmax8 = 0x6f +const UFloatmin8 = 0x10 +const UZero8 = 0x00 + +# the functions +# ---------------- + +function Float8(x::Float32) + # s, absx = signbit(x), abs(x) + ui = toUInt8(x) + return reinterpret(Float8, ui) +end + +function toUInt8(x::Float32) + s, absx = signbit(x), abs(x) + isnan(absx) && return s ? UNaN8|0x80 : UNaN8 + if absx >= roundsto_floatmax8 + if absx > floatmaxplus8 + return s ? UInf8|0x80 : UInf8 + else + return s ? UFloatmax8|0x80 : UFloatmax8 + end + end + if absx < roundsto_zero8 + return s ? UZero8|0x80 : UZero8 + elseif absx < roundsto_subnormal + return subnormal8(s, absx) + end + absx = min(15.5f0, max(0.25f0, absx)) + return normal8(s, absx) +end + +@inline function subnormal8(s::Core.Bool, absx::Float32) + idx = findfirst(a->absx <= a, F8offset_subnormal) + return s ? U8subnormal[idx] | 0x80 : U8subnormal[idx] +end + +function normal8(s::Core.Bool, absx::Float32) + if absx <= 1.96875f0 + if absx <= 0.4921875f0 + idx = UInt8(15+firstof16lte(absx, F8offset1)) + return s ? idx|0x80 : idx + elseif absx <= 0.984375f0 + idx = UInt8(15+16+firstof16lte(absx, F8offset2)) + return s ? idx|0x80 : idx + else + idx = UInt8(15+32+firstof16lte(absx, F8offset3)) + return s ? idx|0x80 : idx + end + else + if absx <= 3.9375f0 + idx = UInt8(15+32+16+firstof16lte(absx, F8offset4)) + return s ? idx|0x80 : idx + elseif absx <= 7.875f0 + idx = UInt8(15+64+firstof16lte(absx, F8offset5)) + return s ? idx|0x80 : idx + else + idx = UInt8(15+64+16+firstof16lte(absx, F8offset6)) + return s ? idx|0x80 : idx + end + end +end + +function firstof16lte(needle, haystack) + for idx = 1:16 + if needle <= haystack[idx] + return idx + end + end + error("should not be reached") +end + +# """Old version, slower.""" +# function normal8(s::Bool, absx::Float32) +# if absx <= 0.4921875f0 +# idx = UInt8(15+findfirst(a->absx <= a, F8offset1)) +# return s ? idx|0x80 : idx +# elseif absx <= 0.984375f0 +# idx = UInt8(15+16+findfirst(a->absx <= a, F8offset2)) +# return s ? idx|0x80 : idx +# elseif absx <= 1.96875f0 +# idx = UInt8(15+32+findfirst(a->absx <= a, F8offset3)) +# return s ? idx|0x80 : idx +# elseif absx <= 3.9375f0 +# idx = UInt8(15+32+16+findfirst(a->absx <= a, F8offset4)) +# return s ? idx|0x80 : idx +# elseif absx <= 7.875f0 +# idx = UInt8(15+64+findfirst(a->absx <= a, F8offset5)) +# return s ? idx|0x80 : idx +# elseif absx <= 15.75f0 +# idx = UInt8(15+64+16+findfirst(a->absx <= a, F8offset6)) +# return s ? idx|0x80 : idx +# else +# throw(DomainError(absx,"not normal for Float8")) +# end +# end diff --git a/src/float32_to_float8_old.jl b/src/float32_to_float8_old.jl new file mode 100644 index 0000000..1d4ba4b --- /dev/null +++ b/src/float32_to_float8_old.jl @@ -0,0 +1,133 @@ +# Float32 -> Float8 algorithm in analogy to +# +# Float32 -> Float16 algorithm from: +# "Fast Half Float Conversion" by Jeroen van der Zijp +# ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf +# +# With adjustments for round-to-nearest, ties to even. + +function create_base_shifttable(::Type{T}) where {T<:AbstractFloat8} + + basetable = Vector{UInt8}(undef, 512) + shifttable = Vector{UInt8}(undef, 512) + + if T == Float8 + # elements derive from + # [1] 2^-6 = Float8(0x01) the smallest representable number (subnormal) + # [2] 2^-2 = Float8(0x10) the first non-subnormal number + # [3] 2^4 = 16 > floatmax(Float8) is the smallest power of two that is larger than floatmax(Float8) + + e_limits = [-6,-2,4] + + # shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1" + # to the first significand bit + # e_shift_subnorm is 17 for Float8 + e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8)-1)+e_limits[2]-1 + elseif T == Float8_4 + + # see above + e_limits = [-9,-6,8] + + # shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1" + # to the first significand bit + # e_shift_subnorm is 14 for Float8_4 + e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8_4)-1)+e_limits[2]-1 + end + + for i = 0:255 # all possible exponents for Float32 + e = i - 127 # subtract Float32 bias + if e < e_limits[1] # Very small numbers map to +- zero + basetable[i|0x000+1] = zero(T) + basetable[i|0x100+1] = -zero(T) + shifttable[i|0x000+1] = n_significant_bits(T)+1 + shifttable[i|0x100+1] = n_significant_bits(T)+1 + elseif e < e_limits[2] # Small numbers map to denorms + basetable[i|0x000+1] = zero(T) + basetable[i|0x100+1] = -zero(T) + shifttable[i|0x000+1] = -e+e_shift_subnorm + shifttable[i|0x100+1] = -e+e_shift_subnorm + elseif e < e_limits[3] # Normal numbers just lose precision + basetable[i|0x000+1] = ((e+bias(T)) << n_significant_bits(T)) + basetable[i|0x100+1] = ((e+bias(T)) << n_significant_bits(T)) | sign_mask(T) + shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T) + shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T) + elseif e < 128 # Large numbers map to Infinity + basetable[i|0x000+1] = inf8(T) + basetable[i|0x100+1] = -inf8(T) + shifttable[i|0x000+1] = n_significant_bits(T)+1 + shifttable[i|0x100+1] = n_significant_bits(T)+1 + else # Infinity and NaN's stay Infinity and NaN's + basetable[i|0x000+1] = inf8(T) + basetable[i|0x100+1] = -inf8(T) + shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T) + shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T) + end + end + + return basetable, shifttable +end + +const basetable8, shifttable8 = create_base_shifttable(Float8) +const basetable8_4, shifttable8_4 = create_base_shifttable(Float8_4) + +# function Float8(val::Float32) +# +# f = reinterpret(UInt32, val) +# +# if isnan(val) #TODO retain the significant bits for NaN? +# return nan8(Float8) +# end +# +# # exponent as Int64 +# i = f >> n_significant_bits(Float32) + 1 +# @inbounds sh = shifttable8[i] +# f &= significand_mask(Float32) +# +# # If `val` is subnormal, the tables are set up to force the +# # result to 0, so the significand has an implicit `1` in the +# # cases we care about. +# +# f |= significand_mask(Float32) + 0x1 +# @inbounds h = (basetable8[i] + (f >> sh) & significand_mask(Float8)) % UInt8 +# +# # rounding +# nextbit = (f >> (sh-1)) & 1 +# if nextbit != 0 && (h & exponent_mask(Float8)) != exponent_mask(Float8) +# # Round halfway to even or check lower bits +# if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0 +# h += one(UInt8) +# end +# end +# return reinterpret(Float8, h) +# end + +function Float8_4(val::Float32) + + f = reinterpret(UInt32, val) + + if isnan(val) #TODO retain the significant bits for NaN? + return nan8(Float8_4) + end + + # exponent as Int64 + i = f >> n_significant_bits(Float32) + 1 + @inbounds sh = shifttable8_4[i] + f &= significand_mask(Float32) + + # If `val` is subnormal, the tables are set up to force the + # result to 0, so the significand has an implicit `1` in the + # cases we care about. + + f |= significand_mask(Float32) + 0x1 + @inbounds h = (basetable8_4[i] + (f >> sh) & significand_mask(Float8_4)) % UInt8 + + # rounding + nextbit = (f >> (sh-1)) & 1 + if nextbit != 0 && (h & exponent_mask(Float8_4)) != exponent_mask(Float8_4) + # Round halfway to even or check lower bits + if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0 + h += one(UInt8) + end + end + return reinterpret(Float8_4, h) +end diff --git a/src/float8.jl b/src/float8.jl index 517dc7d..994787e 100644 --- a/src/float8.jl +++ b/src/float8.jl @@ -89,140 +89,6 @@ function sign(x::T) where {T<:AbstractFloat8} end end -# Float32 -> Float8 algorithm in analogy to -# -# Float32 -> Float16 algorithm from: -# "Fast Half Float Conversion" by Jeroen van der Zijp -# ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf -# -# With adjustments for round-to-nearest, ties to even. - -function create_base_shifttable(::Type{T}) where {T<:AbstractFloat8} - - basetable = Vector{UInt8}(undef, 512) - shifttable = Vector{UInt8}(undef, 512) - - if T == Float8 - # elements derive from - # [1] 2^-6 = Float8(0x01) the smallest representable number (subnormal) - # [2] 2^-2 = Float8(0x10) the first non-subnormal number - # [3] 2^4 = 16 > floatmax(Float8) is the smallest power of two that is larger than floatmax(Float8) - - e_limits = [-6,-2,4] - - # shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1" - # to the first significand bit - # e_shift_subnorm is 17 for Float8 - e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8)-1)+e_limits[2]-1 - elseif T == Float8_4 - - # see above - e_limits = [-9,-6,8] - - # shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1" - # to the first significand bit - # e_shift_subnorm is 14 for Float8_4 - e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8_4)-1)+e_limits[2]-1 - end - - for i = 0:255 # all possible exponents for Float32 - e = i - 127 # subtract Float32 bias - if e < e_limits[1] # Very small numbers map to +- zero - basetable[i|0x000+1] = zero(T) - basetable[i|0x100+1] = -zero(T) - shifttable[i|0x000+1] = n_significant_bits(T)+1 - shifttable[i|0x100+1] = n_significant_bits(T)+1 - elseif e < e_limits[2] # Small numbers map to denorms - basetable[i|0x000+1] = zero(T) - basetable[i|0x100+1] = -zero(T) - shifttable[i|0x000+1] = -e+e_shift_subnorm - shifttable[i|0x100+1] = -e+e_shift_subnorm - elseif e < e_limits[3] # Normal numbers just lose precision - basetable[i|0x000+1] = ((e+bias(T)) << n_significant_bits(T)) - basetable[i|0x100+1] = ((e+bias(T)) << n_significant_bits(T)) | sign_mask(T) - shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T) - shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T) - elseif e < 128 # Large numbers map to Infinity - basetable[i|0x000+1] = inf8(T) - basetable[i|0x100+1] = -inf8(T) - shifttable[i|0x000+1] = n_significant_bits(T)+1 - shifttable[i|0x100+1] = n_significant_bits(T)+1 - else # Infinity and NaN's stay Infinity and NaN's - basetable[i|0x000+1] = inf8(T) - basetable[i|0x100+1] = -inf8(T) - shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T) - shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T) - end - end - - return basetable, shifttable -end - -const basetable8, shifttable8 = create_base_shifttable(Float8) -const basetable8_4, shifttable8_4 = create_base_shifttable(Float8_4) - -function Float8(val::Float32) - - f = reinterpret(UInt32, val) - - if isnan(val) #TODO retain the significant bits for NaN? - return nan8(Float8) - end - - # exponent as Int64 - i = f >> n_significant_bits(Float32) + 1 - @inbounds sh = shifttable8[i] - f &= significand_mask(Float32) - - # If `val` is subnormal, the tables are set up to force the - # result to 0, so the significand has an implicit `1` in the - # cases we care about. - - f |= significand_mask(Float32) + 0x1 - @inbounds h = (basetable8[i] + (f >> sh) & significand_mask(Float8)) % UInt8 - - # rounding - nextbit = (f >> (sh-1)) & 1 - if nextbit != 0 && (h & exponent_mask(Float8)) != exponent_mask(Float8) - # Round halfway to even or check lower bits - if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0 - h += one(UInt8) - end - end - return reinterpret(Float8, h) -end - -function Float8_4(val::Float32) - - f = reinterpret(UInt32, val) - - if isnan(val) #TODO retain the significant bits for NaN? - return nan8(Float8_4) - end - - # exponent as Int64 - i = f >> n_significant_bits(Float32) + 1 - @inbounds sh = shifttable8_4[i] - f &= significand_mask(Float32) - - # If `val` is subnormal, the tables are set up to force the - # result to 0, so the significand has an implicit `1` in the - # cases we care about. - - f |= significand_mask(Float32) + 0x1 - @inbounds h = (basetable8_4[i] + (f >> sh) & significand_mask(Float8_4)) % UInt8 - - # rounding - nextbit = (f >> (sh-1)) & 1 - if nextbit != 0 && (h & exponent_mask(Float8_4)) != exponent_mask(Float8_4) - # Round halfway to even or check lower bits - if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0 - h += one(UInt8) - end - end - return reinterpret(Float8_4, h) -end - first_sig_bit_mask(::Type{Float8}) = 0x00000008 first_sig_bit_mask(::Type{Float8_4}) = 0x00000004 @@ -235,108 +101,6 @@ bias_difference(::Type{Float8_4}) = 0x00000078 # = 120, 127 for Float32 min exp_bits_all_one(::Type{Float8}) = 0x00000007 exp_bits_all_one(::Type{Float8_4}) = 0x0000000f -# function Float32(val::T) where {T<:AbstractFloat8} -# -# ival = reinterpret(UInt8, val) -# -# # seperate into sign, exponent, significand -# sign = UInt32((ival & sign_mask(T)) >> 7) -# exp = UInt32((ival & exponent_mask(T)) >> n_significant_bits(T)) -# sig = UInt32(ival & significand_mask(T)) -# -# if exp == zero(UInt32) -# if sig == zero(UInt32) # +-0 case -# return reinterpret(Float32,sign << 31) -# else # subnormals -# n_bit = 1 -# # first significand bit set to one, else zero, to check for size of subnormal -# bit = first_sig_bit_mask(T) -# while (bit & sig) == 0 -# n_bit = n_bit + 1 -# bit = bit >> 1 -# end -# sign = sign << 31 -# -# # bias = 2^(n_exp-1) - 1, i.e. 127 for Float32, 15 for Float16, 3 for Float8, 7 for Float8_4 -# # difference in bias + 1 has to be added, e.g. 127-3 = 124 = 0x0000007c -# -# exp = ((bias_difference(T)+1 - n_bit) << 23) % UInt32 -# sig = ((sig & (~bit)) << n_bit) << sig_bit_shift(T) -# ret = sign | exp | sig -# reinterpret(Float32,ret) -# end -# elseif exp == exp_bits_all_one(T) # Inf/NaN case -# if sig == zero(UInt32) # Infinity -# if sign == zero(UInt32) -# return Inf32 -# else -# return -Inf32 -# end -# else # NaN, preserve sign and significand (first sig bit always 1) -# # NaN32 == reinterpret(Flaot32,0x7fc00000) -# ret = 0x7fc00000 | (sign<<31) | (sig<> 7) +# exp = UInt32((ival & exponent_mask(T)) >> n_significant_bits(T)) +# sig = UInt32(ival & significand_mask(T)) +# +# if exp == zero(UInt32) +# if sig == zero(UInt32) # +-0 case +# return reinterpret(Float32,sign << 31) +# else # subnormals +# n_bit = 1 +# # first significand bit set to one, else zero, to check for size of subnormal +# bit = first_sig_bit_mask(T) +# while (bit & sig) == 0 +# n_bit = n_bit + 1 +# bit = bit >> 1 +# end +# sign = sign << 31 +# +# # bias = 2^(n_exp-1) - 1, i.e. 127 for Float32, 15 for Float16, 3 for Float8, 7 for Float8_4 +# # difference in bias + 1 has to be added, e.g. 127-3 = 124 = 0x0000007c +# +# exp = ((bias_difference(T)+1 - n_bit) << 23) % UInt32 +# sig = ((sig & (~bit)) << n_bit) << sig_bit_shift(T) +# ret = sign | exp | sig +# reinterpret(Float32,ret) +# end +# elseif exp == exp_bits_all_one(T) # Inf/NaN case +# if sig == zero(UInt32) # Infinity +# if sign == zero(UInt32) +# return Inf32 +# else +# return -Inf32 +# end +# else # NaN, preserve sign and significand (first sig bit always 1) +# # NaN32 == reinterpret(Flaot32,0x7fc00000) +# ret = 0x7fc00000 | (sign<<31) | (sig< Float8 subnormals ordered?" begin -# -# N = 100 -# -# fs = Float32(floatmin(Float8))*rand(Float32,N) -# sort!(fs) -# -# f8s = Float8.(fs) -# @testset for i in 1:N-1 -# @test f8s[i+1] >= f8s[i] -# end -# -# f8s = Float8.(-fs) -# @testset for i in 1:N-1 -# @test f8s[i+1] <= f8s[i] -# end -# end +@testset "Float32 -> Float8 subnormals ordered?" begin + N = 100 -@testset "Conversion Float8 <-> Float32" begin + fs = Float32(floatmin(Float8))*rand(Float32,N) + sort!(fs) - @testset for i in 0x00:0xff - if ~isnan(Float8(i)) - @test i == reinterpret(UInt8,Float8(Float32(Float8(i)))) - end + f8s = Float8.(fs) + @testset for i in 1:N-1 + @test f8s[i+1] >= f8s[i] + end + + f8s = Float8.(-fs) + @testset for i in 1:N-1 + @test f8s[i+1] <= f8s[i] end end -@testset "Conversion Float8_4 <-> Float32" begin + +@testset "Conversion Float8 <-> Float32" begin @testset for i in 0x00:0xff - if ~isnan(Float8_4(i)) - @test i == reinterpret(UInt8,Float8_4(Float32(Float8_4(i)))) + if ~isnan(Float8(i)) + @test i == reinterpret(UInt8,Float8(Float32(Float8(i)))) end end end +# Currently not implemented +# @testset "Conversion Float8_4 <-> Float32" begin +# +# @testset for i in 0x00:0xff +# if ~isnan(Float8_4(i)) +# @test i == reinterpret(UInt8,Float8_4(Float32(Float8_4(i)))) +# end +# end +# end + @testset "Negation" begin @testset for i in 0x00:0xff