Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/Float8s.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,8 @@ module Float8s
export Float8, Float8_4, NaN8, Inf8, NaN8_4, Inf8_4

include("float8.jl")
include("float8_to_float32.jl")
include("float32_to_float8.jl")
include("float32_to_float8_old.jl")

end
168 changes: 168 additions & 0 deletions src/float32_to_float8.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# written by Jeffrey Sarnoff, Feb 2020.
# the constants
# ---------------

#=
One table is split into subsections of 16 entries each.
This keeps the scan time minimal, even after accounting
for the conditionals that select the proper subsection.

These are Float8 values offset by 1/2 way to the next value,
this lets `findfirst` return an appropriately rounded value.

The `F8offsetN` tuples are used with normal values,
values that are finite and are not subnormal. Values
that round to zero are handled before these are used.
=#

const F8offset1 = (Float32[
0.2578125, 0.2734375, 0.2890625, 0.3046875, 0.3203125, 0.3359375,
0.3515625, 0.3671875, 0.3828125, 0.3984375, 0.4140625, 0.4296875,
0.4453125, 0.4609375, 0.4765625, 0.4921875]...,)

const F8offset2 = (Float32[
0.515625, 0.546875, 0.578125, 0.609375, 0.640625, 0.671875,
0.703125, 0.734375, 0.765625, 0.796875, 0.828125, 0.859375,
0.890625, 0.921875, 0.953125, 0.984375]...,);

const F8offset3 = (Float32[
1.03125, 1.09375, 1.15625, 1.21875, 1.28125, 1.34375, 1.40625,
1.46875, 1.53125, 1.59375, 1.65625, 1.71875, 1.78125, 1.84375,
1.90625, 1.96875]...,);

const F8offset4 = (Float32[
2.0625, 2.1875, 2.3125, 2.4375, 2.5625, 2.6875, 2.8125,
2.9375, 3.0625, 3.1875, 3.3125, 3.4375, 3.5625, 3.6875,
3.8125, 3.9375]...,);

const F8offset5 = (Float32[
4.125, 4.375, 4.625, 4.875, 5.125, 5.375, 5.625, 5.875, 6.125,
6.375, 6.625, 6.875, 7.125, 7.375, 7.625, 7.875]...,);

const F8offset6 = (Float32[
8.25, 8.75, 9.25, 9.75, 10.25, 10.75, 11.25, 11.75, 12.25, 12.75,
13.25, 13.75, 14.25, 14.75, 15.25, 15.75]...,);
#=
There is one table used with subnormal values.
It is derived from the actual values of each
subnormal quantity, shifted up halfway to the
next subnormal. This lets scanning also round.
An initial (anchor) value is prepended, that value
is half of the smallest subnormal.

A corresponding table of UInt8 values also is used.
=#

const F8offset_subnormal = (
0.0078125f0, 0.0234375f0, 0.0390625f0, 0.0546875f0, 0.0703125f0,
0.0859375f0, 0.1015625f0, 0.1171875f0, 0.1328125f0, 0.1484375f0,
0.1640625f0, 0.1796825f0, 0.1953125f0, 0.2109375f0, 0.2265625f0,
0.2421875f0)

const U8subnormal = (collect(UInt8.(0:15))...,)

# some named constants to clarify the source text

const roundsto_floatmax8 = 15.25f0
const roundsto_zero8 = 0.0078125f0
const roundsto_subnormal = 0.2421875f0
const floatmaxplus8 = 15.75f0 # floatmax(Float8) + floatmin(Float8)

const UNaN8 = 0x78
const UInf8 = 0x70
const UFloatmax8 = 0x6f
const UFloatmin8 = 0x10
const UZero8 = 0x00

# the functions
# ----------------

function Float8(x::Float32)
# s, absx = signbit(x), abs(x)
ui = toUInt8(x)
return reinterpret(Float8, ui)
end

function toUInt8(x::Float32)
s, absx = signbit(x), abs(x)
isnan(absx) && return s ? UNaN8|0x80 : UNaN8
if absx >= roundsto_floatmax8
if absx > floatmaxplus8
return s ? UInf8|0x80 : UInf8
else
return s ? UFloatmax8|0x80 : UFloatmax8
end
end
if absx < roundsto_zero8
return s ? UZero8|0x80 : UZero8
elseif absx < roundsto_subnormal
return subnormal8(s, absx)
end
absx = min(15.5f0, max(0.25f0, absx))
return normal8(s, absx)
end

@inline function subnormal8(s::Core.Bool, absx::Float32)
idx = findfirst(a->absx <= a, F8offset_subnormal)
return s ? U8subnormal[idx] | 0x80 : U8subnormal[idx]
end

function normal8(s::Core.Bool, absx::Float32)
if absx <= 1.96875f0
if absx <= 0.4921875f0
idx = UInt8(15+firstof16lte(absx, F8offset1))
return s ? idx|0x80 : idx
elseif absx <= 0.984375f0
idx = UInt8(15+16+firstof16lte(absx, F8offset2))
return s ? idx|0x80 : idx
else
idx = UInt8(15+32+firstof16lte(absx, F8offset3))
return s ? idx|0x80 : idx
end
else
if absx <= 3.9375f0
idx = UInt8(15+32+16+firstof16lte(absx, F8offset4))
return s ? idx|0x80 : idx
elseif absx <= 7.875f0
idx = UInt8(15+64+firstof16lte(absx, F8offset5))
return s ? idx|0x80 : idx
else
idx = UInt8(15+64+16+firstof16lte(absx, F8offset6))
return s ? idx|0x80 : idx
end
end
end

function firstof16lte(needle, haystack)
for idx = 1:16
if needle <= haystack[idx]
return idx
end
end
error("should not be reached")
end

# """Old version, slower."""
# function normal8(s::Bool, absx::Float32)
# if absx <= 0.4921875f0
# idx = UInt8(15+findfirst(a->absx <= a, F8offset1))
# return s ? idx|0x80 : idx
# elseif absx <= 0.984375f0
# idx = UInt8(15+16+findfirst(a->absx <= a, F8offset2))
# return s ? idx|0x80 : idx
# elseif absx <= 1.96875f0
# idx = UInt8(15+32+findfirst(a->absx <= a, F8offset3))
# return s ? idx|0x80 : idx
# elseif absx <= 3.9375f0
# idx = UInt8(15+32+16+findfirst(a->absx <= a, F8offset4))
# return s ? idx|0x80 : idx
# elseif absx <= 7.875f0
# idx = UInt8(15+64+findfirst(a->absx <= a, F8offset5))
# return s ? idx|0x80 : idx
# elseif absx <= 15.75f0
# idx = UInt8(15+64+16+findfirst(a->absx <= a, F8offset6))
# return s ? idx|0x80 : idx
# else
# throw(DomainError(absx,"not normal for Float8"))
# end
# end
133 changes: 133 additions & 0 deletions src/float32_to_float8_old.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# Float32 -> Float8 algorithm in analogy to
#
# Float32 -> Float16 algorithm from:
# "Fast Half Float Conversion" by Jeroen van der Zijp
# ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
#
# With adjustments for round-to-nearest, ties to even.

function create_base_shifttable(::Type{T}) where {T<:AbstractFloat8}

basetable = Vector{UInt8}(undef, 512)
shifttable = Vector{UInt8}(undef, 512)

if T == Float8
# elements derive from
# [1] 2^-6 = Float8(0x01) the smallest representable number (subnormal)
# [2] 2^-2 = Float8(0x10) the first non-subnormal number
# [3] 2^4 = 16 > floatmax(Float8) is the smallest power of two that is larger than floatmax(Float8)

e_limits = [-6,-2,4]

# shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1"
# to the first significand bit
# e_shift_subnorm is 17 for Float8
e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8)-1)+e_limits[2]-1
elseif T == Float8_4

# see above
e_limits = [-9,-6,8]

# shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1"
# to the first significand bit
# e_shift_subnorm is 14 for Float8_4
e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8_4)-1)+e_limits[2]-1
end

for i = 0:255 # all possible exponents for Float32
e = i - 127 # subtract Float32 bias
if e < e_limits[1] # Very small numbers map to +- zero
basetable[i|0x000+1] = zero(T)
basetable[i|0x100+1] = -zero(T)
shifttable[i|0x000+1] = n_significant_bits(T)+1
shifttable[i|0x100+1] = n_significant_bits(T)+1
elseif e < e_limits[2] # Small numbers map to denorms
basetable[i|0x000+1] = zero(T)
basetable[i|0x100+1] = -zero(T)
shifttable[i|0x000+1] = -e+e_shift_subnorm
shifttable[i|0x100+1] = -e+e_shift_subnorm
elseif e < e_limits[3] # Normal numbers just lose precision
basetable[i|0x000+1] = ((e+bias(T)) << n_significant_bits(T))
basetable[i|0x100+1] = ((e+bias(T)) << n_significant_bits(T)) | sign_mask(T)
shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T)
shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T)
elseif e < 128 # Large numbers map to Infinity
basetable[i|0x000+1] = inf8(T)
basetable[i|0x100+1] = -inf8(T)
shifttable[i|0x000+1] = n_significant_bits(T)+1
shifttable[i|0x100+1] = n_significant_bits(T)+1
else # Infinity and NaN's stay Infinity and NaN's
basetable[i|0x000+1] = inf8(T)
basetable[i|0x100+1] = -inf8(T)
shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T)
shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T)
end
end

return basetable, shifttable
end

const basetable8, shifttable8 = create_base_shifttable(Float8)
const basetable8_4, shifttable8_4 = create_base_shifttable(Float8_4)

# function Float8(val::Float32)
#
# f = reinterpret(UInt32, val)
#
# if isnan(val) #TODO retain the significant bits for NaN?
# return nan8(Float8)
# end
#
# # exponent as Int64
# i = f >> n_significant_bits(Float32) + 1
# @inbounds sh = shifttable8[i]
# f &= significand_mask(Float32)
#
# # If `val` is subnormal, the tables are set up to force the
# # result to 0, so the significand has an implicit `1` in the
# # cases we care about.
#
# f |= significand_mask(Float32) + 0x1
# @inbounds h = (basetable8[i] + (f >> sh) & significand_mask(Float8)) % UInt8
#
# # rounding
# nextbit = (f >> (sh-1)) & 1
# if nextbit != 0 && (h & exponent_mask(Float8)) != exponent_mask(Float8)
# # Round halfway to even or check lower bits
# if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0
# h += one(UInt8)
# end
# end
# return reinterpret(Float8, h)
# end

function Float8_4(val::Float32)

f = reinterpret(UInt32, val)

if isnan(val) #TODO retain the significant bits for NaN?
return nan8(Float8_4)
end

# exponent as Int64
i = f >> n_significant_bits(Float32) + 1
@inbounds sh = shifttable8_4[i]
f &= significand_mask(Float32)

# If `val` is subnormal, the tables are set up to force the
# result to 0, so the significand has an implicit `1` in the
# cases we care about.

f |= significand_mask(Float32) + 0x1
@inbounds h = (basetable8_4[i] + (f >> sh) & significand_mask(Float8_4)) % UInt8

# rounding
nextbit = (f >> (sh-1)) & 1
if nextbit != 0 && (h & exponent_mask(Float8_4)) != exponent_mask(Float8_4)
# Round halfway to even or check lower bits
if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0
h += one(UInt8)
end
end
return reinterpret(Float8_4, h)
end
Loading