JuliaMath · milankl · Mar 16, 2020 · Mar 16, 2020
diff --git a/src/Float8s.jl b/src/Float8s.jl
@@ -14,5 +14,8 @@ module Float8s
     export Float8, Float8_4, NaN8, Inf8, NaN8_4, Inf8_4
 
     include("float8.jl")
+    include("float8_to_float32.jl")
+    include("float32_to_float8.jl")
+    include("float32_to_float8_old.jl")
 
 end
diff --git a/src/float32_to_float8.jl b/src/float32_to_float8.jl
@@ -0,0 +1,168 @@
+# written by Jeffrey Sarnoff, Feb 2020.
+# the constants
+# ---------------
+
+#=
+   One table is split into subsections of 16 entries each.
+   This keeps the scan time minimal, even after accounting
+   for the conditionals that select the proper subsection.
+
+   These are Float8 values offset by 1/2 way to the next value,
+   this lets `findfirst` return an appropriately rounded value.
+
+   The `F8offsetN` tuples are used with normal values,
+   values that are finite and are not subnormal. Values
+   that round to zero are handled before these are used.
+=#
+
+const F8offset1 = (Float32[
+   0.2578125, 0.2734375, 0.2890625, 0.3046875, 0.3203125, 0.3359375,
+   0.3515625, 0.3671875, 0.3828125, 0.3984375, 0.4140625, 0.4296875,
+   0.4453125, 0.4609375, 0.4765625, 0.4921875]...,)
+
+const F8offset2 = (Float32[
+    0.515625, 0.546875, 0.578125, 0.609375, 0.640625, 0.671875,
+    0.703125, 0.734375, 0.765625, 0.796875, 0.828125, 0.859375,
+    0.890625, 0.921875, 0.953125, 0.984375]...,);
+
+const F8offset3 = (Float32[
+    1.03125, 1.09375, 1.15625, 1.21875, 1.28125, 1.34375, 1.40625,
+    1.46875, 1.53125, 1.59375, 1.65625, 1.71875, 1.78125, 1.84375,
+    1.90625, 1.96875]...,);
+
+const F8offset4 = (Float32[
+    2.0625, 2.1875, 2.3125, 2.4375, 2.5625, 2.6875, 2.8125,
+    2.9375, 3.0625, 3.1875, 3.3125, 3.4375, 3.5625, 3.6875,
+    3.8125, 3.9375]...,);
+
+const F8offset5 = (Float32[
+    4.125, 4.375, 4.625, 4.875, 5.125, 5.375, 5.625, 5.875, 6.125,
+    6.375,  6.625, 6.875, 7.125, 7.375, 7.625, 7.875]...,);
+
+const F8offset6 = (Float32[
+    8.25, 8.75, 9.25, 9.75, 10.25, 10.75, 11.25, 11.75, 12.25, 12.75,
+    13.25, 13.75, 14.25, 14.75, 15.25, 15.75]...,);
+#=
+     There is one table used with subnormal values.
+      It is derived from the actual values of each
+      subnormal quantity, shifted up halfway to the
+      next subnormal.  This lets scanning also round.
+      An initial (anchor) value is prepended, that value
+      is half of the smallest subnormal.
+
+     A corresponding table of UInt8 values also is used.
+=#
+
+const F8offset_subnormal = (
+    0.0078125f0, 0.0234375f0, 0.0390625f0, 0.0546875f0, 0.0703125f0,
+    0.0859375f0, 0.1015625f0, 0.1171875f0, 0.1328125f0, 0.1484375f0,
+    0.1640625f0, 0.1796825f0, 0.1953125f0, 0.2109375f0, 0.2265625f0,
+    0.2421875f0)
+
+const U8subnormal = (collect(UInt8.(0:15))...,)
+
+# some named constants to clarify the source text
+
+const roundsto_floatmax8 = 15.25f0
+const roundsto_zero8 = 0.0078125f0
+const roundsto_subnormal = 0.2421875f0
+const floatmaxplus8 = 15.75f0 # floatmax(Float8) + floatmin(Float8)
+
+const UNaN8 = 0x78
+const UInf8 = 0x70
+const UFloatmax8 = 0x6f
+const UFloatmin8 = 0x10
+const UZero8 = 0x00
+
+#  the functions
+# ----------------
+
+function Float8(x::Float32)
+    # s, absx = signbit(x), abs(x)
+    ui = toUInt8(x)
+    return reinterpret(Float8, ui)
+end
+
+function toUInt8(x::Float32)
+    s, absx = signbit(x), abs(x)
+    isnan(absx) && return s ? UNaN8|0x80 : UNaN8
+    if absx >= roundsto_floatmax8
+        if absx > floatmaxplus8
+            return s ? UInf8|0x80 : UInf8
+        else
+            return s ? UFloatmax8|0x80 : UFloatmax8
+        end
+    end
+    if absx < roundsto_zero8
+        return s ? UZero8|0x80 : UZero8
+    elseif absx < roundsto_subnormal
+        return subnormal8(s, absx)
+    end
+    absx = min(15.5f0, max(0.25f0, absx))
+    return normal8(s, absx)
+end
+
+@inline function subnormal8(s::Core.Bool, absx::Float32)
+    idx = findfirst(a->absx <= a, F8offset_subnormal)
+    return s ? U8subnormal[idx] | 0x80 : U8subnormal[idx]
+end
+
+function normal8(s::Core.Bool, absx::Float32)
+   if absx <= 1.96875f0
+     if absx <= 0.4921875f0
+          idx = UInt8(15+firstof16lte(absx, F8offset1))
+          return s ? idx|0x80 : idx
+     elseif absx <= 0.984375f0
+          idx = UInt8(15+16+firstof16lte(absx, F8offset2))
+          return s ? idx|0x80 : idx
+     else
+         idx = UInt8(15+32+firstof16lte(absx, F8offset3))
+         return s ? idx|0x80 : idx
+     end
+   else
+     if absx <= 3.9375f0
+         idx = UInt8(15+32+16+firstof16lte(absx, F8offset4))
+         return s ? idx|0x80 : idx
+     elseif absx <= 7.875f0
+         idx = UInt8(15+64+firstof16lte(absx, F8offset5))
+         return s ? idx|0x80 : idx
+     else
+         idx = UInt8(15+64+16+firstof16lte(absx, F8offset6))
+         return s ? idx|0x80 : idx
+     end
+   end
+end
+
+function firstof16lte(needle, haystack)
+    for idx = 1:16
+        if needle <= haystack[idx]
+            return idx
+        end
+    end
+    error("should not be reached")
+end
+
+# """Old version, slower."""
+# function normal8(s::Bool, absx::Float32)
+#    if absx <= 0.4921875f0
+#         idx = UInt8(15+findfirst(a->absx <= a, F8offset1))
+#         return s ? idx|0x80 : idx
+#    elseif absx <= 0.984375f0
+#         idx = UInt8(15+16+findfirst(a->absx <= a, F8offset2))
+#         return s ? idx|0x80 : idx
+#    elseif absx <= 1.96875f0
+#        idx = UInt8(15+32+findfirst(a->absx <= a, F8offset3))
+#        return s ? idx|0x80 : idx
+#    elseif absx <= 3.9375f0
+#        idx = UInt8(15+32+16+findfirst(a->absx <= a, F8offset4))
+#        return s ? idx|0x80 : idx
+#    elseif absx <= 7.875f0
+#        idx = UInt8(15+64+findfirst(a->absx <= a, F8offset5))
+#        return s ? idx|0x80 : idx
+#    elseif absx <= 15.75f0
+#        idx = UInt8(15+64+16+findfirst(a->absx <= a, F8offset6))
+#        return s ? idx|0x80 : idx
+#    else
+#        throw(DomainError(absx,"not normal for Float8"))
+#    end
+# end
diff --git a/src/float32_to_float8_old.jl b/src/float32_to_float8_old.jl
@@ -0,0 +1,133 @@
+# Float32 -> Float8 algorithm in analogy to
+#
+# Float32 -> Float16 algorithm from:
+#   "Fast Half Float Conversion" by Jeroen van der Zijp
+#   ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+#
+# With adjustments for round-to-nearest, ties to even.
+
+function create_base_shifttable(::Type{T}) where {T<:AbstractFloat8}
+
+    basetable = Vector{UInt8}(undef, 512)
+    shifttable = Vector{UInt8}(undef, 512)
+
+    if T == Float8
+        # elements derive from
+        # [1]   2^-6 = Float8(0x01) the smallest representable number (subnormal)
+        # [2]   2^-2 = Float8(0x10) the first non-subnormal number
+        # [3]   2^4 = 16 > floatmax(Float8) is the smallest power of two that is larger than floatmax(Float8)
+
+        e_limits = [-6,-2,4]
+
+        # shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1"
+        # to the first significand bit
+        # e_shift_subnorm is 17 for Float8
+        e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8)-1)+e_limits[2]-1
+    elseif T == Float8_4
+
+        # see above
+        e_limits = [-9,-6,8]
+
+        # shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1"
+        # to the first significand bit
+        # e_shift_subnorm is 14 for Float8_4
+        e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8_4)-1)+e_limits[2]-1
+    end
+
+    for i = 0:255                               # all possible exponents for Float32
+        e = i - 127                             # subtract Float32 bias
+        if e < e_limits[1]                      # Very small numbers map to +- zero
+            basetable[i|0x000+1] = zero(T)
+            basetable[i|0x100+1] = -zero(T)
+            shifttable[i|0x000+1] = n_significant_bits(T)+1
+            shifttable[i|0x100+1] = n_significant_bits(T)+1
+        elseif e < e_limits[2]                  # Small numbers map to denorms
+            basetable[i|0x000+1] = zero(T)
+            basetable[i|0x100+1] = -zero(T)
+            shifttable[i|0x000+1] = -e+e_shift_subnorm
+            shifttable[i|0x100+1] = -e+e_shift_subnorm
+        elseif e < e_limits[3]                  # Normal numbers just lose precision
+            basetable[i|0x000+1] = ((e+bias(T)) << n_significant_bits(T))
+            basetable[i|0x100+1] = ((e+bias(T)) << n_significant_bits(T)) | sign_mask(T)
+            shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T)
+            shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T)
+        elseif e < 128                          # Large numbers map to Infinity
+            basetable[i|0x000+1] = inf8(T)
+            basetable[i|0x100+1] = -inf8(T)
+            shifttable[i|0x000+1] = n_significant_bits(T)+1
+            shifttable[i|0x100+1] = n_significant_bits(T)+1
+        else                                    # Infinity and NaN's stay Infinity and NaN's
+            basetable[i|0x000+1] = inf8(T)
+            basetable[i|0x100+1] = -inf8(T)
+            shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T)
+            shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T)
+        end
+    end
+
+    return basetable, shifttable
+end
+
+const basetable8, shifttable8 = create_base_shifttable(Float8)
+const basetable8_4, shifttable8_4 = create_base_shifttable(Float8_4)
+
+# function Float8(val::Float32)
+#
+#     f = reinterpret(UInt32, val)
+#
+#     if isnan(val)       #TODO retain the significant bits for NaN?
+#         return nan8(Float8)
+#     end
+#
+#     # exponent as Int64
+#     i = f >> n_significant_bits(Float32) + 1
+#     @inbounds sh = shifttable8[i]
+#     f &= significand_mask(Float32)
+#
+#     # If `val` is subnormal, the tables are set up to force the
+#     # result to 0, so the significand has an implicit `1` in the
+#     # cases we care about.
+#
+#     f |= significand_mask(Float32) + 0x1
+#     @inbounds h = (basetable8[i] + (f >> sh) & significand_mask(Float8)) % UInt8
+#
+#     # rounding
+#     nextbit = (f >> (sh-1)) & 1
+#     if nextbit != 0 && (h & exponent_mask(Float8)) != exponent_mask(Float8)
+#         # Round halfway to even or check lower bits
+#         if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0
+#             h += one(UInt8)
+#         end
+#     end
+#     return reinterpret(Float8, h)
+# end
+
+function Float8_4(val::Float32)
+
+    f = reinterpret(UInt32, val)
+
+    if isnan(val)       #TODO retain the significant bits for NaN?
+        return nan8(Float8_4)
+    end
+
+    # exponent as Int64
+    i = f >> n_significant_bits(Float32) + 1
+    @inbounds sh = shifttable8_4[i]
+    f &= significand_mask(Float32)
+
+    # If `val` is subnormal, the tables are set up to force the
+    # result to 0, so the significand has an implicit `1` in the
+    # cases we care about.
+
+    f |= significand_mask(Float32) + 0x1
+    @inbounds h = (basetable8_4[i] + (f >> sh) & significand_mask(Float8_4)) % UInt8
+
+    # rounding
+    nextbit = (f >> (sh-1)) & 1
+    if nextbit != 0 && (h & exponent_mask(Float8_4)) != exponent_mask(Float8_4)
+        # Round halfway to even or check lower bits
+        if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0
+            h += one(UInt8)
+        end
+    end
+    return reinterpret(Float8_4, h)
+end