From 34ca21e01ac549b6848522b4f3f6bb3b5b585674 Mon Sep 17 00:00:00 2001
From: milankl <milankloewer@gmx.de>
Date: Mon, 16 Mar 2020 21:17:32 +0000
Subject: [PATCH] Jeffrey's Float8(::Float32)

---
 src/Float8s.jl               |   3 +
 src/float32_to_float8.jl     | 168 +++++++++++++++++++++++++
 src/float32_to_float8_old.jl | 133 ++++++++++++++++++++
 src/float8.jl                | 236 -----------------------------------
 src/float8_to_float32.jl     | 102 +++++++++++++++
 test/runtests.jl             |  52 ++++----
 6 files changed, 432 insertions(+), 262 deletions(-)
 create mode 100644 src/float32_to_float8.jl
 create mode 100644 src/float32_to_float8_old.jl
 create mode 100644 src/float8_to_float32.jl

diff --git a/src/Float8s.jl b/src/Float8s.jl
index de74d7a..3c11a15 100644
--- a/src/Float8s.jl
+++ b/src/Float8s.jl
@@ -14,5 +14,8 @@ module Float8s
     export Float8, Float8_4, NaN8, Inf8, NaN8_4, Inf8_4
 
     include("float8.jl")
+    include("float8_to_float32.jl")
+    include("float32_to_float8.jl")
+    include("float32_to_float8_old.jl")
 
 end
diff --git a/src/float32_to_float8.jl b/src/float32_to_float8.jl
new file mode 100644
index 0000000..a39bce1
--- /dev/null
+++ b/src/float32_to_float8.jl
@@ -0,0 +1,168 @@
+# written by Jeffrey Sarnoff, Feb 2020.
+# the constants
+# ---------------
+
+#=
+   One table is split into subsections of 16 entries each.
+   This keeps the scan time minimal, even after accounting
+   for the conditionals that select the proper subsection.
+
+   These are Float8 values offset by 1/2 way to the next value,
+   this lets `findfirst` return an appropriately rounded value.
+
+   The `F8offsetN` tuples are used with normal values,
+   values that are finite and are not subnormal. Values
+   that round to zero are handled before these are used.
+=#
+
+const F8offset1 = (Float32[
+   0.2578125, 0.2734375, 0.2890625, 0.3046875, 0.3203125, 0.3359375,
+   0.3515625, 0.3671875, 0.3828125, 0.3984375, 0.4140625, 0.4296875,
+   0.4453125, 0.4609375, 0.4765625, 0.4921875]...,)
+
+const F8offset2 = (Float32[
+    0.515625, 0.546875, 0.578125, 0.609375, 0.640625, 0.671875,
+    0.703125, 0.734375, 0.765625, 0.796875, 0.828125, 0.859375,
+    0.890625, 0.921875, 0.953125, 0.984375]...,);
+
+const F8offset3 = (Float32[
+    1.03125, 1.09375, 1.15625, 1.21875, 1.28125, 1.34375, 1.40625,
+    1.46875, 1.53125, 1.59375, 1.65625, 1.71875, 1.78125, 1.84375,
+    1.90625, 1.96875]...,);
+
+const F8offset4 = (Float32[
+    2.0625, 2.1875, 2.3125, 2.4375, 2.5625, 2.6875, 2.8125,
+    2.9375, 3.0625, 3.1875, 3.3125, 3.4375, 3.5625, 3.6875,
+    3.8125, 3.9375]...,);
+
+const F8offset5 = (Float32[
+    4.125, 4.375, 4.625, 4.875, 5.125, 5.375, 5.625, 5.875, 6.125,
+    6.375,  6.625, 6.875, 7.125, 7.375, 7.625, 7.875]...,);
+
+const F8offset6 = (Float32[
+    8.25, 8.75, 9.25, 9.75, 10.25, 10.75, 11.25, 11.75, 12.25, 12.75,
+    13.25, 13.75, 14.25, 14.75, 15.25, 15.75]...,);
+#=
+     There is one table used with subnormal values.
+      It is derived from the actual values of each
+      subnormal quantity, shifted up halfway to the
+      next subnormal.  This lets scanning also round.
+      An initial (anchor) value is prepended, that value
+      is half of the smallest subnormal.
+
+     A corresponding table of UInt8 values also is used.
+=#
+
+const F8offset_subnormal = (
+    0.0078125f0, 0.0234375f0, 0.0390625f0, 0.0546875f0, 0.0703125f0,
+    0.0859375f0, 0.1015625f0, 0.1171875f0, 0.1328125f0, 0.1484375f0,
+    0.1640625f0, 0.1796825f0, 0.1953125f0, 0.2109375f0, 0.2265625f0,
+    0.2421875f0)
+
+const U8subnormal = (collect(UInt8.(0:15))...,)
+
+# some named constants to clarify the source text
+
+const roundsto_floatmax8 = 15.25f0
+const roundsto_zero8 = 0.0078125f0
+const roundsto_subnormal = 0.2421875f0
+const floatmaxplus8 = 15.75f0 # floatmax(Float8) + floatmin(Float8)
+
+const UNaN8 = 0x78
+const UInf8 = 0x70
+const UFloatmax8 = 0x6f
+const UFloatmin8 = 0x10
+const UZero8 = 0x00
+
+#  the functions
+# ----------------
+
+function Float8(x::Float32)
+    # s, absx = signbit(x), abs(x)
+    ui = toUInt8(x)
+    return reinterpret(Float8, ui)
+end
+
+function toUInt8(x::Float32)
+    s, absx = signbit(x), abs(x)
+    isnan(absx) && return s ? UNaN8|0x80 : UNaN8
+    if absx >= roundsto_floatmax8
+        if absx > floatmaxplus8
+            return s ? UInf8|0x80 : UInf8
+        else
+            return s ? UFloatmax8|0x80 : UFloatmax8
+        end
+    end
+    if absx < roundsto_zero8
+        return s ? UZero8|0x80 : UZero8
+    elseif absx < roundsto_subnormal
+        return subnormal8(s, absx)
+    end
+    absx = min(15.5f0, max(0.25f0, absx))
+    return normal8(s, absx)
+end
+
+@inline function subnormal8(s::Core.Bool, absx::Float32)
+    idx = findfirst(a->absx <= a, F8offset_subnormal)
+    return s ? U8subnormal[idx] | 0x80 : U8subnormal[idx]
+end
+
+function normal8(s::Core.Bool, absx::Float32)
+   if absx <= 1.96875f0
+     if absx <= 0.4921875f0
+          idx = UInt8(15+firstof16lte(absx, F8offset1))
+          return s ? idx|0x80 : idx
+     elseif absx <= 0.984375f0
+          idx = UInt8(15+16+firstof16lte(absx, F8offset2))
+          return s ? idx|0x80 : idx
+     else
+         idx = UInt8(15+32+firstof16lte(absx, F8offset3))
+         return s ? idx|0x80 : idx
+     end
+   else
+     if absx <= 3.9375f0
+         idx = UInt8(15+32+16+firstof16lte(absx, F8offset4))
+         return s ? idx|0x80 : idx
+     elseif absx <= 7.875f0
+         idx = UInt8(15+64+firstof16lte(absx, F8offset5))
+         return s ? idx|0x80 : idx
+     else
+         idx = UInt8(15+64+16+firstof16lte(absx, F8offset6))
+         return s ? idx|0x80 : idx
+     end
+   end
+end
+
+function firstof16lte(needle, haystack)
+    for idx = 1:16
+        if needle <= haystack[idx]
+            return idx
+        end
+    end
+    error("should not be reached")
+end
+
+# """Old version, slower."""
+# function normal8(s::Bool, absx::Float32)
+#    if absx <= 0.4921875f0
+#         idx = UInt8(15+findfirst(a->absx <= a, F8offset1))
+#         return s ? idx|0x80 : idx
+#    elseif absx <= 0.984375f0
+#         idx = UInt8(15+16+findfirst(a->absx <= a, F8offset2))
+#         return s ? idx|0x80 : idx
+#    elseif absx <= 1.96875f0
+#        idx = UInt8(15+32+findfirst(a->absx <= a, F8offset3))
+#        return s ? idx|0x80 : idx
+#    elseif absx <= 3.9375f0
+#        idx = UInt8(15+32+16+findfirst(a->absx <= a, F8offset4))
+#        return s ? idx|0x80 : idx
+#    elseif absx <= 7.875f0
+#        idx = UInt8(15+64+findfirst(a->absx <= a, F8offset5))
+#        return s ? idx|0x80 : idx
+#    elseif absx <= 15.75f0
+#        idx = UInt8(15+64+16+findfirst(a->absx <= a, F8offset6))
+#        return s ? idx|0x80 : idx
+#    else
+#        throw(DomainError(absx,"not normal for Float8"))
+#    end
+# end
diff --git a/src/float32_to_float8_old.jl b/src/float32_to_float8_old.jl
new file mode 100644
index 0000000..1d4ba4b
--- /dev/null
+++ b/src/float32_to_float8_old.jl
@@ -0,0 +1,133 @@
+# Float32 -> Float8 algorithm in analogy to
+#
+# Float32 -> Float16 algorithm from:
+#   "Fast Half Float Conversion" by Jeroen van der Zijp
+#   ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+#
+# With adjustments for round-to-nearest, ties to even.
+
+function create_base_shifttable(::Type{T}) where {T<:AbstractFloat8}
+
+    basetable = Vector{UInt8}(undef, 512)
+    shifttable = Vector{UInt8}(undef, 512)
+
+    if T == Float8
+        # elements derive from
+        # [1]   2^-6 = Float8(0x01) the smallest representable number (subnormal)
+        # [2]   2^-2 = Float8(0x10) the first non-subnormal number
+        # [3]   2^4 = 16 > floatmax(Float8) is the smallest power of two that is larger than floatmax(Float8)
+
+        e_limits = [-6,-2,4]
+
+        # shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1"
+        # to the first significand bit
+        # e_shift_subnorm is 17 for Float8
+        e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8)-1)+e_limits[2]-1
+    elseif T == Float8_4
+
+        # see above
+        e_limits = [-9,-6,8]
+
+        # shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1"
+        # to the first significand bit
+        # e_shift_subnorm is 14 for Float8_4
+        e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8_4)-1)+e_limits[2]-1
+    end
+
+    for i = 0:255                               # all possible exponents for Float32
+        e = i - 127                             # subtract Float32 bias
+        if e < e_limits[1]                      # Very small numbers map to +- zero
+            basetable[i|0x000+1] = zero(T)
+            basetable[i|0x100+1] = -zero(T)
+            shifttable[i|0x000+1] = n_significant_bits(T)+1
+            shifttable[i|0x100+1] = n_significant_bits(T)+1
+        elseif e < e_limits[2]                  # Small numbers map to denorms
+            basetable[i|0x000+1] = zero(T)
+            basetable[i|0x100+1] = -zero(T)
+            shifttable[i|0x000+1] = -e+e_shift_subnorm
+            shifttable[i|0x100+1] = -e+e_shift_subnorm
+        elseif e < e_limits[3]                  # Normal numbers just lose precision
+            basetable[i|0x000+1] = ((e+bias(T)) << n_significant_bits(T))
+            basetable[i|0x100+1] = ((e+bias(T)) << n_significant_bits(T)) | sign_mask(T)
+            shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T)
+            shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T)
+        elseif e < 128                          # Large numbers map to Infinity
+            basetable[i|0x000+1] = inf8(T)
+            basetable[i|0x100+1] = -inf8(T)
+            shifttable[i|0x000+1] = n_significant_bits(T)+1
+            shifttable[i|0x100+1] = n_significant_bits(T)+1
+        else                                    # Infinity and NaN's stay Infinity and NaN's
+            basetable[i|0x000+1] = inf8(T)
+            basetable[i|0x100+1] = -inf8(T)
+            shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T)
+            shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T)
+        end
+    end
+
+    return basetable, shifttable
+end
+
+const basetable8, shifttable8 = create_base_shifttable(Float8)
+const basetable8_4, shifttable8_4 = create_base_shifttable(Float8_4)
+
+# function Float8(val::Float32)
+#
+#     f = reinterpret(UInt32, val)
+#
+#     if isnan(val)       #TODO retain the significant bits for NaN?
+#         return nan8(Float8)
+#     end
+#
+#     # exponent as Int64
+#     i = f >> n_significant_bits(Float32) + 1
+#     @inbounds sh = shifttable8[i]
+#     f &= significand_mask(Float32)
+#
+#     # If `val` is subnormal, the tables are set up to force the
+#     # result to 0, so the significand has an implicit `1` in the
+#     # cases we care about.
+#
+#     f |= significand_mask(Float32) + 0x1
+#     @inbounds h = (basetable8[i] + (f >> sh) & significand_mask(Float8)) % UInt8
+#
+#     # rounding
+#     nextbit = (f >> (sh-1)) & 1
+#     if nextbit != 0 && (h & exponent_mask(Float8)) != exponent_mask(Float8)
+#         # Round halfway to even or check lower bits
+#         if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0
+#             h += one(UInt8)
+#         end
+#     end
+#     return reinterpret(Float8, h)
+# end
+
+function Float8_4(val::Float32)
+
+    f = reinterpret(UInt32, val)
+
+    if isnan(val)       #TODO retain the significant bits for NaN?
+        return nan8(Float8_4)
+    end
+
+    # exponent as Int64
+    i = f >> n_significant_bits(Float32) + 1
+    @inbounds sh = shifttable8_4[i]
+    f &= significand_mask(Float32)
+
+    # If `val` is subnormal, the tables are set up to force the
+    # result to 0, so the significand has an implicit `1` in the
+    # cases we care about.
+
+    f |= significand_mask(Float32) + 0x1
+    @inbounds h = (basetable8_4[i] + (f >> sh) & significand_mask(Float8_4)) % UInt8
+
+    # rounding
+    nextbit = (f >> (sh-1)) & 1
+    if nextbit != 0 && (h & exponent_mask(Float8_4)) != exponent_mask(Float8_4)
+        # Round halfway to even or check lower bits
+        if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0
+            h += one(UInt8)
+        end
+    end
+    return reinterpret(Float8_4, h)
+end
diff --git a/src/float8.jl b/src/float8.jl
index 517dc7d..994787e 100644
--- a/src/float8.jl
+++ b/src/float8.jl
@@ -89,140 +89,6 @@ function sign(x::T) where {T<:AbstractFloat8}
     end
 end
 
-# Float32 -> Float8 algorithm in analogy to
-#
-# Float32 -> Float16 algorithm from:
-#   "Fast Half Float Conversion" by Jeroen van der Zijp
-#   ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
-#
-# With adjustments for round-to-nearest, ties to even.
-
-function create_base_shifttable(::Type{T}) where {T<:AbstractFloat8}
-
-    basetable = Vector{UInt8}(undef, 512)
-    shifttable = Vector{UInt8}(undef, 512)
-
-    if T == Float8
-        # elements derive from
-        # [1]   2^-6 = Float8(0x01) the smallest representable number (subnormal)
-        # [2]   2^-2 = Float8(0x10) the first non-subnormal number
-        # [3]   2^4 = 16 > floatmax(Float8) is the smallest power of two that is larger than floatmax(Float8)
-
-        e_limits = [-6,-2,4]
-
-        # shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1"
-        # to the first significand bit
-        # e_shift_subnorm is 17 for Float8
-        e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8)-1)+e_limits[2]-1
-    elseif T == Float8_4
-
-        # see above
-        e_limits = [-9,-6,8]
-
-        # shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1"
-        # to the first significand bit
-        # e_shift_subnorm is 14 for Float8_4
-        e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8_4)-1)+e_limits[2]-1
-    end
-
-    for i = 0:255                               # all possible exponents for Float32
-        e = i - 127                             # subtract Float32 bias
-        if e < e_limits[1]                      # Very small numbers map to +- zero
-            basetable[i|0x000+1] = zero(T)
-            basetable[i|0x100+1] = -zero(T)
-            shifttable[i|0x000+1] = n_significant_bits(T)+1
-            shifttable[i|0x100+1] = n_significant_bits(T)+1
-        elseif e < e_limits[2]                  # Small numbers map to denorms
-            basetable[i|0x000+1] = zero(T)
-            basetable[i|0x100+1] = -zero(T)
-            shifttable[i|0x000+1] = -e+e_shift_subnorm
-            shifttable[i|0x100+1] = -e+e_shift_subnorm
-        elseif e < e_limits[3]                  # Normal numbers just lose precision
-            basetable[i|0x000+1] = ((e+bias(T)) << n_significant_bits(T))
-            basetable[i|0x100+1] = ((e+bias(T)) << n_significant_bits(T)) | sign_mask(T)
-            shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T)
-            shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T)
-        elseif e < 128                          # Large numbers map to Infinity
-            basetable[i|0x000+1] = inf8(T)
-            basetable[i|0x100+1] = -inf8(T)
-            shifttable[i|0x000+1] = n_significant_bits(T)+1
-            shifttable[i|0x100+1] = n_significant_bits(T)+1
-        else                                    # Infinity and NaN's stay Infinity and NaN's
-            basetable[i|0x000+1] = inf8(T)
-            basetable[i|0x100+1] = -inf8(T)
-            shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T)
-            shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T)
-        end
-    end
-
-    return basetable, shifttable
-end
-
-const basetable8, shifttable8 = create_base_shifttable(Float8)
-const basetable8_4, shifttable8_4 = create_base_shifttable(Float8_4)
-
-function Float8(val::Float32)
-
-    f = reinterpret(UInt32, val)
-
-    if isnan(val)       #TODO retain the significant bits for NaN?
-        return nan8(Float8)
-    end
-
-    # exponent as Int64
-    i = f >> n_significant_bits(Float32) + 1
-    @inbounds sh = shifttable8[i]
-    f &= significand_mask(Float32)
-
-    # If `val` is subnormal, the tables are set up to force the
-    # result to 0, so the significand has an implicit `1` in the
-    # cases we care about.
-
-    f |= significand_mask(Float32) + 0x1
-    @inbounds h = (basetable8[i] + (f >> sh) & significand_mask(Float8)) % UInt8
-
-    # rounding
-    nextbit = (f >> (sh-1)) & 1
-    if nextbit != 0 && (h & exponent_mask(Float8)) != exponent_mask(Float8)
-        # Round halfway to even or check lower bits
-        if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0
-            h += one(UInt8)
-        end
-    end
-    return reinterpret(Float8, h)
-end
-
-function Float8_4(val::Float32)
-
-    f = reinterpret(UInt32, val)
-
-    if isnan(val)       #TODO retain the significant bits for NaN?
-        return nan8(Float8_4)
-    end
-
-    # exponent as Int64
-    i = f >> n_significant_bits(Float32) + 1
-    @inbounds sh = shifttable8_4[i]
-    f &= significand_mask(Float32)
-
-    # If `val` is subnormal, the tables are set up to force the
-    # result to 0, so the significand has an implicit `1` in the
-    # cases we care about.
-
-    f |= significand_mask(Float32) + 0x1
-    @inbounds h = (basetable8_4[i] + (f >> sh) & significand_mask(Float8_4)) % UInt8
-
-    # rounding
-    nextbit = (f >> (sh-1)) & 1
-    if nextbit != 0 && (h & exponent_mask(Float8_4)) != exponent_mask(Float8_4)
-        # Round halfway to even or check lower bits
-        if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0
-            h += one(UInt8)
-        end
-    end
-    return reinterpret(Float8_4, h)
-end
-
 first_sig_bit_mask(::Type{Float8}) = 0x00000008
 first_sig_bit_mask(::Type{Float8_4}) = 0x00000004
 
@@ -235,108 +101,6 @@ bias_difference(::Type{Float8_4}) = 0x00000078      # = 120, 127 for Float32 min
 exp_bits_all_one(::Type{Float8}) = 0x00000007
 exp_bits_all_one(::Type{Float8_4}) = 0x0000000f
 
-# function Float32(val::T) where {T<:AbstractFloat8}
-#
-#     ival = reinterpret(UInt8, val)
-#
-#     # seperate into sign, exponent, significand
-#     sign = UInt32((ival & sign_mask(T)) >> 7)
-#     exp  = UInt32((ival & exponent_mask(T)) >> n_significant_bits(T))
-#     sig = UInt32(ival & significand_mask(T))
-#
-#     if exp == zero(UInt32)
-#         if sig == zero(UInt32)          # +-0 case
-#             return reinterpret(Float32,sign << 31)
-#         else                            # subnormals
-#             n_bit = 1
-#             # first significand bit set to one, else zero, to check for size of subnormal
-#             bit = first_sig_bit_mask(T)
-#             while (bit & sig) == 0
-#                 n_bit = n_bit + 1
-#                 bit = bit >> 1
-#             end
-#             sign = sign << 31
-#
-#             # bias = 2^(n_exp-1) - 1, i.e. 127 for Float32, 15 for Float16, 3 for Float8, 7 for Float8_4
-#             # difference in bias + 1 has to be added, e.g. 127-3 = 124 = 0x0000007c
-#
-#             exp = ((bias_difference(T)+1 - n_bit) << 23) % UInt32
-#             sig = ((sig & (~bit)) << n_bit) << sig_bit_shift(T)
-#             ret = sign | exp | sig
-#             reinterpret(Float32,ret)
-#         end
-#     elseif exp == exp_bits_all_one(T)       # Inf/NaN case
-#         if sig == zero(UInt32)              # Infinity
-#             if sign == zero(UInt32)
-#                 return Inf32
-#             else
-#                 return -Inf32
-#             end
-#         else                # NaN, preserve sign and significand (first sig bit always 1)
-#             # NaN32 == reinterpret(Flaot32,0x7fc00000)
-#             ret = 0x7fc00000 | (sign<<31) | (sig<<sig_bit_shift(T))
-#             reinterpret(Float32,ret)
-#         end
-#     else
-#         sign = sign << 31
-#
-#         # bias = 2^(n_exp-1) - 1, i.e. 127 for Float32, 15 for Float16, 3 for Float8, 7 for Float8_4
-#         # difference in bias has to be added, e.g. 127-3 = 124 = 0x0000007c
-#
-#         exp  = (exp + bias_difference(T)) << 23
-#         sig  = sig << sig_bit_shift(T)
-#         ret = sign | exp | sig
-#         return reinterpret(Float32, ret)
-#     end
-# end
-
-# TABLE LOOKUPS
-const Float8toFloat32 = Float32[0.0, 0.015625, 0.03125, 0.046875, 0.0625, 0.078125, 0.09375,
-    0.109375, 0.125, 0.140625, 0.15625, 0.171875, 0.1875, 0.203125, 0.21875, 0.234375, 0.25,
-    0.265625, 0.28125, 0.296875, 0.3125, 0.328125, 0.34375, 0.359375, 0.375, 0.390625, 0.40625,
-    0.421875, 0.4375, 0.453125, 0.46875, 0.484375, 0.5, 0.53125, 0.5625, 0.59375, 0.625, 0.65625,
-    0.6875, 0.71875, 0.75, 0.78125, 0.8125, 0.84375, 0.875, 0.90625, 0.9375, 0.96875, 1.0, 1.0625,
-    1.125, 1.1875, 1.25, 1.3125, 1.375, 1.4375, 1.5, 1.5625, 1.625, 1.6875, 1.75, 1.8125, 1.875, 1.9375,
-    2.0, 2.125, 2.25, 2.375, 2.5, 2.625, 2.75, 2.875, 3.0, 3.125, 3.25, 3.375, 3.5, 3.625, 3.75, 3.875,
-    4.0, 4.25, 4.5, 4.75, 5.0, 5.25, 5.5, 5.75, 6.0, 6.25, 6.5, 6.75, 7.0, 7.25, 7.5, 7.75, 8.0, 8.5,
-    9.0, 9.5, 10.0, 10.5, 11.0, 11.5, 12.0, 12.5, 13.0, 13.5, 14.0, 14.5, 15.0, 15.5, Inf, NaN, NaN,
-    NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, -0.0, -0.015625, -0.03125, -0.046875,
-    -0.0625, -0.078125, -0.09375, -0.109375, -0.125, -0.140625, -0.15625, -0.171875, -0.1875, -0.203125,
-    -0.21875, -0.234375, -0.25, -0.265625, -0.28125, -0.296875, -0.3125, -0.328125, -0.34375, -0.359375,
-    -0.375, -0.390625, -0.40625, -0.421875, -0.4375, -0.453125, -0.46875, -0.484375, -0.5, -0.53125,
-    -0.5625, -0.59375, -0.625, -0.65625, -0.6875, -0.71875, -0.75, -0.78125, -0.8125, -0.84375, -0.875,
-    -0.90625, -0.9375, -0.96875, -1.0, -1.0625, -1.125, -1.1875, -1.25, -1.3125, -1.375, -1.4375, -1.5,
-    -1.5625, -1.625, -1.6875, -1.75, -1.8125, -1.875, -1.9375, -2.0, -2.125, -2.25, -2.375, -2.5, -2.625,
-    -2.75, -2.875, -3.0, -3.125, -3.25, -3.375, -3.5, -3.625, -3.75, -3.875, -4.0, -4.25, -4.5, -4.75, -5.0,
-    -5.25, -5.5, -5.75, -6.0, -6.25, -6.5, -6.75, -7.0, -7.25, -7.5, -7.75, -8.0, -8.5, -9.0, -9.5, -10.0,
-    -10.5, -11.0, -11.5, -12.0, -12.5, -13.0, -13.5, -14.0, -14.5, -15.0, -15.5, -Inf, -NaN, -NaN, -NaN,
-    -NaN, -NaN, -NaN, -NaN, -NaN, -NaN, -NaN, -NaN, -NaN, -NaN, -NaN, -NaN]
-
-const Float8_4toFloat32 = Float32[0.0, 0.001953125, 0.00390625, 0.005859375, 0.0078125, 0.009765625,
-    0.01171875, 0.013671875, 0.015625, 0.017578125, 0.01953125, 0.021484375, 0.0234375, 0.025390625,
-    0.02734375, 0.029296875, 0.03125, 0.03515625, 0.0390625, 0.04296875, 0.046875, 0.05078125, 0.0546875,
-    0.05859375, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125,
-    0.140625, 0.15625, 0.171875, 0.1875, 0.203125, 0.21875, 0.234375, 0.25, 0.28125, 0.3125, 0.34375,
-    0.375, 0.40625, 0.4375, 0.46875, 0.5, 0.5625, 0.625, 0.6875, 0.75, 0.8125, 0.875, 0.9375, 1.0,
-    1.125, 1.25, 1.375, 1.5, 1.625, 1.75, 1.875, 2.0, 2.25, 2.5, 2.75, 3.0, 3.25, 3.5, 3.75, 4.0,
-    4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 18.0, 20.0,
-    22.0, 24.0, 26.0, 28.0, 30.0, 32.0, 36.0, 40.0, 44.0, 48.0, 52.0, 56.0, 60.0, 64.0, 72.0, 80.0, 88.0,
-    96.0, 104.0, 112.0, 120.0, 128.0, 144.0, 160.0, 176.0, 192.0, 208.0, 224.0, 240.0, Inf, NaN, NaN, NaN,
-    NaN, NaN, NaN, NaN, -0.0, -0.001953125, -0.00390625, -0.005859375, -0.0078125, -0.009765625, -0.01171875,
-    -0.013671875, -0.015625, -0.017578125, -0.01953125, -0.021484375, -0.0234375, -0.025390625, -0.02734375,
-    -0.029296875, -0.03125, -0.03515625, -0.0390625, -0.04296875, -0.046875, -0.05078125, -0.0546875, -0.05859375,
-    -0.0625, -0.0703125, -0.078125, -0.0859375, -0.09375, -0.1015625, -0.109375, -0.1171875, -0.125, -0.140625,
-    -0.15625, -0.171875, -0.1875, -0.203125, -0.21875, -0.234375, -0.25, -0.28125, -0.3125, -0.34375, -0.375,
-    -0.40625, -0.4375, -0.46875, -0.5, -0.5625, -0.625, -0.6875, -0.75, -0.8125, -0.875, -0.9375, -1.0, -1.125,
-    -1.25, -1.375, -1.5, -1.625, -1.75, -1.875, -2.0, -2.25, -2.5, -2.75, -3.0, -3.25, -3.5, -3.75, -4.0, -4.5,
-    -5.0, -5.5, -6.0, -6.5, -7.0, -7.5, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0, -16.0, -18.0, -20.0,
-    -22.0, -24.0, -26.0, -28.0, -30.0, -32.0, -36.0, -40.0, -44.0, -48.0, -52.0, -56.0, -60.0, -64.0, -72.0, -80.0,
-    -88.0, -96.0, -104.0, -112.0, -120.0, -128.0, -144.0, -160.0, -176.0, -192.0, -208.0, -224.0, -240.0, -Inf, -NaN,
-    -NaN, -NaN, -NaN, -NaN, -NaN, -NaN]
-
-Base.Float32(x::Float8) = @inbounds Float8toFloat32[reinterpret(UInt8,x) + 0x01]
-Base.Float32(x::Float8_4) = @inbounds Float8_4toFloat32[reinterpret(UInt8,x) + 0x01]
-
 round(x::T, r::RoundingMode{:ToZero}) where {T<:AbstractFloat8} = T(round(Float32(x), r))
 round(x::T, r::RoundingMode{:Down}) where {T<:AbstractFloat8} = T(round(Float32(x), r))
 round(x::T, r::RoundingMode{:Up}) where {T<:AbstractFloat8} = T(round(Float32(x), r))
diff --git a/src/float8_to_float32.jl b/src/float8_to_float32.jl
new file mode 100644
index 0000000..dc0b996
--- /dev/null
+++ b/src/float8_to_float32.jl
@@ -0,0 +1,102 @@
+# TABLE LOOKUPS
+const Float8toFloat32 = Float32[0.0, 0.015625, 0.03125, 0.046875, 0.0625, 0.078125, 0.09375,
+    0.109375, 0.125, 0.140625, 0.15625, 0.171875, 0.1875, 0.203125, 0.21875, 0.234375, 0.25,
+    0.265625, 0.28125, 0.296875, 0.3125, 0.328125, 0.34375, 0.359375, 0.375, 0.390625, 0.40625,
+    0.421875, 0.4375, 0.453125, 0.46875, 0.484375, 0.5, 0.53125, 0.5625, 0.59375, 0.625, 0.65625,
+    0.6875, 0.71875, 0.75, 0.78125, 0.8125, 0.84375, 0.875, 0.90625, 0.9375, 0.96875, 1.0, 1.0625,
+    1.125, 1.1875, 1.25, 1.3125, 1.375, 1.4375, 1.5, 1.5625, 1.625, 1.6875, 1.75, 1.8125, 1.875, 1.9375,
+    2.0, 2.125, 2.25, 2.375, 2.5, 2.625, 2.75, 2.875, 3.0, 3.125, 3.25, 3.375, 3.5, 3.625, 3.75, 3.875,
+    4.0, 4.25, 4.5, 4.75, 5.0, 5.25, 5.5, 5.75, 6.0, 6.25, 6.5, 6.75, 7.0, 7.25, 7.5, 7.75, 8.0, 8.5,
+    9.0, 9.5, 10.0, 10.5, 11.0, 11.5, 12.0, 12.5, 13.0, 13.5, 14.0, 14.5, 15.0, 15.5, Inf, NaN, NaN,
+    NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, -0.0, -0.015625, -0.03125, -0.046875,
+    -0.0625, -0.078125, -0.09375, -0.109375, -0.125, -0.140625, -0.15625, -0.171875, -0.1875, -0.203125,
+    -0.21875, -0.234375, -0.25, -0.265625, -0.28125, -0.296875, -0.3125, -0.328125, -0.34375, -0.359375,
+    -0.375, -0.390625, -0.40625, -0.421875, -0.4375, -0.453125, -0.46875, -0.484375, -0.5, -0.53125,
+    -0.5625, -0.59375, -0.625, -0.65625, -0.6875, -0.71875, -0.75, -0.78125, -0.8125, -0.84375, -0.875,
+    -0.90625, -0.9375, -0.96875, -1.0, -1.0625, -1.125, -1.1875, -1.25, -1.3125, -1.375, -1.4375, -1.5,
+    -1.5625, -1.625, -1.6875, -1.75, -1.8125, -1.875, -1.9375, -2.0, -2.125, -2.25, -2.375, -2.5, -2.625,
+    -2.75, -2.875, -3.0, -3.125, -3.25, -3.375, -3.5, -3.625, -3.75, -3.875, -4.0, -4.25, -4.5, -4.75, -5.0,
+    -5.25, -5.5, -5.75, -6.0, -6.25, -6.5, -6.75, -7.0, -7.25, -7.5, -7.75, -8.0, -8.5, -9.0, -9.5, -10.0,
+    -10.5, -11.0, -11.5, -12.0, -12.5, -13.0, -13.5, -14.0, -14.5, -15.0, -15.5, -Inf, -NaN, -NaN, -NaN,
+    -NaN, -NaN, -NaN, -NaN, -NaN, -NaN, -NaN, -NaN, -NaN, -NaN, -NaN, -NaN]
+
+const Float8_4toFloat32 = Float32[0.0, 0.001953125, 0.00390625, 0.005859375, 0.0078125, 0.009765625,
+    0.01171875, 0.013671875, 0.015625, 0.017578125, 0.01953125, 0.021484375, 0.0234375, 0.025390625,
+    0.02734375, 0.029296875, 0.03125, 0.03515625, 0.0390625, 0.04296875, 0.046875, 0.05078125, 0.0546875,
+    0.05859375, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125,
+    0.140625, 0.15625, 0.171875, 0.1875, 0.203125, 0.21875, 0.234375, 0.25, 0.28125, 0.3125, 0.34375,
+    0.375, 0.40625, 0.4375, 0.46875, 0.5, 0.5625, 0.625, 0.6875, 0.75, 0.8125, 0.875, 0.9375, 1.0,
+    1.125, 1.25, 1.375, 1.5, 1.625, 1.75, 1.875, 2.0, 2.25, 2.5, 2.75, 3.0, 3.25, 3.5, 3.75, 4.0,
+    4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 18.0, 20.0,
+    22.0, 24.0, 26.0, 28.0, 30.0, 32.0, 36.0, 40.0, 44.0, 48.0, 52.0, 56.0, 60.0, 64.0, 72.0, 80.0, 88.0,
+    96.0, 104.0, 112.0, 120.0, 128.0, 144.0, 160.0, 176.0, 192.0, 208.0, 224.0, 240.0, Inf, NaN, NaN, NaN,
+    NaN, NaN, NaN, NaN, -0.0, -0.001953125, -0.00390625, -0.005859375, -0.0078125, -0.009765625, -0.01171875,
+    -0.013671875, -0.015625, -0.017578125, -0.01953125, -0.021484375, -0.0234375, -0.025390625, -0.02734375,
+    -0.029296875, -0.03125, -0.03515625, -0.0390625, -0.04296875, -0.046875, -0.05078125, -0.0546875, -0.05859375,
+    -0.0625, -0.0703125, -0.078125, -0.0859375, -0.09375, -0.1015625, -0.109375, -0.1171875, -0.125, -0.140625,
+    -0.15625, -0.171875, -0.1875, -0.203125, -0.21875, -0.234375, -0.25, -0.28125, -0.3125, -0.34375, -0.375,
+    -0.40625, -0.4375, -0.46875, -0.5, -0.5625, -0.625, -0.6875, -0.75, -0.8125, -0.875, -0.9375, -1.0, -1.125,
+    -1.25, -1.375, -1.5, -1.625, -1.75, -1.875, -2.0, -2.25, -2.5, -2.75, -3.0, -3.25, -3.5, -3.75, -4.0, -4.5,
+    -5.0, -5.5, -6.0, -6.5, -7.0, -7.5, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0, -16.0, -18.0, -20.0,
+    -22.0, -24.0, -26.0, -28.0, -30.0, -32.0, -36.0, -40.0, -44.0, -48.0, -52.0, -56.0, -60.0, -64.0, -72.0, -80.0,
+    -88.0, -96.0, -104.0, -112.0, -120.0, -128.0, -144.0, -160.0, -176.0, -192.0, -208.0, -224.0, -240.0, -Inf, -NaN,
+    -NaN, -NaN, -NaN, -NaN, -NaN, -NaN]
+
+Base.Float32(x::Float8) = @inbounds Float8toFloat32[reinterpret(UInt8,x) + 0x01]
+Base.Float32(x::Float8_4) = @inbounds Float8_4toFloat32[reinterpret(UInt8,x) + 0x01]
+
+# """Old, function-based version."""
+# function Float32(val::T) where {T<:AbstractFloat8}
+#
+#     ival = reinterpret(UInt8, val)
+#
+#     # seperate into sign, exponent, significand
+#     sign = UInt32((ival & sign_mask(T)) >> 7)
+#     exp  = UInt32((ival & exponent_mask(T)) >> n_significant_bits(T))
+#     sig = UInt32(ival & significand_mask(T))
+#
+#     if exp == zero(UInt32)
+#         if sig == zero(UInt32)          # +-0 case
+#             return reinterpret(Float32,sign << 31)
+#         else                            # subnormals
+#             n_bit = 1
+#             # first significand bit set to one, else zero, to check for size of subnormal
+#             bit = first_sig_bit_mask(T)
+#             while (bit & sig) == 0
+#                 n_bit = n_bit + 1
+#                 bit = bit >> 1
+#             end
+#             sign = sign << 31
+#
+#             # bias = 2^(n_exp-1) - 1, i.e. 127 for Float32, 15 for Float16, 3 for Float8, 7 for Float8_4
+#             # difference in bias + 1 has to be added, e.g. 127-3 = 124 = 0x0000007c
+#
+#             exp = ((bias_difference(T)+1 - n_bit) << 23) % UInt32
+#             sig = ((sig & (~bit)) << n_bit) << sig_bit_shift(T)
+#             ret = sign | exp | sig
+#             reinterpret(Float32,ret)
+#         end
+#     elseif exp == exp_bits_all_one(T)       # Inf/NaN case
+#         if sig == zero(UInt32)              # Infinity
+#             if sign == zero(UInt32)
+#                 return Inf32
+#             else
+#                 return -Inf32
+#             end
+#         else                # NaN, preserve sign and significand (first sig bit always 1)
+#             # NaN32 == reinterpret(Flaot32,0x7fc00000)
+#             ret = 0x7fc00000 | (sign<<31) | (sig<<sig_bit_shift(T))
+#             reinterpret(Float32,ret)
+#         end
+#     else
+#         sign = sign << 31
+#
+#         # bias = 2^(n_exp-1) - 1, i.e. 127 for Float32, 15 for Float16, 3 for Float8, 7 for Float8_4
+#         # difference in bias has to be added, e.g. 127-3 = 124 = 0x0000007c
+#
+#         exp  = (exp + bias_difference(T)) << 23
+#         sig  = sig << sig_bit_shift(T)
+#         ret = sign | exp | sig
+#         return reinterpret(Float32, ret)
+#     end
+# end
diff --git a/test/runtests.jl b/test/runtests.jl
index a4fccdc..6b3d46d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -47,44 +47,44 @@ end
     end
 end
 
-# CURRENTLY FAILING
-# @testset "Float32 -> Float8 subnormals ordered?" begin
-#
-#     N = 100
-#
-#     fs = Float32(floatmin(Float8))*rand(Float32,N)
-#     sort!(fs)
-#
-#     f8s = Float8.(fs)
-#     @testset for i in 1:N-1
-#         @test f8s[i+1] >= f8s[i]
-#     end
-#
-#     f8s = Float8.(-fs)
-#     @testset for i in 1:N-1
-#         @test f8s[i+1] <= f8s[i]
-#     end
-# end
+@testset "Float32 -> Float8 subnormals ordered?" begin
 
+    N = 100
 
-@testset "Conversion Float8 <-> Float32" begin
+    fs = Float32(floatmin(Float8))*rand(Float32,N)
+    sort!(fs)
 
-    @testset for i in 0x00:0xff
-        if ~isnan(Float8(i))
-            @test i == reinterpret(UInt8,Float8(Float32(Float8(i))))
-        end
+    f8s = Float8.(fs)
+    @testset for i in 1:N-1
+        @test f8s[i+1] >= f8s[i]
+    end
+
+    f8s = Float8.(-fs)
+    @testset for i in 1:N-1
+        @test f8s[i+1] <= f8s[i]
     end
 end
 
-@testset "Conversion Float8_4 <-> Float32" begin
+
+@testset "Conversion Float8 <-> Float32" begin
 
     @testset for i in 0x00:0xff
-        if ~isnan(Float8_4(i))
-            @test i == reinterpret(UInt8,Float8_4(Float32(Float8_4(i))))
+        if ~isnan(Float8(i))
+            @test i == reinterpret(UInt8,Float8(Float32(Float8(i))))
         end
     end
 end
 
+# Currently not implemented
+# @testset "Conversion Float8_4 <-> Float32" begin
+#
+#     @testset for i in 0x00:0xff
+#         if ~isnan(Float8_4(i))
+#             @test i == reinterpret(UInt8,Float8_4(Float32(Float8_4(i))))
+#         end
+#     end
+# end
+
 @testset "Negation" begin
 
     @testset for i in 0x00:0xff