diff --git a/src/Float8s.jl b/src/Float8s.jl index 4f151cf..df1a968 100644 --- a/src/Float8s.jl +++ b/src/Float8s.jl @@ -7,7 +7,7 @@ module Float8s UInt8,Int8,Int16,Int32,Int64, (+), (-), (*), (/), (\), (^), sin,cos,tan,asin,acos,atan,sinh,cosh,tanh,asinh,acosh, - atanh,exp,exp2,exp10,log,log2,log10,sqrt,lgamma,log1p + atanh,exp,exp2,exp10,log,log2,log10,sqrt,log1p, atan,hypot export Float8, Float8_4, NaN8, Inf8, NaN8_4, Inf8_4 diff --git a/src/float8.jl b/src/float8.jl index ff55e41..2d045a0 100644 --- a/src/float8.jl +++ b/src/float8.jl @@ -91,34 +91,51 @@ function create_base_shifttable(::Type{T}) where {T<:AbstractFloat8} shifttable = Vector{UInt8}(undef, 512) if T == Float8 + # elements derive from + # [1] 2^-6 = Float8(0x01) the smallest representable number (subnormal) + # [2] 2^-2 = Float8(0x10) the first non-subnormal number + # [3] 2^4 = 16 > floatmax(Float8) is the smallest power of two that is larger than floatmax(Float8) + e_limits = [-6,-2,4] + + # shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1" + # to the first significand bit + # e_shift_subnorm is 17 for Float8 + e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8)-1)+e_limits[2]-1 elseif T == Float8_4 - e_limits = [] + + # see above + e_limits = [-9,-6,8] + + # shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1" + # to the first significand bit + # e_shift_subnorm is 14 for Float8_4 + e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8_4)-1)+e_limits[2]-1 end - for i = 0:255 # all possible exponents for Float32 - e = i - 127 # subtract Float32 bias - if e < -6 # Very small numbers map to +- zero + for i = 0:255 # all possible exponents for Float32 + e = i - 127 # subtract Float32 bias + if e < e_limits[1] # Very small numbers map to +- zero basetable[i|0x000+1] = zero(T) basetable[i|0x100+1] = -zero(T) shifttable[i|0x000+1] = n_significant_bits(T)+1 shifttable[i|0x100+1] = n_significant_bits(T)+1 - elseif e < -2 # Small numbers map to denorms + elseif e < e_limits[2] # Small numbers map to denorms basetable[i|0x000+1] = zero(T) basetable[i|0x100+1] = -zero(T) - shifttable[i|0x000+1] = -e+17 - shifttable[i|0x100+1] = -e+17 - elseif e < 4 # Normal numbers just lose precision + shifttable[i|0x000+1] = -e+e_shift_subnorm + shifttable[i|0x100+1] = -e+e_shift_subnorm + elseif e < e_limits[3] # Normal numbers just lose precision basetable[i|0x000+1] = ((e+bias(T)) << n_significant_bits(T)) basetable[i|0x100+1] = ((e+bias(T)) << n_significant_bits(T)) | sign_mask(T) shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T) shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T) - elseif e < 128 # Large numbers map to Infinity + elseif e < 128 # Large numbers map to Infinity basetable[i|0x000+1] = inf8(T) basetable[i|0x100+1] = -inf8(T) shifttable[i|0x000+1] = n_significant_bits(T)+1 shifttable[i|0x100+1] = n_significant_bits(T)+1 - else # Infinity and NaN's stay Infinity and NaN's + else # Infinity and NaN's stay Infinity and NaN's basetable[i|0x000+1] = inf8(T) basetable[i|0x100+1] = -inf8(T) shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T) diff --git a/test/runtests.jl b/test/runtests.jl index bf3f849..ba3776d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,7 +10,7 @@ using Test end end -@testset "Conversion Float8 <-> Float32" begin +@testset "Conversion Float8_4 <-> Float32" begin for i in 0x00:0xff if ~isnan(Float8_4(i))