diff --git a/src/fileio.jl b/src/fileio.jl index 75c0d09..f90b7a8 100644 --- a/src/fileio.jl +++ b/src/fileio.jl @@ -9,7 +9,7 @@ function loadfile(T, file::File) end function loadfile(T, file::TextFile) - replace(read(file.filename, String), "\r"=>"") # ignore CRLF/LF difference + _ignore_CR(read(file.filename, String)) end function loadfile(::Type{<:Number}, file::File{format"TXT"}) @@ -24,7 +24,7 @@ function savefile(file::TextFile, content) write(file.filename, string(content)) end -function query_extended(filename) +function query_extended(filename::AbstractString) file, ext = splitext(filename) # TODO: make this less hacky if uppercase(ext) == ".SHA256" @@ -38,20 +38,28 @@ function query_extended(filename) res end +# Some target formats are not supported by FileIO and thus require an encoding/compression process +# before saving. For other formats, we should trust IO backends and make as few changes as possible. +# Otherwise, reference becomes unfaithful. The encoding process helps making the actual data matches +# the reference data, which is loaded from reference file via IO backends. +# +# TODO: split `maybe_encode` to `maybe_preprocess` and `maybe_encode` """ - _convert(T::Type{<:DataFormat}, x; kw...) -> out + maybe_encode(T::Type{<:DataFormat}, x; kw...) -> out -Convert `x` to a validate content for file data format `T`. +If needed, encode `x` to a valid content that matches format `T`. + +If there is no known method to encode `x`, then it directly return `x` without warning. """ -_convert(::Type{<:DataFormat}, x; kw...) = x +maybe_encode(::Type{<:DataFormat}, x; kw...) = x # plain TXT -_convert(::Type{DataFormat{:TXT}}, x; kw...) = replace(string(x), "\r"=>"") # ignore CRLF/LF difference -_convert(::Type{DataFormat{:TXT}}, x::Number; kw...) = x -function _convert(::Type{DataFormat{:TXT}}, x::AbstractArray{<:AbstractString}; kw...) - return join(x, '\n') -end -function _convert( +maybe_encode(::Type{DataFormat{:TXT}}, x; kw...) = _ignore_CR(string(x)) +maybe_encode(::Type{DataFormat{:TXT}}, x::AbstractArray{<:AbstractString}; kw...) = _join(x) +maybe_encode(::Type{DataFormat{:TXT}}, x::AbstractString; kw...) = _ignore_CR(x) +maybe_encode(::Type{DataFormat{:TXT}}, x::Number; kw...) = x # TODO: Change this to string(x) ? + +function maybe_encode( ::Type{DataFormat{:TXT}}, img::AbstractArray{<:Colorant}; size = (20,40), kw...) @@ -65,11 +73,25 @@ function _convert( end # SHA256 -_convert(::Type{DataFormat{:SHA256}}, x; kw...) = bytes2hex(sha256(string(x))) -function _convert(::Type{DataFormat{:SHA256}}, img::AbstractArray{<:Colorant}; kw...) +maybe_encode(::Type{DataFormat{:SHA256}}, x; kw...) = _sha256(string(x)) +maybe_encode(::Type{DataFormat{:SHA256}}, x::AbstractString) = _sha256(_ignore_CR(x)) +maybe_encode(::Type{DataFormat{:SHA256}}, x::AbstractArray{<:AbstractString}) = _sha256(_join(x)) +function maybe_encode(::Type{DataFormat{:SHA256}}, img::AbstractArray{<:Colorant}; kw...) # encode image into SHA256 - size_str = bytes2hex(sha256(reinterpret(UInt8,[map(Int64,size(img))...]))) - img_str = bytes2hex(sha256(reinterpret(UInt8,vec(rawview(channelview(img)))))) + size_str = _sha256(reinterpret(UInt8,[map(Int64,size(img))...])) + img_str = _sha256(reinterpret(UInt8,vec(rawview(channelview(img))))) return size_str * img_str end + +# Helpers +_join(x::AbstractArray{<:AbstractString}) = _ignore_CR(join(x, "\n")) +_sha256(x) = bytes2hex(sha256(x)) +""" + _ignore_CR(x::AbstractString) + +Ignore the CRLF(`\\r\\n`) and LF(`\\n`) difference by removing `\\r` from the given string. + +CRLF format is widely used by Windows while LF format is mainly used by Linux. +""" +_ignore_CR(x::AbstractString) = replace(x, "\r\n"=>"\n") # issue #39 diff --git a/src/test_reference.jl b/src/test_reference.jl index 211f510..f70873c 100644 --- a/src/test_reference.jl +++ b/src/test_reference.jl @@ -107,7 +107,7 @@ function test_reference( rendermode = default_rendermode(F, raw_actual) end - actual = _convert(F, raw_actual; kw...) + actual = maybe_encode(F, raw_actual; kw...) # preprocessing when reference file doesn't exists if !isfile(path) @info("Reference file for \"$filename\" does not exist. It will be created") diff --git a/test/fileio.jl b/test/fileio.jl index e69de29..be68526 100644 --- a/test/fileio.jl +++ b/test/fileio.jl @@ -0,0 +1,165 @@ +refdir = joinpath(refroot, "fileio") + +@testset "query" begin + check_types = [ + # text types + ("textfile_with_no_extension", format"TXT"), + ("textfile.txt", format"TXT"), + ("textfile.unknown", format"TXT"), + ("textfile.sha256", format"SHA256"), + + # image types + ("imagefile.jpg", format"JPEG"), + ("imagefile.jpeg", format"JPEG"), + ("imagefile.png", format"PNG"), + ("imagefile.tif", format"TIFF"), + ("imagefile.tiff", format"TIFF"), + + # dataframe types + ("dataframe_file.csv", format"CSV") + ] + for (file, fmt) in check_types + @test ReferenceTests.query_extended(file) == File{fmt}(file) + @test ReferenceTests.query_extended(abspath(file)) == File{fmt}(abspath(file)) + end +end + +@testset "maybe_encode" begin + @testset "string" begin + str1 = "Hello world" + str1_sha256 = "64ec88ca00b268e5ba1a35678a1b5316d212f4f366b2477232534a8aeca37f3c" + str2 = "Hello\n world" + str2_sha256 = "60b65ab310480818c4289227f2ec68f1714743db8571b4cb190e100c0085be3d" # bytes2hex(SHA.sha256(str2)) + str2_crlf = "Hello\r\n world" + str3 = "Hello\nworld" + str3_sha256 = "46e0ea795802f17d0b340983ca7d7068c94d7d9172ee4daea37a1ab1168649ec" # bytes2hex(SHA.sha256(str3)) + str3_arr1 = ["Hello", "world"] + str3_arr2 = ["Hello" "world"] + str4 = "Hello\n world1\nHello\n world2" + str4_sha256 = "c7dc8b82c3a6fed4afa0c8790a0586b73df0e4f35524efe6810e5d78b6b6a611" # bytes2hex(SHA.sha256(str4)) + str4_arr = ["Hello\r\n world1", "Hello\n world2"] + + # string as plain text + fmt = format"TXT" + # convert should respect whitespaces + @test str1 == ReferenceTests.maybe_encode(fmt, str1) + @test str2 == ReferenceTests.maybe_encode(fmt, str2) + # but ignore CRLF/LF differences + @test str2 == ReferenceTests.maybe_encode(fmt, str2_crlf) + # string arrays are treated as multi-line strings, even for UNKNOWN format + @test str3 == ReferenceTests.maybe_encode(fmt, str3) + @test str3 == ReferenceTests.maybe_encode(fmt, str3_arr1) + @test str3 == ReferenceTests.maybe_encode(fmt, str3_arr2) + # string arrays should ignore CRLF/LF differences, too + @test str4 == ReferenceTests.maybe_encode(fmt, str4_arr) + + # string as SHA256 should also ignore CRLF/LF differences + fmt = format"SHA256" + @test str1_sha256 == ReferenceTests.maybe_encode(fmt, str1) + @test str2_sha256 == ReferenceTests.maybe_encode(fmt, str2) + # but ignore CRLF/LF differences + @test str2_sha256 == ReferenceTests.maybe_encode(fmt, str2_crlf) + # string arrays are treated as multi-line strings, even for UNKNOWN format + @test str3_sha256 == ReferenceTests.maybe_encode(fmt, str3) + @test str3_sha256 == ReferenceTests.maybe_encode(fmt, str3_arr1) + @test str3_sha256 == ReferenceTests.maybe_encode(fmt, str3_arr2) + # string arrays should ignore CRLF/LF differences, too + @test str4_sha256 == ReferenceTests.maybe_encode(fmt, str4_arr) + + # unknown formats + fmt = format"PNG" + for str in (str1, str2, str2_crlf, str3, str3_arr1, str3_arr2) + @test str === ReferenceTests.maybe_encode(fmt, str) + end + end + + @testset "numbers" begin + for num in (0x01, 1, 1.0f0, 1.0) + for fmt in (format"TXT", format"UNKNOWN") + @test num === ReferenceTests.maybe_encode(fmt, num) + end + fmt = format"SHA256" + @test ReferenceTests.maybe_encode(fmt, num) == ReferenceTests.maybe_encode(fmt, string(num)) + end + + + for (fmt, a, ref) in [ + # if target is TXT, convert it to string + (format"TXT", [1, 2], "[1, 2]"), + (format"TXT", [1,2], "[1, 2]"), + (format"TXT", [1;2], "[1, 2]"), + (format"TXT", [1 2], "[1 2]"), + (format"TXT", [1 2; 3 4], "[1 2; 3 4]"), + # if target is Unknown, make no change + (format"UNKNOWN", [1, 2], [1, 2]), + (format"UNKNOWN", [1,2], [1, 2]), + (format"UNKNOWN", [1;2], [1, 2]), + (format"UNKNOWN", [1 2], [1 2]), + (format"UNKNOWN", [1 2; 3 4], [1 2; 3 4]), + ] + @test ref == ReferenceTests.maybe_encode(fmt, a) + end + + for a in [[1, 2], [1 2], [1 2; 3 4]] + fmt = format"SHA256" + @test ReferenceTests.maybe_encode(fmt, a) == ReferenceTests.maybe_encode(fmt, string(a)) + end + + end + + @testset "image" begin + gray_1d = Gray{N0f8}.(0.0:0.1:0.9) + rgb_1d = RGB.(gray_1d) + gray_2d = Gray{N0f8}.(reshape(0.0:0.1:0.9, 2, 5)) + rgb_2d = RGB.(gray_2d) + gray_3d = Gray{N0f8}.(reshape(0.0:0.02:0.95, 2, 4, 6)) + rgb_3d = RGB.(gray_3d) + + # any common image types + for img in (gray_1d, gray_2d, gray_3d, rgb_1d, rgb_2d, rgb_3d) + for fmt in (format"JPEG", format"PNG", format"TIFF", format"UNKNOWN") + @test img === ReferenceTests.maybe_encode(fmt, img) + end + end + + # image as text file + fmt = format"TXT" + # TODO: support n-D image encoding + # @test_reference joinpath(refdir, "gray_1d_as_txt.txt") ReferenceTests.maybe_encode(fmt, gray_1d) + # @test_reference joinpath(refdir, "rgb_1d_as_txt.txt") ReferenceTests.maybe_encode(fmt, rgb_1d) + @test_reference joinpath(refdir, "gray_2d_as_txt.txt") ReferenceTests.maybe_encode(fmt, gray_2d) + @test_reference joinpath(refdir, "rgb_2d_as_txt.txt") ReferenceTests.maybe_encode(fmt, rgb_2d) + # @test_reference joinpath(refdir, "gray_3d_as_txt.txt") ReferenceTests.maybe_encode(fmt, gray_3d) + # @test_reference joinpath(refdir, "rgb_3d_as_txt.txt") ReferenceTests.maybe_encode(fmt, rgb_3d) + + # image as SHA256 + fmt = format"SHA256" + for (file, img) in [ + ("gray_1d", gray_1d), + ("gray_2d", gray_2d), + ("gray_3d", gray_3d), + ("rgb_1d", rgb_1d), + ("rgb_2d", rgb_2d), + ("rgb_3d", rgb_3d) + ] + reffile = joinpath(refdir, "$(file)_as_sha256.txt") + @test_reference reffile ReferenceTests.maybe_encode(fmt, img) + end + end + + # dataframe + @testset "dataframe" begin + df = DataFrame(v1=[1,2,3], v2=["a","b","c"]) + + @test string(df) == ReferenceTests.maybe_encode(format"TXT", df) + for fmt in (format"CSV", format"UNKNOWN") + @test df === ReferenceTests.maybe_encode(fmt, df) + end + + fmt = format"SHA256" + @test_reference joinpath(refdir, "dataframe_as_sha256.txt") ReferenceTests.maybe_encode(fmt, df) + + end +end + +# TODO: savefile & loadfile diff --git a/test/references/fileio/dataframe_as_sha256.txt b/test/references/fileio/dataframe_as_sha256.txt new file mode 100644 index 0000000..37fe296 --- /dev/null +++ b/test/references/fileio/dataframe_as_sha256.txt @@ -0,0 +1 @@ +2cf7c4edcafc27a5eb1b74fb0af704edc0d9bbef91a1b55d3b7350fa4b54cd18 \ No newline at end of file diff --git a/test/references/fileio/gray_1d_as_sha256.txt b/test/references/fileio/gray_1d_as_sha256.txt new file mode 100644 index 0000000..dd28753 --- /dev/null +++ b/test/references/fileio/gray_1d_as_sha256.txt @@ -0,0 +1 @@ +a111f275cc2e7588000001d300a31e76336d15b9d314cd1a1d8f3d3556975eed10ef43c7fcace84c4d0d54b8e92c0c9be2d14a6bf3dd7647254a3cc0c4a04297 \ No newline at end of file diff --git a/test/references/fileio/gray_2d_as_sha256.txt b/test/references/fileio/gray_2d_as_sha256.txt new file mode 100644 index 0000000..465a648 --- /dev/null +++ b/test/references/fileio/gray_2d_as_sha256.txt @@ -0,0 +1 @@ +26cfbb315c316a0b15516434f90284e5011dcb58503fe39eb036bf669bd8233d10ef43c7fcace84c4d0d54b8e92c0c9be2d14a6bf3dd7647254a3cc0c4a04297 \ No newline at end of file diff --git a/test/references/fileio/gray_2d_as_txt.txt b/test/references/fileio/gray_2d_as_txt.txt new file mode 100644 index 0000000..908bd18 --- /dev/null +++ b/test/references/fileio/gray_2d_as_txt.txt @@ -0,0 +1 @@ +▀▀▀▀▀ \ No newline at end of file diff --git a/test/references/fileio/gray_3d_as_sha256.txt b/test/references/fileio/gray_3d_as_sha256.txt new file mode 100644 index 0000000..21b6828 --- /dev/null +++ b/test/references/fileio/gray_3d_as_sha256.txt @@ -0,0 +1 @@ +72307e420b5460c03a1c167060ed336407c26ea74aabf8fab76dd8e9dbe8cbe4baf0f53196e8d5270c0b0b2da82bbbb4676edbb0ebf84ec0dcbd8c0bf4d9af68 \ No newline at end of file diff --git a/test/references/fileio/rgb_1d_as_sha256.txt b/test/references/fileio/rgb_1d_as_sha256.txt new file mode 100644 index 0000000..9708f12 --- /dev/null +++ b/test/references/fileio/rgb_1d_as_sha256.txt @@ -0,0 +1 @@ +a111f275cc2e7588000001d300a31e76336d15b9d314cd1a1d8f3d3556975eedebd6b0ad29dd5402ce5745bb5b48d4c59b7f8da0cdf8d2f287befd9094f6ac89 \ No newline at end of file diff --git a/test/references/fileio/rgb_2d_as_sha256.txt b/test/references/fileio/rgb_2d_as_sha256.txt new file mode 100644 index 0000000..0b6d3ff --- /dev/null +++ b/test/references/fileio/rgb_2d_as_sha256.txt @@ -0,0 +1 @@ +26cfbb315c316a0b15516434f90284e5011dcb58503fe39eb036bf669bd8233debd6b0ad29dd5402ce5745bb5b48d4c59b7f8da0cdf8d2f287befd9094f6ac89 \ No newline at end of file diff --git a/test/references/fileio/rgb_2d_as_txt.txt b/test/references/fileio/rgb_2d_as_txt.txt new file mode 100644 index 0000000..908bd18 --- /dev/null +++ b/test/references/fileio/rgb_2d_as_txt.txt @@ -0,0 +1 @@ +▀▀▀▀▀ \ No newline at end of file diff --git a/test/references/fileio/rgb_3d_as_sha256.txt b/test/references/fileio/rgb_3d_as_sha256.txt new file mode 100644 index 0000000..8f59425 --- /dev/null +++ b/test/references/fileio/rgb_3d_as_sha256.txt @@ -0,0 +1 @@ +72307e420b5460c03a1c167060ed336407c26ea74aabf8fab76dd8e9dbe8cbe45465bcbf50acdbe5600207e3266eedef6548bc4d244e55d7a1af0f1af09e019f \ No newline at end of file