Skip to content

Commit

Permalink
rename _convert to maybe_encode (#64)
Browse files Browse the repository at this point in the history
* rename _convert to maybe_encode
* add tests for maybe_encode
* ignore CRLF/LF differences

Co-authored-by: Lyndon White <oxinabox@ucc.asn.au>
  • Loading branch information
johnnychen94 and oxinabox committed Jul 7, 2020
1 parent ab01585 commit afc31d4
Show file tree
Hide file tree
Showing 12 changed files with 212 additions and 16 deletions.
52 changes: 37 additions & 15 deletions src/fileio.jl
Expand Up @@ -9,7 +9,7 @@ function loadfile(T, file::File)
end

function loadfile(T, file::TextFile)
replace(read(file.filename, String), "\r"=>"") # ignore CRLF/LF difference
_ignore_CR(read(file.filename, String))
end

function loadfile(::Type{<:Number}, file::File{format"TXT"})
Expand All @@ -24,7 +24,7 @@ function savefile(file::TextFile, content)
write(file.filename, string(content))
end

function query_extended(filename)
function query_extended(filename::AbstractString)
file, ext = splitext(filename)
# TODO: make this less hacky
if uppercase(ext) == ".SHA256"
Expand All @@ -38,20 +38,28 @@ function query_extended(filename)
res
end

# Some target formats are not supported by FileIO and thus require an encoding/compression process
# before saving. For other formats, we should trust IO backends and make as few changes as possible.
# Otherwise, reference becomes unfaithful. The encoding process helps making the actual data matches
# the reference data, which is loaded from reference file via IO backends.
#
# TODO: split `maybe_encode` to `maybe_preprocess` and `maybe_encode`
"""
_convert(T::Type{<:DataFormat}, x; kw...) -> out
maybe_encode(T::Type{<:DataFormat}, x; kw...) -> out
Convert `x` to a validate content for file data format `T`.
If needed, encode `x` to a valid content that matches format `T`.
If there is no known method to encode `x`, then it directly return `x` without warning.
"""
_convert(::Type{<:DataFormat}, x; kw...) = x
maybe_encode(::Type{<:DataFormat}, x; kw...) = x

# plain TXT
_convert(::Type{DataFormat{:TXT}}, x; kw...) = replace(string(x), "\r"=>"") # ignore CRLF/LF difference
_convert(::Type{DataFormat{:TXT}}, x::Number; kw...) = x
function _convert(::Type{DataFormat{:TXT}}, x::AbstractArray{<:AbstractString}; kw...)
return join(x, '\n')
end
function _convert(
maybe_encode(::Type{DataFormat{:TXT}}, x; kw...) = _ignore_CR(string(x))
maybe_encode(::Type{DataFormat{:TXT}}, x::AbstractArray{<:AbstractString}; kw...) = _join(x)
maybe_encode(::Type{DataFormat{:TXT}}, x::AbstractString; kw...) = _ignore_CR(x)
maybe_encode(::Type{DataFormat{:TXT}}, x::Number; kw...) = x # TODO: Change this to string(x) ?

function maybe_encode(
::Type{DataFormat{:TXT}}, img::AbstractArray{<:Colorant};
size = (20,40), kw...)

Expand All @@ -65,11 +73,25 @@ function _convert(
end

# SHA256
_convert(::Type{DataFormat{:SHA256}}, x; kw...) = bytes2hex(sha256(string(x)))
function _convert(::Type{DataFormat{:SHA256}}, img::AbstractArray{<:Colorant}; kw...)
maybe_encode(::Type{DataFormat{:SHA256}}, x; kw...) = _sha256(string(x))
maybe_encode(::Type{DataFormat{:SHA256}}, x::AbstractString) = _sha256(_ignore_CR(x))
maybe_encode(::Type{DataFormat{:SHA256}}, x::AbstractArray{<:AbstractString}) = _sha256(_join(x))
function maybe_encode(::Type{DataFormat{:SHA256}}, img::AbstractArray{<:Colorant}; kw...)
# encode image into SHA256
size_str = bytes2hex(sha256(reinterpret(UInt8,[map(Int64,size(img))...])))
img_str = bytes2hex(sha256(reinterpret(UInt8,vec(rawview(channelview(img))))))
size_str = _sha256(reinterpret(UInt8,[map(Int64,size(img))...]))
img_str = _sha256(reinterpret(UInt8,vec(rawview(channelview(img)))))

return size_str * img_str
end

# Helpers
_join(x::AbstractArray{<:AbstractString}) = _ignore_CR(join(x, "\n"))
_sha256(x) = bytes2hex(sha256(x))
"""
_ignore_CR(x::AbstractString)
Ignore the CRLF(`\\r\\n`) and LF(`\\n`) difference by removing `\\r` from the given string.
CRLF format is widely used by Windows while LF format is mainly used by Linux.
"""
_ignore_CR(x::AbstractString) = replace(x, "\r\n"=>"\n") # issue #39
2 changes: 1 addition & 1 deletion src/test_reference.jl
Expand Up @@ -107,7 +107,7 @@ function test_reference(
rendermode = default_rendermode(F, raw_actual)
end

actual = _convert(F, raw_actual; kw...)
actual = maybe_encode(F, raw_actual; kw...)
# preprocessing when reference file doesn't exists
if !isfile(path)
@info("Reference file for \"$filename\" does not exist. It will be created")
Expand Down
165 changes: 165 additions & 0 deletions test/fileio.jl
@@ -0,0 +1,165 @@
refdir = joinpath(refroot, "fileio")

@testset "query" begin
check_types = [
# text types
("textfile_with_no_extension", format"TXT"),
("textfile.txt", format"TXT"),
("textfile.unknown", format"TXT"),
("textfile.sha256", format"SHA256"),

# image types
("imagefile.jpg", format"JPEG"),
("imagefile.jpeg", format"JPEG"),
("imagefile.png", format"PNG"),
("imagefile.tif", format"TIFF"),
("imagefile.tiff", format"TIFF"),

# dataframe types
("dataframe_file.csv", format"CSV")
]
for (file, fmt) in check_types
@test ReferenceTests.query_extended(file) == File{fmt}(file)
@test ReferenceTests.query_extended(abspath(file)) == File{fmt}(abspath(file))
end
end

@testset "maybe_encode" begin
@testset "string" begin
str1 = "Hello world"
str1_sha256 = "64ec88ca00b268e5ba1a35678a1b5316d212f4f366b2477232534a8aeca37f3c"
str2 = "Hello\n world"
str2_sha256 = "60b65ab310480818c4289227f2ec68f1714743db8571b4cb190e100c0085be3d" # bytes2hex(SHA.sha256(str2))
str2_crlf = "Hello\r\n world"
str3 = "Hello\nworld"
str3_sha256 = "46e0ea795802f17d0b340983ca7d7068c94d7d9172ee4daea37a1ab1168649ec" # bytes2hex(SHA.sha256(str3))
str3_arr1 = ["Hello", "world"]
str3_arr2 = ["Hello" "world"]
str4 = "Hello\n world1\nHello\n world2"
str4_sha256 = "c7dc8b82c3a6fed4afa0c8790a0586b73df0e4f35524efe6810e5d78b6b6a611" # bytes2hex(SHA.sha256(str4))
str4_arr = ["Hello\r\n world1", "Hello\n world2"]

# string as plain text
fmt = format"TXT"
# convert should respect whitespaces
@test str1 == ReferenceTests.maybe_encode(fmt, str1)
@test str2 == ReferenceTests.maybe_encode(fmt, str2)
# but ignore CRLF/LF differences
@test str2 == ReferenceTests.maybe_encode(fmt, str2_crlf)
# string arrays are treated as multi-line strings, even for UNKNOWN format
@test str3 == ReferenceTests.maybe_encode(fmt, str3)
@test str3 == ReferenceTests.maybe_encode(fmt, str3_arr1)
@test str3 == ReferenceTests.maybe_encode(fmt, str3_arr2)
# string arrays should ignore CRLF/LF differences, too
@test str4 == ReferenceTests.maybe_encode(fmt, str4_arr)

# string as SHA256 should also ignore CRLF/LF differences
fmt = format"SHA256"
@test str1_sha256 == ReferenceTests.maybe_encode(fmt, str1)
@test str2_sha256 == ReferenceTests.maybe_encode(fmt, str2)
# but ignore CRLF/LF differences
@test str2_sha256 == ReferenceTests.maybe_encode(fmt, str2_crlf)
# string arrays are treated as multi-line strings, even for UNKNOWN format
@test str3_sha256 == ReferenceTests.maybe_encode(fmt, str3)
@test str3_sha256 == ReferenceTests.maybe_encode(fmt, str3_arr1)
@test str3_sha256 == ReferenceTests.maybe_encode(fmt, str3_arr2)
# string arrays should ignore CRLF/LF differences, too
@test str4_sha256 == ReferenceTests.maybe_encode(fmt, str4_arr)

# unknown formats
fmt = format"PNG"
for str in (str1, str2, str2_crlf, str3, str3_arr1, str3_arr2)
@test str === ReferenceTests.maybe_encode(fmt, str)
end
end

@testset "numbers" begin
for num in (0x01, 1, 1.0f0, 1.0)
for fmt in (format"TXT", format"UNKNOWN")
@test num === ReferenceTests.maybe_encode(fmt, num)
end
fmt = format"SHA256"
@test ReferenceTests.maybe_encode(fmt, num) == ReferenceTests.maybe_encode(fmt, string(num))
end


for (fmt, a, ref) in [
# if target is TXT, convert it to string
(format"TXT", [1, 2], "[1, 2]"),
(format"TXT", [1,2], "[1, 2]"),
(format"TXT", [1;2], "[1, 2]"),
(format"TXT", [1 2], "[1 2]"),
(format"TXT", [1 2; 3 4], "[1 2; 3 4]"),
# if target is Unknown, make no change
(format"UNKNOWN", [1, 2], [1, 2]),
(format"UNKNOWN", [1,2], [1, 2]),
(format"UNKNOWN", [1;2], [1, 2]),
(format"UNKNOWN", [1 2], [1 2]),
(format"UNKNOWN", [1 2; 3 4], [1 2; 3 4]),
]
@test ref == ReferenceTests.maybe_encode(fmt, a)
end

for a in [[1, 2], [1 2], [1 2; 3 4]]
fmt = format"SHA256"
@test ReferenceTests.maybe_encode(fmt, a) == ReferenceTests.maybe_encode(fmt, string(a))
end

end

@testset "image" begin
gray_1d = Gray{N0f8}.(0.0:0.1:0.9)
rgb_1d = RGB.(gray_1d)
gray_2d = Gray{N0f8}.(reshape(0.0:0.1:0.9, 2, 5))
rgb_2d = RGB.(gray_2d)
gray_3d = Gray{N0f8}.(reshape(0.0:0.02:0.95, 2, 4, 6))
rgb_3d = RGB.(gray_3d)

# any common image types
for img in (gray_1d, gray_2d, gray_3d, rgb_1d, rgb_2d, rgb_3d)
for fmt in (format"JPEG", format"PNG", format"TIFF", format"UNKNOWN")
@test img === ReferenceTests.maybe_encode(fmt, img)
end
end

# image as text file
fmt = format"TXT"
# TODO: support n-D image encoding
# @test_reference joinpath(refdir, "gray_1d_as_txt.txt") ReferenceTests.maybe_encode(fmt, gray_1d)
# @test_reference joinpath(refdir, "rgb_1d_as_txt.txt") ReferenceTests.maybe_encode(fmt, rgb_1d)
@test_reference joinpath(refdir, "gray_2d_as_txt.txt") ReferenceTests.maybe_encode(fmt, gray_2d)
@test_reference joinpath(refdir, "rgb_2d_as_txt.txt") ReferenceTests.maybe_encode(fmt, rgb_2d)
# @test_reference joinpath(refdir, "gray_3d_as_txt.txt") ReferenceTests.maybe_encode(fmt, gray_3d)
# @test_reference joinpath(refdir, "rgb_3d_as_txt.txt") ReferenceTests.maybe_encode(fmt, rgb_3d)

# image as SHA256
fmt = format"SHA256"
for (file, img) in [
("gray_1d", gray_1d),
("gray_2d", gray_2d),
("gray_3d", gray_3d),
("rgb_1d", rgb_1d),
("rgb_2d", rgb_2d),
("rgb_3d", rgb_3d)
]
reffile = joinpath(refdir, "$(file)_as_sha256.txt")
@test_reference reffile ReferenceTests.maybe_encode(fmt, img)
end
end

# dataframe
@testset "dataframe" begin
df = DataFrame(v1=[1,2,3], v2=["a","b","c"])

@test string(df) == ReferenceTests.maybe_encode(format"TXT", df)
for fmt in (format"CSV", format"UNKNOWN")
@test df === ReferenceTests.maybe_encode(fmt, df)
end

fmt = format"SHA256"
@test_reference joinpath(refdir, "dataframe_as_sha256.txt") ReferenceTests.maybe_encode(fmt, df)

end
end

# TODO: savefile & loadfile
1 change: 1 addition & 0 deletions test/references/fileio/dataframe_as_sha256.txt
@@ -0,0 +1 @@
2cf7c4edcafc27a5eb1b74fb0af704edc0d9bbef91a1b55d3b7350fa4b54cd18
1 change: 1 addition & 0 deletions test/references/fileio/gray_1d_as_sha256.txt
@@ -0,0 +1 @@
a111f275cc2e7588000001d300a31e76336d15b9d314cd1a1d8f3d3556975eed10ef43c7fcace84c4d0d54b8e92c0c9be2d14a6bf3dd7647254a3cc0c4a04297
1 change: 1 addition & 0 deletions test/references/fileio/gray_2d_as_sha256.txt
@@ -0,0 +1 @@
26cfbb315c316a0b15516434f90284e5011dcb58503fe39eb036bf669bd8233d10ef43c7fcace84c4d0d54b8e92c0c9be2d14a6bf3dd7647254a3cc0c4a04297
1 change: 1 addition & 0 deletions test/references/fileio/gray_2d_as_txt.txt
@@ -0,0 +1 @@
▀▀▀▀▀
1 change: 1 addition & 0 deletions test/references/fileio/gray_3d_as_sha256.txt
@@ -0,0 +1 @@
72307e420b5460c03a1c167060ed336407c26ea74aabf8fab76dd8e9dbe8cbe4baf0f53196e8d5270c0b0b2da82bbbb4676edbb0ebf84ec0dcbd8c0bf4d9af68
1 change: 1 addition & 0 deletions test/references/fileio/rgb_1d_as_sha256.txt
@@ -0,0 +1 @@
a111f275cc2e7588000001d300a31e76336d15b9d314cd1a1d8f3d3556975eedebd6b0ad29dd5402ce5745bb5b48d4c59b7f8da0cdf8d2f287befd9094f6ac89
1 change: 1 addition & 0 deletions test/references/fileio/rgb_2d_as_sha256.txt
@@ -0,0 +1 @@
26cfbb315c316a0b15516434f90284e5011dcb58503fe39eb036bf669bd8233debd6b0ad29dd5402ce5745bb5b48d4c59b7f8da0cdf8d2f287befd9094f6ac89
1 change: 1 addition & 0 deletions test/references/fileio/rgb_2d_as_txt.txt
@@ -0,0 +1 @@
▀▀▀▀▀
1 change: 1 addition & 0 deletions test/references/fileio/rgb_3d_as_sha256.txt
@@ -0,0 +1 @@
72307e420b5460c03a1c167060ed336407c26ea74aabf8fab76dd8e9dbe8cbe45465bcbf50acdbe5600207e3266eedef6548bc4d244e55d7a1af0f1af09e019f

0 comments on commit afc31d4

Please sign in to comment.