JuliaML · adrhill · Jun 23, 2022 · Jun 23, 2022 · Jun 23, 2022 · Jun 23, 2022
diff --git a/Project.toml b/Project.toml
@@ -12,16 +12,19 @@ FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
 GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
 Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
+ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
 ImageShow = "4e3cecfd-b093-5904-9786-8bbb286a6a31"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+JpegTurbo = "b835a17e-a41a-41e7-81f0-2f016b05efe0"
 LazyModules = "8cdb02fc-e678-4876-92c5-9defec4f444e"
 MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 NPZ = "15e1cf62-19b3-5cfa-8e77-841668bca605"
 Pickle = "fbb45041-c46e-462f-888f-7c521cafbc2c"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+StackViews = "cae243ae-269e-4f55-b966-ac2d0dc13c15"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
@@ -33,15 +36,18 @@ FixedPointNumbers = "0.8"
 GZip = "0.5"
 Glob = "1.3"
 HDF5 = "0.16.2"
+ImageCore = "0.9"
 ImageShow = "0.3"
 JLD2 = "0.4.21"
 JSON3 = "1"
+JpegTurbo = "0.1"
 LazyModules = "0.3"
 MAT = "0.10"
 MLUtils = "0.2.0"
 NPZ = "0.4.1"
 Pickle = "0.3"
 Requires = "1"
+StackViews = "0.1"
 Tables = "1.6"
 julia = "1.6"
 

diff --git a/docs/src/datasets/vision.md b/docs/src/datasets/vision.md
@@ -22,6 +22,7 @@ convert2image
 ```@docs
 CIFAR10
 CIFAR100
+ImageNet
 EMNIST
 FashionMNIST
 MNIST

diff --git a/src/MLDatasets.jl b/src/MLDatasets.jl
@@ -29,6 +29,11 @@ import CSV
 @lazy import HDF5="f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 # @lazy import JLD2
 
+# Required for ImageNet
+@lazy import JpegTurbo="b835a17e-a41a-41e7-81f0-2f016b05efe0"   # Open jpg-files
+@lazy import ImageCore="a09fc81d-aa75-5fe9-8630-4744c3626534"   # Preprocessing
+@lazy import StackViews="cae243ae-269e-4f55-b966-ac2d0dc13c15"  # Batching of images
+
 export getobs, numobs # From MLUtils.jl
 
 include("abstract_datasets.jl")
@@ -86,6 +91,9 @@ include("datasets/vision/cifar100.jl")
 export CIFAR100
 include("datasets/vision/svhn2.jl")
 export SVHN2
+include("datasets/vision/imagenet_reader/ImageNetReader.jl")
+include("datasets/vision/imagenet.jl")
+export ImageNet
 
 ## Text
 
@@ -147,6 +155,7 @@ function __init__()
     __init__fashionmnist()
     __init__mnist()
     __init__svhn2()
+    __init__imagenet()
 end
 
 end #module
diff --git a/src/datasets/vision/imagenet.jl b/src/datasets/vision/imagenet.jl
@@ -0,0 +1,204 @@
+const IMAGENET_WEBSITE = "https://image-net.org/"
+
+function __init__imagenet()
+    DEPNAME = "ImageNet"
+    return register(
+        ManualDataDep(
+            DEPNAME,
+            # TODO: currently markdown formatting is not applied
+            """
+            The ImageNet 2012 Classification Dataset (ILSVRC 2012-2017) can be downloaded at
+            $IMAGENET_WEBSITE after signing up and accepting the terms of access.
+            It is therefore required that you download this dataset manually.
+
+            ## Existing installation
+            The dataset structure is assumed to look as follows:
+            ```
+            ImageNet
+            ├── train
+            ├── val
+            │   ├── n01440764
+            │   │   ├── ILSVRC2012_val_00000293.JPEG
+            │   │   ├── ILSVRC2012_val_00002138.JPEG
+            │   │   └── ...
+            │   ├── n01443537
+            │   └── ...
+            ├── test
+            └── devkit
+                ├── data
+                │   ├── meta.mat
+                │   └── ...
+                └── ...
+            ```
+            If your existing copy of the ImageNet dataset uses another file structure,
+            we recommend to create symbolic links, e.g. using `ln` on Unix-like operating
+            systems:
+            ```bash
+            cd ~/.julia/datadeps
+            mkdir -p ImageNet/val
+            ln -s my/path/to/imagenet/val ImageNet/val
+            mkdir -p ImageNet/devkit/data
+            ln -s my/path/to/imagenet/devkit/data ImageNet/devkit/data
+            ```
+
+            ## New installation
+            Download the following files from the ImageNet website ($IMAGENET_WEBSITE):
+            * `ILSVRC2012_devkit_t12`
+            * `ILSVRC2012_img_train.tar`, only required for `:train` split
+            * `ILSVRC2012_img_val.tar`, only required for `:val` split
+
+            After downloading the data, move and extract the training and validation images to
+            labeled subfolders running the following shell script:
+            ```bash
+            # Extract the training data:
+            mkdir -p ImageNet/train && tar -xvf ILSVRC2012_img_train.tar -C ImageNet/train
+            # Unpack all 1000 compressed tar-files, one for each category:
+            cd ImageNet/train
+            find . -name "*.tar" | while read NAME ; do mkdir -p "\${NAME%.tar}"; tar -xvf "\${NAME}" -C "\${NAME%.tar}"; rm -f "\${NAME}"; done
+
+            # Extract the validation data:
+            cd ../..
+            mkdir -p ImageNet/val && tar -xvf ILSVRC2012_img_val.tar -C ImageNet/val
+
+            # Run script from soumith to create all class directories and moves images into corresponding directories:
+            cd ImageNet/val
+            wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
+
+            # Extract metadata from the devkit:
+            cd ../..
+            mkdir -p ImageNet/devkit && tar -xvf ILSVRC2012_img_val.tar -C ImageNet/devkit
+            ```
+            """,
+            # shell script based on PyTorch example "ImageNet training in PyTorch":
+            # https://github.com/pytorch/examples/blob/d5478765d38210addf474dd73faf0d103052027a/imagenet/extract_ILSVRC.sh
+        ),
+    )
+end
+
+"""
+    ImageNet(; Tx=Float32, split=:train, dir=nothing)
+    ImageNet([Tx, split])
+
+The ImageNet 2012 Classification Dataset (ILSVRC 2012-2017).
+This is the most highly-used subset of ImageNet. It spans 1000 object classes and contains
+1,281,167 training images, 50,000 validation images and 100,000 test images.
+Each image is in 224x224x3 format using RGB color space.
+
+- Authors: Olga Russakovsky, Jia Deng, Hao Su, Jonathan Krause, Sanjeev Satheesh,
+    Sean Ma, Zhiheng Huang, Andrej Karpathy, Aditya Khosla, Michael Bernstein,
+    Alexander C. Berg, Li Fei-Fei
+- Website: $IMAGENET_WEBSITE
+- Reference: Russakovsky et al., ImageNet Large Scale Visual Recognition Challenge
+    (https://arxiv.org/abs/1409.0575)
+
+# Arguments
+
+$ARGUMENTS_SUPERVISED_ARRAY
+- `split`: selects the data partition. Can take the values `:train:` or `:test`.
+
+# Fields
+
+$FIELDS_SUPERVISED_ARRAY
+- `split`.
+
+# Methods
+
+$METHODS_SUPERVISED_ARRAY
+- [`convert2image`](@ref) converts features to `RGB` images.
+
+# Examples
+
+```julia-repl
+julia> using MLDatasets: ImageNet
+
+julia> dataset = ImageNet(:val)
+dataset ImageNet:
+  metadata    =>    Dict{String, Any} with 4 entries
+  split       =>    :val
+  files       =>    50000-element Vector{String}
+  targets     =>    50000-element Vector{Int64}
+  Tx          =>    Float32
+
+julia> dataset[1:5].targets
+5-element Vector{Int64}:
+ 1
+ 1
+ 1
+ 1
+ 1
+
+julia> X, y = dataset[1:5];
+
+julia> size(X)
+(224, 224, 3, 5)
+
+julia> dataset.metadata
+Dict{String, Any} with 4 entries:
+  "class_WNIDs"       => ["n02119789", "n02100735", "n02110185", "n02096294", "n02102040", "n02066245", "n02509815", "n02124075", "n02417914", "n02123394"  …  "n02815834", "n09229709", "n07697313", "n03888605", "n03355925", "n03…
+  "class_description" => ["small grey fox of southwestern United States; may be a subspecies of Vulpes velox", "an English breed having a plumed tail and a soft silky coat that is chiefly white", "breed of sled dog developed in …
+  "class_names"       => Vector{SubString{String}}[["kit fox", "Vulpes macrotis"], ["English setter"], ["Siberian husky"], ["Australian terrier"], ["English springer", "English springer spaniel"], ["grey whale", "gray whale", "d…
+  "wnid_to_label"     => Dict("n07693725"=>768, "n03775546"=>829, "n01689811"=>469, "n02100877"=>192, "n02441942"=>48, "n04371774"=>569, "n07717410"=>741, "n03347037"=>919, "n04355338"=>526, "n02097474"=>158…)
+```
+"""
+struct ImageNet <: SupervisedDataset
+    metadata::Dict{String,Any}
+    split::Symbol
+    files::Vector{String}
+    targets::Vector{Int}
+    Tx::Type
+end
+
+ImageNet(; split=:train, Tx=Float32, dir=nothing) = ImageNet(Tx, split; dir)
+ImageNet(split::Symbol; kws...) = ImageNet(; split, kws...)
+ImageNet(Tx::Type; kws...) = ImageNet(; Tx, kws...)
+
+function ImageNet(
+    Tx::Type,
+    split::Symbol;
+    dir=nothing,
+    train_dir="train",
+    val_dir="val",
+    test_dir="test",
+    devkit_dir="devkit",
+)
+    @assert split ∈ (:train, :val, :test)
+
+    DEPNAME = "ImageNet"
+    METADATA_FILENAME = joinpath(devkit_dir, "data", "meta.mat")
+
+    TRAINSET_SIZE = 1_281_167
+    VALSET_SIZE = 50_000
+    TESTSET_SIZE = 100_000
+
+    # Load metadata
+    file_path = datafile(DEPNAME, METADATA_FILENAME, dir)
+    metadata = ImageNetReader.read_metadata(file_path)
+
+    root_dir = @datadep_str DEPNAME
+    if split == :train
+        files = ImageNetReader.readdata(joinpath(root_dir, train_dir))
+        @assert length(files) == TRAINSET_SIZE
+    elseif split == :val
+        files = ImageNetReader.readdata(joinpath(root_dir, val_dir))
+        @assert length(files) == VALSET_SIZE
+    else
+        files = ImageNetReader.readdata(joinpath(root_dir, test_dir))
+        @assert length(files) == TESTSET_SIZE
+    end
+    targets = [metadata["wnid_to_label"][wnid] for wnid in ImageNetReader.load_wnids(files)]
+    return ImageNet(metadata, split, files, targets, Tx)
+end
+
+function convert2image(::Type{<:ImageNet}, x::AbstractArray{<:Integer})
+    return convert2image(ImageNet, reinterpret(N0f8, convert(Array{UInt8}, x)))
+end
+convert2image(::Type{<:ImageNet}, x) = ImageNetReader.inverse_preprocess(x)
+
+Base.length(d::ImageNet) = length(d.image_files)
+function Base.getindex(d::ImageNet, ::Colon)
+    # Throw warning here that ImageNet probably will not fit in memory?
+    return (features=ImageNetReader.readimage(d.Tx, d.files), targets=d.targets)
+end
+function Base.getindex(d::ImageNet, i)
+    return (features=ImageNetReader.readimage(d.Tx, d.files[i]), targets=d.targets[i])
+end
diff --git a/src/datasets/vision/imagenet_reader/ImageNetReader.jl b/src/datasets/vision/imagenet_reader/ImageNetReader.jl
@@ -0,0 +1,48 @@
+module ImageNetReader
+import ..FileDataset
+import ..read_mat
+import ..@lazy
+
+@lazy import JpegTurbo = "b835a17e-a41a-41e7-81f0-2f016b05efe0"
+@lazy import ImageCore="a09fc81d-aa75-5fe9-8630-4744c3626534"
+@lazy import StackViews="cae243ae-269e-4f55-b966-ac2d0dc13c15"
+
+const NCLASSES = 1000
+const IMGSIZE = (224, 224)
+
+include("preprocess.jl")
+
+function read_metadata(file::AbstractString)
+    meta = read_mat(file)["synsets"]
+    is_child = iszero.(meta["num_children"])
+    @assert meta["ILSVRC2012_ID"][is_child] == 1:NCLASSES
+
+    metadata = Dict{String,Any}()
+    metadata["class_WNIDs"] = Vector{String}(meta["WNID"][is_child]) # WordNet IDs
+    metadata["class_names"] = split.(meta["words"][is_child], ", ")
+    metadata["class_description"] = Vector{String}(meta["gloss"][is_child])
+    metadata["wnid_to_label"] = Dict(metadata["class_WNIDs"] .=> 1:NCLASSES)
+    return metadata
+end
+
+# The full ImageNet dataset doesn't fit into memory, so we only save filenames
+readdata(dir::AbstractString) = FileDataset(identity, dir, "*.JPEG").paths
+
+# Get WordNet ID from path
+function load_wnids(files::AbstractVector{<:AbstractString})
+    return [split(f, "/")[end - 1] for f in files]
+end
+
+# Load image from ImageNetFile path and preprocess it to normalized 224x224x3 Array{Tx,3}
+function readimage(Tx::Type{<:Real}, file::AbstractString)
+    im = JpegTurbo.jpeg_decode(ImageCore.RGB{Tx}, file; preferred_size=IMGSIZE)
+    return preprocess(Tx, im)
+end
+
+# Load batched array of images
+cat_batchdim(xs...) = cat(xs...; dims=4)
+function readimage(Tx::Type, files::AbstractVector{<:AbstractString})
+    return StackViews.StackView([readimage(Tx, f) for f in files])
+end
+
+end # module
diff --git a/src/datasets/vision/imagenet_reader/preprocess.jl b/src/datasets/vision/imagenet_reader/preprocess.jl
@@ -0,0 +1,32 @@
+# Image preprocessing for ImageNet models.
+# Code adapted from Metalhead 0.5.3's utils.jl
+
+# Take rectangle of pixels of shape `outsize` at the center of image `im`
+adjust(i::Integer) = ifelse(iszero(i % 2), 1, 0)
+function center_crop_view(im::AbstractMatrix, outsize=IMGSIZE)
+    h2, w2 = div.(outsize, 2) # half height, half width of view
+    h_adjust, w_adjust = adjust.(outsize)
+    return @view im[
+        (div(end, 2) - h2):(div(end, 2) + h2 - h_adjust),
+        (div(end, 2) - w2):(div(end, 2) + w2 - w_adjust),
+    ]
+end
+
+# Coefficients taken from PyTorch's ImageNet normalization code
+const PYTORCH_MEAN = [0.485f0, 0.456f0, 0.406f0]
+const PYTORCH_STD = [0.229f0, 0.224f0, 0.225f0]
+
+function preprocess(Tx::Type, im::AbstractMatrix{<:ImageCore.AbstractRGB})
+    im = center_crop_view(im)
+    im = (ImageCore.channelview(im) .- PYTORCH_MEAN) ./ PYTORCH_STD
+    # Convert from CHW (Image.jl's channel ordering) to WHC:
+    return Tx.(PermutedDimsArray(im, (3, 2, 1)))
+end
+
+function inverse_preprocess(x::AbstractArray{T,N}) where {T,N}
+    @assert N == 3 || N == 4
+    return ImageCore.colorview(
+        ImageCore.RGB,
+        PermutedDimsArray(x, (3, 2, 1, 4:N...)) .* PYTORCH_STD .+ PYTORCH_MEAN,
+    )
+end