From ab5061cdaa13b9641cfa2e0bcb237b25ffdb0b0a Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Sat, 3 Jul 2021 16:45:57 +0200 Subject: [PATCH] support for jagged branch of basic std types --- Project.toml | 1 - src/UnROOT.jl | 2 +- src/bootstrap.jl | 3 + src/root.jl | 108 +++++++++----------- test/runtests.jl | 20 ++-- test/samples/tree_with_large_array_lz4.root | Bin 809061 -> 809061 bytes 6 files changed, 67 insertions(+), 67 deletions(-) diff --git a/Project.toml b/Project.toml index 737a54fc..32024d34 100644 --- a/Project.toml +++ b/Project.toml @@ -10,7 +10,6 @@ CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" MD5 = "6ac74813-4b46-53a4-afec-0b5dc9d7885c" Mixers = "2a8e4939-dab8-5edc-8f64-72a8776f13de" -OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" diff --git a/src/UnROOT.jl b/src/UnROOT.jl index dc6fb98a..1b2104d5 100644 --- a/src/UnROOT.jl +++ b/src/UnROOT.jl @@ -7,7 +7,7 @@ import Base: keys, get, getindex, show, length, iterate, position using CodecZlib, CodecLz4, CodecXz using Mixers using Parameters -using StaticArrays, OffsetArrays +using StaticArrays include("constants.jl") include("io.jl") diff --git a/src/bootstrap.jl b/src/bootstrap.jl index ca5a9610..f3ef25d3 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -790,3 +790,6 @@ function TTree(io, tkey::TKey, refs) endcheck(io, preamble) TTree(;fields...) end + +# FIXME what to do with auto.py's massive type translation? +# https://github.com/scikit-hep/uproot3/blob/54f5151fb7c686c3a161fbe44b9f299e482f346b/uproot3/interp/auto.py#L360-L365 diff --git a/src/root.jl b/src/root.jl index 9bbf56a5..01a6fcec 100644 --- a/src/root.jl +++ b/src/root.jl @@ -130,6 +130,7 @@ function Base.getindex(t::T, s::AbstractString) where {T<:Union{TTree, TBranchEl end missing end + """ function array(f::ROOTFile, path) @@ -145,17 +146,44 @@ function array(f::ROOTFile, path; raw=false) end end - if raw - return readbasketsraw(f.fobj, branch) - end - - if length(branch.fLeaves.elements) > 1 - error("Branches with multiple leaves are not supported yet.") + if !raw && length(branch.fLeaves.elements) > 1 + error( + "Branches with multiple leaves are not supported yet. Try reading with `array(...; raw=true)`.", + ) end leaf = first(branch.fLeaves.elements) - - readbaskets(f.fobj, branch, primitivetype(leaf)) + rawdata, rawoffsets = readbasketsraw(f.fobj, branch) + if raw + return rawdata, rawoffsets + else + if leaf isa TLeafElement # non-primitive jagged leaf + classname = branch.fClassName # the C++ class name, such as "vector" + m = match(r"vector<(.*)>", classname) + isnothing(m) && error("Cannot understand fClassName: $classname.") + elname = m[1] + elname = endswith(elname, "_t") ? lowercase(chop(elname; tail=2)) : elname # Double_t -> double + T = try + getfield(Base, Symbol(:C, elname)) + catch + error("Cannot convert element of $elname to a native Julia type") + end + + jagg_offset = 10 # magic offsets, seems to be common for a lot of types, see auto.py in uproot3 + + # for each "event", the index range is `offsets[i] + jagg_offset + 1` to `offsets[i+1]` + # this is why we need to append `rawoffsets` in the `readbasketsraw()` call + # when you use this range to index `rawdata`, you will get raw bytes belong to each event + # Say your real data is Int32 and you see 8 bytes after indexing, then this event has [num1, num2] as real data + @views [ + ntoh.(reinterpret( + T, rawdata[ (rawoffsets[i]+jagg_offset+1):rawoffsets[i+1] ] + )) for i in 1:(length(rawoffsets) - 1) + ] + else # the branch is not jagged + return ntoh.(reinterpret(primitivetype(leaf), rawdata)) + end + end end @@ -173,8 +201,14 @@ function DataFrame(f::ROOTFile, path) DataFrame(cols, names, copycols=false) #avoid double allocation end +""" + splitup(data::Vector{UInt8}, offsets, T::Type; skipbytes=0, primitive=false) + +Given the `offsets` and `data` return by `array(...; raw = true)`, reconstructed the actual +array (can be jagged, or with custome struct). +""" function splitup(data::Vector{UInt8}, offsets, T::Type; skipbytes=0, primitive=false) - elsize = packedsizeof(T) + elsize = sizeof(T) out = sizehint!(Vector{Vector{T}}(), length(offsets)) lengths = diff(offsets) push!(lengths, length(data) - offsets[end] + offsets[1]) # yay ;) @@ -194,29 +228,6 @@ function splitup(data::Vector{UInt8}, offsets, T::Type; skipbytes=0, primitive=f end -function readbaskets(io, branch, ::Type{T}) where {T} - seeks = branch.fBasketSeek - entries = branch.fBasketEntry - - out = sizehint!(Vector{T}(), branch.fEntries) - - - for (idx, basket_seek) in enumerate(seeks) - @debug "Reading basket" idx basket_seek - if basket_seek == 0 - break - end - seek(io, basket_seek) - basketkey = unpack(io, TBasketKey) - s = datastream(io, basketkey) - - for _ in entries[idx]:(entries[idx + 1] - 1) - push!(out, readtype(s, T)) - end - end - out -end - function readbasketsraw(io, branch) seeks = branch.fBasketSeek @@ -226,19 +237,17 @@ function readbasketsraw(io, branch) # Just to check if we have a jagged structure # streamer = streamerfor() - # FIXME This UInt8 is wrong, the final data depends on branch info max_len = sum(bytes) data = sizehint!(Vector{UInt8}(), max_len) offsets = sizehint!(Vector{Int32}(), total_entries+1) # this is always Int32 idx = 1 - _res = sizehint!(Vector{Int32}(), max_len) for (basket_seek, n_bytes) in zip(seeks, bytes) @debug "Reading raw basket data" basket_seek n_bytes basket_seek == 0 && break seek(io, basket_seek) - idx += readbasketbytes!(data, offsets, io, idx, _res) + idx += readbasketbytes!(data, offsets, io, idx) end - _res, offsets + data, offsets end @@ -253,48 +262,29 @@ end # │ │ # │← fObjlen →│ # -function readbasketbytes!(data, offsets, io, idx, _res::Vector{T}) where T +function readbasketbytes!(data, offsets, io, idx) basketkey = unpack(io, TBasketKey) - # @show basketkey s = datastream(io, basketkey) # position(s) == 0, but offsets start at -basketkey.fKeylen start = position(s) - # @show start contentsize = basketkey.fLast - basketkey.fKeylen offsetbytesize = basketkey.fObjlen - contentsize - 8 offset_len = offsetbytesize ÷ 4 # these are always Int32 if offsetbytesize > 0 - @debug "Offset data present" offsetlength + @debug "Offset data present" offsetbytesize skip(s, contentsize) skip(s, 4) # a flag that indicates the type of data that follows readoffsets!(offsets, s, offset_len, length(data), length(data)) skip(s, 4) # "Pointer-to/location-of last used byte in basket" seek(s, start) end - push!(offsets, basketkey.fLast) - offsets .-= basketkey.fKeylen @debug "Reading $(contentsize) bytes" readbytes!(s, data, idx, contentsize) + push!(offsets, basketkey.fLast) + offsets .-= basketkey.fKeylen - # FIXME wtf is going on here please make this non-allocating - # https://github.com/scikit-hep/uproot3/blob/54f5151fb7c686c3a161fbe44b9f299e482f346b/uproot3/interp/jagged.py#L78-L87 - # - # FIXME the +10 is for a bunch of jagged stuff, not sure what's the speial case - bytestarts = offsets[begin:offset_len] .+ 10 - bytestops = offsets[begin+1:offset_len+1] - - # fuck 0/1 index - mask = OffsetArray(zeros(Int8, contentsize), -1) - mask[@view bytestarts[bytestarts .< contentsize]] .= 1 - mask[@view bytestops[bytestops .< contentsize]] .-= 1 - mask = OffsetArrays.no_offset_view(cumsum(mask)) - - #FIXME figureout what to interpret to outside - append!(_res, ntoh.(reinterpret(T, data[mask .== 1]))) - - # ======= end of magic ======= contentsize end diff --git a/test/runtests.jl b/test/runtests.jl index 85464a9a..b2fd6993 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -188,20 +188,28 @@ end @test [0.0, 1.0588236, 2.1176472, 3.1764705, 4.2352943] ≈ df.float_array[1:5] atol=1e-7 end -@testset "simple jagged" begin +@testset "Jagged branches" begin rootfile = ROOTFile(joinpath(SAMPLES_DIR, "tree_with_jagged_array.root")) - data, offsets = array(rootfile, "t1/int32_array"; raw=true) + data = array(rootfile, "t1/int32_array") - @test data isa Vector{Int32} - @test offsets isa Vector{Int32} - @test data[1:3] == [0,0,1] + @test data isa Vector{Vector{Int32}} + @test data[1] == Int32[] + @test data[1:2] == [Int32[], Int32[0]] + @test data[end] == Int32[90, 91, 92, 93, 94, 95, 96, 97, 98] end @testset "readbasketsraw()" begin array_md5 = [0xb4, 0xe9, 0x32, 0xe8, 0xfb, 0xff, 0xcf, 0xa0, 0xda, 0x75, 0xe0, 0x25, 0x34, 0x9b, 0xcd, 0xdf] rootfile = ROOTFile(joinpath(SAMPLES_DIR, "km3net_online.root")) data, offsets = array(rootfile, "KM3NET_EVENT/KM3NET_EVENT/snapshotHits"; raw=true) - @test_broken array_md5 == md5(data) #FIXME, the file seems to be broken + @test array_md5 == md5(data) #FIXME, the file seems to be broken + + rootfile = ROOTFile(joinpath(SAMPLES_DIR, "tree_with_jagged_array.root")) + data, offsets = array(rootfile, "t1/int32_array"; raw=true) + + @test data isa Vector{UInt8} + @test offsets isa Vector{Int32} + @test data[1:3] == UInt8[0x40, 0x00, 0x00] end diff --git a/test/samples/tree_with_large_array_lz4.root b/test/samples/tree_with_large_array_lz4.root index e658f77a8ba6dcd0da9ae4e0611d5e8d005850ab..edae63d08e3b3e070f22c544e6ef33652cfaad77 100644 GIT binary patch delta 404 zcmaF5!SLw@!wLEV$x-`#ZcP(>y{mNfdY(i3-cNK?<7Hr2!@$D8z>t!8*dcRbQ7Su- z0yFQ4X`!nJDnkM_f0*AeFw|@SsopHd__Kx?#E`0kGQ@sE7~6#^pmN*!zQR~sMNCjx zw)Zd=Qw}p!?B5F*>sKlZRP5U$80%9!D^%>=9T@9XBpX!h*)y|GE zRP5R*80(TdCsgd*5g6-~BNtTc*ghERkTo|{Y~K#sKlZRP5U$80%9!D^%>=9T@9XBpX!h*)y|GE zRP5R*80(TdCsgd*5g6-~BNtTc*ghERkTo|{Y~K#