Skip to content

Commit

Permalink
support for jagged branch of basic std types
Browse files Browse the repository at this point in the history
  • Loading branch information
Moelf committed Jul 3, 2021
1 parent d94093e commit ab5061c
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 67 deletions.
1 change: 0 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
MD5 = "6ac74813-4b46-53a4-afec-0b5dc9d7885c"
Mixers = "2a8e4939-dab8-5edc-8f64-72a8776f13de"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

Expand Down
2 changes: 1 addition & 1 deletion src/UnROOT.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import Base: keys, get, getindex, show, length, iterate, position
using CodecZlib, CodecLz4, CodecXz
using Mixers
using Parameters
using StaticArrays, OffsetArrays
using StaticArrays

include("constants.jl")
include("io.jl")
Expand Down
3 changes: 3 additions & 0 deletions src/bootstrap.jl
Original file line number Diff line number Diff line change
Expand Up @@ -790,3 +790,6 @@ function TTree(io, tkey::TKey, refs)
endcheck(io, preamble)
TTree(;fields...)
end

# FIXME what to do with auto.py's massive type translation?
# https://github.com/scikit-hep/uproot3/blob/54f5151fb7c686c3a161fbe44b9f299e482f346b/uproot3/interp/auto.py#L360-L365
108 changes: 49 additions & 59 deletions src/root.jl
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ function Base.getindex(t::T, s::AbstractString) where {T<:Union{TTree, TBranchEl
end
missing
end

"""
function array(f::ROOTFile, path)
Expand All @@ -145,17 +146,44 @@ function array(f::ROOTFile, path; raw=false)
end
end

if raw
return readbasketsraw(f.fobj, branch)
end

if length(branch.fLeaves.elements) > 1
error("Branches with multiple leaves are not supported yet.")
if !raw && length(branch.fLeaves.elements) > 1
error(
"Branches with multiple leaves are not supported yet. Try reading with `array(...; raw=true)`.",
)
end

leaf = first(branch.fLeaves.elements)

readbaskets(f.fobj, branch, primitivetype(leaf))
rawdata, rawoffsets = readbasketsraw(f.fobj, branch)
if raw
return rawdata, rawoffsets
else
if leaf isa TLeafElement # non-primitive jagged leaf
classname = branch.fClassName # the C++ class name, such as "vector<int>"
m = match(r"vector<(.*)>", classname)
isnothing(m) && error("Cannot understand fClassName: $classname.")
elname = m[1]
elname = endswith(elname, "_t") ? lowercase(chop(elname; tail=2)) : elname # Double_t -> double
T = try
getfield(Base, Symbol(:C, elname))
catch
error("Cannot convert element of $elname to a native Julia type")
end

jagg_offset = 10 # magic offsets, seems to be common for a lot of types, see auto.py in uproot3

# for each "event", the index range is `offsets[i] + jagg_offset + 1` to `offsets[i+1]`
# this is why we need to append `rawoffsets` in the `readbasketsraw()` call
# when you use this range to index `rawdata`, you will get raw bytes belong to each event
# Say your real data is Int32 and you see 8 bytes after indexing, then this event has [num1, num2] as real data
@views [
ntoh.(reinterpret(
T, rawdata[ (rawoffsets[i]+jagg_offset+1):rawoffsets[i+1] ]
)) for i in 1:(length(rawoffsets) - 1)
]
else # the branch is not jagged
return ntoh.(reinterpret(primitivetype(leaf), rawdata))
end
end
end


Expand All @@ -173,8 +201,14 @@ function DataFrame(f::ROOTFile, path)
DataFrame(cols, names, copycols=false) #avoid double allocation
end

"""
splitup(data::Vector{UInt8}, offsets, T::Type; skipbytes=0, primitive=false)
Given the `offsets` and `data` return by `array(...; raw = true)`, reconstructed the actual
array (can be jagged, or with custome struct).
"""
function splitup(data::Vector{UInt8}, offsets, T::Type; skipbytes=0, primitive=false)
elsize = packedsizeof(T)
elsize = sizeof(T)
out = sizehint!(Vector{Vector{T}}(), length(offsets))
lengths = diff(offsets)
push!(lengths, length(data) - offsets[end] + offsets[1]) # yay ;)
Expand All @@ -194,29 +228,6 @@ function splitup(data::Vector{UInt8}, offsets, T::Type; skipbytes=0, primitive=f
end


function readbaskets(io, branch, ::Type{T}) where {T}
seeks = branch.fBasketSeek
entries = branch.fBasketEntry

out = sizehint!(Vector{T}(), branch.fEntries)


for (idx, basket_seek) in enumerate(seeks)
@debug "Reading basket" idx basket_seek
if basket_seek == 0
break
end
seek(io, basket_seek)
basketkey = unpack(io, TBasketKey)
s = datastream(io, basketkey)

for _ in entries[idx]:(entries[idx + 1] - 1)
push!(out, readtype(s, T))
end
end
out
end


function readbasketsraw(io, branch)
seeks = branch.fBasketSeek
Expand All @@ -226,19 +237,17 @@ function readbasketsraw(io, branch)
# Just to check if we have a jagged structure
# streamer = streamerfor()

# FIXME This UInt8 is wrong, the final data depends on branch info
max_len = sum(bytes)
data = sizehint!(Vector{UInt8}(), max_len)
offsets = sizehint!(Vector{Int32}(), total_entries+1) # this is always Int32
idx = 1
_res = sizehint!(Vector{Int32}(), max_len)
for (basket_seek, n_bytes) in zip(seeks, bytes)
@debug "Reading raw basket data" basket_seek n_bytes
basket_seek == 0 && break
seek(io, basket_seek)
idx += readbasketbytes!(data, offsets, io, idx, _res)
idx += readbasketbytes!(data, offsets, io, idx)
end
_res, offsets
data, offsets
end


Expand All @@ -253,48 +262,29 @@ end
# │ │
# │← fObjlen →│
#
function readbasketbytes!(data, offsets, io, idx, _res::Vector{T}) where T
function readbasketbytes!(data, offsets, io, idx)
basketkey = unpack(io, TBasketKey)

# @show basketkey
s = datastream(io, basketkey) # position(s) == 0, but offsets start at -basketkey.fKeylen
start = position(s)
# @show start
contentsize = basketkey.fLast - basketkey.fKeylen
offsetbytesize = basketkey.fObjlen - contentsize - 8
offset_len = offsetbytesize ÷ 4 # these are always Int32

if offsetbytesize > 0
@debug "Offset data present" offsetlength
@debug "Offset data present" offsetbytesize
skip(s, contentsize)
skip(s, 4) # a flag that indicates the type of data that follows
readoffsets!(offsets, s, offset_len, length(data), length(data))
skip(s, 4) # "Pointer-to/location-of last used byte in basket"
seek(s, start)
end
push!(offsets, basketkey.fLast)
offsets .-= basketkey.fKeylen

@debug "Reading $(contentsize) bytes"
readbytes!(s, data, idx, contentsize)
push!(offsets, basketkey.fLast)
offsets .-= basketkey.fKeylen

# FIXME wtf is going on here please make this non-allocating
# https://github.com/scikit-hep/uproot3/blob/54f5151fb7c686c3a161fbe44b9f299e482f346b/uproot3/interp/jagged.py#L78-L87
#
# FIXME the +10 is for a bunch of jagged stuff, not sure what's the speial case
bytestarts = offsets[begin:offset_len] .+ 10
bytestops = offsets[begin+1:offset_len+1]

# fuck 0/1 index
mask = OffsetArray(zeros(Int8, contentsize), -1)
mask[@view bytestarts[bytestarts .< contentsize]] .= 1
mask[@view bytestops[bytestops .< contentsize]] .-= 1
mask = OffsetArrays.no_offset_view(cumsum(mask))

#FIXME figureout what to interpret to outside
append!(_res, ntoh.(reinterpret(T, data[mask .== 1])))

# ======= end of magic =======
contentsize
end

Expand Down
20 changes: 14 additions & 6 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -188,20 +188,28 @@ end
@test [0.0, 1.0588236, 2.1176472, 3.1764705, 4.2352943] df.float_array[1:5] atol=1e-7
end

@testset "simple jagged" begin
@testset "Jagged branches" begin
rootfile = ROOTFile(joinpath(SAMPLES_DIR, "tree_with_jagged_array.root"))
data, offsets = array(rootfile, "t1/int32_array"; raw=true)
data = array(rootfile, "t1/int32_array")

@test data isa Vector{Int32}
@test offsets isa Vector{Int32}
@test data[1:3] == [0,0,1]
@test data isa Vector{Vector{Int32}}
@test data[1] == Int32[]
@test data[1:2] == [Int32[], Int32[0]]
@test data[end] == Int32[90, 91, 92, 93, 94, 95, 96, 97, 98]
end

@testset "readbasketsraw()" begin
array_md5 = [0xb4, 0xe9, 0x32, 0xe8, 0xfb, 0xff, 0xcf, 0xa0, 0xda, 0x75, 0xe0, 0x25, 0x34, 0x9b, 0xcd, 0xdf]
rootfile = ROOTFile(joinpath(SAMPLES_DIR, "km3net_online.root"))
data, offsets = array(rootfile, "KM3NET_EVENT/KM3NET_EVENT/snapshotHits"; raw=true)
@test_broken array_md5 == md5(data) #FIXME, the file seems to be broken
@test array_md5 == md5(data) #FIXME, the file seems to be broken

rootfile = ROOTFile(joinpath(SAMPLES_DIR, "tree_with_jagged_array.root"))
data, offsets = array(rootfile, "t1/int32_array"; raw=true)

@test data isa Vector{UInt8}
@test offsets isa Vector{Int32}
@test data[1:3] == UInt8[0x40, 0x00, 0x00]
end


Expand Down
Binary file modified test/samples/tree_with_large_array_lz4.root
Binary file not shown.

0 comments on commit ab5061c

Please sign in to comment.