Skip to content

Commit

Permalink
handle jagged array simplest case
Browse files Browse the repository at this point in the history
  • Loading branch information
Moelf committed Jul 3, 2021
1 parent 8fa520c commit 5b89eac
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 26 deletions.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
MD5 = "6ac74813-4b46-53a4-afec-0b5dc9d7885c"
Mixers = "2a8e4939-dab8-5edc-8f64-72a8776f13de"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

Expand Down
2 changes: 1 addition & 1 deletion src/UnROOT.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import Base: keys, get, getindex, show, length, iterate, position
using CodecZlib, CodecLz4, CodecXz
using Mixers
using Parameters
using StaticArrays
using StaticArrays, OffsetArrays

include("constants.jl")
include("io.jl")
Expand Down
75 changes: 54 additions & 21 deletions src/root.jl
Original file line number Diff line number Diff line change
Expand Up @@ -226,54 +226,85 @@ function readbasketsraw(io, branch)
# Just to check if we have a jagged structure
# streamer = streamerfor()

data = sizehint!(Vector{UInt8}(), sum(bytes))
offsets = sizehint!(Vector{Int32}(), total_entries)
# FIXME This UInt8 is wrong, the final data depends on branch info
max_len = sum(bytes)
data = sizehint!(Vector{UInt8}(), max_len)
offsets = sizehint!(Vector{Int32}(), total_entries+1) # this is always Int32
idx = 1
_res = sizehint!(Vector{Int32}(), max_len)
for (basket_seek, n_bytes) in zip(seeks, bytes)
@debug "Reading raw basket data" basket_seek n_bytes
basket_seek == 0 && break
seek(io, basket_seek)
idx += readbasketbytes!(data, offsets, io, idx)
idx += readbasketbytes!(data, offsets, io, idx, _res)
end
data, offsets
_res, offsets
end


function readoffsets!(out, s, contentsize, global_offset, local_offset)
for _ in 1:contentsize
offset = readtype(s, Int32) + global_offset
push!(out, offset)
end
end

function readbasketbytes!(data, offsets, io, idx)
# Thanks Jim and Philippe
# https://groups.google.com/forum/#!topic/polyglot-root-io/yeC0mAizQcA
# The offsets start at fKeylen - fLast + 4. A singe basket of data looks like this:
# 4 bytes 4 bytes
# ┌─────────┬────────────────────────────────┬───┬────────────┬───┐
# │ TKey │ content │ X │ offsets │ x │
# └─────────┴────────────────────────────────┴───┴────────────┴───┘
# │← fLast - fKeylen →│ │
# │ │
# │← fObjlen →│
#
function readbasketbytes!(data, offsets, io, idx, _res::Vector{T}) where T
basketkey = unpack(io, TBasketKey)

# @show basketkey
s = datastream(io, basketkey) # position(s) == 0, but offsets start at -basketkey.fKeylen
start = position(s)
# @show start
contentsize = basketkey.fLast - basketkey.fKeylen
offsetlength = basketkey.fObjlen - contentsize
offsetbytesize = basketkey.fObjlen - contentsize - 8
offset_len = offsetbytesize ÷ 4 # these are always Int32

if offsetlength > 0
if offsetbytesize > 0
@debug "Offset data present" offsetlength
skip(s, contentsize)
skip(s, 4)
readoffsets!(offsets, s, (offsetlength - 8) / 4, length(data), length(data))
# https://groups.google.com/forum/#!topic/polyglot-root-io/yeC0mAizQcA
skip(s, 4) # a flag that indicates the type of data that follows
readoffsets!(offsets, s, offset_len, length(data), length(data))
skip(s, 4) # "Pointer-to/location-of last used byte in basket"
seek(s, start)
end
push!(offsets, basketkey.fLast)
offsets .-= basketkey.fKeylen

@debug "Reading $(contentsize) bytes"
readbytes!(s, data, idx, contentsize)
# for _ in 1:contentsize
# push!(data, readtype(s, UInt8))
# end

# FIXME wtf is going on here please make this non-allocating
# https://github.com/scikit-hep/uproot3/blob/54f5151fb7c686c3a161fbe44b9f299e482f346b/uproot3/interp/jagged.py#L78-L87
#
# FIXME the +10 is for a bunch of jagged stuff, not sure what's the speial case
bytestarts = offsets[begin:offset_len] .+ 10
bytestops = offsets[begin+1:offset_len+1]

# fuck 0/1 index
mask = OffsetArray(zeros(Int8, contentsize), -1)
mask[@view bytestarts[bytestarts .< contentsize]] .= 1
mask[@view bytestops[bytestops .< contentsize]] .-= 1
mask = OffsetArrays.no_offset_view(cumsum(mask))

#FIXME figureout what to interpret to outside
append!(_res, ntoh.(reinterpret(T, data[mask .== 1])))

# ======= end of magic =======
contentsize
end

function readoffsets!(out, s, contentsize, global_offset, local_offset)
for _ in 1:contentsize
offset = readtype(s, Int32) + global_offset
push!(out, offset)
end
end

"""
function readbytes!(io, b, offset, nr)
Expand All @@ -282,5 +313,7 @@ Efficient read of bytes into an existing array at a given offset
function readbytes!(io, b, offset, nr)
resize!(b, offset + nr - 1)
nb = UInt(nr)
GC.@preserve b unsafe_read(io, pointer(b, offset), nb)
# GC.@preserve b unsafe_read(io, pointer(b, offset), nb)
unsafe_read(io, pointer(b, offset), nb)
nothing
end
2 changes: 1 addition & 1 deletion src/streamers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ function unpack(io, tkey::TKey, refs::Dict{Int32, Any}, T::Type{TObjArray})
elements = Vector{Any}(undef, size)
for i in 1:size
ele = readobjany!(io, tkey, refs)
!ismissing(ele) && @show ele.fName
# !ismissing(ele) && @show ele.fName
elements[i] = ele
end
endcheck(io, preamble)
Expand Down
4 changes: 2 additions & 2 deletions src/types.jl
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ function datastream(io, tkey::T) where T<:Union{TKey, TBasketKey}
seekstart(io, tkey)
fufilled = 0
uncomp_data = Vector{UInt8}(undef, tkey.fObjlen)
while fufilled < length(uncomp_data) # careful with 0/1-based index when thinking about offsets
while fufilled < tkey.fObjlen # careful with 0/1-based index when thinking about offsets
compression_header = unpack(io, CompressionHeader)
cname, _, compbytes, uncompbytes = unpack(compression_header)
io_buf = IOBuffer(read(io, compbytes))
Expand All @@ -122,7 +122,7 @@ function datastream(io, tkey::T) where T<:Union{TKey, TBasketKey}

fufilled += uncompbytes
end
@assert fufilled == length(uncomp_data) # fail means something bad happens we over shoot
@assert fufilled == length(uncomp_data)
return IOBuffer(uncomp_data)
end

Expand Down
11 changes: 10 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,20 @@ end
@test [0.0, 1.0588236, 2.1176472, 3.1764705, 4.2352943] df.float_array[1:5] atol=1e-7
end

@testset "simple jagged" begin
rootfile = ROOTFile(joinpath(SAMPLES_DIR, "tree_with_jagged_array.root"))
data, offsets = array(rootfile, "ti/int32_array"; raw=true)

@test data isa Vector{Int32}
@test offsets isa Vector{UInt8}
@test data[1:3] == [0,0,1]
end

@testset "readbasketsraw()" begin
array_md5 = [0xb4, 0xe9, 0x32, 0xe8, 0xfb, 0xff, 0xcf, 0xa0, 0xda, 0x75, 0xe0, 0x25, 0x34, 0x9b, 0xcd, 0xdf]
rootfile = ROOTFile(joinpath(SAMPLES_DIR, "km3net_online.root"))
data, offsets = array(rootfile, "KM3NET_EVENT/KM3NET_EVENT/snapshotHits"; raw=true)
@test array_md5 == md5(data)
@test_broken array_md5 == md5(data) #FIXME, the file seems to be broken
end


Expand Down
Binary file modified test/samples/tree_with_large_array_lz4.root
Binary file not shown.

0 comments on commit 5b89eac

Please sign in to comment.