From ab5061cdaa13b9641cfa2e0bcb237b25ffdb0b0a Mon Sep 17 00:00:00 2001
From: Jerry Ling <proton@jling.dev>
Date: Sat, 3 Jul 2021 16:45:57 +0200
Subject: [PATCH] support for jagged branch of basic std types

---
 Project.toml                                |   1 -
 src/UnROOT.jl                               |   2 +-
 src/bootstrap.jl                            |   3 +
 src/root.jl                                 | 108 +++++++++-----------
 test/runtests.jl                            |  20 ++--
 test/samples/tree_with_large_array_lz4.root | Bin 809061 -> 809061 bytes
 6 files changed, 67 insertions(+), 67 deletions(-)
diff --git a/Project.toml b/Project.toml
index 737a54fc..32024d34 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,7 +10,6 @@ CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 MD5 = "6ac74813-4b46-53a4-afec-0b5dc9d7885c"
 Mixers = "2a8e4939-dab8-5edc-8f64-72a8776f13de"
-OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
diff --git a/src/UnROOT.jl b/src/UnROOT.jl
index dc6fb98a..1b2104d5 100644
--- a/src/UnROOT.jl
+++ b/src/UnROOT.jl
@@ -7,7 +7,7 @@ import Base: keys, get, getindex, show, length, iterate, position
 using CodecZlib, CodecLz4, CodecXz
 using Mixers
 using Parameters
-using StaticArrays, OffsetArrays
+using StaticArrays
 
 include("constants.jl")
 include("io.jl")
diff --git a/src/bootstrap.jl b/src/bootstrap.jl
index ca5a9610..f3ef25d3 100644
--- a/src/bootstrap.jl
+++ b/src/bootstrap.jl
@@ -790,3 +790,6 @@ function TTree(io, tkey::TKey, refs)
     endcheck(io, preamble)
     TTree(;fields...)
 end
+
+# FIXME what to do with auto.py's massive type translation?
+# https://github.com/scikit-hep/uproot3/blob/54f5151fb7c686c3a161fbe44b9f299e482f346b/uproot3/interp/auto.py#L360-L365
diff --git a/src/root.jl b/src/root.jl
index 9bbf56a5..01a6fcec 100644
--- a/src/root.jl
+++ b/src/root.jl
@@ -130,6 +130,7 @@ function Base.getindex(t::T, s::AbstractString) where {T<:Union{TTree, TBranchEl
     end
     missing
 end
+
 """
     function array(f::ROOTFile, path)
 
@@ -145,17 +146,44 @@ function array(f::ROOTFile, path; raw=false)
         end
     end
 
-    if raw
-        return readbasketsraw(f.fobj, branch)
-    end
-
-    if length(branch.fLeaves.elements) > 1
-        error("Branches with multiple leaves are not supported yet.")
+    if !raw && length(branch.fLeaves.elements) > 1
+        error(
+            "Branches with multiple leaves are not supported yet. Try reading with `array(...; raw=true)`.",
+        )
     end
 
     leaf = first(branch.fLeaves.elements)
-
-    readbaskets(f.fobj, branch, primitivetype(leaf))
+    rawdata, rawoffsets = readbasketsraw(f.fobj, branch)
+    if raw
+        return rawdata, rawoffsets
+    else
+        if leaf isa TLeafElement # non-primitive jagged leaf
+            classname = branch.fClassName # the C++ class name, such as "vector<int>"
+            m = match(r"vector<(.*)>", classname)
+            isnothing(m) && error("Cannot understand fClassName: $classname.")
+            elname = m[1]
+            elname = endswith(elname, "_t") ? lowercase(chop(elname; tail=2)) : elname  # Double_t -> double
+            T = try
+                getfield(Base, Symbol(:C, elname))
+            catch
+                error("Cannot convert element of $elname to a native Julia type")
+            end
+
+            jagg_offset = 10 # magic offsets, seems to be common for a lot of types, see auto.py in uproot3
+
+            # for each "event", the index range is `offsets[i] + jagg_offset + 1` to `offsets[i+1]`
+            # this is why we need to append `rawoffsets` in the `readbasketsraw()` call
+            # when you use this range to index `rawdata`, you will get raw bytes belong to each event
+            # Say your real data is Int32 and you see 8 bytes after indexing, then this event has [num1, num2] as real data
+            @views [
+                ntoh.(reinterpret(
+                        T, rawdata[ (rawoffsets[i]+jagg_offset+1):rawoffsets[i+1] ]
+                    )) for i in 1:(length(rawoffsets) - 1)
+            ]
+        else # the branch is not jagged
+            return ntoh.(reinterpret(primitivetype(leaf), rawdata))
+        end
+    end
 end
 
 
@@ -173,8 +201,14 @@ function DataFrame(f::ROOTFile, path)
     DataFrame(cols, names, copycols=false) #avoid double allocation
 end
 
+"""
+    splitup(data::Vector{UInt8}, offsets, T::Type; skipbytes=0, primitive=false)
+
+Given the `offsets` and `data` return by `array(...; raw = true)`, reconstructed the actual
+array (can be jagged, or with custome struct).
+"""
 function splitup(data::Vector{UInt8}, offsets, T::Type; skipbytes=0, primitive=false)
-    elsize = packedsizeof(T)
+    elsize = sizeof(T)
     out = sizehint!(Vector{Vector{T}}(), length(offsets))
     lengths = diff(offsets)
     push!(lengths, length(data) - offsets[end] + offsets[1])  # yay ;)
@@ -194,29 +228,6 @@ function splitup(data::Vector{UInt8}, offsets, T::Type; skipbytes=0, primitive=f
 end
 
 
-function readbaskets(io, branch, ::Type{T}) where {T}
-    seeks = branch.fBasketSeek
-    entries = branch.fBasketEntry
-
-    out = sizehint!(Vector{T}(), branch.fEntries)
-
-
-    for (idx, basket_seek) in enumerate(seeks)
-        @debug "Reading basket" idx basket_seek
-        if basket_seek == 0
-            break
-        end
-        seek(io, basket_seek)
-        basketkey = unpack(io, TBasketKey)
-        s = datastream(io, basketkey)
-
-        for _ in entries[idx]:(entries[idx + 1] - 1)
-            push!(out, readtype(s, T))
-        end
-    end
-    out
-end
-
 
 function readbasketsraw(io, branch)
     seeks = branch.fBasketSeek
@@ -226,19 +237,17 @@ function readbasketsraw(io, branch)
     # Just to check if we have a jagged structure
     # streamer = streamerfor()
 
-    # FIXME This UInt8 is wrong, the final data depends on branch info
     max_len = sum(bytes)
     data = sizehint!(Vector{UInt8}(), max_len)
     offsets = sizehint!(Vector{Int32}(), total_entries+1) # this is always Int32
     idx = 1
-    _res = sizehint!(Vector{Int32}(), max_len)
     for (basket_seek, n_bytes) in zip(seeks, bytes)
         @debug "Reading raw basket data" basket_seek n_bytes
         basket_seek == 0 && break
         seek(io, basket_seek)
-        idx += readbasketbytes!(data, offsets, io, idx, _res)
+        idx += readbasketbytes!(data, offsets, io, idx)
     end
-    _res, offsets
+    data, offsets
 end
 
 
@@ -253,48 +262,29 @@ end
 #           │                                                     │
 #           │←                       fObjlen                     →│
 #
-function readbasketbytes!(data, offsets, io, idx, _res::Vector{T}) where T
+function readbasketbytes!(data, offsets, io, idx)
     basketkey = unpack(io, TBasketKey)
 
-    # @show basketkey
     s = datastream(io, basketkey)  # position(s) == 0, but offsets start at -basketkey.fKeylen
     start = position(s)
-    # @show start
     contentsize = basketkey.fLast - basketkey.fKeylen
     offsetbytesize = basketkey.fObjlen - contentsize - 8
     offset_len = offsetbytesize ÷ 4 # these are always Int32
 
     if offsetbytesize > 0
-        @debug "Offset data present" offsetlength
+        @debug "Offset data present" offsetbytesize
         skip(s, contentsize)
         skip(s, 4) # a flag that indicates the type of data that follows
         readoffsets!(offsets, s, offset_len, length(data), length(data))
         skip(s, 4)  # "Pointer-to/location-of last used byte in basket"
         seek(s, start)
     end
-    push!(offsets, basketkey.fLast)
-    offsets .-= basketkey.fKeylen 
 
     @debug "Reading $(contentsize) bytes"
     readbytes!(s, data, idx, contentsize)
+    push!(offsets, basketkey.fLast)
+    offsets .-= basketkey.fKeylen 
 
-    # FIXME wtf is going on here please make this non-allocating
-    # https://github.com/scikit-hep/uproot3/blob/54f5151fb7c686c3a161fbe44b9f299e482f346b/uproot3/interp/jagged.py#L78-L87
-    #
-    # FIXME the +10 is for a bunch of jagged stuff, not sure what's the speial case
-    bytestarts = offsets[begin:offset_len] .+ 10
-    bytestops = offsets[begin+1:offset_len+1]
-
-    # fuck 0/1 index
-    mask = OffsetArray(zeros(Int8, contentsize), -1)
-    mask[@view bytestarts[bytestarts .< contentsize]] .=  1
-    mask[@view bytestops[bytestops .< contentsize]]   .-= 1
-    mask = OffsetArrays.no_offset_view(cumsum(mask))
-
-    #FIXME figureout what to interpret to outside
-    append!(_res, ntoh.(reinterpret(T, data[mask .== 1])))
-
-    # ======= end of magic =======
     contentsize
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 85464a9a..b2fd6993 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -188,20 +188,28 @@ end
     @test [0.0, 1.0588236, 2.1176472, 3.1764705, 4.2352943] ≈ df.float_array[1:5] atol=1e-7
 end
 
-@testset "simple jagged" begin
+@testset "Jagged branches" begin
     rootfile = ROOTFile(joinpath(SAMPLES_DIR, "tree_with_jagged_array.root"))
-    data, offsets = array(rootfile, "t1/int32_array"; raw=true)
+    data = array(rootfile, "t1/int32_array")
 
-    @test data isa Vector{Int32}
-    @test offsets isa Vector{Int32}
-    @test data[1:3] == [0,0,1]
+    @test data isa Vector{Vector{Int32}}
+    @test data[1] == Int32[]
+    @test data[1:2] == [Int32[], Int32[0]]
+    @test data[end] == Int32[90, 91, 92, 93, 94, 95, 96, 97, 98]
 end
 
 @testset "readbasketsraw()" begin
     array_md5 = [0xb4, 0xe9, 0x32, 0xe8, 0xfb, 0xff, 0xcf, 0xa0, 0xda, 0x75, 0xe0, 0x25, 0x34, 0x9b, 0xcd, 0xdf]
     rootfile = ROOTFile(joinpath(SAMPLES_DIR, "km3net_online.root"))
     data, offsets = array(rootfile, "KM3NET_EVENT/KM3NET_EVENT/snapshotHits"; raw=true)
-    @test_broken array_md5 == md5(data) #FIXME, the file seems to be broken
+    @test array_md5 == md5(data) #FIXME, the file seems to be broken
+
+    rootfile = ROOTFile(joinpath(SAMPLES_DIR, "tree_with_jagged_array.root"))
+    data, offsets = array(rootfile, "t1/int32_array"; raw=true)
+
+    @test data isa Vector{UInt8}
+    @test offsets isa Vector{Int32}
+    @test data[1:3] == UInt8[0x40, 0x00, 0x00]
 end
 
 
diff --git a/test/samples/tree_with_large_array_lz4.root b/test/samples/tree_with_large_array_lz4.root
index e658f77a8ba6dcd0da9ae4e0611d5e8d005850ab..edae63d08e3b3e070f22c544e6ef33652cfaad77 100644
GIT binary patch
delta 404
zcmaF5!SLw@!wLEV$x-`#ZcP(>y{mNfdY(i3-cNK?<7Hr2!@$D8z>t!8*dcRbQ7Su-
z0yFQ4X`!nJDnkM_f0*AeFw|@SsopHd__Kx?#E`0kGQ@sE7~6#^pmN*!zQR~sMNCjx
zw)Zd=Qw}p!?B5F*>sKlZRP5U$80%9!D^%>=9T@9XBpX!h*)<sJQ7}7H?A`?!>y|GE
zRP5R*80(TdCsgd*5g6-~BNtTc*ghERkTo|{Y~K#<PA8CWw|o7Dv98(jd|(2JO+V?t
d;|XQ}g|w$TIP!3VmHIjITo40`D@O3R001**vz7n=

delta 404
zcmaF5!SLw@!wLEVx5fB!ZZ8*nJvFnDVa0)c?<YE{@iH*1VPIikU`WY4=6P;nQ7Su-
z0yFQ4X`!nJDnkM_f0*AeFw|@SsopHd__Kx?#E`0kGQ@sE7~6#^pmN*!zQR~sMNCjx
zw)Zd=Qw}p!?B5F*>sKlZRP5U$80%9!D^%>=9T@9XBpX!h*)<sJQ7}7H?A`?!>y|GE
zRP5R*80(TdCsgd*5g6-~BNtTc*ghERkTo|{Y~K#<PA8CWw|o7Dv98(jd|(2JO+V?t
d;|XQ}g|w$TIP!3VmHIjITo40`D@O3R004AY!6E<v