forked from dps/go-zim
/
cluster.go
148 lines (134 loc) · 5.08 KB
/
cluster.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
package zim
import (
"compress/bzip2"
"encoding/binary"
"errors"
"io"
"io/ioutil"
)
const (
defaultOffsetSize = 4
extendedOffsetSize = 8
maxClusterLen = 1024 * 1024 * 32 // 32MB
)
func clusterOffsetSize(clusterInformation uint8) uint8 {
/*
if (clusterInformation & 0b0001_0000) == 0b0001_0000 {
return 8
}
return 4
*/
return ((clusterInformation >> 2) & 4) + 4
}
func clusterCompression(clusterInformation uint8) uint8 {
return clusterInformation & 15
}
func (z *File) clusterReader(clusterPosition uint32) (reader io.Reader, clusterInformation uint8, err error) {
if clusterPosition >= z.ClusterCount() {
err = errors.New("zim: invalid cluster position")
return
}
var clusterPointer = z.clusterPointerAtPos(clusterPosition)
seek(z.f, int64(clusterPointer))
clusterInformation = readUint8(z.f)
var compression = clusterCompression(clusterInformation)
switch compression {
case 0, 1: // uncompressed
reader = z.f
case 3: // bzip2 compressed
reader = bzip2.NewReader(z.f)
case 4: // xz compressed
if err = z.xzReader.Reset(z.f); err == nil {
z.xzReader.Multistream(false)
reader = z.xzReader
}
case 5: // zstd compressed
if err = z.zstdReader.Reset(z.f); err == nil {
reader = z.zstdReader
}
default:
// 2: zlib compressed (not used anymore)
err = errors.New("zim: unsupported cluster compression")
}
return
}
func (z *File) lastClusterPosition() uint32 {
return z.header.clusterCount - 1
}
// Cluster stores the uncompressed cluster data (blob positions followed by a sequence of blobs).
// Each blob belongs to a Directory Entry.
type Cluster struct {
data []byte // always uncompressed and len(data) <= 32MB
position uint32 // cluster position
information uint8 // cluster information byte; stores information about compression and offset size
}
// WasCompressed shows if the cluster data was compressed.
// This information can be used as an indicator about the
// cluster contents.
func (c *Cluster) WasCompressed() bool {
return clusterCompression(c.information) > 1
}
func (z *File) nextClusterPointer(c *Cluster) uint64 {
if c.position >= z.lastClusterPosition() {
return z.header.checksumPos - 1
}
return z.clusterPointerAtPos(c.position + 1)
}
// clusterLen returns the length of the cluster in bytes.
func (z *File) clusterLen(c *Cluster) int64 {
var nextClusterPointer = z.nextClusterPointer(c)
var clusterPointer = z.clusterPointerAtPos(c.position)
seek(z.f, int64(clusterPointer)+1) // file position was (very likely) changed; seek back.
// The +1 is because c.information byte was read afterwards too.
return int64(nextClusterPointer - clusterPointer - 1)
}
// ClusterAt returns the Cluster of the ZIM file at the given cluster position.
// The complete cluster data is stored uncompressed in memory.
// If the size of the cluster data is more than 32MB an error is returned
// and the data is not read into memory.
// Note: Only use this function, when it's needed to read every single blob of a
// ZIM file into memory (for example when iterating over all contents this improves performance).
func (z *File) ClusterAt(clusterPosition uint32) (Cluster, error) {
var c = Cluster{position: clusterPosition}
var clusterLen = z.clusterLen(&c)
if clusterLen <= 0 || clusterLen > maxClusterLen {
return c, errors.New("zim: invalid cluster size")
}
var clusterReader, clusterInformation, clusterReaderErr = z.clusterReader(clusterPosition)
c.information = clusterInformation
if clusterReaderErr != nil {
return c, clusterReaderErr
}
var clusterData, clusterDataErr = ioutil.ReadAll(io.LimitReader(clusterReader, int64(maxClusterLen)))
if clusterDataErr != nil {
return c, clusterDataErr
}
c.data = clusterData
return c, nil
}
// BlobAt returns the blob data at blob position of a given Cluster.
// This is only useful when iteration over all blobs in a Cluster is done.
// When only a single blob of a Cluster should be retrieved, it's better
// to use z.BlobReaderAt(clusterPosition, blobPosition) instead.
// The blob position starts at 0 and ends if an error is returned.
func (c *Cluster) BlobAt(blobPosition uint32) ([]byte, error) {
var offsetSize = uint64(clusterOffsetSize(c.information))
var thisBlobIndex = uint64(blobPosition) * offsetSize
var nextBlobIndex = thisBlobIndex + offsetSize
if nextBlobIndex+offsetSize > uint64(len(c.data)) {
return nil, errors.New("zim: invalid blob position")
}
var thisBlobPointer uint64
var nextBlobPointer uint64
if offsetSize == defaultOffsetSize {
thisBlobPointer = uint64(binary.LittleEndian.Uint32(c.data[thisBlobIndex : thisBlobIndex+defaultOffsetSize]))
nextBlobPointer = uint64(binary.LittleEndian.Uint32(c.data[nextBlobIndex : nextBlobIndex+defaultOffsetSize]))
} else {
thisBlobPointer = binary.LittleEndian.Uint64(c.data[thisBlobIndex : thisBlobIndex+extendedOffsetSize])
nextBlobPointer = binary.LittleEndian.Uint64(c.data[nextBlobIndex : nextBlobIndex+extendedOffsetSize])
}
if nextBlobPointer >= thisBlobPointer && nextBlobPointer <= uint64(len(c.data)) {
return c.data[thisBlobPointer:nextBlobPointer], nil
}
return nil, errors.New("zim: invalid blob index")
}