Find biggest key

HDT3213 · Feb 20, 2022 · 98fa0a7 · 98fa0a7
1 parent 7fbf15c
commit 98fa0a7
Show file tree

Hide file tree

Showing 10 changed files with 321 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@ It provides utilities to:
 - Generate memory report for rdb file
 - Convert RDB files to JSON
 - Convert RDB files to Redis Serialization Protocol (or AOF file)
+- Find Biggest Key in RDB files
 - Customize data usage
 
 Thanks sripathikrishnan for his [redis-rdb-tools](https://github.com/sripathikrishnan/redis-rdb-tools)
@@ -22,7 +23,7 @@ If you have installed `go` on your compute, just simply use:
 go get github.com/hdt3213/rdb
 ```
 
-Or, you can download executable binary file from releases(https://github.com/HDT3213/rdb/releases) and put its path to PATH environment.
+Or, you can download executable binary file from [releases](https://github.com/HDT3213/rdb/releases) and put its path to PATH environment.
 
 use `rdb` command in terminal, you can see it's manual
 
@@ -65,7 +66,7 @@ Example:
 rdb -c memory -o mem.csv cases/memory.rdb
 ```
 
-The examples for json result:
+The examples for csv result:
 ```csv
 database,key,type,size,size_readable,element_count
 0,hash,hash,64,64B,2
@@ -77,6 +78,28 @@ database,key,type,size,size_readable,element_count
 0,set,set,39,39B,2
 ```
 
+# Find Biggest Keys
+
+RDB can find biggest N keys in file
+```
+rdb -c bigkey -n <result_number> <source_path>
+```
+
+Example:
+```
+rdb -c bigkey -n 5 cases/memory.rdb
+```
+
+The examples for csv result:
+```csv
+database,key,type,size,size_readable,element_count
+0,large,string,2056,2K,0
+0,list,list,66,66B,4
+0,hash,hash,64,64B,2
+0,zset,zset,57,57B,2
+0,set,set,39,39B,2
+```
+
 # Convert to AOF
 
 Usage:

diff --git a/cases/largest.csv b/cases/largest.csv
@@ -0,0 +1,6 @@
+database,key,type,size,size_readable,element_count
+0,large,string,2056,2K,0
+0,list,list,66,66B,4
+0,hash,hash,64,64B,2
+0,zset,zset,57,57B,2
+0,set,set,39,39B,2
diff --git a/cmd.go b/cmd.go
@@ -4,13 +4,15 @@ import (
 	"flag"
 	"fmt"
 	"github.com/hdt3213/rdb/helper"
+	"os"
 )
 
 const help = `
 This is a tool to parse Redis' RDB files
 Options:
   -c command, including: json/memory/aof
   -o output file path
+  -n number of result 
 
 Examples:
 1. convert rdb to json
@@ -19,13 +21,17 @@ Examples:
   rdb -c memory -o memory.csv dump.rdb
 3. convert to aof file
   rdb -c aof -o dump.aof dump.rdb
+4. get largest keys
+  rdb -c bigkey -o dump.aof dump.rdb
 `
 
 func main() {
 	var cmd string
 	var output string
+	var n int
 	flag.StringVar(&cmd, "c", "", "command for rdb: json")
 	flag.StringVar(&output, "o", "", "output file path")
+	flag.IntVar(&n, "n", 0, "")
 	flag.Parse()
 	src := flag.Arg(0)
 
@@ -37,10 +43,6 @@ func main() {
 		println("src file is required")
 		return
 	}
-	if output == "" {
-		println("output file path is required")
-		return
-	}
 
 	var err error
 	switch cmd {
@@ -50,6 +52,8 @@ func main() {
 		err = helper.MemoryProfile(src, output)
 	case "aof":
 		err = helper.ToAOF(src, output)
+	case "bigkey":
+		err = helper.FindBiggestKeys(src, n, os.Stdout)
 	default:
 		println("unknown command")
 		return

diff --git a/core/decoder.go b/core/decoder.go
@@ -300,7 +300,13 @@ func (dec *Decoder) parse(cb func(object model.RedisObject) bool) error {
 // Parse parses rdb and callback
 // cb returns true to continue, returns false to stop the iteration
 func (dec *Decoder) Parse(cb func(object model.RedisObject) bool) error {
-	err := dec.checkHeader()
+	var err error
+	defer func() {
+		if err2 := recover(); err2 != nil {
+			err = fmt.Errorf("panic: %v", err2)
+		}
+	}()
+	err = dec.checkHeader()
 	if err != nil {
 		return err
 	}

diff --git a/core/list.go b/core/list.go
@@ -57,11 +57,6 @@ func (dec *Decoder) readZipList() ([][]byte, error) {
 }
 
 func (dec *Decoder) readZipListEntry(buf []byte, cursor *int) (result []byte, err error) {
-	defer func() {
-		if err2 := recover(); err2 != nil {
-			err = fmt.Errorf("panic: %v", err)
-		}
-	}()
 	prevLen := buf[*cursor]
 	*cursor++
 	if prevLen == zipBigPrevLen {

diff --git a/helper/bigkey.go b/helper/bigkey.go
@@ -0,0 +1,134 @@
+package helper
+
+import (
+	"container/heap"
+	"encoding/csv"
+	"errors"
+	"fmt"
+	"github.com/hdt3213/rdb/bytefmt"
+	"github.com/hdt3213/rdb/core"
+	"github.com/hdt3213/rdb/model"
+	"os"
+	"strconv"
+)
+
+type redisHeap struct {
+	list     []model.RedisObject
+	capacity int
+	minSize  int // size of min object
+	minIndex int // index of min object
+}
+
+func (h redisHeap) Len() int {
+	return len(h.list)
+}
+
+// Max Heap
+func (h *redisHeap) Less(i, j int) bool {
+	return h.list[i].GetSize() > h.list[j].GetSize()
+}
+
+func (h *redisHeap) Swap(i, j int) {
+	h.list[i], h.list[j] = h.list[j], h.list[i]
+}
+
+func (h *redisHeap) Push(x interface{}) {
+	h.list = append(h.list, x.(model.RedisObject))
+}
+
+func (h *redisHeap) Pop() interface{} {
+	item := h.list[len(h.list)-1]
+	h.list = h.list[0 : len(h.list)-1]
+	return item
+}
+
+// time complexity: O(n*log(m)), n is number of redis object, m is heap capacity. m if far less than n
+func (h *redisHeap) Append(x model.RedisObject) {
+	// heap is full, skip
+	if x.GetSize() <= h.minSize && h.Len() >= h.capacity {
+		return
+	}
+	// if heap is full, pop min object
+	if h.Len() >= h.capacity {
+		// assert h.minIndex >= 0
+		heap.Remove(h, h.minIndex)
+	}
+	heap.Push(h, x)
+	// update h.minSize
+	h.minSize = 1<<31 - 1
+	for i := h.Len() - 1; i >= 0; i-- { //
+		o := h.list[i]
+		if o.GetSize() < h.minSize {
+			h.minSize = o.GetSize()
+			h.minIndex = i
+		}
+	}
+}
+
+func (h *redisHeap) Dump() []model.RedisObject {
+	result := make([]model.RedisObject, 0, h.Len())
+	for h.Len() > 0 {
+		o := heap.Pop(h).(model.RedisObject)
+		result = append(result, o)
+	}
+	return result
+}
+
+func newRedisHeap(cap int) *redisHeap {
+	list := make([]model.RedisObject, 0, cap)
+	h := &redisHeap{
+		list:     list,
+		capacity: cap,
+		minIndex: -1,
+	}
+	heap.Init(h)
+	return h
+}
+
+// FindBiggestKeys read rdb file and find the largest N keys.
+// The invoker owns output, FindBiggestKeys won't close it
+func FindBiggestKeys(rdbFilename string, topN int, output *os.File) error {
+	if rdbFilename == "" {
+		return errors.New("src file path is required")
+	}
+	if topN <= 0 {
+		return errors.New("n must greater than 0")
+	}
+	rdbFile, err := os.Open(rdbFilename)
+	if err != nil {
+		return fmt.Errorf("open rdb %s failed, %v", rdbFilename, err)
+	}
+	defer func() {
+		_ = rdbFile.Close()
+	}()
+	p := core.NewDecoder(rdbFile)
+	topList := newRedisHeap(topN)
+	err = p.Parse(func(object model.RedisObject) bool {
+		topList.Append(object)
+		return true
+	})
+	if err != nil {
+		return err
+	}
+	_, err = output.WriteString("database,key,type,size,size_readable,element_count\n")
+	if err != nil {
+		return fmt.Errorf("write header failed: %v", err)
+	}
+	csvWriter := csv.NewWriter(output)
+	defer csvWriter.Flush()
+	for topList.Len() > 0 {
+		object := heap.Pop(topList).(model.RedisObject)
+		err = csvWriter.Write([]string{
+			strconv.Itoa(object.GetDBIndex()),
+			object.GetKey(),
+			object.GetType(),
+			strconv.Itoa(object.GetSize()),
+			bytefmt.FormatSize(uint64(object.GetSize())),
+			strconv.Itoa(object.GetElemCount()),
+		})
+		if err != nil {
+			return fmt.Errorf("csv write failed: %v", err)
+		}
+	}
+	return nil
+}
diff --git a/helper/bigkey_test.go b/helper/bigkey_test.go
@@ -0,0 +1,49 @@
+package helper
+
+import (
+	"github.com/hdt3213/rdb/model"
+	"math/rand"
+	"sort"
+	"strconv"
+	"testing"
+)
+
+func TestRedisHeap_Append(t *testing.T) {
+	sizeMap := make(map[int]struct{}) // The behavior when encountering objects of the same size is undefined
+	topN := 100
+	n := topN * 10
+	objects := make([]model.RedisObject, 0)
+	for i := 0; i < n; i++ {
+		var size int
+		for {
+			size = rand.Intn(n * 10)
+			if _, ok := sizeMap[size]; !ok {
+				sizeMap[size] = struct{}{}
+				break
+			}
+		}
+		o := &model.StringObject{
+			BaseObject: &model.BaseObject{
+				Key:  strconv.Itoa(i),
+				Size: size,
+			},
+		}
+		objects = append(objects, o)
+	}
+	topList := newRedisHeap(topN)
+	for _, o := range objects {
+		topList.Append(o)
+	}
+	actual := topList.Dump()
+	sort.Slice(objects, func(i, j int) bool {
+		return objects[i].GetSize() > objects[j].GetSize()
+	})
+	expect := objects[0:topN]
+	for i := 0; i < topN; i++ {
+		o1 := actual[i]
+		o2 := expect[i]
+		if o1.GetSize() != o2.GetSize() {
+			t.Errorf("wrong answer at index: %d", i)
+		}
+	}
+}
diff --git a/helper/helper.go b/helper/helper.go
@@ -2,6 +2,7 @@ package helper
 
 import (
 	"encoding/json"
+	"errors"
 	"fmt"
 	"github.com/hdt3213/rdb/core"
 	"github.com/hdt3213/rdb/model"
@@ -10,6 +11,12 @@ import (
 
 // ToJsons read rdb file and convert to json file whose each line contains a json object
 func ToJsons(rdbFilename string, jsonFilename string) error {
+	if rdbFilename == "" {
+		return errors.New("src file path is required")
+	}
+	if jsonFilename == "" {
+		return errors.New("output file path is required")
+	}
 	rdbFile, err := os.Open(rdbFilename)
 	if err != nil {
 		return fmt.Errorf("open rdb %s failed, %v", rdbFilename, err)
@@ -60,6 +67,12 @@ func ToJsons(rdbFilename string, jsonFilename string) error {
 
 // ToAOF read rdb file and convert to aof file (Redis Serialization )
 func ToAOF(rdbFilename string, aofFilename string) error {
+	if rdbFilename == "" {
+		return errors.New("src file path is required")
+	}
+	if aofFilename == "" {
+		return errors.New("output file path is required")
+	}
 	rdbFile, err := os.Open(rdbFilename)
 	if err != nil {
 		return fmt.Errorf("open rdb %s failed, %v", rdbFilename, err)

diff --git a/helper/memory.go b/helper/memory.go
@@ -2,6 +2,7 @@ package helper
 
 import (
 	"encoding/csv"
+	"errors"
 	"fmt"
 	"github.com/hdt3213/rdb/bytefmt"
 	"github.com/hdt3213/rdb/core"
@@ -12,6 +13,12 @@ import (
 
 // MemoryProfile read rdb file and analysis memory usage then write result to csv file
 func MemoryProfile(rdbFilename string, csvFilename string) error {
+	if rdbFilename == "" {
+		return errors.New("src file path is required")
+	}
+	if csvFilename == "" {
+		return errors.New("output file path is required")
+	}
 	rdbFile, err := os.Open(rdbFilename)
 	if err != nil {
 		return fmt.Errorf("open rdb %s failed, %v", rdbFilename, err)