/
keyword.go
126 lines (104 loc) · 2.38 KB
/
keyword.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
package blog
import (
"strings"
"github.com/Laisky/go-chaining"
utils "github.com/Laisky/go-utils/v4"
"github.com/yanyiwu/gojieba"
)
// var (
// isUseHMM = false
// )
type Analyser struct {
j *gojieba.Jieba
}
func NewAnalyser() *Analyser {
return &Analyser{j: gojieba.NewJieba()}
}
func (a *Analyser) Cut2Words(cnt string, minialCount, topN int) (words []string) {
return chaining.Flow(
FilterFmt,
a.cut2Words,
FilterDiscardWords,
Convert2WrodsMap,
FilterMinimalWordsCount(minialCount),
FilterMostFreqWords(topN),
)(cnt, nil).GetSliceString()
}
func (a *Analyser) cut2Words(c *chaining.Chain) (interface{}, error) {
return a.j.CutAll(c.GetString()), nil
}
func FilterDiscardWords(c *chaining.Chain) (interface{}, error) {
var (
w, dw string
)
ret := []string{}
for _, w = range c.GetSliceString() {
w = strings.TrimSpace(strings.ToLower(w))
if w == "" || discardWordsRegex.MatchString(w) {
goto PASS
}
for _, dw = range discardWords {
if w == dw {
goto PASS
}
}
ret = append(ret, w)
PASS:
}
return ret, nil
}
func FilterFmt(c *chaining.Chain) (interface{}, error) {
ret := discardFmtRegex.ReplaceAllString(c.GetString(), "")
return strings.Replace(ret, " ", "", -1), nil
}
func Convert2WrodsMap(c *chaining.Chain) (interface{}, error) {
wordsMap := map[string]int{}
var ok bool
for _, w := range c.GetSliceString() {
if _, ok = wordsMap[w]; !ok {
wordsMap[w] = 1
} else {
wordsMap[w]++
}
}
return wordsMap, nil
}
func FilterMinimalWordsCount(minialCount int) func(c *chaining.Chain) (interface{}, error) {
return func(c *chaining.Chain) (interface{}, error) {
wordsMap := c.GetVal().(map[string]int)
for k, v := range wordsMap {
if v < minialCount {
delete(wordsMap, k)
}
}
return wordsMap, nil
}
}
type sortItem struct {
k string
v int
}
func (i *sortItem) GetValue() int {
return i.v
}
func (i *sortItem) GetData() interface{} {
return i.k
}
func FilterMostFreqWords(topN int) func(c *chaining.Chain) (interface{}, error) {
pairLs := utils.PairList{}
keyLs := []string{}
return func(c *chaining.Chain) (interface{}, error) {
wordsMap := c.GetVal().(map[string]int)
for k, v := range wordsMap {
pairLs = append(pairLs, &sortItem{k, v})
}
utils.SortBiggest(pairLs)
for i, k := range pairLs {
if i >= topN {
break
}
keyLs = append(keyLs, k.GetData().(string))
}
return keyLs, nil
}
}