-
Notifications
You must be signed in to change notification settings - Fork 1
/
hmm.go
119 lines (101 loc) · 2.13 KB
/
hmm.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
package npartword
import (
"regexp"
)
type Hmm struct {
regHan *regexp.Regexp
regSkip *regexp.Regexp
hstates []byte //隐状态
}
func NewHmm() *Hmm {
LoadEmitProb()
return &Hmm {
regHan : regexp.MustCompile(`\p{Han}+`),
regSkip : regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`),
hstates : []byte{'B', 'M', 'E', 'S'},
}
}
func (hmm *Hmm) HmmPart(text string) []string {
result := make([]string, 0, 10)
var (
hans string
nonHans string
hanLoc []int
nonHanLoc []int
)
for {
//匹配汉字
hanLoc = hmm.regHan.FindStringIndex(text)
if hanLoc == nil {
if len(text) == 0 {
break
}
} else if hanLoc[0] == 0 {
hans = text[hanLoc[0]:hanLoc[1]]
text = text[hanLoc[1]:]
for _, han := range hmm.GetViterbiResult(hans) {
result = append(result, han)
}
continue
}
//匹配字母数字
nonHanLoc = hmm.regSkip.FindStringIndex(text)
if nonHanLoc == nil {
if len(text) == 0 {
break
}
} else if nonHanLoc[0] == 0 {
nonHans = text[nonHanLoc[0]:nonHanLoc[1]]
text = text[nonHanLoc[1]:]
if nonHans != "" {
result = append(result, nonHans)
continue
}
}
loc := locSwitch(text, hanLoc, nonHanLoc)
if loc == nil {
result = append(result, text)
break
}
result = append(result, text[:loc[0]])
text = text[loc[0]:]
}
return result
}
func (hmm *Hmm) GetViterbiResult(text string) []string {
result := make([]string, 0, 10)
begin, next := 0, 0
runes := []rune(text)
_, posList := Viterbi(runes, hmm.hstates)
for i, v := range runes {
pos := posList[i]
switch pos {
case 'B':
begin = i
case 'E':
result = append(result, string(runes[begin:i+1]))
next = i+1
case 'S':
result = append(result, string(v))
next = i+1
}
}
if next < len(runes) {
result = append(result, string(runes[next:]))
}
return result
}
func locSwitch(text string, hanLoc, nonHanLoc []int) (loc []int) {
if hanLoc == nil && nonHanLoc == nil {
if len(text) > 0 {
return nil
}
} else if hanLoc == nil {
loc = nonHanLoc
} else if nonHanLoc == nil || hanLoc[0] < nonHanLoc[0] {
loc = hanLoc
} else {
loc = nonHanLoc
}
return loc
}