/
AnsjTokenizer.java
139 lines (112 loc) · 3.47 KB
/
AnsjTokenizer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
package org.ansj.lucene.util;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.impl.StopRecognition;
import org.ansj.recognition.impl.SynonymsRecgnition;
import org.ansj.splitWord.Analysis;
import org.ansj.util.AnsjReader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
public final class AnsjTokenizer extends Tokenizer {
// 当前词
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
// 偏移量
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
// 距离
private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
// 分词词性
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
protected Analysis ta = null;
private LinkedList<Object> result;
private List<StopRecognition> stops; //停用词对象
private List<SynonymsRecgnition> synonyms; //同义词词典
private int finalOffset;
public AnsjTokenizer(Analysis ta, List<StopRecognition> stops, List<SynonymsRecgnition> synonyms) {
this.ta = ta;
this.stops = stops;
this.synonyms = synonyms;
}
@Override
public final boolean incrementToken() throws IOException {
int position = 0;
if (result == null) {
parse();
}
Object obj = result.pollFirst();
if (obj == null) {
result = null;
return false;
}
if (obj instanceof Term) {
clearAttributes();
Term term = (Term) obj;
while (filterTerm(term)) { //停用词
term = (Term) result.pollFirst();
if (term == null) {
result = null;
return false;
}
position++;
}
List<String> synonyms = term.getSynonyms(); //获得同义词
String rName = null;
if (synonyms != null) {
for (int i = 1; i < synonyms.size(); i++) {
result.addFirst(synonyms.get(i));
}
rName = synonyms.get(0);
} else {
rName = term.getName();
}
position++;
offsetAtt.setOffset(correctOffset(term.getOffe()), finalOffset = correctOffset(term.getOffe() + term.getName().length()));
typeAtt.setType(term.getNatureStr());
positionAttr.setPositionIncrement(position);
termAtt.setEmpty().append(rName);
} else {
positionAttr.setPositionIncrement(position);
termAtt.setEmpty().append(obj.toString());
}
return true;
}
private boolean filterTerm(Term term) {
if (stops != null && stops.size() > 0) {
for (StopRecognition filterRecognition : stops) {
if (filterRecognition.filter(term)) {
return true;
}
}
}
return false;
}
@Override
public void end() throws IOException {
super.end();
// set final offset
offsetAtt.setOffset(finalOffset, finalOffset);
}
/**
* 必须重载的方法,否则在批量索引文件时将会导致文件索引失败
*/
@Override
public void reset() throws IOException {
super.reset();
ta.resetContent(new AnsjReader(this.input));
parse();
}
private void parse() throws IOException {
Result parse = ta.parse();
if (synonyms != null) {
for (SynonymsRecgnition sr : synonyms) {
parse.recognition(sr);
}
}
result = new LinkedList<Object>(parse.getTerms());
}
}