Skip to content

Commit

Permalink
add example extracting
Browse files Browse the repository at this point in the history
  • Loading branch information
ansjsun committed Jan 31, 2020
1 parent 4deadeb commit 2b5b4a5
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 40 deletions.
24 changes: 24 additions & 0 deletions src/main/java/org/ansj/app/extracting/Extracting.java
Expand Up @@ -9,6 +9,7 @@
import org.ansj.library.DicLibrary;
import org.ansj.recognition.arrimpl.UserDefineRecognition;
import org.ansj.splitWord.analysis.DicAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.ansj.util.Graph;
import org.ansj.util.TermUtil;
import org.nlpcn.commons.lang.tire.domain.Forest;
Expand Down Expand Up @@ -99,7 +100,30 @@ public ExtractingResult parse(String content, Forest... forests) {
}

Result terms = DicAnalysis.parse(content, myForests);
return parse(terms, false);
}

/**
* 传入文本分词并抽取
* @param content 需要分析的文本
* @param forests 对文本分词加载的词典
* @return 抽取结果集
*/
public ExtractingResult parseWithToAnalysis(String content, Forest... forests) {
Forest[] myForests = null;
if (forests == null) {
myForests = new Forest[]{ruleIndex.getForest()};
} else if (forests.length == 0) {
myForests = new Forest[]{ruleIndex.getForest(), DicLibrary.get()};
} else {
myForests = new Forest[forests.length + 1];
myForests[0] = ruleIndex.getForest();
for (int i = 0; i < forests.length; i++) {
myForests[i + 1] = forests[i];
}
}

Result terms = ToAnalysis.parse(content, myForests);
return parse(terms, false);
}

Expand Down
Expand Up @@ -87,7 +87,7 @@ public void recognition(Graph graph) {

//0B 1C 2D 3E 4K 5L 6M 7X 8Y 9Z 10A

nodes = new PersonNode[terms.length + 1][11];
nodes = new PersonNode[terms.length + 1][11];//应该是+2 吧? TODO

beginOff = terms[0].getOffe();

Expand Down Expand Up @@ -317,7 +317,7 @@ private Viterbi<PersonNode> getPersonNodeViterbi(Term[] terms) {
nodes[0][4] = new PersonNode(4, "B", -Math.log(begin.getK()));
nodes[0][10] = new PersonNode(10, "B", -Math.log(begin.getA()));

PersonNatureAttr end = DATDictionary.person("END");
PersonNatureAttr end = DATDictionary.person("END"); //TODO: 这里是term.length+1 吧
nodes[terms.length][5] = new PersonNode(5, "E", -Math.log(end.getL()));
nodes[terms.length][6] = null;
nodes[terms.length][10] = new PersonNode(10, "E", -Math.log(end.getA()));
Expand Down Expand Up @@ -349,7 +349,7 @@ private void termSplit(Term[] terms, List<Term> replaceTerm) {

int len = first.getName().length();

if (len == 1 || len == 3) {//这里写死了只支持2-3个字的拆分
if (len == 1 || len > 3) {//这里写死了只支持2-3个字的拆分
continue;
}

Expand Down
77 changes: 40 additions & 37 deletions src/main/java/org/ansj/splitWord/impl/GetWordsImpl.java
Expand Up @@ -56,43 +56,43 @@ public String allWords() {
charHashCode = chars[i];
end++;
switch (getStatement()) {
case 0:
if (baseValue == chars[i]) {
str = String.valueOf(chars[i]);
offe = i;
start = ++i;
end = 0;
baseValue = 0;
tempBaseValue = baseValue;
return str;
} else {
int startCharStatus = DATDictionary.getItem(chars[start]).getStatus();
if (startCharStatus == 1) { //如果start的词的status为1,则将start设为i;否则start加1
start=i;
i--;
case 0:
if (baseValue == chars[i]) {
str = String.valueOf(chars[i]);
offe = i;
start = ++i;
end = 0;
baseValue = 0;
tempBaseValue = baseValue;
return str;
} else {
i = start;
start++;
end = 0;
baseValue = 0;
int startCharStatus = DATDictionary.getItem(chars[start]).getStatus();
if (startCharStatus == 1) { //如果start的词的status为1,则将start设为i;否则start加1
start = i;
i--;
end = 0;
baseValue = 0;
} else {
i = start;
start++;
end = 0;
baseValue = 0;
}
break;
}
break;
}
case 2:
i++;
offe = start;
tempBaseValue = baseValue;
return DATDictionary.getItem(tempBaseValue).getName();
case 3:
offe = start;
start++;
i = start;
end = 0;
tempBaseValue = baseValue;
baseValue = 0;
return DATDictionary.getItem(tempBaseValue).getName();
case 2:
i++;
offe = start;
tempBaseValue = baseValue;
return DATDictionary.getItem(tempBaseValue).getName();
case 3:
offe = start;
start++;
i = start;
end = 0;
tempBaseValue = baseValue;
baseValue = 0;
return DATDictionary.getItem(tempBaseValue).getName();
}

}
Expand All @@ -104,21 +104,24 @@ public String allWords() {

/**
* 根据用户传入的c得到单词的状态. 0.代表这个字不在词典中 1.继续 2.是个词但是还可以继续 3.停止已经是个词了
*
*
* @param c
* @return
*/
private int getStatement() {
checkValue = baseValue;
baseValue = DATDictionary.getItem(checkValue).getBase() + charHashCode;
if (baseValue < DATDictionary.arrayLength && (DATDictionary.getItem(baseValue).getCheck() == checkValue || DATDictionary.getItem(baseValue).getCheck() == -1)) {
return DATDictionary.getItem(baseValue).getStatus();
if (baseValue < DATDictionary.arrayLength) {
AnsjItem temp = DATDictionary.getItem(baseValue);
if (temp.getCheck() == checkValue || temp.getCheck() == -1) {
return DATDictionary.getItem(baseValue).getStatus();
}
}
return 0;
}

public AnsjItem getItem() {
return DATDictionary.getItem(tempBaseValue);
return DATDictionary.getItem(tempBaseValue);
}

@Override
Expand Down
35 changes: 35 additions & 0 deletions src/test/java/org/ansj/app/extracting/ExtractingTest.java
Expand Up @@ -2,7 +2,9 @@

import org.ansj.app.extracting.domain.ExtractingResult;
import org.ansj.app.extracting.exception.RuleFormatException;
import org.ansj.domain.Term;
import org.ansj.library.DicLibrary;
import org.ansj.splitWord.analysis.DicAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.junit.Test;

Expand Down Expand Up @@ -76,4 +78,37 @@ public void test1() throws RuleFormatException {
System.out.println(extracting.parse("清华大学啊啊啊负责人孙健先生").getAllResult());
System.out.println(extracting.parse("本期计提坏账准备金额2138030.52元;本期收回或转回坏账准备金额0.00元").getAllResult());
}

@Test
public void test2() throws RuleFormatException {
List<String> lines = new ArrayList<>() ;

//填写规则 可以写多条

lines.add("(是很)(:a)(:u)\t特定:0;副词:1;形容词:2");
lines.add("(:n)(:d)(:a)\t副词:0;形容词:1;名词:2");
lines.add("(:d)(:a)(:u)(:n)\td:0;副词:1;形容词:2;名词:3");


Extracting extracting = new Extracting(lines) ;


System.out.println(ToAnalysis.parse("我这里有很优美的环境"));
System.out.println(ToAnalysis.parse("我这里环境很优美"));
System.out.println(ToAnalysis.parse("我这里的环境是很优美的"));

System.out.println("------------------------------------------------------------");

printRule(extracting.parseWithToAnalysis("我这里有很优美的环境").findAll());
printRule(extracting.parseWithToAnalysis("我这里环境很优美").findAll());
printRule(extracting.parseWithToAnalysis("我这里的环境是很优美的").findAll());


}

private void printRule(List<List<Term>> list) {
for (List<Term> terms : list) {
System.out.println(terms);
}
}
}

0 comments on commit 2b5b4a5

Please sign in to comment.