From 2b5b4a511ddc5eb78f57eda9254d687f55d5e0a4 Mon Sep 17 00:00:00 2001 From: ansj Date: Fri, 31 Jan 2020 21:52:19 +0800 Subject: [PATCH] add example extracting --- .../org/ansj/app/extracting/Extracting.java | 24 ++++++ .../arrimpl/PersonRecognition.java | 6 +- .../org/ansj/splitWord/impl/GetWordsImpl.java | 77 ++++++++++--------- .../ansj/app/extracting/ExtractingTest.java | 35 +++++++++ 4 files changed, 102 insertions(+), 40 deletions(-) diff --git a/src/main/java/org/ansj/app/extracting/Extracting.java b/src/main/java/org/ansj/app/extracting/Extracting.java index 2806ae1c..091d3124 100644 --- a/src/main/java/org/ansj/app/extracting/Extracting.java +++ b/src/main/java/org/ansj/app/extracting/Extracting.java @@ -9,6 +9,7 @@ import org.ansj.library.DicLibrary; import org.ansj.recognition.arrimpl.UserDefineRecognition; import org.ansj.splitWord.analysis.DicAnalysis; +import org.ansj.splitWord.analysis.ToAnalysis; import org.ansj.util.Graph; import org.ansj.util.TermUtil; import org.nlpcn.commons.lang.tire.domain.Forest; @@ -99,7 +100,30 @@ public ExtractingResult parse(String content, Forest... forests) { } Result terms = DicAnalysis.parse(content, myForests); + return parse(terms, false); + } + + /** + * 传入文本分词并抽取 + * @param content 需要分析的文本 + * @param forests 对文本分词加载的词典 + * @return 抽取结果集 + */ + public ExtractingResult parseWithToAnalysis(String content, Forest... forests) { + Forest[] myForests = null; + if (forests == null) { + myForests = new Forest[]{ruleIndex.getForest()}; + } else if (forests.length == 0) { + myForests = new Forest[]{ruleIndex.getForest(), DicLibrary.get()}; + } else { + myForests = new Forest[forests.length + 1]; + myForests[0] = ruleIndex.getForest(); + for (int i = 0; i < forests.length; i++) { + myForests[i + 1] = forests[i]; + } + } + Result terms = ToAnalysis.parse(content, myForests); return parse(terms, false); } diff --git a/src/main/java/org/ansj/recognition/arrimpl/PersonRecognition.java b/src/main/java/org/ansj/recognition/arrimpl/PersonRecognition.java index 358d1451..d9e720bf 100644 --- a/src/main/java/org/ansj/recognition/arrimpl/PersonRecognition.java +++ b/src/main/java/org/ansj/recognition/arrimpl/PersonRecognition.java @@ -87,7 +87,7 @@ public void recognition(Graph graph) { //0B 1C 2D 3E 4K 5L 6M 7X 8Y 9Z 10A - nodes = new PersonNode[terms.length + 1][11]; + nodes = new PersonNode[terms.length + 1][11];//应该是+2 吧? TODO beginOff = terms[0].getOffe(); @@ -317,7 +317,7 @@ private Viterbi getPersonNodeViterbi(Term[] terms) { nodes[0][4] = new PersonNode(4, "B", -Math.log(begin.getK())); nodes[0][10] = new PersonNode(10, "B", -Math.log(begin.getA())); - PersonNatureAttr end = DATDictionary.person("END"); + PersonNatureAttr end = DATDictionary.person("END"); //TODO: 这里是term.length+1 吧 nodes[terms.length][5] = new PersonNode(5, "E", -Math.log(end.getL())); nodes[terms.length][6] = null; nodes[terms.length][10] = new PersonNode(10, "E", -Math.log(end.getA())); @@ -349,7 +349,7 @@ private void termSplit(Term[] terms, List replaceTerm) { int len = first.getName().length(); - if (len == 1 || len == 3) {//这里写死了只支持2-3个字的拆分 + if (len == 1 || len > 3) {//这里写死了只支持2-3个字的拆分 continue; } diff --git a/src/main/java/org/ansj/splitWord/impl/GetWordsImpl.java b/src/main/java/org/ansj/splitWord/impl/GetWordsImpl.java index a0e3aeb1..efbff829 100644 --- a/src/main/java/org/ansj/splitWord/impl/GetWordsImpl.java +++ b/src/main/java/org/ansj/splitWord/impl/GetWordsImpl.java @@ -56,43 +56,43 @@ public String allWords() { charHashCode = chars[i]; end++; switch (getStatement()) { - case 0: - if (baseValue == chars[i]) { - str = String.valueOf(chars[i]); - offe = i; - start = ++i; - end = 0; - baseValue = 0; - tempBaseValue = baseValue; - return str; - } else { - int startCharStatus = DATDictionary.getItem(chars[start]).getStatus(); - if (startCharStatus == 1) { //如果start的词的status为1,则将start设为i;否则start加1 - start=i; - i--; + case 0: + if (baseValue == chars[i]) { + str = String.valueOf(chars[i]); + offe = i; + start = ++i; end = 0; baseValue = 0; + tempBaseValue = baseValue; + return str; } else { - i = start; - start++; - end = 0; - baseValue = 0; + int startCharStatus = DATDictionary.getItem(chars[start]).getStatus(); + if (startCharStatus == 1) { //如果start的词的status为1,则将start设为i;否则start加1 + start = i; + i--; + end = 0; + baseValue = 0; + } else { + i = start; + start++; + end = 0; + baseValue = 0; + } + break; } - break; - } - case 2: - i++; - offe = start; - tempBaseValue = baseValue; - return DATDictionary.getItem(tempBaseValue).getName(); - case 3: - offe = start; - start++; - i = start; - end = 0; - tempBaseValue = baseValue; - baseValue = 0; - return DATDictionary.getItem(tempBaseValue).getName(); + case 2: + i++; + offe = start; + tempBaseValue = baseValue; + return DATDictionary.getItem(tempBaseValue).getName(); + case 3: + offe = start; + start++; + i = start; + end = 0; + tempBaseValue = baseValue; + baseValue = 0; + return DATDictionary.getItem(tempBaseValue).getName(); } } @@ -104,21 +104,24 @@ public String allWords() { /** * 根据用户传入的c得到单词的状态. 0.代表这个字不在词典中 1.继续 2.是个词但是还可以继续 3.停止已经是个词了 - * + * * @param c * @return */ private int getStatement() { checkValue = baseValue; baseValue = DATDictionary.getItem(checkValue).getBase() + charHashCode; - if (baseValue < DATDictionary.arrayLength && (DATDictionary.getItem(baseValue).getCheck() == checkValue || DATDictionary.getItem(baseValue).getCheck() == -1)) { - return DATDictionary.getItem(baseValue).getStatus(); + if (baseValue < DATDictionary.arrayLength) { + AnsjItem temp = DATDictionary.getItem(baseValue); + if (temp.getCheck() == checkValue || temp.getCheck() == -1) { + return DATDictionary.getItem(baseValue).getStatus(); + } } return 0; } public AnsjItem getItem() { - return DATDictionary.getItem(tempBaseValue); + return DATDictionary.getItem(tempBaseValue); } @Override diff --git a/src/test/java/org/ansj/app/extracting/ExtractingTest.java b/src/test/java/org/ansj/app/extracting/ExtractingTest.java index 40cc9a71..ea3f52c6 100644 --- a/src/test/java/org/ansj/app/extracting/ExtractingTest.java +++ b/src/test/java/org/ansj/app/extracting/ExtractingTest.java @@ -2,7 +2,9 @@ import org.ansj.app.extracting.domain.ExtractingResult; import org.ansj.app.extracting.exception.RuleFormatException; +import org.ansj.domain.Term; import org.ansj.library.DicLibrary; +import org.ansj.splitWord.analysis.DicAnalysis; import org.ansj.splitWord.analysis.ToAnalysis; import org.junit.Test; @@ -76,4 +78,37 @@ public void test1() throws RuleFormatException { System.out.println(extracting.parse("清华大学啊啊啊负责人孙健先生").getAllResult()); System.out.println(extracting.parse("本期计提坏账准备金额2138030.52元;本期收回或转回坏账准备金额0.00元").getAllResult()); } + + @Test + public void test2() throws RuleFormatException { + List lines = new ArrayList<>() ; + + //填写规则 可以写多条 + + lines.add("(是很)(:a)(:u)\t特定:0;副词:1;形容词:2"); + lines.add("(:n)(:d)(:a)\t副词:0;形容词:1;名词:2"); + lines.add("(:d)(:a)(:u)(:n)\td:0;副词:1;形容词:2;名词:3"); + + + Extracting extracting = new Extracting(lines) ; + + + System.out.println(ToAnalysis.parse("我这里有很优美的环境")); + System.out.println(ToAnalysis.parse("我这里环境很优美")); + System.out.println(ToAnalysis.parse("我这里的环境是很优美的")); + + System.out.println("------------------------------------------------------------"); + + printRule(extracting.parseWithToAnalysis("我这里有很优美的环境").findAll()); + printRule(extracting.parseWithToAnalysis("我这里环境很优美").findAll()); + printRule(extracting.parseWithToAnalysis("我这里的环境是很优美的").findAll()); + + + } + + private void printRule(List> list) { + for (List terms : list) { + System.out.println(terms); + } + } }