/
TextRankExtractor.java
93 lines (81 loc) · 2.67 KB
/
TextRankExtractor.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
package com.ikanow.infinit.e.harvest.enrichment.legacy;
import java.util.ArrayList;
import java.util.Collection;
import com.ikanow.infinit.e.data_model.Globals;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDailyLimitExceededException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDocumentLevelException;
import com.ikanow.infinit.e.data_model.interfaces.harvest.EntityExtractorEnum;
import com.ikanow.infinit.e.data_model.interfaces.harvest.IEntityExtractor;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.sharethis.textrank.MetricVector;
import com.sharethis.textrank.TextRank;
public class TextRankExtractor implements IEntityExtractor {
public ThreadLocal<TextRank> processor = new ThreadLocal<TextRank>() {
@Override protected TextRank initialValue() {
try {
return new TextRank(Globals.getConfigLocation(), "en");
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
};
@Override
public String getName() {
return "textrank";
}
@Override
public void extractEntities(DocumentPojo partialDoc)
{
if (null == partialDoc) {
return;
}
try {
if (null == partialDoc.getFullText()) {
return;
}
processor.get().prepCall(partialDoc.getFullText(), false);
Collection<MetricVector> results = processor.get().call();
int nSize = results.size();
if (null == partialDoc.getEntities()) {
partialDoc.setEntities(new ArrayList<EntityPojo>(nSize));
}
for (MetricVector res: results) {
if ((nSize > 50) && (res.metric < 0.1)) {
continue; // some very basic filtering
}
if ((nSize > 100) && (res.metric < 0.2)) {
continue; // more aggressive filtering
}
if (Double.isInfinite(res.metric) || Double.isNaN(res.metric)) {
continue;
}
EntityPojo entity = new EntityPojo();
entity.setDimension(EntityPojo.Dimension.What);
entity.setType("Keyword");
entity.setDisambiguatedName(res.value.text);
entity.setActual_name(res.value.text);
entity.setFrequency(1L);
entity.setRelevance(res.metric);
partialDoc.getEntities().add(entity);
//DEBUG
//System.out.println(res.value.text + ": " + res.metric + "/" + res.link_rank + "/" + res.count_rank);
}
}
catch (Exception e) {
//DEBUG
//e.printStackTrace();
}
}
@Override
public void extractEntitiesAndText(DocumentPojo partialDoc)
throws ExtractorDailyLimitExceededException,
ExtractorDocumentLevelException {
//cannot extract from url, not implemented
}
@Override
public String getCapability(EntityExtractorEnum capability) {
return null;
}
}