From dc6a572f408a4126b3e9a2e7fdc62f06f06ef719 Mon Sep 17 00:00:00 2001 From: Adam Sotona Date: Thu, 23 Aug 2018 13:32:33 +0200 Subject: [PATCH] Adding an option for Halyard Stats to calculate/update statistics of a particular named graph context --- .../gin/halyard/common/HalyardTableUtils.java | 2 +- docs/tools.md | 5 +- .../msd/gin/halyard/tools/HalyardStats.java | 58 +++++++++++---- .../gin/halyard/tools/HalyardStatsTest.java | 71 ++++++++++++++++++- .../msd/gin/halyard/tools/testMoreData.trig | 16 +++++ .../halyard/tools/testStatsMoreTarget.trig | 66 +++++++++++++++++ .../halyard/tools/testStatsTargetPartial.trig | 30 ++++++++ 7 files changed, 233 insertions(+), 15 deletions(-) create mode 100644 tools/src/test/resources/com/msd/gin/halyard/tools/testMoreData.trig create mode 100644 tools/src/test/resources/com/msd/gin/halyard/tools/testStatsMoreTarget.trig create mode 100644 tools/src/test/resources/com/msd/gin/halyard/tools/testStatsTargetPartial.trig diff --git a/common/src/main/java/com/msd/gin/halyard/common/HalyardTableUtils.java b/common/src/main/java/com/msd/gin/halyard/common/HalyardTableUtils.java index c882cba3a..c25306ef1 100644 --- a/common/src/main/java/com/msd/gin/halyard/common/HalyardTableUtils.java +++ b/common/src/main/java/com/msd/gin/halyard/common/HalyardTableUtils.java @@ -102,7 +102,7 @@ public final class HalyardTableUtils { private static final int PREFIXES = 3; private static final byte[] START_KEY = new byte[20]; - static final byte[] STOP_KEY = new byte[20]; + public static final byte[] STOP_KEY = new byte[20]; static { Arrays.fill(START_KEY, (byte)0); Arrays.fill(STOP_KEY, (byte)0xff); /* 0xff is 255 in decimal */ diff --git a/docs/tools.md b/docs/tools.md index ee3c4e3f8..4a480f3e8 100644 --- a/docs/tools.md +++ b/docs/tools.md @@ -251,7 +251,8 @@ hdfs:///my_tmp_workdir -t mydataset ### Halyard Stats ``` $ ./halyard stats -h -usage: halyard stats [-h] [-v] -s [-t ] [-r ] [-g ] +usage: halyard stats [-h] [-v] -s [-t ] [-r ] [-c ] + [-g ] Halyard Stats is a MapReduce application that calculates dataset statistics and stores them in the named graph within the dataset or exports them into a file. The generated statistics are described by the VoID vocabulary, its extensions, and the SPARQL 1.1 Service Description. @@ -263,6 +264,8 @@ by the VoID vocabulary, its extensions, and the SPARQL 1.1 Service Description. hdfs:///[{0}].[.] -r,--threshold Optional minimal size of a named graph to calculate statistics for (default is 1000) + -c,--graph-context Optional restrict stats calculation to the given named graph + context only -g,--target-graph Optional target graph context of the exported statistics (default is 'http://merck.github.io/Halyard/ns#statsContext') Example: halyard stats -s my_dataset [-g 'http://whatever/mystats'] [-t diff --git a/tools/src/main/java/com/msd/gin/halyard/tools/HalyardStats.java b/tools/src/main/java/com/msd/gin/halyard/tools/HalyardStats.java index e6851465e..bea682998 100644 --- a/tools/src/main/java/com/msd/gin/halyard/tools/HalyardStats.java +++ b/tools/src/main/java/com/msd/gin/halyard/tools/HalyardStats.java @@ -31,8 +31,10 @@ import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.text.MessageFormat; +import java.util.ArrayList; import java.util.Arrays; import java.util.Base64; +import java.util.List; import java.util.Map; import java.util.Optional; import java.util.WeakHashMap; @@ -47,6 +49,8 @@ import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.filter.MultiRowRangeFilter; +import org.apache.hadoop.hbase.filter.MultiRowRangeFilter.RowRange; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.hbase.mapreduce.TableMapper; @@ -83,6 +87,7 @@ public final class HalyardStats extends AbstractHalyardTool { private static final String SOURCE = "halyard.stats.source"; private static final String TARGET = "halyard.stats.target"; private static final String THRESHOLD = "halyard.stats.threshold"; + private static final String TARGET_GRAPH = "halyard.stats.target.graph"; private static final String GRAPH_CONTEXT = "halyard.stats.graph.context"; private static final Charset UTF8 = Charset.forName("UTF-8"); @@ -95,7 +100,7 @@ static final class StatsMapper extends TableMapper 0) { + if (value > 0 && (graphContext == null || graphContext.equals(graph))) { ByteArrayOutputStream baos = new ByteArrayOutputStream(); try (DataOutputStream dos = new DataOutputStream(baos)) { dos.writeUTF(graph.stringValue()); @@ -322,7 +337,7 @@ static final class StatsReducer extends Reducer(); } @@ -438,14 +455,20 @@ public HalyardStats() { addOption("s", "source-dataset", "dataset_table", "Source HBase table with Halyard RDF store", true, true); addOption("t", "target-file", "target_url", "Optional target file to export the statistics (instead of update) hdfs:///[{0}].[.]", false, true); addOption("r", "threshold", "size", "Optional minimal size of a named graph to calculate statistics for (default is 1000)", false, true); + addOption("c", "graph-context", "graph_context", "Optional restrict stats calculation to the given named graph context only", false, true); addOption("g", "target-graph", "target_graph", "Optional target graph context of the exported statistics (default is '" + HALYARD.STATS_GRAPH_CONTEXT.stringValue() + "')", false, true); } + private static RowRange rowRange(byte prefix, byte[] hash) { + return new RowRange(HalyardTableUtils.concat(prefix, false, hash), true, HalyardTableUtils.concat(prefix, true, hash, HalyardTableUtils.STOP_KEY, HalyardTableUtils.STOP_KEY, HalyardTableUtils.STOP_KEY), true); + } + @Override public int run(CommandLine cmd) throws Exception { String source = cmd.getOptionValue('s'); String target = cmd.getOptionValue('t'); - String graph = cmd.getOptionValue('g'); + String targetGraph = cmd.getOptionValue('g'); + String graphContext = cmd.getOptionValue('c'); String thresh = cmd.getOptionValue('r'); TableMapReduceUtil.addDependencyJars(getConf(), HalyardExport.class, @@ -463,7 +486,8 @@ public int run(CommandLine cmd) throws Exception { Job job = Job.getInstance(getConf(), "HalyardStats " + source + (target == null ? " update" : " -> " + target)); job.getConfiguration().set(SOURCE, source); if (target != null) job.getConfiguration().set(TARGET, target); - if (graph != null) job.getConfiguration().set(GRAPH_CONTEXT, graph); + if (targetGraph != null) job.getConfiguration().set(TARGET_GRAPH, targetGraph); + if (graphContext != null) job.getConfiguration().set(GRAPH_CONTEXT, graphContext); if (thresh != null) job.getConfiguration().setLong(THRESHOLD, Long.parseLong(thresh)); job.setJarByClass(HalyardStats.class); TableMapReduceUtil.initCredentials(job); @@ -473,7 +497,17 @@ public int run(CommandLine cmd) throws Exception { scan.setMaxVersions(1); scan.setBatch(10); scan.setAllowPartialResults(true); - + if (graphContext != null) { //restricting stats to scan given graph context only + List ranges = new ArrayList<>(); + byte[] gcHash = HalyardTableUtils.hashKey(NTriplesUtil.toNTriplesString(SimpleValueFactory.getInstance().createIRI(graphContext)).getBytes(UTF8)); + ranges.add(rowRange(HalyardTableUtils.CSPO_PREFIX, gcHash)); + ranges.add(rowRange(HalyardTableUtils.CPOS_PREFIX, gcHash)); + ranges.add(rowRange(HalyardTableUtils.COSP_PREFIX, gcHash)); + if (target == null) { //add stats context to the scanned row ranges (when in update mode) to delete the related stats during MapReduce + ranges.add(rowRange(HalyardTableUtils.CSPO_PREFIX, HalyardTableUtils.hashKey(NTriplesUtil.toNTriplesString(targetGraph == null ? HALYARD.STATS_GRAPH_CONTEXT : SimpleValueFactory.getInstance().createIRI(targetGraph)).getBytes(UTF8)))); + } + scan.setFilter(new MultiRowRangeFilter(ranges)); + } TableMapReduceUtil.initTableMapperJob( source, scan, diff --git a/tools/src/test/java/com/msd/gin/halyard/tools/HalyardStatsTest.java b/tools/src/test/java/com/msd/gin/halyard/tools/HalyardStatsTest.java index 1bcaba3d3..82ddbb7d4 100644 --- a/tools/src/test/java/com/msd/gin/halyard/tools/HalyardStatsTest.java +++ b/tools/src/test/java/com/msd/gin/halyard/tools/HalyardStatsTest.java @@ -146,6 +146,8 @@ static void assertEqualModels(Set ref, Set m) { public void testStatsUpdate() throws Exception { final HBaseSail sail = new HBaseSail(HBaseServerTestInstance.getInstanceConfig(), "statsTable2", true, -1, true, 0, null, null); sail.initialize(); + + //load test data try (InputStream ref = HalyardStatsTest.class.getResourceAsStream("testData.trig")) { RDFParser p = Rio.createParser(RDFFormat.TRIG); p.setPreserveBNodeIDs(true); @@ -158,20 +160,87 @@ public void handleStatement(Statement st) throws RDFHandlerException { } sail.commit(); + //update stats assertEquals(0, ToolRunner.run(HBaseServerTestInstance.getInstanceConfig(), new HalyardStats(), new String[]{"-s", "statsTable2", "-r", "100"})); + //verify with golden file Set statsM = new HashSet<>(); try (CloseableIteration it = sail.getStatements(null, null, null, true, HALYARD.STATS_GRAPH_CONTEXT)) { while (it.hasNext()) { statsM.add(it.next()); } } - sail.close(); try (InputStream refStream = HalyardStatsTest.class.getResourceAsStream("testStatsTarget.trig")) { Model refM = Rio.parse(refStream, "", RDFFormat.TRIG, new ParserConfig().set(BasicParserSettings.PRESERVE_BNODE_IDS, true), SimpleValueFactory.getInstance(), new ParseErrorLogger()); assertEqualModels(refM, statsM); } + + //load additional data + try (InputStream ref = HalyardStatsTest.class.getResourceAsStream("testMoreData.trig")) { + RDFParser p = Rio.createParser(RDFFormat.TRIG); + p.setPreserveBNodeIDs(true); + p.setRDFHandler(new AbstractRDFHandler() { + @Override + public void handleStatement(Statement st) throws RDFHandlerException { + sail.addStatement(st.getSubject(), st.getPredicate(), st.getObject(), st.getContext()); + } + }).parse(ref, ""); + } + sail.commit(); + + //update stats only for graph1 + assertEquals(0, ToolRunner.run(HBaseServerTestInstance.getInstanceConfig(), new HalyardStats(), + new String[]{"-s", "statsTable2", "-r", "100", "-c", "http://whatever/graph1"})); + + //verify with golden file + statsM = new HashSet<>(); + try (CloseableIteration it = sail.getStatements(null, null, null, true, HALYARD.STATS_GRAPH_CONTEXT)) { + while (it.hasNext()) { + statsM.add(it.next()); + } + } + try (InputStream refStream = HalyardStatsTest.class.getResourceAsStream("testStatsMoreTarget.trig")) { + Model refM = Rio.parse(refStream, "", RDFFormat.TRIG, new ParserConfig().set(BasicParserSettings.PRESERVE_BNODE_IDS, true), SimpleValueFactory.getInstance(), new ParseErrorLogger()); + assertEqualModels(refM, statsM); + } + + sail.close(); + } + + @Test + public void testStatsTargetPartial() throws Exception { + final HBaseSail sail = new HBaseSail(HBaseServerTestInstance.getInstanceConfig(), "statsTable3", true, -1, true, 0, null, null); + sail.initialize(); + try (InputStream ref = HalyardStatsTest.class.getResourceAsStream("testData.trig")) { + RDFParser p = Rio.createParser(RDFFormat.TRIG); + p.setPreserveBNodeIDs(true); + p.setRDFHandler(new AbstractRDFHandler() { + @Override + public void handleStatement(Statement st) throws RDFHandlerException { + sail.addStatement(st.getSubject(), st.getPredicate(), st.getObject(), st.getContext()); + } + }).parse(ref, ""); + } + sail.commit(); + sail.close(); + + File root = File.createTempFile("test_stats", ""); + root.delete(); + root.mkdirs(); + + assertEquals(0, ToolRunner.run(HBaseServerTestInstance.getInstanceConfig(), new HalyardStats(), + new String[]{"-s", "statsTable3", "-t", root.toURI().toURL().toString() + "stats{0}.trig", "-r", "100", "-g", "http://whatever/myStats", "-c", "http://whatever/graph0"})); + + File stats = new File(root, "stats0.trig"); + assertTrue(stats.isFile()); + try (InputStream statsStream = new FileInputStream(stats)) { + try (InputStream refStream = HalyardStatsTest.class.getResourceAsStream("testStatsTargetPartial.trig")) { + Model statsM = Rio.parse(statsStream, "", RDFFormat.TRIG, new ParserConfig().set(BasicParserSettings.PRESERVE_BNODE_IDS, true), SimpleValueFactory.getInstance(), new ParseErrorLogger()); + Model refM = Rio.parse(refStream, "", RDFFormat.TRIG, new ParserConfig().set(BasicParserSettings.PRESERVE_BNODE_IDS, true), SimpleValueFactory.getInstance(), new ParseErrorLogger(), SimpleValueFactory.getInstance().createIRI("http://whatever/myStats")); + assertEqualModels(refM, statsM); + } + } } public void testRunNoArgs() throws Exception { diff --git a/tools/src/test/resources/com/msd/gin/halyard/tools/testMoreData.trig b/tools/src/test/resources/com/msd/gin/halyard/tools/testMoreData.trig new file mode 100644 index 000000000..6b6d2f364 --- /dev/null +++ b/tools/src/test/resources/com/msd/gin/halyard/tools/testMoreData.trig @@ -0,0 +1,16 @@ +@prefix : . +{ + :subjA :predB "whatever value ABC". +} + +:graph0 { + :subjA :predB "whatever value ABC". +} + +:graph1 { + :subjA :predB "whatever value ABC". +} + +:graph2 { + :subjA :predB "whatever value ABC". +} diff --git a/tools/src/test/resources/com/msd/gin/halyard/tools/testStatsMoreTarget.trig b/tools/src/test/resources/com/msd/gin/halyard/tools/testStatsMoreTarget.trig new file mode 100644 index 000000000..27025aa35 --- /dev/null +++ b/tools/src/test/resources/com/msd/gin/halyard/tools/testStatsMoreTarget.trig @@ -0,0 +1,66 @@ +@prefix : . +@prefix halyard: . +@prefix void: . +@prefix void-ext: . +@prefix xsd: . +@prefix sd: . +@prefix rdf: . + +halyard:statsContext { + halyard:statsRoot a void:Dataset , sd:Dataset , sd:Graph ; + sd:defaultGraph halyard:statsRoot ; + sd:namedGraph :graph0 , :graph1 , :graph2 ; + void:classes "1000"^^xsd:long ; + void:triples "2001"^^xsd:long ; + void:propertyPartition halyard:statsRoot_property_v0EPmHVxqhkyM3Yh_Wfu7gMOZGU ; + void:properties "14"^^xsd:long ; + void-ext:distinctLiterals "857"^^xsd:long ; + void:distinctObjects "1957"^^xsd:long ; + void:distinctSubjects "191"^^xsd:long ; + void-ext:distinctBlankNodeObjects "100"^^xsd:long ; + void-ext:distinctBlankNodeSubjects "91"^^xsd:long ; + void-ext:distinctIRIReferenceObjects "1000"^^xsd:long ; + void-ext:distinctIRIReferenceSubjects "100"^^xsd:long . + + :graph0 sd:name :graph0 ; + sd:graph :graph0 ; + a sd:NamedGraph , sd:Graph , void:Dataset ; + void:classes "450"^^xsd:long ; + void:triples "900"^^xsd:long ; + void:propertyPartition :graph0_property_v0EPmHVxqhkyM3Yh_Wfu7gMOZGU ; + void:properties "14"^^xsd:long ; + void-ext:distinctLiterals "386"^^xsd:long ; + void:distinctObjects "886"^^xsd:long ; + void:distinctSubjects "91"^^xsd:long ; + void-ext:distinctBlankNodeObjects "50"^^xsd:long ; + void-ext:distinctBlankNodeSubjects "41"^^xsd:long ; + void-ext:distinctIRIReferenceObjects "450"^^xsd:long ; + void-ext:distinctIRIReferenceSubjects "50"^^xsd:long . + + :graph0_property_v0EPmHVxqhkyM3Yh_Wfu7gMOZGU a void:Dataset ; + void:property rdf:type ; + void:triples "450"^^xsd:long . + + :graph1 sd:name :graph1 ; + sd:graph :graph1 ; + a sd:NamedGraph , sd:Graph , void:Dataset ; + void:classes "450"^^xsd:long ; + void:triples "901"^^xsd:long ; + void:propertyPartition :graph1_property_v0EPmHVxqhkyM3Yh_Wfu7gMOZGU ; + void:properties "15"^^xsd:long ; + void-ext:distinctLiterals "387"^^xsd:long ; + void:distinctObjects "887"^^xsd:long ; + void:distinctSubjects "91"^^xsd:long ; + void-ext:distinctBlankNodeObjects "50"^^xsd:long ; + void-ext:distinctBlankNodeSubjects "40"^^xsd:long ; + void-ext:distinctIRIReferenceObjects "450"^^xsd:long ; + void-ext:distinctIRIReferenceSubjects "51"^^xsd:long . + + :graph1_property_v0EPmHVxqhkyM3Yh_Wfu7gMOZGU a void:Dataset ; + void:property rdf:type ; + void:triples "450"^^xsd:long . + + halyard:statsRoot_property_v0EPmHVxqhkyM3Yh_Wfu7gMOZGU a void:Dataset ; + void:property rdf:type ; + void:triples "1000"^^xsd:long . +} diff --git a/tools/src/test/resources/com/msd/gin/halyard/tools/testStatsTargetPartial.trig b/tools/src/test/resources/com/msd/gin/halyard/tools/testStatsTargetPartial.trig new file mode 100644 index 000000000..3a6b05775 --- /dev/null +++ b/tools/src/test/resources/com/msd/gin/halyard/tools/testStatsTargetPartial.trig @@ -0,0 +1,30 @@ +@prefix : . +@prefix halyard: . +@prefix void: . +@prefix void-ext: . +@prefix xsd: . +@prefix sd: . +@prefix rdf: . + +halyard:statsContext { + halyard:statsRoot sd:namedGraph :graph0 . + + :graph0 sd:name :graph0 ; + sd:graph :graph0 ; + a sd:NamedGraph , sd:Graph , void:Dataset ; + void:classes "450"^^xsd:long ; + void:triples "900"^^xsd:long ; + void:propertyPartition :graph0_property_v0EPmHVxqhkyM3Yh_Wfu7gMOZGU ; + void:properties "14"^^xsd:long ; + void-ext:distinctLiterals "386"^^xsd:long ; + void:distinctObjects "886"^^xsd:long ; + void:distinctSubjects "91"^^xsd:long ; + void-ext:distinctBlankNodeObjects "50"^^xsd:long ; + void-ext:distinctBlankNodeSubjects "41"^^xsd:long ; + void-ext:distinctIRIReferenceObjects "450"^^xsd:long ; + void-ext:distinctIRIReferenceSubjects "50"^^xsd:long . + + :graph0_property_v0EPmHVxqhkyM3Yh_Wfu7gMOZGU a void:Dataset ; + void:property rdf:type ; + void:triples "450"^^xsd:long . +}