From 5f5a4436e267eba593dd6ecd0a8dd5617759d3cc Mon Sep 17 00:00:00 2001 From: Claus Stadler Date: Thu, 29 Jun 2017 14:19:22 +0200 Subject: [PATCH] added new feature to replicate sequences script --- README.md | 11 ++++++-- .../command/osm/CommandOsmReplicateSequences.java | 19 ++++++++----- .../osm/replication/dao/OsmRepoDao.java | 2 ++ .../osm/replication/dao/OsmRepoDaoImpl.java | 31 ++++++++++++++-------- linkedgeodata-docker/.env | 5 +++- linkedgeodata-docker/docker-compose.yml | 2 -- .../lgd-nominatim-sync/local.php.dist | 8 +++--- linkedgeodata-docker/lgd-osm-sync/start.sh | 2 +- 8 files changed, 53 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index b9e138b..3498c7b 100644 --- a/README.md +++ b/README.md @@ -126,13 +126,13 @@ Again, note that Sparqlify is still in development and the supported features ar * `lgd-osm-replicate-sequences`: Convert a timestamp to a sequence ID. This is similar to [mazdermind's replicate sequences tool](https://github.com/MaZderMind/replicate-sequences), however, our version does not require a local index. Instead, our tools combines binary search with linear interpolation: First, the the two most recent state.txt files from the given repository url are fetched, then the time differnce is computed, and based on linear interpolation a sequence id close to the given timetstamp is computed. This process is repeated recursively. ```bash -lgd-osm-replicate-sequences -u "http://planet.openstreetmap.org/replication/hour/" -d "2017-05-28T15:00:00Z" +lgd-osm-replicate-sequences -u "http://planet.openstreetmap.org/replication/hour/" -t "2017-05-28T15:00:00Z" # The above command from the debian package is a wrapper for: java -cp linkedgeodata-debian/target/linkedgeodata-debian-*-jar-with-dependencies.jar \ "org.aksw.linkedgeodata.cli.command.osm.CommandOsmReplicateSequences" \ - -u "http://planet.openstreetmap.org/replication/hour/" -d "2017-05-28T15:00:00Z" + -u "http://planet.openstreetmap.org/replication/hour/" -t "2017-05-28T15:00:00Z" ``` The output is a (presently subset) of the appropriate state.txt file whose timestamp is strictly less than that given as the argument. ``` @@ -146,6 +146,13 @@ timestamp=`osmconvert --out-timestamp "data.osm.pbf"` lgd-osm-replicate-sequences -u "url-to-repo" -t "$timestamp" ``` +```bash +# Use the -d option to option the (d)uration between the most recently published files +lgd-osm-replicate-sequences -u "http://planet.openstreetmap.org/replication/day/" -d +# This yields simply the output (possibly off by a few seconds) +# 86400 +``` + ### Postgresql Database Tuning It is recommended to tune the database according to [these recommendations](http://wiki.postgresql.org/wiki/Tuning_Your_PostgreSQL_Server). Here is a brief summary: Edit `/etc/postgresql/9.1/main/postgresql.conf` and set the following properties: diff --git a/linkedgeodata-cli/src/main/java/org/aksw/linkedgeodata/cli/command/osm/CommandOsmReplicateSequences.java b/linkedgeodata-cli/src/main/java/org/aksw/linkedgeodata/cli/command/osm/CommandOsmReplicateSequences.java index aa5bde6..179c199 100644 --- a/linkedgeodata-cli/src/main/java/org/aksw/linkedgeodata/cli/command/osm/CommandOsmReplicateSequences.java +++ b/linkedgeodata-cli/src/main/java/org/aksw/linkedgeodata/cli/command/osm/CommandOsmReplicateSequences.java @@ -16,9 +16,13 @@ @Parameter(names = {"-u", "-url"}, description = "OSM Repository base URL") public String osmReplicationRepoBaseUrl = null; - @Parameter(names = {"-d", "-date"}, description = "Timestamp") + @Parameter(names = {"-t", "-timestamp"}, description = "Timestamp") public String timestamp = null; + @Parameter(names = {"-d", "-duration"}, description = "Duration") + public Boolean returnDuration = false; + //public String cmd + public static void main(String[] args) throws Exception { CommandOsmReplicateSequences options = new CommandOsmReplicateSequences(); @@ -27,14 +31,17 @@ public static void main(String[] args) throws Exception { OsmRepoDao repoDao = OsmRepoDaoImpl.create(options.osmReplicationRepoBaseUrl); - Instant instant = Instant.parse(options.timestamp); - + if(options.returnDuration) { + System.out.println(repoDao.getUpdateInterval().getSeconds()); + } else { // By default, print the state file for the timestamp + Instant instant = Instant.parse(options.timestamp); - State state = repoDao.findState(instant); - java.util.Properties properties = StateImpl.toProperties(new Properties(), state); - properties.store(System.out, null); + State state = repoDao.findState(instant); + java.util.Properties properties = StateImpl.toProperties(new Properties(), state); + properties.store(System.out, null); + } // if(commandLineArgs.isHelp()) // { diff --git a/linkedgeodata-core/src/main/java/org/aksw/linkedgeodata/osm/replication/dao/OsmRepoDao.java b/linkedgeodata-core/src/main/java/org/aksw/linkedgeodata/osm/replication/dao/OsmRepoDao.java index 66bbcdd..893a8e7 100644 --- a/linkedgeodata-core/src/main/java/org/aksw/linkedgeodata/osm/replication/dao/OsmRepoDao.java +++ b/linkedgeodata-core/src/main/java/org/aksw/linkedgeodata/osm/replication/dao/OsmRepoDao.java @@ -1,5 +1,6 @@ package org.aksw.linkedgeodata.osm.replication.dao; +import java.time.Duration; import java.time.Instant; /** @@ -20,4 +21,5 @@ // State getState(long seqId) throws Exception; State findState(Instant searchTimestamp) throws Exception; + Duration getUpdateInterval(); } diff --git a/linkedgeodata-core/src/main/java/org/aksw/linkedgeodata/osm/replication/dao/OsmRepoDaoImpl.java b/linkedgeodata-core/src/main/java/org/aksw/linkedgeodata/osm/replication/dao/OsmRepoDaoImpl.java index 17fdf9c..b9b3a7d 100644 --- a/linkedgeodata-core/src/main/java/org/aksw/linkedgeodata/osm/replication/dao/OsmRepoDaoImpl.java +++ b/linkedgeodata-core/src/main/java/org/aksw/linkedgeodata/osm/replication/dao/OsmRepoDaoImpl.java @@ -1,6 +1,7 @@ package org.aksw.linkedgeodata.osm.replication.dao; import java.net.URI; +import java.time.Duration; import java.time.Instant; import java.time.temporal.ChronoUnit; import java.util.ArrayList; @@ -13,13 +14,18 @@ implements OsmRepoDao { protected OsmRepoCoreDao coreDao; - protected long avgUpdateIntervalInSec; + protected Duration updateInterval; - public OsmRepoDaoImpl(OsmRepoCoreDao repoAccessor, long avgUpdateIntervalInSec) { + public OsmRepoDaoImpl(OsmRepoCoreDao repoAccessor, Duration updateInterval) { super(); this.coreDao = repoAccessor; - this.avgUpdateIntervalInSec = avgUpdateIntervalInSec; + this.updateInterval = updateInterval; + } + + @Override + public Duration getUpdateInterval() { + return updateInterval; } @Override @@ -45,12 +51,12 @@ public State findState(Instant searchTimestamp) throws Exception { State currentState = this.getMostRecentState(); Range range = Range.closedOpen(0l, currentState.getSeqId()); - State result = findStatePreceedingTimestamp(currentState, searchTimestamp, avgUpdateIntervalInSec, range); + State result = findStatePreceedingTimestamp(currentState, searchTimestamp, updateInterval, range); return result; } - public State findStatePreceedingTimestamp(State currentState, Instant searchTimestamp, long avgUpdateIntervalInS, Range seqIdRange) throws Exception { + public State findStatePreceedingTimestamp(State currentState, Instant searchTimestamp, Duration updateInterval, Range seqIdRange) throws Exception { State result = null; Range subRange; @@ -63,7 +69,7 @@ public State findStatePreceedingTimestamp(State currentState, Instant searchTime //Instant lowerTimestamp = timestamp.toInstant(); // Interpolate the sequence id of the lower state - long delta = (long)(ChronoUnit.SECONDS.between(searchTimestamp, currentTimestamp) / (double)avgUpdateIntervalInS); + long delta = (long)(ChronoUnit.SECONDS.between(searchTimestamp, currentTimestamp) / (double)updateInterval.getSeconds()); // If the searchTimestamp is after the checkTimestamp, we need to go further back by one updateInterval if(delta == 0) { if(searchTimestamp.compareTo(currentTimestamp) < 0) { @@ -87,13 +93,13 @@ public State findStatePreceedingTimestamp(State currentState, Instant searchTime State checkState = this.getState(lowerSeqId); //Instant checkTimestamp = checkState.getTimestamp().toInstant(); - result = findStatePreceedingTimestamp(checkState, searchTimestamp, avgUpdateIntervalInS, subRange); + result = findStatePreceedingTimestamp(checkState, searchTimestamp, updateInterval, subRange); } } return result; } - public static double determineUpdateIntervalInSec(OsmRepoCoreDao repoCoreDao) throws Exception { + public static Duration determineUpdateIntervalInSec(OsmRepoCoreDao repoCoreDao) throws Exception { int n = 2; List instants = new ArrayList<>(n); State latest = repoCoreDao.getMostRecentState(); @@ -103,11 +109,12 @@ public static double determineUpdateIntervalInSec(OsmRepoCoreDao repoCoreDao) th instants.add(state.getTimestamp().toInstant()); } - double result = IntStream.range(0, n - 1) + double tmp = IntStream.range(0, n - 1) .mapToLong(i -> ChronoUnit.SECONDS.between(instants.get(i + 1), instants.get(i))) .average() .getAsDouble(); + Duration result = Duration.ofSeconds((long)tmp); return result; } @@ -120,9 +127,11 @@ public static OsmRepoDao create(String repoBaseUriStr) throws Exception { } public static OsmRepoDao create(OsmRepoCoreDao coreDao) throws Exception { - long updateIntervalInSec = (long)determineUpdateIntervalInSec(coreDao); + Duration updateInterval = determineUpdateIntervalInSec(coreDao); - OsmRepoDao result = new OsmRepoDaoImpl(coreDao, updateIntervalInSec); + OsmRepoDao result = new OsmRepoDaoImpl(coreDao, updateInterval); return result; } } + + diff --git a/linkedgeodata-docker/.env b/linkedgeodata-docker/.env index 861a495..dee0429 100644 --- a/linkedgeodata-docker/.env +++ b/linkedgeodata-docker/.env @@ -21,5 +21,8 @@ DB_MAINTENANCE_WORK_MEM=256MB OSM_DATA_BASE_URL=http://downloads.linkedgeodata.org/debugging/monaco-170618.osm.pbf OSM_DATA_SYNC_URL=http://download.geofabrik.de/europe/monaco-updates/ -OSM_DATA_SYNC_SLEEP=600 + + +#OSM_DATA_SYNC_RECHECK_INTERVAL=900 +#OSM_DATA_SYNC_UPDATE_INTERVAL=3600 diff --git a/linkedgeodata-docker/docker-compose.yml b/linkedgeodata-docker/docker-compose.yml index 2435793..917f1e6 100644 --- a/linkedgeodata-docker/docker-compose.yml +++ b/linkedgeodata-docker/docker-compose.yml @@ -36,7 +36,6 @@ services: environment: - OSM_DATA_BASE_URL=${OSM_DATA_BASE_URL} - OSM_DATA_SYNC_URL=${OSM_DATA_SYNC_URL} - - OSM_DATA_SYNC_SLEEP=${OSM_DATA_SYNC_SLEEP} depends_on: lgd-db: condition: service_healthy @@ -69,7 +68,6 @@ services: environment: - OSM_DATA_BASE_URL=${OSM_DATA_BASE_URL} - OSM_DATA_SYNC_URL=${OSM_DATA_SYNC_URL} - - OSM_DATA_SYNC_SLEEP=${OSM_DATA_SYNC_SLEEP} depends_on: lgd-db: condition: service_healthy diff --git a/linkedgeodata-docker/lgd-nominatim-sync/local.php.dist b/linkedgeodata-docker/lgd-nominatim-sync/local.php.dist index 63831bd..3ff1ed1 100644 --- a/linkedgeodata-docker/lgd-nominatim-sync/local.php.dist +++ b/linkedgeodata-docker/lgd-nominatim-sync/local.php.dist @@ -7,8 +7,8 @@ // Website settings @define('CONST_Database_DSN', 'pgsql://lgd:lgdpwd@lgd-db:5432/lgd'); // ://:@:/ @define('CONST_Website_BaseURL', '/'); - @define('CONST_Replication_Url', '${OSM_DATA_SYNC_URL'); - @define('CONST_Replication_MaxInterval', '86400'); // Process each update separately, osmosis cannot merge multiple updates - @define('CONST_Replication_Update_Interval', '86400'); // How often upstream publishes diffs - @define('CONST_Replication_Recheck_Interval', '900'); // How long to sleep if no update found yet + @define('CONST_Replication_Url', '${OSM_DATA_SYNC_URL}'); + @define('CONST_Replication_MaxInterval', '${OSM_DATA_SYNC_UPDATE_INTERVAL}'); // Process each update separately, osmosis cannot merge multiple updates + @define('CONST_Replication_Update_Interval', '${OSM_DATA_SYNC_UPDATE_INTERVAL}'); // How often upstream publishes diffs + @define('CONST_Replication_Recheck_Interval', '${OSM_DATA_SYNC_RECHECK_INTERVAL}'); // How long to sleep if no update found yet ?> diff --git a/linkedgeodata-docker/lgd-osm-sync/start.sh b/linkedgeodata-docker/lgd-osm-sync/start.sh index c2cc237..e915523 100644 --- a/linkedgeodata-docker/lgd-osm-sync/start.sh +++ b/linkedgeodata-docker/lgd-osm-sync/start.sh @@ -27,7 +27,7 @@ if [ -z "$statusVal" ]; then timestamp=`osmconvert --out-timestamp "$syncDir/data.osm.pbf"` #curl "https://osm.mazdermind.de/replicate-sequences/?$timestamp" > sync/state.txt - lgd-osm-replicate-sequences -u "$OSM_DATA_SYNC_URL" -d "$timestamp" > "$syncDir/state.txt" + lgd-osm-replicate-sequences -u "$OSM_DATA_SYNC_URL" -t "$timestamp" > "$syncDir/state.txt" # TODO Fix lgd-createdb to include port