Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a toggle for saving the WARC data generated by the search engine's crawler. Normally this is discarded, but for debugging or archival purposes, retaining it may be of interest. The warc files are concatenated into larger archives, up to about 1 GB each. An index is also created containing filenames, domain names, offsets and sizes to help navigate these larger archives. The warc data is saved in a directory warc/ under the crawl data storage.
- Loading branch information
Showing
10 changed files
with
174 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1 change: 1 addition & 0 deletions
1
code/common/db/src/main/resources/db/migration/V24_01_0_001__node_config__keep_warc.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
ALTER TABLE WMSA_prod.NODE_CONFIGURATION ADD COLUMN KEEP_WARCS BOOLEAN DEFAULT FALSE; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
122 changes: 122 additions & 0 deletions
122
...rocesses/crawling-process/src/main/java/nu/marginalia/crawl/warc/WarcArchiverFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
package nu.marginalia.crawl.warc; | ||
|
||
import com.google.inject.Inject; | ||
import nu.marginalia.ProcessConfiguration; | ||
import nu.marginalia.nodecfg.NodeConfigurationService; | ||
import org.apache.commons.io.IOUtils; | ||
|
||
import java.io.IOException; | ||
import java.io.OutputStream; | ||
import java.io.PrintWriter; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.time.LocalDateTime; | ||
import java.time.format.DateTimeFormatter; | ||
|
||
/** Factory for creating WarcArchiverIf instances. Depending on the node's configuration, | ||
* either a shredder instance that just discards the Warc file, or a persisting instance | ||
* that creates a series of concatenated warc.gz-files with an index | ||
*/ | ||
public class WarcArchiverFactory { | ||
private final boolean keepWarcs; | ||
|
||
@Inject | ||
public WarcArchiverFactory(ProcessConfiguration processConfiguration, | ||
NodeConfigurationService nodeConfigurationService) | ||
throws Exception | ||
{ | ||
keepWarcs = nodeConfigurationService.get(processConfiguration.node()).keepWarcs(); | ||
} | ||
|
||
public WarcArchiverIf get(Path outputDir) throws IOException { | ||
if (!keepWarcs) { | ||
return new WarcArchiverShredder(); | ||
} else { | ||
return new WarcArchiver(outputDir); | ||
} | ||
} | ||
|
||
} | ||
|
||
/** Dummy archiver that just deletes the warc file. */ | ||
class WarcArchiverShredder implements WarcArchiverIf { | ||
@Override | ||
public void consumeWarc(Path warcFile, String domain) throws IOException { | ||
Files.deleteIfExists(warcFile); | ||
} | ||
|
||
@Override | ||
public void close() {} | ||
} | ||
|
||
/** Archives warc files to disk. Concatenates all warc files into a single | ||
* warc file, and creates an index file with the offsets and lengths of | ||
* each domain segment. | ||
* */ | ||
class WarcArchiver implements WarcArchiverIf { | ||
// Specs say the recommended maximum size of a warc file is ~1GB, after which a new file should be created | ||
private static final long MAX_COMBINED_WARC_FILE_SIZE = 1_000_000_000; | ||
|
||
|
||
private PrintWriter indexWriter; | ||
private OutputStream warcWriter; | ||
private final Path warcDir; | ||
|
||
String warcFileName = null; | ||
String ts = LocalDateTime.now() | ||
.format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) | ||
.replace(':', '-'); | ||
|
||
long pos = 0; | ||
int fileCounter = 0; | ||
|
||
public WarcArchiver(Path outputDir) throws IOException { | ||
warcDir = outputDir.resolve("warc"); | ||
|
||
if (!Files.exists(warcDir)) { | ||
Files.createDirectories(warcDir); | ||
} | ||
|
||
switchFile(); | ||
} | ||
|
||
private void switchFile() throws IOException { | ||
if (warcWriter != null) warcWriter.close(); | ||
|
||
warcFileName = "marginalia-crawl-" + ts + "--" + String.format("%04d", fileCounter++) + ".warc.gz"; | ||
|
||
warcWriter = Files.newOutputStream(warcDir.resolve(warcFileName)); | ||
|
||
if (indexWriter == null) { | ||
Path indexFile = warcDir.resolve("marginalia-crawl-" + ts + ".idx"); | ||
indexWriter = new PrintWriter(Files.newBufferedWriter(indexFile)); | ||
} | ||
} | ||
|
||
@Override | ||
public void consumeWarc(Path warcFile, String domain) throws IOException { | ||
try { | ||
synchronized (this) { | ||
// Specs say the recommended maximum size of a warc file is ~1GB | ||
if (pos > MAX_COMBINED_WARC_FILE_SIZE) { | ||
switchFile(); | ||
} | ||
|
||
indexWriter.printf("%s %s %d %d\n", warcFileName, domain, pos, Files.size(warcFile)); | ||
indexWriter.flush(); | ||
try (var is = Files.newInputStream(warcFile)) { | ||
pos += IOUtils.copy(is, warcWriter); | ||
} | ||
} | ||
} | ||
finally { | ||
Files.deleteIfExists(warcFile); | ||
} | ||
} | ||
|
||
@Override | ||
public void close() throws IOException { | ||
if (warcWriter != null) warcWriter.close(); | ||
if (indexWriter != null) indexWriter.close(); | ||
} | ||
} |
12 changes: 12 additions & 0 deletions
12
code/processes/crawling-process/src/main/java/nu/marginalia/crawl/warc/WarcArchiverIf.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
package nu.marginalia.crawl.warc; | ||
|
||
import java.io.IOException; | ||
import java.nio.file.Path; | ||
|
||
public interface WarcArchiverIf extends AutoCloseable { | ||
/** Process the warc file. After processing, the warc file is deleted. | ||
* Processing may be a no-op, depending on the implementation. | ||
*/ | ||
void consumeWarc(Path warcFile, String domain) throws IOException; | ||
void close() throws IOException; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters