Skip to content

Commit

Permalink
(crawler) Fix a bug where reference copies of crawl data was written …
Browse files Browse the repository at this point in the history
…without etag and last-modified

This commit also adds a band-aid to ParquetSerializableCrawlDataStream to fetch this from the 304-entity.  This can be removed in a few months.
  • Loading branch information
vlofgren committed Jan 18, 2024
1 parent 9644198 commit 22c8fb3
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ private void createDomainRecord(CrawledDocumentParquetRecord parquetRecord) thro
));
}

private CrawledDocumentParquetRecord previousRecord = null;

private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) {
String bodyString = "";
CrawlerDocumentStatus status = CrawlerDocumentStatus.OK;
Expand Down Expand Up @@ -130,6 +132,24 @@ else if (nextRecord.body != null) {
status = CrawlerDocumentStatus.ERROR;
}

String etag = nextRecord.etagHeader;
String lastModified = nextRecord.lastModifiedHeader;

// If we have a previous record, and it was a 304, and this one is a 200, we'll use the ETag and Last-Modified
// from the previous record, as it's not guaranteed the reference copy will have the same headers due to a bug
// in the crawler. The bug is fixed, but we still need to support old crawls.
//
// This was added in 2024-01-18, so we can remove it in a few months.

if (previousRecord != null
&& previousRecord.url.equals(nextRecord.url)
&& previousRecord.httpStatus == 304
&& nextRecord.httpStatus == 200)
{
etag = previousRecord.etagHeader;
lastModified = previousRecord.lastModifiedHeader;
}

nextQ.add(new CrawledDocument("",
nextRecord.url,
nextRecord.contentType,
Expand All @@ -144,11 +164,14 @@ else if (nextRecord.body != null) {
null,
"",
nextRecord.cookies,
nextRecord.lastModifiedHeader,
nextRecord.etagHeader));
lastModified,
etag));

previousRecord = nextRecord;
}

public void close() throws IOException {
previousRecord = null;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.model.EdgeDomain;
Expand Down Expand Up @@ -255,15 +254,6 @@ private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, St
}
}

/**
* Flag the given URL as skipped by the crawler, so that it will not be retried.
* Which URLs were skipped is still important when resynchronizing on the WARC file,
* so that the crawler can avoid re-fetching them.
*/
public void flagAsSkipped(EdgeUrl url, String contentType, int statusCode, String documentBody) {
saveOldResponse(url, contentType, statusCode, documentBody, ContentTags.empty());
}

/**
* Write a reference copy of the given document data. This is used when the crawler provides
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.model.EdgeUrl;
Expand Down Expand Up @@ -84,7 +85,12 @@ public int recrawl(CrawlDataReference oldCrawlData,
}

// Add a WARC record so we don't repeat this
warcRecorder.flagAsSkipped(url, doc.contentType, doc.httpStatus, doc.documentBody);
warcRecorder.writeReferenceCopy(url,
doc.contentType,
doc.httpStatus,
doc.documentBody,
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
);
}
else {
// GET the document with the stored document as a reference
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,11 @@ void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, I
public void flagAsSkipped() throws IOException, URISyntaxException {

try (var recorder = new WarcRecorder(fileNameWarc)) {
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
"text/html",
200,
"<?doctype html><html><body>test</body></html>");
"<?doctype html><html><body>test</body></html>",
ContentTags.empty());
}

try (var reader = new WarcReader(fileNameWarc)) {
Expand All @@ -95,21 +96,23 @@ public void flagAsSkipped() throws IOException, URISyntaxException {
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {

try (var recorder = new WarcRecorder(fileNameWarc)) {
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
"text/html",
200,
null);
null,
ContentTags.empty());
}

}

@Test
public void testSaveImport() throws URISyntaxException, IOException {
try (var recorder = new WarcRecorder(fileNameWarc)) {
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
"text/html",
200,
"<?doctype html><html><body>test</body></html>");
"<?doctype html><html><body>test</body></html>",
ContentTags.empty());
}

try (var reader = new WarcReader(fileNameWarc)) {
Expand Down

0 comments on commit 22c8fb3

Please sign in to comment.