[webcrawler] Support emitting non HTML documents (like PDFs...) (#739)

LangStream · Nov 30, 2023 · 850b3ca · 850b3ca
1 parent e908495
commit 850b3ca
Show file tree

Hide file tree

Showing 5 changed files with 102 additions and 17 deletions.
diff --git a/...ream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java b/...ream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java
@@ -41,7 +41,6 @@
 import io.minio.errors.ErrorResponseException;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
-import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.time.Instant;
@@ -269,7 +268,7 @@ public List<Record> read() throws Exception {
         processed(0, 1);
         return List.of(
                 new WebCrawlerSourceRecord(
-                        document.content().getBytes(StandardCharsets.UTF_8), document.url()));
+                        document.content(), document.url(), document.contentType()));
     }
 
     private void checkReindexIsNeeded() {
@@ -336,10 +335,12 @@ public void commit(List<Record> records) {
     private static class WebCrawlerSourceRecord implements Record {
         private final byte[] read;
         private final String url;
+        private final String contentType;
 
-        public WebCrawlerSourceRecord(byte[] read, String url) {
+        public WebCrawlerSourceRecord(byte[] read, String url, String contentType) {
             this.read = read;
             this.url = url;
+            this.contentType = contentType;
         }
 
         /**
@@ -370,7 +371,9 @@ public Long timestamp() {
 
         @Override
         public Collection<Header> headers() {
-            return List.of(new SimpleRecord.SimpleHeader("url", url));
+            return List.of(
+                    new SimpleRecord.SimpleHeader("url", url),
+                    new SimpleRecord.SimpleHeader("content_type", contentType));
         }
 
         @Override

diff --git a/...ream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/Document.java b/...ream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/Document.java
@@ -15,4 +15,4 @@
  */
 package ai.langstream.agents.webcrawler.crawler;
 
-public record Document(String url, String content) {}
+public record Document(String url, byte[] content, String contentType) {}
diff --git a/...am-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java b/...am-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java
@@ -37,6 +37,7 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 import java.util.Set;
 import lombok.Getter;
 import lombok.extern.slf4j.Slf4j;
@@ -160,14 +161,17 @@ public boolean runCycle() throws Exception {
         connect.timeout(configuration.getHttpTimeout());
 
         boolean redirectedToForbiddenDomain = false;
-        Document document;
+        Document document = null;
+        String contentType = null;
+        byte[] binaryContent = null;
         try {
             document = connect.get();
             Connection.Response response = connect.response();
+            contentType = response.contentType();
             int statusCode = response.statusCode();
             if (statusCode >= 300 && statusCode < 400) {
                 String location = response.header("Location");
-                if (!location.equals(current)) {
+                if (!Objects.equals(location, current)) {
                     if (isUrlForbidden(location)) {
                         redirectedToForbiddenDomain = true;
                         log.warn(
@@ -200,17 +204,44 @@ public boolean runCycle() throws Exception {
             // we did something
             return true;
         } catch (UnsupportedMimeTypeException notHtml) {
-            log.info(
-                    "Url {} lead to a {} content-type document. Skipping",
-                    current,
-                    notHtml.getMimeType());
-            discardUrl(current, reference);
+            if (configuration.isAllowNonHtmlContent()) {
+                log.info(
+                        "Url {} lead to a {} content-type document. allow-not-html-content is true, so we are processing it",
+                        current,
+                        notHtml.getMimeType());
+                handleThrottling(current);
+
+                // download again the file, this is a little inefficient but currently
+                // this is not the most common case, we can improve it later
+
+                // downloadUrl takes care of retrying
+                HttpResponse<byte[]> httpResponse = downloadUrl(current);
+                contentType =
+                        httpResponse
+                                .headers()
+                                .firstValue("content-type")
+                                .orElse("application/octet-stream");
+                binaryContent = httpResponse.body();
+                visitor.visit(
+                        new ai.langstream.agents.webcrawler.crawler.Document(
+                                current, binaryContent, contentType));
+
+                handleThrottling(current);
+
+                return true;
+            } else {
+                log.info(
+                        "Url {} lead to a {} content-type document. Skipping",
+                        current,
+                        notHtml.getMimeType());
+                discardUrl(current, reference);
 
-            // prevent from being banned for flooding
-            handleThrottling(current);
+                // prevent from being banned for flooding
+                handleThrottling(current);
 
-            // we did something
-            return true;
+                // we did something
+                return true;
+            }
         } catch (IOException e) {
             log.info("Error while crawling url: {}, IO Error: {}", current, e + "");
 
@@ -240,7 +271,10 @@ public boolean runCycle() throws Exception {
                                 });
             }
             visitor.visit(
-                    new ai.langstream.agents.webcrawler.crawler.Document(current, document.html()));
+                    new ai.langstream.agents.webcrawler.crawler.Document(
+                            current,
+                            document.html().getBytes(StandardCharsets.UTF_8),
+                            contentType));
         }
 
         // prevent from being banned for flooding

diff --git a/...rawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java b/...rawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java
@@ -39,6 +39,7 @@ public class WebCrawlerConfiguration {
     @Builder.Default private boolean handleCookies = true;
     @Builder.Default private boolean handleRobotsFile = true;
     @Builder.Default private boolean scanHtmlDocuments = true;
+    @Builder.Default private boolean allowNonHtmlContent = false;
 
     @Builder.Default private Set<String> allowedTags = Set.of("a");
 

diff --git a/...gent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerTest.java b/...gent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerTest.java
@@ -322,4 +322,51 @@ void testNetworkErrorsEventuallyFail(WireMockRuntimeInfo vmRuntimeInfo) throws E
         assertEquals(0, status.getPendingUrls().size());
         assertEquals(2, status.getUrls().size());
     }
+
+    @Test
+    void testBinaryContent(WireMockRuntimeInfo vmRuntimeInfo) throws Exception {
+
+        byte[] mockPdf = new byte[] {1, 2, 3, 4, 5};
+        stubFor(
+                get("/index.html")
+                        .willReturn(
+                                okForContentType(
+                                        "text/html",
+                                        """
+                                  <a href="document.pdf">link</a>
+                              """)));
+        stubFor(
+                get("/document.pdf")
+                        .willReturn(
+                                aResponse()
+                                        .withHeader("content-type", "application/pdf")
+                                        .withBody(mockPdf)));
+
+        WebCrawlerConfiguration configuration =
+                WebCrawlerConfiguration.builder()
+                        .allowedDomains(Set.of(vmRuntimeInfo.getHttpBaseUrl()))
+                        .allowNonHtmlContent(true)
+                        .handleRobotsFile(false)
+                        .maxErrorCount(5)
+                        .build();
+        WebCrawlerStatus status = new WebCrawlerStatus();
+        List<Document> documents = new ArrayList<>();
+        WebCrawler crawler = new WebCrawler(configuration, status, documents::add);
+        crawler.crawl(vmRuntimeInfo.getHttpBaseUrl() + "/index.html");
+        crawler.runCycle();
+
+        assertEquals(1, documents.size());
+        assertEquals(vmRuntimeInfo.getHttpBaseUrl() + "/index.html", documents.get(0).url());
+        assertEquals(1, status.getPendingUrls().size());
+        assertEquals(2, status.getUrls().size());
+
+        crawler.runCycle();
+
+        assertEquals(vmRuntimeInfo.getHttpBaseUrl() + "/document.pdf", documents.get(1).url());
+        assertArrayEquals(mockPdf, documents.get(1).content());
+        assertEquals("application/pdf", documents.get(1).contentType());
+
+        assertEquals(0, status.getPendingUrls().size());
+        assertEquals(2, status.getUrls().size());
+    }
 }