Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
(converter) Make it possible to sideload documents from a directory tree
- Loading branch information
Showing
18 changed files
with
386 additions
and
138 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,5 +3,6 @@ | |
public enum ConvertAction { | ||
ConvertCrawlData, | ||
SideloadEncyclopedia, | ||
SideloadDirtree, | ||
SideloadStackexchange | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
67 changes: 67 additions & 0 deletions
67
...verting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
package nu.marginalia.converting.sideload; | ||
|
||
import com.google.inject.Inject; | ||
import com.google.inject.Singleton; | ||
import nu.marginalia.converting.model.ProcessedDocument; | ||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin; | ||
import nu.marginalia.crawling.model.CrawledDocument; | ||
import nu.marginalia.model.EdgeUrl; | ||
import nu.marginalia.model.crawl.UrlIndexingState; | ||
import nu.marginalia.model.idx.WordFlags; | ||
import nu.marginalia.model.idx.WordMetadata; | ||
|
||
import java.net.URISyntaxException; | ||
import java.time.LocalDateTime; | ||
import java.util.List; | ||
|
||
@Singleton | ||
public class SideloaderProcessing { | ||
private final HtmlDocumentProcessorPlugin htmlProcessorPlugin; | ||
|
||
@Inject | ||
public SideloaderProcessing(HtmlDocumentProcessorPlugin htmlProcessorPlugin) { | ||
this.htmlProcessorPlugin = htmlProcessorPlugin; | ||
} | ||
|
||
public ProcessedDocument processDocument(String url, String body, List<String> extraKeywords, int size) throws URISyntaxException { | ||
var crawledDoc = new CrawledDocument( | ||
"encyclopedia.marginalia.nu", | ||
url, | ||
"text/html", | ||
LocalDateTime.now().toString(), | ||
200, | ||
"OK", | ||
"NP", | ||
"", | ||
body, | ||
Integer.toHexString(url.hashCode()), | ||
url, | ||
"", | ||
"SIDELOAD" | ||
); | ||
|
||
var ret = new ProcessedDocument(); | ||
try { | ||
var details = htmlProcessorPlugin.createDetails(crawledDoc); | ||
|
||
ret.words = details.words(); | ||
|
||
for (String keyword : extraKeywords) | ||
ret.words.add(keyword, WordFlags.Subjects.asBit()); | ||
|
||
ret.details = details.details(); | ||
ret.details.metadata = ret.details.metadata | ||
.withSize(size, Math.max(0, 255 - url.length())); | ||
ret.url = new EdgeUrl(url); | ||
ret.state = UrlIndexingState.OK; | ||
ret.stateReason = "SIDELOAD"; | ||
} | ||
catch (Exception e) { | ||
ret.url = new EdgeUrl(url); | ||
ret.state = UrlIndexingState.DISQUALIFIED; | ||
ret.stateReason = "SIDELOAD"; | ||
} | ||
|
||
return ret; | ||
} | ||
} |
20 changes: 20 additions & 0 deletions
20
...-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloadSpec.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
package nu.marginalia.converting.sideload.dirtree; | ||
|
||
import lombok.AllArgsConstructor; | ||
import lombok.Getter; | ||
import lombok.NoArgsConstructor; | ||
import lombok.Setter; | ||
|
||
import java.util.List; | ||
|
||
@AllArgsConstructor | ||
@NoArgsConstructor | ||
@Setter | ||
@Getter | ||
class DirtreeSideloadSpec { | ||
public String name; | ||
public String domainName; | ||
public String dir; | ||
public String baseUrl; | ||
public List<String> keywords; | ||
} |
14 changes: 14 additions & 0 deletions
14
...cess/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloadSpecList.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
package nu.marginalia.converting.sideload.dirtree; | ||
|
||
import lombok.AllArgsConstructor; | ||
import lombok.Getter; | ||
import lombok.NoArgsConstructor; | ||
import lombok.Setter; | ||
|
||
import java.util.List; | ||
|
||
@AllArgsConstructor @NoArgsConstructor | ||
@Setter @Getter | ||
class DirtreeSideloadSpecList { | ||
public List<DirtreeSideloadSpec> sources; | ||
} |
83 changes: 83 additions & 0 deletions
83
...ng-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
package nu.marginalia.converting.sideload.dirtree; | ||
|
||
import lombok.SneakyThrows; | ||
import nu.marginalia.converting.model.ProcessedDocument; | ||
import nu.marginalia.converting.model.ProcessedDomain; | ||
import nu.marginalia.converting.sideload.SideloadSource; | ||
import nu.marginalia.converting.sideload.SideloaderProcessing; | ||
import nu.marginalia.model.EdgeDomain; | ||
import nu.marginalia.model.crawl.DomainIndexingState; | ||
|
||
import java.io.IOException; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
import java.util.stream.Stream; | ||
|
||
public class DirtreeSideloader implements SideloadSource, AutoCloseable { | ||
private final Path dirBase; | ||
private final String domainName; | ||
private final String urlBase; | ||
|
||
private final List<String> extraKeywords; | ||
private final SideloaderProcessing sideloaderProcessing; | ||
|
||
private final Stream<Path> filesStream; | ||
|
||
public DirtreeSideloader(Path dirBase, | ||
String domainName, | ||
String urlBase, | ||
List<String> extraKeywords, | ||
SideloaderProcessing sideloaderProcessing) | ||
throws IOException | ||
{ | ||
this.dirBase = dirBase; | ||
this.domainName = domainName; | ||
this.urlBase = urlBase + (urlBase.endsWith("/") ? "" : "/"); | ||
this.filesStream = Files.walk(dirBase); | ||
this.extraKeywords = extraKeywords; | ||
this.sideloaderProcessing = sideloaderProcessing; | ||
} | ||
|
||
@Override | ||
public ProcessedDomain getDomain() { | ||
var ret = new ProcessedDomain(); | ||
|
||
ret.domain = new EdgeDomain(domainName); | ||
ret.ip = "0.0.0.0"; | ||
ret.state = DomainIndexingState.ACTIVE; | ||
|
||
return ret; | ||
} | ||
|
||
@Override | ||
public Iterator<ProcessedDocument> getDocumentsStream() { | ||
return filesStream | ||
.filter(Files::isRegularFile) | ||
.filter(this::isHtmlFile) | ||
.map(this::process) | ||
.iterator(); | ||
} | ||
|
||
private boolean isHtmlFile(Path path) { | ||
final String name = path.toFile().getName().toLowerCase(); | ||
|
||
return name.endsWith(".html") || name.endsWith(".htm"); | ||
} | ||
|
||
@SneakyThrows | ||
private ProcessedDocument process(Path path) { | ||
String body = Files.readString(path); | ||
String url = urlBase + dirBase.relativize(path); | ||
|
||
return sideloaderProcessing | ||
.processDocument(url, body, extraKeywords, 10_000); | ||
} | ||
|
||
@Override | ||
public void close() throws Exception { | ||
filesStream.close(); | ||
} | ||
|
||
} |
Oops, something went wrong.