-
-
Notifications
You must be signed in to change notification settings - Fork 22
/
InstructionWriterFactory.java
143 lines (112 loc) · 4.55 KB
/
InstructionWriterFactory.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
package nu.marginalia.converting;
import com.github.luben.zstd.ZstdOutputStream;
import com.google.gson.Gson;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.converting.instruction.Instruction;
import nu.marginalia.converting.instruction.Interpreter;
import nu.marginalia.keyword.model.DocumentKeywords;
import nu.marginalia.converting.instruction.instructions.DomainLink;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
public class InstructionWriterFactory {
private final ConversionLog log;
private final Path outputDir;
private final Gson gson;
private static final Logger logger = LoggerFactory.getLogger(InstructionWriterFactory.class);
public InstructionWriterFactory(ConversionLog log, Path outputDir, Gson gson) {
this.log = log;
this.outputDir = outputDir;
this.gson = gson;
if (!Files.isDirectory(outputDir)) {
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
}
}
public InstructionWriter createInstructionsForDomainWriter(String id) throws IOException {
Path outputFile = getOutputFile(id);
return new InstructionWriter(outputFile);
}
public class InstructionWriter implements AutoCloseable {
private final ObjectOutputStream outputStream;
private final String where;
private final SummarizingInterpreter summary = new SummarizingInterpreter();
private int size = 0;
InstructionWriter(Path filename) throws IOException {
where = filename.getFileName().toString();
Files.deleteIfExists(filename);
outputStream = new ObjectOutputStream(new ZstdOutputStream(new FileOutputStream(filename.toFile())));
}
public void accept(Instruction instruction) {
if (instruction.isNoOp()) return;
instruction.apply(summary);
instruction.apply(log);
size++;
try {
outputStream.writeObject(instruction);
// Reset the stream to avoid keeping references to the objects
// (as this will cause the memory usage to grow indefinitely when
// writing huge amounts of data)
outputStream.reset();
}
catch (IOException ex) {
logger.warn("IO exception writing instruction", ex);
}
}
@Override
public void close() throws IOException {
logger.info("Wrote {} - {} - {}", where, size, summary);
outputStream.close();
}
public String getFileName() {
return where;
}
public int getSize() {
return size;
}
}
private Path getOutputFile(String id) throws IOException {
String first = id.substring(0, 2);
String second = id.substring(2, 4);
Path destDir = outputDir.resolve(first).resolve(second);
if (!Files.exists(destDir)) {
Files.createDirectories(destDir);
}
return destDir.resolve(id + ".pzstd");
}
private static class SummarizingInterpreter implements Interpreter {
private String domainName;
private int ok = 0;
private int error = 0;
int keywords = 0;
int documents = 0;
public String toString() {
// This shouldn't happen (TM)
assert keywords == documents : "keywords != documents";
return String.format("%s - %d %d", domainName, ok, error);
}
@Override
public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) {
this.domainName = domain.toString();
}
@Override
public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {
documents++;
}
@Override
public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {
keywords++;
}
@Override
public void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) {
ok += goodUrls;
error += visitedUrls - goodUrls;
}
}
}