Skip to content

Commit

Permalink
(*) Overhaul settings and properties
Browse files Browse the repository at this point in the history
Use a system.properties file to configure the system.  This is loaded statically by MainClass or ProcessMainClass.  Update the property names to be more consistent, and update the documentations to reflect the changes.
  • Loading branch information
vlofgren committed Jan 13, 2024
1 parent 176b9c9 commit 7c6e18f
Show file tree
Hide file tree
Showing 25 changed files with 164 additions and 79 deletions.
Expand Up @@ -64,7 +64,7 @@ public SearchResultSet query(Context ctx, List<Integer> nodes, SearchSpecificati
.postGet(ctx, node, "/search/", specs, SearchResultSet.class).onErrorReturn(t -> new SearchResultSet())
.observeOn(Schedulers.io());
} catch (RouteNotConfiguredException ex) {
return Observable.error(ex);
return Observable.empty();
}
})
.reduce(SearchResultSet::combine)
Expand Down
@@ -1,3 +1,3 @@
package nu.marginalia;

public record UserAgent(String uaString) {}
public record UserAgent(String uaString, String uaIdentifier) {}
21 changes: 8 additions & 13 deletions code/common/config/src/main/java/nu/marginalia/WmsaHome.java
Expand Up @@ -12,19 +12,19 @@
import java.util.stream.Stream;

public class WmsaHome {
public static UserAgent getUserAgent() throws IOException {
var uaPath = getHomePath().resolve("conf/user-agent");
public static UserAgent getUserAgent() {

if (!Files.exists(uaPath)) {
throw new FileNotFoundException("Could not find " + uaPath);
}

return new UserAgent(Files.readString(uaPath).trim());
return new UserAgent(
System.getProperty("crawler.userAgentString", "Mozilla/5.0 (compatible; Marginalia-like bot; +https://git.marginalia.nu/))"),
System.getProperty("crawler.userAgentIdentifier", "search.marginalia.nu")
);
}


public static Path getUploadDir() {
return Path.of("/uploads");
return Path.of(
System.getProperty("executor.uploadDir", "/uploads")
);
}

public static Path getHomePath() {
Expand Down Expand Up @@ -93,11 +93,6 @@ public static LanguageModels getLanguageModels() {
public static Path getAtagsPath() {
return getHomePath().resolve("data/atags.parquet");
}
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");

public static boolean isDebug() {
return debugMode;
}


}
Expand Up @@ -16,7 +16,7 @@ public class DomainBlacklistImpl implements DomainBlacklist {
private volatile TIntHashSet spamDomainSet = new TIntHashSet();
private final HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final boolean blacklistDisabled = Boolean.getBoolean("no-domain-blacklist");
private final boolean blacklistDisabled = Boolean.getBoolean("blacklist.disable");
@Inject
public DomainBlacklistImpl(HikariDataSource dataSource) {
this.dataSource = dataSource;
Expand Down
@@ -0,0 +1,33 @@
package nu.marginalia.service;

import nu.marginalia.WmsaHome;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;

public class ConfigLoader {
private static final Logger logger = LoggerFactory.getLogger(ConfigLoader.class);

static Path getConfigPath(String configName) {
return WmsaHome.getHomePath().resolve("conf/properties/" + configName + ".properties");
}

static void loadConfig(Path configPath) {
if (!Files.exists(configPath)) {
logger.info("No config file found at {}", configPath);
return;
}

logger.info("Loading config from {}", configPath);

try (var is = Files.newInputStream(configPath)) {
logger.info("Config:\n{}", Files.readString(configPath));
System.getProperties().load(is);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
Expand Up @@ -15,7 +15,14 @@
* They must also invoke init() in their main method.
*/
public abstract class MainClass {
private final Logger logger = LoggerFactory.getLogger(getClass());
private static final Logger logger = LoggerFactory.getLogger(MainClass.class);

static {
// Load global config ASAP
ConfigLoader.loadConfig(
ConfigLoader.getConfigPath("system")
);
}

public MainClass() {
RxJavaPlugins.setErrorHandler(this::handleError);
Expand All @@ -42,11 +49,14 @@ else if (ex instanceof NetworkException) {


protected static void init(ServiceId id, String... args) {

System.setProperty("log4j2.isThreadContextMapInheritable", "true");
System.setProperty("isThreadContextMapInheritable", "true");
System.setProperty("service-name", id.name);

ConfigLoader.loadConfig(
ConfigLoader.getConfigPath(id.name)
);

initJdbc();
initPrometheus();
}
Expand Down
@@ -0,0 +1,20 @@
package nu.marginalia.service;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public abstract class ProcessMainClass {
private static final Logger logger = LoggerFactory.getLogger(ProcessMainClass.class);

static {
// Load global config ASAP
ConfigLoader.loadConfig(
ConfigLoader.getConfigPath("system")
);
}

public ProcessMainClass() {
new org.mariadb.jdbc.Driver();
}

}
Expand Up @@ -35,7 +35,7 @@ public DatabaseModule(boolean migrate) {
dbProperties = loadDbProperties();

if (migrate) {
if (Boolean.getBoolean("disableFlyway")) {
if (Boolean.getBoolean("flyway.disable")) {
logger.info("Flyway disabled");
}
else {
Expand Down
Expand Up @@ -22,7 +22,7 @@ public class IpBlockList {
private final GeoIpBlocklist geoIpBlocklist;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final List<SubnetUtils.SubnetInfo> badSubnets = new ArrayList<>();
private final boolean blocklistDisabled = Boolean.getBoolean("no-ip-blocklist");
private final boolean blocklistDisabled = Boolean.getBoolean("ip-blocklist.disabled");

@Inject
public IpBlockList(GeoIpBlocklist geoIpBlocklist) {
Expand Down
Expand Up @@ -6,24 +6,14 @@
import nu.marginalia.array.algo.IntArraySort;
import nu.marginalia.array.algo.IntArrayTransformations;
import nu.marginalia.array.delegate.ShiftedIntArray;
import nu.marginalia.array.delegate.ShiftedLongArray;
import nu.marginalia.array.page.SegmentIntArray;
import nu.marginalia.array.page.SegmentLongArray;
import nu.marginalia.array.scheme.ArrayPartitioningScheme;

import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.file.Files;
import java.nio.file.Path;

public interface IntArray extends IntArrayBase, IntArrayTransformations, IntArraySearch, IntArraySort {
int WORD_SIZE = 4;

ArrayPartitioningScheme DEFAULT_PARTITIONING_SCHEME
= ArrayPartitioningScheme.forPartitionSize(Integer.getInteger("wmsa.page-size",1<<30) / WORD_SIZE);

int MAX_CONTINUOUS_SIZE = Integer.MAX_VALUE/WORD_SIZE - 16;

static IntArray allocate(long size) {
return SegmentIntArray.onHeap(Arena.ofShared(), size);
}
Expand Down
Expand Up @@ -11,6 +11,7 @@
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
import nu.marginalia.converting.writer.ConverterBatchWriter;
import nu.marginalia.converting.writer.ConverterWriter;
import nu.marginalia.service.ProcessMainClass;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.mq.MqMessage;
Expand Down Expand Up @@ -38,7 +39,7 @@

import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX;

public class ConverterMain {
public class ConverterMain extends ProcessMainClass {
private static final Logger logger = LoggerFactory.getLogger(ConverterMain.class);
private final DomainProcessor processor;
private final Gson gson;
Expand Down
Expand Up @@ -32,6 +32,7 @@
import java.util.regex.Pattern;

public class DomainProcessor {
private static final int SIDELOAD_THRESHOLD = Integer.getInteger("converter.sideloadThreshold", 10_000);
private final DocumentProcessor documentProcessor;
private final SiteWords siteWords;
private final AnchorTagsSource anchorTagsSource;
Expand Down Expand Up @@ -59,7 +60,7 @@ public DomainProcessor(DocumentProcessor documentProcessor,
public ConverterBatchWritableIf createWritable(SerializableCrawlDataStream domain) {
final int sizeHint = domain.sizeHint();

if (sizeHint > 10_000) {
if (sizeHint > SIDELOAD_THRESHOLD) {
// If the file is too big, we run a processing mode that doesn't
// require loading the entire dataset into RAM
return sideloadProcessing(domain, sizeHint);
Expand Down
Expand Up @@ -23,6 +23,7 @@
import nu.marginalia.crawling.io.CrawlerOutputFile;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
import nu.marginalia.crawlspec.CrawlSpecFileNames;
import nu.marginalia.service.ProcessMainClass;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
import nu.marginalia.mq.MessageQueueFactory;
Expand Down Expand Up @@ -51,7 +52,7 @@

import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;

public class CrawlerMain {
public class CrawlerMain extends ProcessMainClass {
private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class);

private final UserAgent userAgent;
Expand Down Expand Up @@ -96,10 +97,10 @@ public CrawlerMain(UserAgent userAgent,
this.node = processConfiguration.node();

pool = new SimpleBlockingThreadPool("CrawlerPool",
Integer.getInteger("crawler.pool-size", 256),
Integer.getInteger("crawler.poolSize", 256),
1);

fetcher = new HttpFetcherImpl(userAgent.uaString(),
fetcher = new HttpFetcherImpl(userAgent,
new Dispatcher(),
new ConnectionPool(5, 10, TimeUnit.SECONDS)
);
Expand Down
Expand Up @@ -13,12 +13,12 @@
public class ContentTypeProber {

private static final Logger logger = LoggerFactory.getLogger(ContentTypeProber.class);
private final String userAgent;
private final String userAgentString;
private final OkHttpClient client;
private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();

public ContentTypeProber(String userAgent, OkHttpClient httpClient) {
this.userAgent = userAgent;
public ContentTypeProber(String userAgentString, OkHttpClient httpClient) {
this.userAgentString = userAgentString;
this.client = httpClient;
}

Expand All @@ -35,7 +35,7 @@ public ContentTypeProbeResult probeContentType(EdgeUrl url) {
logger.debug("Probing suspected binary {}", url);

var headBuilder = new Request.Builder().head()
.addHeader("User-agent", userAgent)
.addHeader("User-agent", userAgentString)
.addHeader("Accept-Encoding", "gzip")
.url(url.toString());

Expand Down
Expand Up @@ -5,6 +5,7 @@
import crawlercommons.robots.SimpleRobotRules;
import crawlercommons.robots.SimpleRobotRulesParser;
import lombok.SneakyThrows;
import nu.marginalia.UserAgent;
import nu.marginalia.crawl.retreival.Cookies;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult;
Expand Down Expand Up @@ -35,7 +36,8 @@
public class HttpFetcherImpl implements HttpFetcher {

private final Logger logger = LoggerFactory.getLogger(getClass());
private final String userAgent;
private final String userAgentString;
private final String userAgentIdentifier;
private final Cookies cookies = new Cookies();

private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
Expand Down Expand Up @@ -85,18 +87,20 @@ public void clearCookies() {
}

@Inject
public HttpFetcherImpl(@Named("user-agent") String userAgent,
public HttpFetcherImpl(UserAgent userAgent,
Dispatcher dispatcher,
ConnectionPool connectionPool)
{
this.client = createClient(dispatcher, connectionPool);
this.userAgent = userAgent;
this.contentTypeProber = new ContentTypeProber(userAgent, client);
this.userAgentString = userAgent.uaString();
this.userAgentIdentifier = userAgent.uaIdentifier();
this.contentTypeProber = new ContentTypeProber(userAgentString, client);
}

public HttpFetcherImpl(@Named("user-agent") String userAgent) {
public HttpFetcherImpl(String userAgent) {
this.client = createClient(null, new ConnectionPool());
this.userAgent = userAgent;
this.userAgentString = userAgent;
this.userAgentIdentifier = userAgent;
this.contentTypeProber = new ContentTypeProber(userAgent, client);
}

Expand All @@ -110,7 +114,7 @@ public HttpFetcherImpl(@Named("user-agent") String userAgent) {
@Override
@SneakyThrows
public FetchResult probeDomain(EdgeUrl url) {
var head = new Request.Builder().head().addHeader("User-agent", userAgent)
var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
.url(url.toString())
.build();

Expand Down Expand Up @@ -170,7 +174,7 @@ else if (probeResult instanceof ContentTypeProbeResult.Exception exception) {

getBuilder.url(url.toString())
.addHeader("Accept-Encoding", "gzip")
.addHeader("User-agent", userAgent);
.addHeader("User-agent", userAgentString);

contentTags.paint(getBuilder);

Expand Down Expand Up @@ -212,15 +216,15 @@ private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, WarcRecorde

getBuilder.url(url.toString())
.addHeader("Accept-Encoding", "gzip")
.addHeader("User-agent", userAgent);
.addHeader("User-agent", userAgentString);

HttpFetchResult result = recorder.fetch(client, getBuilder.build());

return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
robotsParser.parseContent(url.toString(),
body,
contentType.toString(),
userAgent)
userAgentIdentifier)
);

}
Expand Down
Expand Up @@ -6,6 +6,7 @@
import nu.marginalia.IndexLocations;
import nu.marginalia.ProcessConfiguration;
import nu.marginalia.ProcessConfigurationModule;
import nu.marginalia.service.ProcessMainClass;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.forward.ForwardIndexConverter;
Expand Down Expand Up @@ -38,7 +39,7 @@

import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX;

public class IndexConstructorMain {
public class IndexConstructorMain extends ProcessMainClass {
private final FileStorageService fileStorageService;
private final ProcessHeartbeatImpl heartbeat;
private final MessageQueueFactory messageQueueFactory;
Expand Down
Expand Up @@ -8,6 +8,7 @@
import lombok.SneakyThrows;
import nu.marginalia.ProcessConfiguration;
import nu.marginalia.ProcessConfigurationModule;
import nu.marginalia.service.ProcessMainClass;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.loading.documents.DocumentLoaderService;
Expand Down Expand Up @@ -37,7 +38,7 @@

import static nu.marginalia.mqapi.ProcessInboxNames.LOADER_INBOX;

public class LoaderMain {
public class LoaderMain extends ProcessMainClass {
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);

private final ProcessHeartbeatImpl heartbeat;
Expand Down

0 comments on commit 7c6e18f

Please sign in to comment.