Skip to content

Commit

Permalink
Add experiment runner tool and got rid of experiments module in proce…
Browse files Browse the repository at this point in the history
…sses.
  • Loading branch information
vlofgren committed Mar 28, 2023
1 parent 03bd892 commit 8f51345
Show file tree
Hide file tree
Showing 14 changed files with 326 additions and 261 deletions.
3 changes: 0 additions & 3 deletions code/processes/experimental/readme.md

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

@@ -1,6 +1,7 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id 'application'

id 'jvm-test-suite'
}
Expand All @@ -11,37 +12,49 @@ java {
}
}

dependencies {
implementation project(':code:common:process')
application {
mainClass = 'nu.marginalia.tools.ExperimentRunnerMain'
applicationName = 'experiment-runner'
}

tasks.distZip.enabled = false

dependencies {
implementation project(':third-party:rdrpostagger')
implementation project(':third-party:porterstemmer')
implementation project(':third-party:monkey-patch-opennlp')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:process')
implementation project(':code:common:service')
implementation project(':code:libraries:big-string')
implementation project(':code:api:index-api')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')
implementation project(':code:libraries:language-processing')

implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:big-string')
implementation project(':code:processes:converting-process')
implementation project(':code:process-models:crawling-model')

implementation project(':code:processes:converting-process')
implementation project(':code:features-convert:adblock')
implementation project(':code:features-convert:topic-detection')

implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.notnull

implementation libs.guice
implementation libs.jsoup
implementation libs.bundles.mariadb
implementation libs.trove
implementation libs.fastutil

implementation libs.bundles.nlp
implementation libs.commons.lang3

testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}


test {
useJUnitPlatform()
}
Expand Down
7 changes: 7 additions & 0 deletions code/tools/experiment-runner/readme.md
@@ -0,0 +1,7 @@
# Experiment Runner

This tool is a means of launching crawl data processing experiments,
for interacting with crawl data.

It's launched with `run/experiment.sh`. New experiments need to be added to
`ExperimentRunnerMain` in order for the script to be able to run them.
@@ -0,0 +1,17 @@
package nu.marginalia.tools;

import nu.marginalia.crawling.model.CrawledDomain;

public interface Experiment {

/** The experiment processes the domain here.
*
* @return true to continue, false to terminate.
*/
boolean process(CrawledDomain domain);

/** Invoked after all domains are processed
*
*/
void onFinish();
}
@@ -0,0 +1,49 @@
package nu.marginalia.tools;

import com.google.inject.Guice;
import com.google.inject.Injector;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.tools.experiments.*;
import plan.CrawlPlanLoader;

import java.io.IOException;
import java.nio.file.Path;
import java.util.Map;

public class ExperimentRunnerMain {

private static Map<String, Class<? extends Experiment>> experiments = Map.of(
"test", TestExperiment.class,
"adblock", AdblockExperiment.class,
"topic", TopicExperiment.class,
"statistics", SentenceStatisticsExperiment.class
);

public static void main(String... args) throws IOException {
if (args.length != 2) {
System.err.println("Expected arguments: plan.yaml experiment-name");
return;
}

if (!experiments.containsKey(args[1])) {
System.err.println("Valid experiment names: " + experiments.keySet());
return;
}

Injector injector = Guice.createInjector(
new DatabaseModule()
);

Experiment experiment = injector.getInstance(experiments.get(args[1]));

var plan = new CrawlPlanLoader().load(Path.of(args[0]));

for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
if (!experiment.process(domain)) {
break;
}
}
experiment.onFinish();

}
}

0 comments on commit 8f51345

Please sign in to comment.