From 6bbf40d7d292f1f3c1a968b9fe49de4423fab761 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 20 Sep 2023 14:17:33 +0200 Subject: [PATCH] (stackexchange-integration) Tools for reading stackexchange xml files --- .../stackexchange-xml/build.gradle | 39 ++++ .../model/StackExchangeComment.java | 5 + .../model/StackExchangePost.java | 16 ++ .../sqlite/StackExchangePostsDb.java | 171 ++++++++++++++++++ .../StackExchange7zXmlEventReaderSource.java | 58 ++++++ .../xml/StackExchangeXmlCommentReader.java | 58 ++++++ .../xml/StackExchangeXmlIterator.java | 58 ++++++ .../xml/StackExchangeXmlPostReader.java | 102 +++++++++++ .../xml/XmlEventReaderSource.java | 8 + .../src/main/resources/db/stackexchange.sql | 11 ++ .../StackExchangeXmlCommentReaderTest.java | 28 +++ .../xml/StackExchangeXmlPostReaderTest.java | 29 +++ .../xml/StringXmlTestEventReader.java | 28 +++ settings.gradle | 1 + 14 files changed, 612 insertions(+) create mode 100644 code/features-convert/stackexchange-xml/build.gradle create mode 100644 code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/model/StackExchangeComment.java create mode 100644 code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/model/StackExchangePost.java create mode 100644 code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java create mode 100644 code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchange7zXmlEventReaderSource.java create mode 100644 code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReader.java create mode 100644 code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlIterator.java create mode 100644 code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReader.java create mode 100644 code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/XmlEventReaderSource.java create mode 100644 code/features-convert/stackexchange-xml/src/main/resources/db/stackexchange.sql create mode 100644 code/features-convert/stackexchange-xml/src/test/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReaderTest.java create mode 100644 code/features-convert/stackexchange-xml/src/test/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReaderTest.java create mode 100644 code/features-convert/stackexchange-xml/src/test/java/nu/marginalia/integration/stackexchange/xml/StringXmlTestEventReader.java diff --git a/code/features-convert/stackexchange-xml/build.gradle b/code/features-convert/stackexchange-xml/build.gradle new file mode 100644 index 000000000..1f924521e --- /dev/null +++ b/code/features-convert/stackexchange-xml/build.gradle @@ -0,0 +1,39 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "8.2.2" + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(20)) + } +} + +dependencies { + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation 'org.tukaani:xz:1.8' + implementation project(':code:libraries:blocking-thread-pool') + implementation libs.notnull + + implementation libs.jsoup + implementation libs.sqlite + + implementation libs.guice + implementation libs.guava + implementation libs.zstd + implementation libs.trove + implementation libs.commons.compress + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + maxHeapSize = "8G" + useJUnitPlatform() +} diff --git a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/model/StackExchangeComment.java b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/model/StackExchangeComment.java new file mode 100644 index 000000000..9d55fec5b --- /dev/null +++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/model/StackExchangeComment.java @@ -0,0 +1,5 @@ +package nu.marginalia.integration.stackexchange.model; + +public record StackExchangeComment(int id, int postId, String text) { + +} diff --git a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/model/StackExchangePost.java b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/model/StackExchangePost.java new file mode 100644 index 000000000..4c04e0070 --- /dev/null +++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/model/StackExchangePost.java @@ -0,0 +1,16 @@ +package nu.marginalia.integration.stackexchange.model; + +import javax.annotation.Nullable; +import java.util.List; + +public record StackExchangePost(@Nullable + String title, + List tags, + int year, + int id, + @Nullable Integer parentId, + int postTypeId, + String body) +{ + +} diff --git a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java new file mode 100644 index 000000000..8fdbabaf2 --- /dev/null +++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java @@ -0,0 +1,171 @@ +package nu.marginalia.integration.stackexchange.sqlite; + +import com.github.luben.zstd.Zstd; +import gnu.trove.list.TIntList; +import gnu.trove.list.array.TIntArrayList; +import gnu.trove.map.TIntIntMap; +import gnu.trove.map.hash.TIntIntHashMap; +import lombok.SneakyThrows; +import nu.marginalia.integration.stackexchange.xml.StackExchangeXmlPostReader; +import org.apache.commons.compress.compressors.zstandard.ZstdUtils; + +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.DriverManager; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.Future; +import java.util.function.Predicate; + +public class StackExchangePostsDb { + + @SneakyThrows + public static void create(Path sqliteFile, + Path stackExchange7zFile) { + if (Files.exists(sqliteFile)) + Files.delete(sqliteFile); + String connStr = "jdbc:sqlite:" + sqliteFile; + + try (var connection = DriverManager.getConnection(connStr); + var stream = ClassLoader.getSystemResourceAsStream("db/stackexchange.sql"); + var updateStmt = connection.createStatement() + ) { + var sql = new String(stream.readAllBytes()); + + String[] sqlParts = sql.split(";"); + for (var part : sqlParts) { + if (part.isBlank()) { + continue; + } + updateStmt.executeUpdate(part); + } + updateStmt.execute("PRAGMA synchronous = OFF"); + + var postReader = new StackExchangeXmlPostReader( + stackExchange7zFile + ); + + var insertPost = connection.prepareStatement(""" + INSERT INTO post(id, threadId, postYear, title, body, origSize, tags) + VALUES (?, ?, ?, ?, ?, ?, ?) + """); + + var iter = postReader.iterator(); + + int cnt = 0; + while (iter.hasNext()) { + var post = iter.next(); + insertPost.setInt(1, post.id()); + + if (post.parentId() == null) insertPost.setInt(2, post.id()); + else insertPost.setInt(2, post.parentId()); + + insertPost.setInt(3, post.year()); + + if (post.title() == null) + insertPost.setString(4, ""); + else + insertPost.setString(4, post.title()); + + byte[] bodyBytes = post.body().getBytes(); + insertPost.setBytes(5, Zstd.compress(bodyBytes)); + insertPost.setInt(6, bodyBytes.length); + + insertPost.setString(7, String.join(",", post.tags())); + insertPost.addBatch(); + if (++cnt > 100) { + insertPost.executeBatch(); + cnt = 0; + } + } + if (cnt > 0) { + insertPost.executeBatch(); + } + } + catch (IOException | SQLException | XMLStreamException e) { + e.printStackTrace(); + } + } + + @SneakyThrows + public static void forEachPost( + Path sqliteFile, + Predicate consumer) { + + String connStr = "jdbc:sqlite:" + sqliteFile; + + + try (var connection = DriverManager.getConnection(connStr); + var selectThreadIds = connection.prepareStatement("SELECT DISTINCT(threadId) FROM post"); + var queryPostContents = connection.prepareStatement(""" + SELECT postYear, title, body, origSize, tags + FROM post + WHERE threadId = ? + """) + ) { + TIntList threadIds = new TIntArrayList(10_000); + ResultSet rs = selectThreadIds.executeQuery(); + + while (rs.next()) { + threadIds.add(rs.getInt(1)); + } + + System.out.println("Got " + threadIds.size() + " IDs"); + + var idIterator = threadIds.iterator(); + int ordinal = 0; + + while (idIterator.hasNext()) { + queryPostContents.setInt(1, idIterator.next()); + rs = queryPostContents.executeQuery(); + + List parts = new ArrayList<>(); + String title = ""; + int year = 2023; + + List> partWork = new ArrayList<>(); + var commonPool = ForkJoinPool.commonPool(); + while (rs.next()) { + String maybeTitle = rs.getString("title"); + + if (maybeTitle != null && !maybeTitle.isBlank()) + title = maybeTitle; + int origSize = rs.getInt("origSize"); + + year = Math.min(year, rs.getInt("postYear")); + + byte[] bytes = rs.getBytes("body"); + partWork.add(commonPool.submit( + () -> new String(Zstd.decompress(bytes, origSize) + ))); + } + + for (var workItem : partWork) { + parts.add(workItem.get()); + } + + if (!consumer.test(new CombinedPostModel(ordinal++, title, year, parts))) + break; + } + + } + catch (SQLException ex) { + ex.printStackTrace(); + } + + } + + public record CombinedPostModel(int ordinal, + String title, + int year, + List bodies) + { + + } + +} diff --git a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchange7zXmlEventReaderSource.java b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchange7zXmlEventReaderSource.java new file mode 100644 index 000000000..36bac843f --- /dev/null +++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchange7zXmlEventReaderSource.java @@ -0,0 +1,58 @@ +package nu.marginalia.integration.stackexchange.xml; + +import org.apache.commons.compress.archivers.sevenz.SevenZArchiveEntry; +import org.apache.commons.compress.archivers.sevenz.SevenZFile; + +import javax.xml.stream.XMLEventReader; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.file.Path; + +public class StackExchange7zXmlEventReaderSource implements XmlEventReaderSource { + + static { + // We need to set this for SAX reasons. Something to do with reading + // XML files with more than 50,000,000 entities being forbidden to enhance + // security somehow. Since we're using STAX, these aren't + // software-configurable. + System.setProperty("jdk.xml.totalEntitySizeLimit", "0"); + } + + private final XMLEventReader reader; + private final SevenZFile postsFile; + public StackExchange7zXmlEventReaderSource(Path pathTo7zFile, String xmlFileName) + throws IOException, XMLStreamException + { + postsFile = new SevenZFile(pathTo7zFile.toFile()); + + SevenZArchiveEntry postsEntry = null; + + for (SevenZArchiveEntry entry : postsFile.getEntries()) { + if (xmlFileName.equals(entry.getName())) { + postsEntry = entry; + break; + } + } + + if (postsEntry == null) { + postsFile.close(); + throw new FileNotFoundException("No " + xmlFileName + " in provided archive"); + } + + XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); + reader = xmlInputFactory.createXMLEventReader(postsFile.getInputStream(postsEntry)); + } + + @Override + public XMLEventReader reader() { + return reader; + } + + @Override + public void close() throws Exception { + reader.close(); + postsFile.close(); + } +} diff --git a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReader.java b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReader.java new file mode 100644 index 000000000..6225ecec4 --- /dev/null +++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReader.java @@ -0,0 +1,58 @@ +package nu.marginalia.integration.stackexchange.xml; + +import nu.marginalia.integration.stackexchange.model.StackExchangeComment; + +import javax.xml.namespace.QName; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.events.XMLEvent; +import java.io.IOException; +import java.nio.file.Path; +import java.util.Iterator; + +public class StackExchangeXmlCommentReader { + private final Path pathTo7zFile; + + public StackExchangeXmlCommentReader(Path pathTo7zFile) { + this.pathTo7zFile = pathTo7zFile; + } + + public Iterator iterator() throws IOException, XMLStreamException { + return iterator(new StackExchange7zXmlEventReaderSource(pathTo7zFile, "Comments.xml")); + } + + // exposed for testability + static Iterator iterator(XmlEventReaderSource source) { + return new StackExchangeXmlIterator<>(source, StackExchangeXmlCommentReader::parseEvent); + } + + private static final QName idName = new QName("Id"); + private static final QName postIdName = new QName("PostId"); + private static final QName textName = new QName("Text"); + + private static StackExchangeComment parseEvent(XMLEvent event) { + if (!event.isStartElement()) + return null; + + var startEvent = event.asStartElement(); + if (!"row".equals(startEvent.getName().getLocalPart())) + return null; + + var postIdAttribute = startEvent.getAttributeByName(postIdName); + if (postIdAttribute == null) + return null; + int postId = Integer.parseInt(postIdAttribute.getValue()); + + var commentIdAttribute = startEvent.getAttributeByName(idName); + if (commentIdAttribute == null) + return null; + int commentId = Integer.parseInt(commentIdAttribute.getValue()); + + var textAttribute = startEvent.getAttributeByName(textName); + if (textAttribute == null) + return null; + String text = textAttribute.getValue(); + + return new StackExchangeComment(commentId, postId, text); + } + +} diff --git a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlIterator.java b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlIterator.java new file mode 100644 index 000000000..5e22719eb --- /dev/null +++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlIterator.java @@ -0,0 +1,58 @@ +package nu.marginalia.integration.stackexchange.xml; + +import lombok.SneakyThrows; + +import javax.xml.stream.XMLEventReader; +import javax.xml.stream.events.XMLEvent; +import java.util.Iterator; +import java.util.function.Function; + +class StackExchangeXmlIterator implements Iterator { + private T next = null; + + private final XmlEventReaderSource readerSource; + private final XMLEventReader xmlReader; + private final Function parser; + + protected StackExchangeXmlIterator(XmlEventReaderSource readerSource, + Function parser + ) { + this.readerSource = readerSource; + this.xmlReader = readerSource.reader(); + this.parser = parser; + } + + @SneakyThrows + @Override + public boolean hasNext() { + if (next != null) + return true; + + while (xmlReader.hasNext()) { + XMLEvent event = xmlReader.nextEvent(); + + if (!event.isStartElement()) + continue; + + next = parser.apply(event); + + if (next != null) + return true; + } + + readerSource.close(); + + return false; + } + + @Override + public T next() { + if (hasNext()) { + var ret = next; + next = null; + return ret; + } + + throw new IllegalStateException("No more posts"); + } +} diff --git a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReader.java b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReader.java new file mode 100644 index 000000000..203d68905 --- /dev/null +++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReader.java @@ -0,0 +1,102 @@ +package nu.marginalia.integration.stackexchange.xml; + +import nu.marginalia.integration.stackexchange.model.StackExchangePost; + +import javax.xml.namespace.QName; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.events.XMLEvent; +import java.io.IOException; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +public class StackExchangeXmlPostReader { + private final Path pathTo7zFile; + + public StackExchangeXmlPostReader(Path pathTo7zFile) { + this.pathTo7zFile = pathTo7zFile; + } + + public Iterator iterator() throws IOException, XMLStreamException { + return iterator(new StackExchange7zXmlEventReaderSource(pathTo7zFile, "Posts.xml")); + } + + static Iterator iterator(XmlEventReaderSource source) { + return new StackExchangeXmlIterator<>(source, StackExchangeXmlPostReader::parseEvent); + } + + private static final QName titleName = new QName("Title"); + private static final QName idName = new QName("Id"); + private static final QName bodyName = new QName("Body"); + private static final QName tagsName = new QName("Tags"); + private static final QName creationDateName = new QName("CreationDate"); + private static final QName score = new QName("Score"); + private static final QName parentId = new QName("ParentId"); + private static final QName postTypeId = new QName("PostTypeId"); + + private static StackExchangePost parseEvent(XMLEvent event) { + var startEvent = event.asStartElement(); + if (!"row".equals(startEvent.getName().getLocalPart())) + return null; + + var scoreAttribute = startEvent.getAttributeByName(score); + if (scoreAttribute == null) + return null; + int score = Integer.parseInt(scoreAttribute.getValue()); + if (score < 1) + return null; + + var titleAttribute = startEvent.getAttributeByName(titleName); + String title = null; + if (titleAttribute != null) + title = titleAttribute.getValue(); + + var idAttribute = startEvent.getAttributeByName(idName); + if (idAttribute == null) + return null; + int id = Integer.parseInt(idAttribute.getValue()); + + var parentIdAttribute = startEvent.getAttributeByName(parentId); + Integer parentId = null; + if (parentIdAttribute != null) + parentId = Integer.parseInt(parentIdAttribute.getValue()); + + var postTypeIdAttribute = startEvent.getAttributeByName(postTypeId); + if (postTypeIdAttribute == null) + return null; + int postTypeId = Integer.parseInt(postTypeIdAttribute.getValue()); + + var bodyAttribute = startEvent.getAttributeByName(bodyName); + if (bodyAttribute == null) + return null; + String body = bodyAttribute.getValue(); + + var tagsAttribute = startEvent.getAttributeByName(tagsName); + List tagsParsed; + if (tagsAttribute == null) { + tagsParsed = List.of(); + } + else { + String tags = tagsAttribute.getValue(); + tagsParsed = parseTags(tags); + } + + var creationDateAttribute = startEvent.getAttributeByName(creationDateName); + if (creationDateAttribute == null) + return null; + String creationDate = creationDateAttribute.getValue(); + int year = Integer.parseInt(creationDate.substring(0, 4)); + + return new StackExchangePost(title, tagsParsed, year, id, parentId, postTypeId, body); + } + + private static final Pattern splitPattern = Pattern.compile("[<>]+"); + private static List parseTags(String tags) { + return Arrays.stream(splitPattern.split(tags)) + .filter(s -> !s.isBlank()) + .collect(Collectors.toList()); + } +} diff --git a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/XmlEventReaderSource.java b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/XmlEventReaderSource.java new file mode 100644 index 000000000..419e0d160 --- /dev/null +++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/xml/XmlEventReaderSource.java @@ -0,0 +1,8 @@ +package nu.marginalia.integration.stackexchange.xml; + +import javax.xml.stream.XMLEventReader; + +public interface XmlEventReaderSource { + XMLEventReader reader(); + void close() throws Exception; +} diff --git a/code/features-convert/stackexchange-xml/src/main/resources/db/stackexchange.sql b/code/features-convert/stackexchange-xml/src/main/resources/db/stackexchange.sql new file mode 100644 index 000000000..401fe7a18 --- /dev/null +++ b/code/features-convert/stackexchange-xml/src/main/resources/db/stackexchange.sql @@ -0,0 +1,11 @@ +CREATE TABLE post ( + id INT PRIMARY KEY, + threadId INT NOT NULL, + postYear INT NOT NULL, + title TEXT, + body BINARY NOT NULL, + origSize INTEGER NOT NULL, + tags TEXT +); + +CREATE INDEX post_threadId ON post(threadId); \ No newline at end of file diff --git a/code/features-convert/stackexchange-xml/src/test/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReaderTest.java b/code/features-convert/stackexchange-xml/src/test/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReaderTest.java new file mode 100644 index 000000000..e315325c9 --- /dev/null +++ b/code/features-convert/stackexchange-xml/src/test/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReaderTest.java @@ -0,0 +1,28 @@ +package nu.marginalia.integration.stackexchange.xml; + +import org.junit.jupiter.api.Test; + +import javax.xml.stream.XMLStreamException; + +class StackExchangeXmlCommentReaderTest { + @Test + public void testSunnyDay() throws XMLStreamException { + String xml = """ + + + + + + """; + + var iter = StackExchangeXmlCommentReader.iterator( + new StringXmlTestEventReader(xml) + ); + + while (iter.hasNext()) { + var comment = iter.next(); + System.out.println(comment); + } + } + +} \ No newline at end of file diff --git a/code/features-convert/stackexchange-xml/src/test/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReaderTest.java b/code/features-convert/stackexchange-xml/src/test/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReaderTest.java new file mode 100644 index 000000000..208bbca20 --- /dev/null +++ b/code/features-convert/stackexchange-xml/src/test/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReaderTest.java @@ -0,0 +1,29 @@ +package nu.marginalia.integration.stackexchange.xml; + +import org.junit.jupiter.api.Test; + +import javax.xml.stream.XMLStreamException; + +class StackExchangeXmlPostReaderTest { + @Test + public void testSunnyDay() throws XMLStreamException { + String xml = """ + + + + + + + """; + + var iter = StackExchangeXmlPostReader.iterator( + new StringXmlTestEventReader(xml) + ); + + while (iter.hasNext()) { + var post = iter.next(); + System.out.println(post); + } + } + +} \ No newline at end of file diff --git a/code/features-convert/stackexchange-xml/src/test/java/nu/marginalia/integration/stackexchange/xml/StringXmlTestEventReader.java b/code/features-convert/stackexchange-xml/src/test/java/nu/marginalia/integration/stackexchange/xml/StringXmlTestEventReader.java new file mode 100644 index 000000000..54a158294 --- /dev/null +++ b/code/features-convert/stackexchange-xml/src/test/java/nu/marginalia/integration/stackexchange/xml/StringXmlTestEventReader.java @@ -0,0 +1,28 @@ +package nu.marginalia.integration.stackexchange.xml; + +import nu.marginalia.integration.stackexchange.xml.XmlEventReaderSource; + +import javax.xml.stream.XMLEventReader; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import java.io.StringReader; + +class StringXmlTestEventReader implements XmlEventReaderSource { + private final XMLEventReader reader; + + public StringXmlTestEventReader(String xml) throws XMLStreamException { + reader = XMLInputFactory.newInstance().createXMLEventReader( + new StringReader(xml) + ); + } + + @Override + public XMLEventReader reader() { + return reader; + } + + @Override + public void close() throws Exception { + reader.close(); + } +} diff --git a/settings.gradle b/settings.gradle index 80887d3d9..b7e49ae8a 100644 --- a/settings.gradle +++ b/settings.gradle @@ -29,6 +29,7 @@ include 'code:features-search:query-parser' include 'code:features-search:result-ranking' include 'code:features-convert:adblock' +include 'code:features-convert:stackexchange-xml' include 'code:features-convert:pubdate' include 'code:features-convert:summary-extraction' include 'code:features-convert:keyword-extraction'