Skip to content

Commit

Permalink
Add basic implementation (#1)
Browse files Browse the repository at this point in the history
Add basic implementation
  • Loading branch information
Nianna committed Jul 13, 2023
1 parent b4eb78e commit c4ad709
Show file tree
Hide file tree
Showing 19 changed files with 5,570 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Default ignored files
/shelf/
/.idea/
/target/
32 changes: 32 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.github.nianna</groupId>
<artifactId>hyphenator</artifactId>
<version>1.0.0</version>

<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<junit.version>5.9.2</junit.version>
</properties>

<dependencies>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
22 changes: 22 additions & 0 deletions src/main/java/com/github/nianna/api/HyphenatedText.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package com.github.nianna.api;

import java.util.List;
import java.util.stream.Collectors;

public record HyphenatedText(List<HyphenatedToken> hyphenatedTokens) {

public static final String DEFAULT_TOKEN_SEPARATOR = " ";

public static final String DEFAULT_SYLLABLE_SEPARATOR = "-";

public String read() {
return read(DEFAULT_TOKEN_SEPARATOR, DEFAULT_SYLLABLE_SEPARATOR);
}

public String read(String tokenSeparator, String syllableSeparator) {
return hyphenatedTokens.stream()
.map(token -> token.read(syllableSeparator))
.collect(Collectors.joining(tokenSeparator));
}

}
21 changes: 21 additions & 0 deletions src/main/java/com/github/nianna/api/HyphenatedToken.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package com.github.nianna.api;

import java.util.List;

public record HyphenatedToken(String token, List<Integer> hyphenIndexes) {

public String read(String syllableSeparator) {
StringBuilder builder = new StringBuilder();
int lastIndex = 0;
for (Integer index : hyphenIndexes) {
builder.append(token, lastIndex, index);
builder.append(syllableSeparator);
lastIndex = index;
}
if (lastIndex < token.length()) {
builder.append(token.substring(lastIndex));
}
return builder.toString();
}

}
52 changes: 52 additions & 0 deletions src/main/java/com/github/nianna/api/Hyphenator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package com.github.nianna.api;

import com.github.nianna.internal.HyphenIndexFinder;

import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Stream;

import static com.github.nianna.internal.Utils.checkArgument;
import static com.github.nianna.internal.Utils.isNotEmpty;
import static java.util.Objects.nonNull;

public class Hyphenator {

public static final String DEFAULT_TOKEN_SEPARATOR = " ";

private final HyphenIndexFinder hyphenIndexFinder;

private final String tokenSeparatorPattern;

public Hyphenator(List<String> patterns) {
this(patterns, new HyphenatorProperties());
}

public Hyphenator(List<String> patterns, HyphenatorProperties hyphenatorProperties) {
this(patterns, hyphenatorProperties, DEFAULT_TOKEN_SEPARATOR);
}

public Hyphenator(List<String> patterns, HyphenatorProperties hyphenatorProperties, String tokenSeparator) {
checkArgument(nonNull(hyphenatorProperties), "Properties can not be null");
hyphenIndexFinder = new HyphenIndexFinder(patterns, hyphenatorProperties);
checkArgument(isNotEmpty(tokenSeparator), "Token separator can not be empty");
this.tokenSeparatorPattern = Pattern.quote(tokenSeparator);
}

public HyphenatedText hyphenateText(String text) {
List<HyphenatedToken> hyphenatedTokens = tokenize(text)
.map(this::hyphenateToken)
.toList();
return new HyphenatedText(hyphenatedTokens);
}

public HyphenatedToken hyphenateToken(String token) {
List<Integer> hyphenationIndexes = hyphenIndexFinder.findIndexes(token);
return new HyphenatedToken(token, hyphenationIndexes);
}

private Stream<String> tokenize(String text) {
return Stream.of(text.split(tokenSeparatorPattern));
}

}
34 changes: 34 additions & 0 deletions src/main/java/com/github/nianna/api/HyphenatorProperties.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package com.github.nianna.api;

import static com.github.nianna.internal.Utils.checkArgument;

public class HyphenatorProperties {

public static int DEFAULT_MIN_PREFIX_LENGTH = 2;

public static int DEFAULT_MIN_SUFFIX_LENGTH = 2;

private final int minPrefixLength;

private final int minSuffixLength;

public HyphenatorProperties(int minPrefixLength, int minSuffixLength) {
checkArgument(minPrefixLength > 0, "Prefix must be at least 1 character long");
checkArgument(minSuffixLength > 0, "Suffix must be at least 1 character long");
this.minPrefixLength = minPrefixLength;
this.minSuffixLength = minSuffixLength;
}

public HyphenatorProperties() {
this(DEFAULT_MIN_PREFIX_LENGTH, DEFAULT_MIN_SUFFIX_LENGTH);
}

public int getMinPrefixLength() {
return minPrefixLength;
}

public int getMinSuffixLength() {
return minSuffixLength;
}

}
105 changes: 105 additions & 0 deletions src/main/java/com/github/nianna/internal/HyphenIndexFinder.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
package com.github.nianna.internal;

import com.github.nianna.api.HyphenatorProperties;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static com.github.nianna.internal.Utils.isAlphabetic;
import static com.github.nianna.internal.Utils.isOdd;
import static java.util.Objects.isNull;

public class HyphenIndexFinder {

private final PatternCollection patternCollection;

private final HyphenatorProperties hyphenatorProperties;

public HyphenIndexFinder(List<String> patterns, HyphenatorProperties hyphenatorProperties) {
this.patternCollection = new PatternCollection(patterns);
this.hyphenatorProperties = hyphenatorProperties;
}

public List<Integer> findIndexes(String token) {
int firstLetterIndex = getFirstLetterIndex(token);
int lastLetterIndex = getLastLetterIndex(token, firstLetterIndex);
String actualToken = token.substring(firstLetterIndex, lastLetterIndex + 1);
if (actualToken.isBlank() | !isAlphabetic(actualToken)) {
return List.of();
}
return doFindIndexes(actualToken)
.map(index -> index + firstLetterIndex)
.toList();
}

private Stream<Integer> doFindIndexes(String token) {
String normalizedToken = token.toLowerCase(Locale.ROOT);
int maxPatternLength = patternCollection.getMaxPatternLength();
Map<Integer, List<String>> matchedPatternsAtIndexes = matchedPatternsAtIndexes(normalizedToken, maxPatternLength);
Map<Integer, Integer> maxPrioritiesAtIndexes = mergePriorities(matchedPatternsAtIndexes);
return getIndexesWithOddPriorities(token, maxPrioritiesAtIndexes);
}

private int getFirstLetterIndex(String word) {
int firstLetterIndex = 0;
while (firstLetterIndex < word.length() && !Character.isLetter(word.charAt(firstLetterIndex))) {
firstLetterIndex++;
}
return firstLetterIndex;
}

private int getLastLetterIndex(String word, int firstLetterIndex) {
int lastLetterIndex = word.length() - 1;
while (lastLetterIndex >= firstLetterIndex && !Character.isLetter(word.charAt(lastLetterIndex))) {
lastLetterIndex--;
}
return lastLetterIndex;
}

private Map<Integer, List<String>> matchedPatternsAtIndexes(String token, int maxPatternLength) {
Map<Integer, List<String>> result = new HashMap<>();
for (int i = 0; i < token.length(); i++) {
for (int j = Math.min(i + maxPatternLength, token.length() - 1); j >= i; j--) {
String identifier = token.substring(i, j + 1);
if (patternCollection.hasPattern(identifier)) {
result.compute(i, (key, value) -> append(value, identifier));
}
if (i == 0 && patternCollection.hasPattern("." + identifier)) {
result.compute(i, (key, value) -> append(value, "." + identifier));
}
if (j == token.length() - 1 && patternCollection.hasPattern(identifier + ".")) {
result.compute(i, (key, value) -> append(value, identifier + "."));
}
}
}
return result;
}

private Map<Integer, Integer> mergePriorities(Map<Integer, List<String>> matchedPatternsAtIndexes) {
return matchedPatternsAtIndexes.entrySet().stream()
.flatMap(entry ->
entry.getValue().stream()
.flatMap(patternCollection::priorities)
.map(priority -> Map.entry(entry.getKey() + priority.index(), priority.value()))
).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, Math::max));
}

private Stream<Integer> getIndexesWithOddPriorities(String token, Map<Integer, Integer> maxPrioritiesAtIndexes) {
return maxPrioritiesAtIndexes.entrySet().stream()
.filter(entry -> isOdd(entry.getValue()))
.map(Map.Entry::getKey)
.filter(index -> index <= token.length() - hyphenatorProperties.getMinSuffixLength())
.filter(index -> index >= hyphenatorProperties.getMinPrefixLength());
}

private List<String> append(List<String> collector, String newValue) {
collector = isNull(collector) ? new ArrayList<>() : collector;
collector.add(newValue);
return collector;
}
}
56 changes: 56 additions & 0 deletions src/main/java/com/github/nianna/internal/PatternCollection.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package com.github.nianna.internal;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;

class PatternCollection {

private final Map<String, List<Priority>> parsedPatterns;

private final int maxPatternLength;

PatternCollection(List<String> patterns) {
parsedPatterns = patterns.stream()
.map(this::parsePattern)
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
this.maxPatternLength = parsedPatterns.keySet().stream()
.map(String::length)
.max(Comparator.naturalOrder())
.orElse(0);
}

int getMaxPatternLength() {
return maxPatternLength;
}

List<Priority> getPriorities(String identifier) {
return parsedPatterns.get(identifier);
}

Stream<Priority> priorities(String identifier) {
return Stream.ofNullable(parsedPatterns.get(identifier))
.flatMap(List::stream);
}

boolean hasPattern(String identifier) {
return parsedPatterns.containsKey(identifier);
}

private Map.Entry<String, List<Priority>> parsePattern(String pattern) {
boolean isLeadPattern = pattern.startsWith(".");
List<Priority> patternPriorities = new ArrayList<>();
for (int i = 0; i < pattern.length(); i++) {
if (Character.isDigit(pattern.charAt(i))) {
int index = i - patternPriorities.size() - (isLeadPattern ? 1 : 0);
patternPriorities.add(new Priority(index, Character.getNumericValue(pattern.charAt(i))));
}
}
String identifier = pattern.replaceAll("[0-9]", "");
return Map.entry(identifier, patternPriorities);
}

}
4 changes: 4 additions & 0 deletions src/main/java/com/github/nianna/internal/Priority.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
package com.github.nianna.internal;

record Priority(int index, int value) {
}
28 changes: 28 additions & 0 deletions src/main/java/com/github/nianna/internal/Utils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package com.github.nianna.internal;

import static java.util.Objects.nonNull;

public class Utils {

private Utils() {
}

static boolean isAlphabetic(String input) {
return input.codePoints().allMatch(Character::isAlphabetic);
}

static boolean isOdd(Integer value) {
return value % 2 != 0;
}

public static void checkArgument(boolean expression, String errorMessage) {
if (!expression) {
throw new IllegalArgumentException(errorMessage);
}
}

public static boolean isNotEmpty(String string) {
return nonNull(string) && !string.isEmpty();
}

}
3 changes: 3 additions & 0 deletions src/main/java/module-info.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module nianna.hyphenator {
exports com.github.nianna.api;
}
21 changes: 21 additions & 0 deletions src/test/java/com/github/nianna/TestUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package com.github.nianna;

import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;

public class TestUtil {

public static List<String> loadPlPatterns() {
try {
Path patternsPath = Path.of(TestUtil.class.getResource("/hyph_pl_PL.dic").toURI());
return Files.readAllLines(patternsPath);
} catch (IOException | URISyntaxException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}

}

0 comments on commit c4ad709

Please sign in to comment.