-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add basic implementation
- Loading branch information
Showing
19 changed files
with
5,570 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Default ignored files | ||
/shelf/ | ||
/.idea/ | ||
/target/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<groupId>com.github.nianna</groupId> | ||
<artifactId>hyphenator</artifactId> | ||
<version>1.0.0</version> | ||
|
||
<properties> | ||
<maven.compiler.source>17</maven.compiler.source> | ||
<maven.compiler.target>17</maven.compiler.target> | ||
<junit.version>5.9.2</junit.version> | ||
</properties> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>org.junit.jupiter</groupId> | ||
<artifactId>junit-jupiter-api</artifactId> | ||
<version>${junit.version}</version> | ||
<scope>test</scope> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.junit.jupiter</groupId> | ||
<artifactId>junit-jupiter-engine</artifactId> | ||
<version>${junit.version}</version> | ||
<scope>test</scope> | ||
</dependency> | ||
</dependencies> | ||
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
package com.github.nianna.api; | ||
|
||
import java.util.List; | ||
import java.util.stream.Collectors; | ||
|
||
public record HyphenatedText(List<HyphenatedToken> hyphenatedTokens) { | ||
|
||
public static final String DEFAULT_TOKEN_SEPARATOR = " "; | ||
|
||
public static final String DEFAULT_SYLLABLE_SEPARATOR = "-"; | ||
|
||
public String read() { | ||
return read(DEFAULT_TOKEN_SEPARATOR, DEFAULT_SYLLABLE_SEPARATOR); | ||
} | ||
|
||
public String read(String tokenSeparator, String syllableSeparator) { | ||
return hyphenatedTokens.stream() | ||
.map(token -> token.read(syllableSeparator)) | ||
.collect(Collectors.joining(tokenSeparator)); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
package com.github.nianna.api; | ||
|
||
import java.util.List; | ||
|
||
public record HyphenatedToken(String token, List<Integer> hyphenIndexes) { | ||
|
||
public String read(String syllableSeparator) { | ||
StringBuilder builder = new StringBuilder(); | ||
int lastIndex = 0; | ||
for (Integer index : hyphenIndexes) { | ||
builder.append(token, lastIndex, index); | ||
builder.append(syllableSeparator); | ||
lastIndex = index; | ||
} | ||
if (lastIndex < token.length()) { | ||
builder.append(token.substring(lastIndex)); | ||
} | ||
return builder.toString(); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package com.github.nianna.api; | ||
|
||
import com.github.nianna.internal.HyphenIndexFinder; | ||
|
||
import java.util.List; | ||
import java.util.regex.Pattern; | ||
import java.util.stream.Stream; | ||
|
||
import static com.github.nianna.internal.Utils.checkArgument; | ||
import static com.github.nianna.internal.Utils.isNotEmpty; | ||
import static java.util.Objects.nonNull; | ||
|
||
public class Hyphenator { | ||
|
||
public static final String DEFAULT_TOKEN_SEPARATOR = " "; | ||
|
||
private final HyphenIndexFinder hyphenIndexFinder; | ||
|
||
private final String tokenSeparatorPattern; | ||
|
||
public Hyphenator(List<String> patterns) { | ||
this(patterns, new HyphenatorProperties()); | ||
} | ||
|
||
public Hyphenator(List<String> patterns, HyphenatorProperties hyphenatorProperties) { | ||
this(patterns, hyphenatorProperties, DEFAULT_TOKEN_SEPARATOR); | ||
} | ||
|
||
public Hyphenator(List<String> patterns, HyphenatorProperties hyphenatorProperties, String tokenSeparator) { | ||
checkArgument(nonNull(hyphenatorProperties), "Properties can not be null"); | ||
hyphenIndexFinder = new HyphenIndexFinder(patterns, hyphenatorProperties); | ||
checkArgument(isNotEmpty(tokenSeparator), "Token separator can not be empty"); | ||
this.tokenSeparatorPattern = Pattern.quote(tokenSeparator); | ||
} | ||
|
||
public HyphenatedText hyphenateText(String text) { | ||
List<HyphenatedToken> hyphenatedTokens = tokenize(text) | ||
.map(this::hyphenateToken) | ||
.toList(); | ||
return new HyphenatedText(hyphenatedTokens); | ||
} | ||
|
||
public HyphenatedToken hyphenateToken(String token) { | ||
List<Integer> hyphenationIndexes = hyphenIndexFinder.findIndexes(token); | ||
return new HyphenatedToken(token, hyphenationIndexes); | ||
} | ||
|
||
private Stream<String> tokenize(String text) { | ||
return Stream.of(text.split(tokenSeparatorPattern)); | ||
} | ||
|
||
} |
34 changes: 34 additions & 0 deletions
34
src/main/java/com/github/nianna/api/HyphenatorProperties.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
package com.github.nianna.api; | ||
|
||
import static com.github.nianna.internal.Utils.checkArgument; | ||
|
||
public class HyphenatorProperties { | ||
|
||
public static int DEFAULT_MIN_PREFIX_LENGTH = 2; | ||
|
||
public static int DEFAULT_MIN_SUFFIX_LENGTH = 2; | ||
|
||
private final int minPrefixLength; | ||
|
||
private final int minSuffixLength; | ||
|
||
public HyphenatorProperties(int minPrefixLength, int minSuffixLength) { | ||
checkArgument(minPrefixLength > 0, "Prefix must be at least 1 character long"); | ||
checkArgument(minSuffixLength > 0, "Suffix must be at least 1 character long"); | ||
this.minPrefixLength = minPrefixLength; | ||
this.minSuffixLength = minSuffixLength; | ||
} | ||
|
||
public HyphenatorProperties() { | ||
this(DEFAULT_MIN_PREFIX_LENGTH, DEFAULT_MIN_SUFFIX_LENGTH); | ||
} | ||
|
||
public int getMinPrefixLength() { | ||
return minPrefixLength; | ||
} | ||
|
||
public int getMinSuffixLength() { | ||
return minSuffixLength; | ||
} | ||
|
||
} |
105 changes: 105 additions & 0 deletions
105
src/main/java/com/github/nianna/internal/HyphenIndexFinder.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
package com.github.nianna.internal; | ||
|
||
import com.github.nianna.api.HyphenatorProperties; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Locale; | ||
import java.util.Map; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.Stream; | ||
|
||
import static com.github.nianna.internal.Utils.isAlphabetic; | ||
import static com.github.nianna.internal.Utils.isOdd; | ||
import static java.util.Objects.isNull; | ||
|
||
public class HyphenIndexFinder { | ||
|
||
private final PatternCollection patternCollection; | ||
|
||
private final HyphenatorProperties hyphenatorProperties; | ||
|
||
public HyphenIndexFinder(List<String> patterns, HyphenatorProperties hyphenatorProperties) { | ||
this.patternCollection = new PatternCollection(patterns); | ||
this.hyphenatorProperties = hyphenatorProperties; | ||
} | ||
|
||
public List<Integer> findIndexes(String token) { | ||
int firstLetterIndex = getFirstLetterIndex(token); | ||
int lastLetterIndex = getLastLetterIndex(token, firstLetterIndex); | ||
String actualToken = token.substring(firstLetterIndex, lastLetterIndex + 1); | ||
if (actualToken.isBlank() | !isAlphabetic(actualToken)) { | ||
return List.of(); | ||
} | ||
return doFindIndexes(actualToken) | ||
.map(index -> index + firstLetterIndex) | ||
.toList(); | ||
} | ||
|
||
private Stream<Integer> doFindIndexes(String token) { | ||
String normalizedToken = token.toLowerCase(Locale.ROOT); | ||
int maxPatternLength = patternCollection.getMaxPatternLength(); | ||
Map<Integer, List<String>> matchedPatternsAtIndexes = matchedPatternsAtIndexes(normalizedToken, maxPatternLength); | ||
Map<Integer, Integer> maxPrioritiesAtIndexes = mergePriorities(matchedPatternsAtIndexes); | ||
return getIndexesWithOddPriorities(token, maxPrioritiesAtIndexes); | ||
} | ||
|
||
private int getFirstLetterIndex(String word) { | ||
int firstLetterIndex = 0; | ||
while (firstLetterIndex < word.length() && !Character.isLetter(word.charAt(firstLetterIndex))) { | ||
firstLetterIndex++; | ||
} | ||
return firstLetterIndex; | ||
} | ||
|
||
private int getLastLetterIndex(String word, int firstLetterIndex) { | ||
int lastLetterIndex = word.length() - 1; | ||
while (lastLetterIndex >= firstLetterIndex && !Character.isLetter(word.charAt(lastLetterIndex))) { | ||
lastLetterIndex--; | ||
} | ||
return lastLetterIndex; | ||
} | ||
|
||
private Map<Integer, List<String>> matchedPatternsAtIndexes(String token, int maxPatternLength) { | ||
Map<Integer, List<String>> result = new HashMap<>(); | ||
for (int i = 0; i < token.length(); i++) { | ||
for (int j = Math.min(i + maxPatternLength, token.length() - 1); j >= i; j--) { | ||
String identifier = token.substring(i, j + 1); | ||
if (patternCollection.hasPattern(identifier)) { | ||
result.compute(i, (key, value) -> append(value, identifier)); | ||
} | ||
if (i == 0 && patternCollection.hasPattern("." + identifier)) { | ||
result.compute(i, (key, value) -> append(value, "." + identifier)); | ||
} | ||
if (j == token.length() - 1 && patternCollection.hasPattern(identifier + ".")) { | ||
result.compute(i, (key, value) -> append(value, identifier + ".")); | ||
} | ||
} | ||
} | ||
return result; | ||
} | ||
|
||
private Map<Integer, Integer> mergePriorities(Map<Integer, List<String>> matchedPatternsAtIndexes) { | ||
return matchedPatternsAtIndexes.entrySet().stream() | ||
.flatMap(entry -> | ||
entry.getValue().stream() | ||
.flatMap(patternCollection::priorities) | ||
.map(priority -> Map.entry(entry.getKey() + priority.index(), priority.value())) | ||
).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, Math::max)); | ||
} | ||
|
||
private Stream<Integer> getIndexesWithOddPriorities(String token, Map<Integer, Integer> maxPrioritiesAtIndexes) { | ||
return maxPrioritiesAtIndexes.entrySet().stream() | ||
.filter(entry -> isOdd(entry.getValue())) | ||
.map(Map.Entry::getKey) | ||
.filter(index -> index <= token.length() - hyphenatorProperties.getMinSuffixLength()) | ||
.filter(index -> index >= hyphenatorProperties.getMinPrefixLength()); | ||
} | ||
|
||
private List<String> append(List<String> collector, String newValue) { | ||
collector = isNull(collector) ? new ArrayList<>() : collector; | ||
collector.add(newValue); | ||
return collector; | ||
} | ||
} |
56 changes: 56 additions & 0 deletions
56
src/main/java/com/github/nianna/internal/PatternCollection.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
package com.github.nianna.internal; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Comparator; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.Stream; | ||
|
||
class PatternCollection { | ||
|
||
private final Map<String, List<Priority>> parsedPatterns; | ||
|
||
private final int maxPatternLength; | ||
|
||
PatternCollection(List<String> patterns) { | ||
parsedPatterns = patterns.stream() | ||
.map(this::parsePattern) | ||
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); | ||
this.maxPatternLength = parsedPatterns.keySet().stream() | ||
.map(String::length) | ||
.max(Comparator.naturalOrder()) | ||
.orElse(0); | ||
} | ||
|
||
int getMaxPatternLength() { | ||
return maxPatternLength; | ||
} | ||
|
||
List<Priority> getPriorities(String identifier) { | ||
return parsedPatterns.get(identifier); | ||
} | ||
|
||
Stream<Priority> priorities(String identifier) { | ||
return Stream.ofNullable(parsedPatterns.get(identifier)) | ||
.flatMap(List::stream); | ||
} | ||
|
||
boolean hasPattern(String identifier) { | ||
return parsedPatterns.containsKey(identifier); | ||
} | ||
|
||
private Map.Entry<String, List<Priority>> parsePattern(String pattern) { | ||
boolean isLeadPattern = pattern.startsWith("."); | ||
List<Priority> patternPriorities = new ArrayList<>(); | ||
for (int i = 0; i < pattern.length(); i++) { | ||
if (Character.isDigit(pattern.charAt(i))) { | ||
int index = i - patternPriorities.size() - (isLeadPattern ? 1 : 0); | ||
patternPriorities.add(new Priority(index, Character.getNumericValue(pattern.charAt(i)))); | ||
} | ||
} | ||
String identifier = pattern.replaceAll("[0-9]", ""); | ||
return Map.entry(identifier, patternPriorities); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
package com.github.nianna.internal; | ||
|
||
record Priority(int index, int value) { | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
package com.github.nianna.internal; | ||
|
||
import static java.util.Objects.nonNull; | ||
|
||
public class Utils { | ||
|
||
private Utils() { | ||
} | ||
|
||
static boolean isAlphabetic(String input) { | ||
return input.codePoints().allMatch(Character::isAlphabetic); | ||
} | ||
|
||
static boolean isOdd(Integer value) { | ||
return value % 2 != 0; | ||
} | ||
|
||
public static void checkArgument(boolean expression, String errorMessage) { | ||
if (!expression) { | ||
throw new IllegalArgumentException(errorMessage); | ||
} | ||
} | ||
|
||
public static boolean isNotEmpty(String string) { | ||
return nonNull(string) && !string.isEmpty(); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
module nianna.hyphenator { | ||
exports com.github.nianna.api; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
package com.github.nianna; | ||
|
||
import java.io.IOException; | ||
import java.net.URISyntaxException; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.List; | ||
|
||
public class TestUtil { | ||
|
||
public static List<String> loadPlPatterns() { | ||
try { | ||
Path patternsPath = Path.of(TestUtil.class.getResource("/hyph_pl_PL.dic").toURI()); | ||
return Files.readAllLines(patternsPath); | ||
} catch (IOException | URISyntaxException e) { | ||
e.printStackTrace(); | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
} |
Oops, something went wrong.