Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Java port for Ve.parse(). MIT licensed.
- Loading branch information
Showing
9 changed files
with
928 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,3 +3,5 @@ | |
*.gem | ||
.rvmrc | ||
|
||
java/target/classes | ||
java/target/test-classes |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<groupId>uk.co.birchlabs.ve</groupId> | ||
<artifactId>ve</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
|
||
<name>main source</name> | ||
<url>http://maven.apache.org</url> | ||
|
||
<repositories> | ||
<repository> | ||
<id>Atilika Open Source repository</id> | ||
<url>http://www.atilika.org/nexus/content/repositories/atilika</url> | ||
</repository> | ||
</repositories> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>org.atilika.kuromoji</groupId> | ||
<artifactId>kuromoji</artifactId> | ||
<version>0.7.7</version> | ||
<type>jar</type> | ||
<scope>compile</scope> | ||
</dependency> | ||
<dependency> | ||
<groupId>junit</groupId> | ||
<artifactId>junit</artifactId> | ||
<version>4.12</version> | ||
<scope>test</scope> | ||
</dependency> | ||
</dependencies> | ||
|
||
<properties> | ||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
<slf4jVersion>1.7.21</slf4jVersion> | ||
<asyncHttpClientVersion>2.0.6</asyncHttpClientVersion> | ||
</properties> | ||
|
||
<build> | ||
<plugins> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-compiler-plugin</artifactId> | ||
<version>3.5.1</version> | ||
<configuration> | ||
<source>1.8</source> | ||
<target>1.8</target> | ||
</configuration> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
package ve; | ||
|
||
/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse | ||
* Released under MIT license (see LICENSE.txt at root of repository). | ||
**/ | ||
public enum Grammar { | ||
Auxiliary, | ||
Nominal, | ||
Unassigned | ||
} |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
package ve; | ||
|
||
/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse | ||
* Released under MIT license (see LICENSE.txt at root of repository). | ||
* | ||
* Based on ve/lib/part_of_speech.rb. | ||
**/ | ||
public enum Pos { | ||
Noun, | ||
ProperNoun, | ||
Pronoun, | ||
Adjective, | ||
Adverb, | ||
Determiner, | ||
Preposition, | ||
Postposition, | ||
Verb, | ||
Suffix, | ||
Prefix, | ||
Conjunction, | ||
Interjection, | ||
Number, | ||
Unknown, | ||
Symbol, | ||
Other, | ||
TBD | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
package ve; | ||
|
||
import org.atilika.kuromoji.Token; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse | ||
* Released under MIT license (see LICENSE.txt at root of repository). | ||
* | ||
* A Word is composed of one or more Tokens, as stored in an internal List. | ||
* It also has various fields like 'reading' and 'transcription', which may | ||
* build up as extra Tokens are added to the list. | ||
* Words are identified and built up by this project's Parse.words() method. | ||
**/ | ||
public class Word { | ||
// These five seem underdeveloped and underpopulated: | ||
private String reading; | ||
private String transcription; | ||
private Grammar grammar; | ||
// private String reading_script; | ||
// private String transcription_script; | ||
private String lemma; // "聞く" | ||
private Pos part_of_speech; // eg. Pos.Noun | ||
private List<Token> tokens = new ArrayList<>(); // those which were eaten up by this one word: {聞か, せ, られ} | ||
private String word; // "聞かせられ" | ||
|
||
/** | ||
* Incoming variables are named in the style of Sen; fields are named in the style of Ve. | ||
* @param read - call token.getReading(). | ||
* @param pronunciation - call token.getPronunciation(). | ||
* @param grammar - this is an underdeveloped enum-like variable originating from Ve. | ||
* @param basic - call token.getBasicString(). | ||
* @param part_of_speech - this is another underdeveloped enum-like variable originating from Ve. | ||
* @param nodeStr - call token.getNodeStr(). | ||
* @param token - pass in a Token composing part of the Word. Currently expects the Token to come from Sen, but could | ||
* be simply adapted to come from Kuromoji. | ||
*/ | ||
public Word(String read, | ||
String pronunciation, | ||
Grammar grammar, | ||
// String reading_script, | ||
// String transcription_script, | ||
String basic, | ||
Pos part_of_speech, | ||
String nodeStr, | ||
Token token) { | ||
this.reading = read; | ||
this.transcription = pronunciation; | ||
this.grammar = grammar; | ||
// this.reading_script = reading_script; | ||
// this.transcription_script = transcription_script; | ||
this.lemma = basic; | ||
this.part_of_speech = part_of_speech; | ||
this.word = nodeStr; | ||
tokens.add(token); | ||
} | ||
|
||
public void setPart_of_speech(Pos part_of_speech) { | ||
this.part_of_speech = part_of_speech; | ||
} | ||
|
||
public String getLemma() { | ||
return lemma; | ||
} | ||
|
||
public Pos getPart_of_speech() { | ||
return part_of_speech; | ||
} | ||
|
||
public List<Token> getTokens() { | ||
return tokens; | ||
} | ||
|
||
public String getWord() { | ||
return word; | ||
} | ||
|
||
public void appendToWord(String suffix) { | ||
if(word == null) word = "_".concat(suffix); // likely won't experience a null word, actually. | ||
else word = word.concat(suffix); | ||
} | ||
|
||
public void appendToReading(String suffix) { | ||
if(reading == null) reading = "_".concat(suffix); | ||
else reading = reading.concat(suffix); | ||
} | ||
|
||
public void appendToTranscription(String suffix) { | ||
if(transcription == null) transcription = "_".concat(suffix); | ||
else transcription = transcription.concat(suffix); | ||
} | ||
|
||
// Not sure when this would change. | ||
public void appendToLemma(String suffix) { | ||
if(lemma == null) lemma = "_".concat(suffix); | ||
else lemma = lemma.concat(suffix); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return word; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
package ve; | ||
|
||
import org.atilika.kuromoji.Token; | ||
import org.atilika.kuromoji.Tokenizer; | ||
import org.junit.Test; | ||
|
||
import java.util.List; | ||
|
||
/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse | ||
* Released under MIT license (see LICENSE.txt at root of repository). | ||
* | ||
* This test is purely to show the console output; it is unconditional. | ||
**/ | ||
public class VeTest { | ||
|
||
/** Based on https://hondou.homedns.org/pukiwiki/index.php?cmd=read&page=Java%20SEN%20%A4%C7%B7%C1%C2%D6%C1%C7%B2%F2%C0%CF | ||
*/ | ||
@Test | ||
public void coreUsage() { | ||
String kanji = "金がなければくよくよします女に振られりゃなきまする\n" | ||
+ "腹が減ったらおまんま食べて命尽きればあの世行き\n" | ||
+ "有難や有難や\n"; | ||
List<Token> tokensList = Tokenizer.builder().build().tokenize(kanji); | ||
Token[] tokensArray = tokensList.toArray(new Token[tokensList.size()]); | ||
|
||
Parse parser = new Parse(tokensArray); | ||
List<Word> words = parser.words(); | ||
System.out.println(words); | ||
|
||
/* Prints out: | ||
[金, が, なけれ, ば, くよくよ, します, 女に, 振られりゃなき, まする, | ||
, 腹, が, 減ったら, お, まんま, 食べ, て, 命, 尽きれ, ば, あの世行き, | ||
, 有難, や, 有難, や, | ||
] | ||
*/ | ||
|
||
/* Note: I have found that, depending on the MeCab dictionary/model, POS-tagging of tokens may vary. | ||
ie: when tokenizing using net.java.sen: | ||
なけれ is labelled as a DOUSHI-JITATSU-*-*. | ||
However, when tokenizing using org.atilika.kuromoji: | ||
なけれ is labelled as a KEIYOUSHI-JITATSU-*-*. | ||
*/ | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4"> | ||
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8" inherit-compiler-output="false"> | ||
<output url="file://$MODULE_DIR$/target/classes" /> | ||
<output-test url="file://$MODULE_DIR$/target/test-classes" /> | ||
<content url="file://$MODULE_DIR$"> | ||
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" /> | ||
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" /> | ||
<excludeFolder url="file://$MODULE_DIR$/target" /> | ||
</content> | ||
<orderEntry type="inheritedJdk" /> | ||
<orderEntry type="sourceFolder" forTests="false" /> | ||
<orderEntry type="library" name="Maven: org.atilika.kuromoji:kuromoji:0.7.7" level="project" /> | ||
<orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.12" level="project" /> | ||
<orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.3" level="project" /> | ||
</component> | ||
</module> |