Java port for Ve.parse(). MIT licensed.

Kimtaro · Jun 13, 2017 · 6ad88bd · 6ad88bd
1 parent 6618268
commit 6ad88bd
Show file tree

Hide file tree

Showing 9 changed files with 928 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,5 @@
 *.gem
 .rvmrc
 
+java/target/classes
+java/target/test-classes
diff --git a/Parse.java b/Parse.java
diff --git a/java/pom.xml b/java/pom.xml
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>uk.co.birchlabs.ve</groupId>
+    <artifactId>ve</artifactId>
+    <version>1.0-SNAPSHOT</version>
+
+    <name>main source</name>
+    <url>http://maven.apache.org</url>
+
+    <repositories>
+        <repository>
+            <id>Atilika Open Source repository</id>
+            <url>http://www.atilika.org/nexus/content/repositories/atilika</url>
+        </repository>
+    </repositories>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.atilika.kuromoji</groupId>
+            <artifactId>kuromoji</artifactId>
+            <version>0.7.7</version>
+            <type>jar</type>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.12</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <slf4jVersion>1.7.21</slf4jVersion>
+        <asyncHttpClientVersion>2.0.6</asyncHttpClientVersion>
+    </properties>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.5.1</version>
+                <configuration>
+                    <source>1.8</source>
+                    <target>1.8</target>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+</project>
diff --git a/java/src/main/java/ve/Grammar.java b/java/src/main/java/ve/Grammar.java
@@ -0,0 +1,10 @@
+package ve;
+
+/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
+ * Released under MIT license (see LICENSE.txt at root of repository).
+ **/
+public enum Grammar {
+    Auxiliary,
+    Nominal,
+    Unassigned
+}
diff --git a/java/src/main/java/ve/Parse.java b/java/src/main/java/ve/Parse.java
diff --git a/java/src/main/java/ve/Pos.java b/java/src/main/java/ve/Pos.java
@@ -0,0 +1,27 @@
+package ve;
+
+/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
+ * Released under MIT license (see LICENSE.txt at root of repository).
+ *
+ * Based on ve/lib/part_of_speech.rb.
+ **/
+public enum Pos {
+        Noun,
+        ProperNoun,
+        Pronoun,
+        Adjective,
+        Adverb,
+        Determiner,
+        Preposition,
+        Postposition,
+        Verb,
+        Suffix,
+        Prefix,
+        Conjunction,
+        Interjection,
+        Number,
+        Unknown,
+        Symbol,
+        Other,
+        TBD
+}
diff --git a/java/src/main/java/ve/Word.java b/java/src/main/java/ve/Word.java
@@ -0,0 +1,104 @@
+package ve;
+
+import org.atilika.kuromoji.Token;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
+ * Released under MIT license (see LICENSE.txt at root of repository).
+ *
+ * A Word is composed of one or more Tokens, as stored in an internal List.
+ * It also has various fields like 'reading' and 'transcription', which may
+ * build up as extra Tokens are added to the list.
+ * Words are identified and built up by this project's Parse.words() method.
+ **/
+public class Word {
+//    These five seem underdeveloped and underpopulated:
+    private String reading;
+    private String transcription;
+    private Grammar grammar;
+//    private String reading_script;
+//    private String transcription_script;
+    private String lemma; // "聞く"
+    private Pos part_of_speech; // eg. Pos.Noun
+    private List<Token> tokens = new ArrayList<>(); // those which were eaten up by this one word: {聞か, せ, られ}
+    private String word; // "聞かせられ"
+
+    /**
+     * Incoming variables are named in the style of Sen; fields are named in the style of Ve.
+     * @param read - call token.getReading().
+     * @param pronunciation - call token.getPronunciation().
+     * @param grammar - this is an underdeveloped enum-like variable originating from Ve.
+     * @param basic - call token.getBasicString().
+     * @param part_of_speech - this is another underdeveloped enum-like variable originating from Ve.
+     * @param nodeStr - call token.getNodeStr().
+     * @param token - pass in a Token composing part of the Word. Currently expects the Token to come from Sen, but could
+     *              be simply adapted to come from Kuromoji.
+     */
+    public Word(String read,
+                String pronunciation,
+                Grammar grammar,
+//                String reading_script,
+//                String transcription_script,
+                String basic,
+                Pos part_of_speech,
+                String nodeStr,
+                Token token) {
+        this.reading = read;
+        this.transcription = pronunciation;
+        this.grammar = grammar;
+//        this.reading_script = reading_script;
+//        this.transcription_script = transcription_script;
+        this.lemma = basic;
+        this.part_of_speech = part_of_speech;
+        this.word = nodeStr;
+        tokens.add(token);
+    }
+
+    public void setPart_of_speech(Pos part_of_speech) {
+        this.part_of_speech = part_of_speech;
+    }
+
+    public String getLemma() {
+        return lemma;
+    }
+
+    public Pos getPart_of_speech() {
+        return part_of_speech;
+    }
+
+    public List<Token> getTokens() {
+        return tokens;
+    }
+
+    public String getWord() {
+        return word;
+    }
+
+    public void appendToWord(String suffix) {
+        if(word == null) word = "_".concat(suffix); // likely won't experience a null word, actually.
+        else word = word.concat(suffix);
+    }
+
+    public void appendToReading(String suffix) {
+        if(reading == null) reading = "_".concat(suffix);
+        else reading = reading.concat(suffix);
+    }
+
+    public void appendToTranscription(String suffix) {
+        if(transcription == null) transcription = "_".concat(suffix);
+        else transcription = transcription.concat(suffix);
+    }
+
+    // Not sure when this would change.
+    public void appendToLemma(String suffix) {
+        if(lemma == null) lemma = "_".concat(suffix);
+        else lemma = lemma.concat(suffix);
+    }
+
+    @Override
+    public String toString() {
+        return word;
+    }
+}
diff --git a/java/src/test/java/ve/VeTest.java b/java/src/test/java/ve/VeTest.java
@@ -0,0 +1,44 @@
+package ve;
+
+import org.atilika.kuromoji.Token;
+import org.atilika.kuromoji.Tokenizer;
+import org.junit.Test;
+
+import java.util.List;
+
+/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
+  * Released under MIT license (see LICENSE.txt at root of repository).
+  *
+  * This test is purely to show the console output; it is unconditional.
+  **/
+public class VeTest {
+
+    /** Based on https://hondou.homedns.org/pukiwiki/index.php?cmd=read&page=Java%20SEN%20%A4%C7%B7%C1%C2%D6%C1%C7%B2%F2%C0%CF
+     */
+    @Test
+    public void coreUsage() {
+        String kanji = "金がなければくよくよします女に振られりゃなきまする\n"
+                + "腹が減ったらおまんま食べて命尽きればあの世行き\n"
+                + "有難や有難や\n";
+        List<Token> tokensList = Tokenizer.builder().build().tokenize(kanji);
+        Token[] tokensArray = tokensList.toArray(new Token[tokensList.size()]);
+
+        Parse parser = new Parse(tokensArray);
+        List<Word> words = parser.words();
+        System.out.println(words);
+
+        /*  Prints out:
+            [金, が, なけれ, ば, くよくよ, します, 女に, 振られりゃなき, まする,
+            , 腹, が, 減ったら, お, まんま, 食べ, て, 命, 尽きれ, ば, あの世行き,
+            , 有難, や, 有難, や,
+            ]
+        */
+
+        /* Note: I have found that, depending on the MeCab dictionary/model, POS-tagging of tokens may vary.
+           ie: when tokenizing using net.java.sen:
+               なけれ is labelled as a DOUSHI-JITATSU-*-*.
+           However, when tokenizing using org.atilika.kuromoji:
+               なけれ is labelled as a KEIYOUSHI-JITATSU-*-*.
+        */
+    }
+}
diff --git a/java/ve.iml b/java/ve.iml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8" inherit-compiler-output="false">
+    <output url="file://$MODULE_DIR$/target/classes" />
+    <output-test url="file://$MODULE_DIR$/target/test-classes" />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
+      <excludeFolder url="file://$MODULE_DIR$/target" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" name="Maven: org.atilika.kuromoji:kuromoji:0.7.7" level="project" />
+    <orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.12" level="project" />
+    <orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.3" level="project" />
+  </component>
+</module>