Skip to content

Commit

Permalink
Java port for Ve.parse(). MIT licensed.
Browse files Browse the repository at this point in the history
  • Loading branch information
shirakaba committed Jun 13, 2017
1 parent 6618268 commit 6ad88bd
Show file tree
Hide file tree
Showing 9 changed files with 928 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Expand Up @@ -3,3 +3,5 @@
*.gem
.rvmrc

java/target/classes
java/target/test-classes
332 changes: 332 additions & 0 deletions Parse.java

Large diffs are not rendered by default.

56 changes: 56 additions & 0 deletions java/pom.xml
@@ -0,0 +1,56 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>uk.co.birchlabs.ve</groupId>
<artifactId>ve</artifactId>
<version>1.0-SNAPSHOT</version>

<name>main source</name>
<url>http://maven.apache.org</url>

<repositories>
<repository>
<id>Atilika Open Source repository</id>
<url>http://www.atilika.org/nexus/content/repositories/atilika</url>
</repository>
</repositories>

<dependencies>
<dependency>
<groupId>org.atilika.kuromoji</groupId>
<artifactId>kuromoji</artifactId>
<version>0.7.7</version>
<type>jar</type>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<slf4jVersion>1.7.21</slf4jVersion>
<asyncHttpClientVersion>2.0.6</asyncHttpClientVersion>
</properties>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
10 changes: 10 additions & 0 deletions java/src/main/java/ve/Grammar.java
@@ -0,0 +1,10 @@
package ve;

/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
* Released under MIT license (see LICENSE.txt at root of repository).
**/
public enum Grammar {
Auxiliary,
Nominal,
Unassigned
}
336 changes: 336 additions & 0 deletions java/src/main/java/ve/Parse.java

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions java/src/main/java/ve/Pos.java
@@ -0,0 +1,27 @@
package ve;

/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
* Released under MIT license (see LICENSE.txt at root of repository).
*
* Based on ve/lib/part_of_speech.rb.
**/
public enum Pos {
Noun,
ProperNoun,
Pronoun,
Adjective,
Adverb,
Determiner,
Preposition,
Postposition,
Verb,
Suffix,
Prefix,
Conjunction,
Interjection,
Number,
Unknown,
Symbol,
Other,
TBD
}
104 changes: 104 additions & 0 deletions java/src/main/java/ve/Word.java
@@ -0,0 +1,104 @@
package ve;

import org.atilika.kuromoji.Token;

import java.util.ArrayList;
import java.util.List;

/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
* Released under MIT license (see LICENSE.txt at root of repository).
*
* A Word is composed of one or more Tokens, as stored in an internal List.
* It also has various fields like 'reading' and 'transcription', which may
* build up as extra Tokens are added to the list.
* Words are identified and built up by this project's Parse.words() method.
**/
public class Word {
// These five seem underdeveloped and underpopulated:
private String reading;
private String transcription;
private Grammar grammar;
// private String reading_script;
// private String transcription_script;
private String lemma; // "聞く"
private Pos part_of_speech; // eg. Pos.Noun
private List<Token> tokens = new ArrayList<>(); // those which were eaten up by this one word: {聞か, せ, られ}
private String word; // "聞かせられ"

/**
* Incoming variables are named in the style of Sen; fields are named in the style of Ve.
* @param read - call token.getReading().
* @param pronunciation - call token.getPronunciation().
* @param grammar - this is an underdeveloped enum-like variable originating from Ve.
* @param basic - call token.getBasicString().
* @param part_of_speech - this is another underdeveloped enum-like variable originating from Ve.
* @param nodeStr - call token.getNodeStr().
* @param token - pass in a Token composing part of the Word. Currently expects the Token to come from Sen, but could
* be simply adapted to come from Kuromoji.
*/
public Word(String read,
String pronunciation,
Grammar grammar,
// String reading_script,
// String transcription_script,
String basic,
Pos part_of_speech,
String nodeStr,
Token token) {
this.reading = read;
this.transcription = pronunciation;
this.grammar = grammar;
// this.reading_script = reading_script;
// this.transcription_script = transcription_script;
this.lemma = basic;
this.part_of_speech = part_of_speech;
this.word = nodeStr;
tokens.add(token);
}

public void setPart_of_speech(Pos part_of_speech) {
this.part_of_speech = part_of_speech;
}

public String getLemma() {
return lemma;
}

public Pos getPart_of_speech() {
return part_of_speech;
}

public List<Token> getTokens() {
return tokens;
}

public String getWord() {
return word;
}

public void appendToWord(String suffix) {
if(word == null) word = "_".concat(suffix); // likely won't experience a null word, actually.
else word = word.concat(suffix);
}

public void appendToReading(String suffix) {
if(reading == null) reading = "_".concat(suffix);
else reading = reading.concat(suffix);
}

public void appendToTranscription(String suffix) {
if(transcription == null) transcription = "_".concat(suffix);
else transcription = transcription.concat(suffix);
}

// Not sure when this would change.
public void appendToLemma(String suffix) {
if(lemma == null) lemma = "_".concat(suffix);
else lemma = lemma.concat(suffix);
}

@Override
public String toString() {
return word;
}
}
44 changes: 44 additions & 0 deletions java/src/test/java/ve/VeTest.java
@@ -0,0 +1,44 @@
package ve;

import org.atilika.kuromoji.Token;
import org.atilika.kuromoji.Tokenizer;
import org.junit.Test;

import java.util.List;

/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
* Released under MIT license (see LICENSE.txt at root of repository).
*
* This test is purely to show the console output; it is unconditional.
**/
public class VeTest {

/** Based on https://hondou.homedns.org/pukiwiki/index.php?cmd=read&page=Java%20SEN%20%A4%C7%B7%C1%C2%D6%C1%C7%B2%F2%C0%CF
*/
@Test
public void coreUsage() {
String kanji = "金がなければくよくよします女に振られりゃなきまする\n"
+ "腹が減ったらおまんま食べて命尽きればあの世行き\n"
+ "有難や有難や\n";
List<Token> tokensList = Tokenizer.builder().build().tokenize(kanji);
Token[] tokensArray = tokensList.toArray(new Token[tokensList.size()]);

Parse parser = new Parse(tokensArray);
List<Word> words = parser.words();
System.out.println(words);

/* Prints out:
[金, が, なけれ, ば, くよくよ, します, 女に, 振られりゃなき, まする,
, 腹, が, 減ったら, お, まんま, 食べ, て, 命, 尽きれ, ば, あの世行き,
, 有難, や, 有難, や,
]
*/

/* Note: I have found that, depending on the MeCab dictionary/model, POS-tagging of tokens may vary.
ie: when tokenizing using net.java.sen:
なけれ is labelled as a DOUSHI-JITATSU-*-*.
However, when tokenizing using org.atilika.kuromoji:
なけれ is labelled as a KEIYOUSHI-JITATSU-*-*.
*/
}
}
17 changes: 17 additions & 0 deletions java/ve.iml
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: org.atilika.kuromoji:kuromoji:0.7.7" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.12" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.3" level="project" />
</component>
</module>

0 comments on commit 6ad88bd

Please sign in to comment.