From 97ef9d171c9cca6820b3448b45a929bdc1cf933d Mon Sep 17 00:00:00 2001 From: Joseph Wilk Date: Sun, 17 Dec 2017 16:00:59 +0000 Subject: [PATCH] Expose the token cleaner regex. While this is still fixed as only alphas the regex is now exposed and its possible to tweak this to fit other valid token structures. For example allowing numbers in a token. --- build.boot | 2 + src/kixi/mallet/TokenSequenceCleaner.java | 76 +++++++++++++++++++++++ src/kixi/mallet/script.clj | 5 +- 3 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 src/kixi/mallet/TokenSequenceCleaner.java diff --git a/build.boot b/build.boot index 9235f03..a959e0e 100644 --- a/build.boot +++ b/build.boot @@ -25,6 +25,7 @@ (deftask release-locally [] (comp (aot :namespace #{'kixi.mallet.pipes}) + (javac) (pom) (jar) (install))) @@ -32,6 +33,7 @@ (deftask release [] (comp (aot :namespace #{'kixi.mallet.pipes}) + (javac) (pom) (jar) (push))) diff --git a/src/kixi/mallet/TokenSequenceCleaner.java b/src/kixi/mallet/TokenSequenceCleaner.java new file mode 100644 index 0000000..e57e065 --- /dev/null +++ b/src/kixi/mallet/TokenSequenceCleaner.java @@ -0,0 +1,76 @@ +package kixi.mallet; + +import java.io.ObjectOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.util.regex.Pattern; + +import cc.mallet.types.FeatureSequenceWithBigrams; +import cc.mallet.types.Instance; +import cc.mallet.types.Token; +import cc.mallet.types.TokenSequence; +import cc.mallet.util.CharSequenceLexer; +import cc.mallet.pipe.Pipe; + +/* Copyright (C) 2005 Univ. of Massachusetts Amherst, Computer Science Dept. +This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). +http://www.cs.umass.edu/~mccallum/mallet +This software is provided under the terms of the Common Public License, +version 1.0, as published by http://www.opensource.org. For further +information, see the file `LICENSE' included with this distribution. */ + +/* Remove tokens that do not match passed in Regex */ +public class TokenSequenceCleaner extends cc.mallet.pipe.Pipe { + boolean markDeletions = false; + Pattern acceptPattern; + + public TokenSequenceCleaner (boolean markDeletions, Pattern regex) + { + this.markDeletions = markDeletions; + this.acceptPattern = regex; + } + + public TokenSequenceCleaner (Pattern regex) + { + this (false, regex); + } + + public TokenSequenceCleaner () + { + this (false, CharSequenceLexer.LEX_ALPHA); + } + + public Instance pipe (Instance carrier) + { + TokenSequence ts = (TokenSequence) carrier.getData(); + TokenSequence ret = new TokenSequence (); + Token prevToken = null; + for (int i = 0; i < ts.size(); i++) { + Token t = ts.get(i); + String s = t.getText(); + if (this.acceptPattern.matcher(s).matches()) { + ret.add (t); + prevToken = t; + } else if (markDeletions && prevToken != null) + prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText()); + } + carrier.setData(ret); + return carrier; + } + + // Serialization + + private static final long serialVersionUID = 1; + private static final int CURRENT_SERIAL_VERSION = 0; + + private void writeObject (ObjectOutputStream out) throws IOException { + out.writeInt (CURRENT_SERIAL_VERSION); + out.writeBoolean(markDeletions); + } + + private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { + int version = in.readInt (); + markDeletions = in.readBoolean(); + } + +} diff --git a/src/kixi/mallet/script.clj b/src/kixi/mallet/script.clj index 2bcc307..998e72d 100644 --- a/src/kixi/mallet/script.clj +++ b/src/kixi/mallet/script.clj @@ -21,6 +21,8 @@ TokenSequenceNGrams TokenSequence2FeatureSequence FeatureSequence2AugmentableFeatureVector] [cc.mallet.pipe.iterator FileIterator] + + [kixi.mallet TokenSequenceCleaner] [java.io File FileOutputStream ObjectOutputStream FileFilter] [java.nio.charset Charset])) @@ -45,6 +47,7 @@ See kixi.mallet.boot for options" [opts] (let [default-token-regex (re-pattern "\\p{L}[\\p{L}\\p{P}]+\\p{L}") + token-cleaner-regex (re-pattern "\\p{Alpha}+") directory (:input opts) instantiate (fn [sym] (clojure.lang.Reflector/invokeConstructor (resolve sym) (into-array []))) @@ -53,7 +56,7 @@ (when-let [string-pipes (:string-pipes opts)] (map instantiate string-pipes)) [(CharSequence2TokenSequence. default-token-regex) - (TokenSequenceRemoveNonAlpha. true) + (TokenSequenceCleaner. true token-cleaner-regex) (when (:remove-stopwords opts) (if-let [file (:extra-stopwords opts)] (TokenSequenceRemoveStopwords. file "UTF-8" true false true)