Expose the token cleaner regex.

While this is still fixed as only alphas the regex is now exposed and its possible to tweak this to fit other valid token structures. For example allowing numbers in a token.
MastodonC · Dec 17, 2017 · 97ef9d1 · 97ef9d1
1 parent 8c67223
commit 97ef9d1
Show file tree

Hide file tree

Showing 3 changed files with 82 additions and 1 deletion.
diff --git a/build.boot b/build.boot
@@ -25,13 +25,15 @@
 
 (deftask release-locally []
   (comp (aot :namespace #{'kixi.mallet.pipes})
+        (javac)
         (pom)
         (jar)
         (install)))
 
 (deftask release
   []
   (comp (aot :namespace #{'kixi.mallet.pipes})
+        (javac)
         (pom)
         (jar)
         (push)))
diff --git a/src/kixi/mallet/TokenSequenceCleaner.java b/src/kixi/mallet/TokenSequenceCleaner.java
@@ -0,0 +1,76 @@
+package kixi.mallet;
+
+import java.io.ObjectOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.util.regex.Pattern;
+
+import cc.mallet.types.FeatureSequenceWithBigrams;
+import cc.mallet.types.Instance;
+import cc.mallet.types.Token;
+import cc.mallet.types.TokenSequence;
+import cc.mallet.util.CharSequenceLexer;
+import cc.mallet.pipe.Pipe;
+
+/* Copyright (C) 2005 Univ. of Massachusetts Amherst, Computer Science Dept.
+This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
+http://www.cs.umass.edu/~mccallum/mallet
+This software is provided under the terms of the Common Public License,
+version 1.0, as published by http://www.opensource.org.  For further
+information, see the file `LICENSE' included with this distribution. */
+
+/* Remove tokens that do not match passed in Regex */
+public class TokenSequenceCleaner extends cc.mallet.pipe.Pipe {
+  boolean markDeletions = false;
+  Pattern acceptPattern;
+
+  public TokenSequenceCleaner (boolean markDeletions, Pattern regex)
+  {
+    this.markDeletions = markDeletions;
+    this.acceptPattern = regex;
+  }
+
+  public TokenSequenceCleaner (Pattern regex)
+  {
+    this (false, regex);
+  }
+
+  public TokenSequenceCleaner ()
+  {
+    this (false, CharSequenceLexer.LEX_ALPHA);
+  }
+
+  public Instance pipe (Instance carrier)
+  {
+    TokenSequence ts = (TokenSequence) carrier.getData();
+    TokenSequence ret = new TokenSequence ();
+    Token prevToken = null;
+    for (int i = 0; i < ts.size(); i++) {
+      Token t = ts.get(i);
+      String s = t.getText();
+      if (this.acceptPattern.matcher(s).matches()) {
+        ret.add (t);
+        prevToken = t;
+      } else if (markDeletions && prevToken != null)
+        prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText());
+    }
+    carrier.setData(ret);
+    return carrier;
+  }
+
+  // Serialization
+
+  private static final long serialVersionUID = 1;
+  private static final int CURRENT_SERIAL_VERSION = 0;
+
+  private void writeObject (ObjectOutputStream out) throws IOException {
+    out.writeInt (CURRENT_SERIAL_VERSION);
+    out.writeBoolean(markDeletions);
+  }
+
+  private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
+    int version = in.readInt ();
+    markDeletions = in.readBoolean();
+  }
+
+}
diff --git a/src/kixi/mallet/script.clj b/src/kixi/mallet/script.clj
@@ -21,6 +21,8 @@
             TokenSequenceNGrams TokenSequence2FeatureSequence
             FeatureSequence2AugmentableFeatureVector]
            [cc.mallet.pipe.iterator FileIterator]
+
+           [kixi.mallet TokenSequenceCleaner]
            [java.io File FileOutputStream ObjectOutputStream FileFilter]
            [java.nio.charset Charset]))
 
@@ -45,6 +47,7 @@
   See kixi.mallet.boot for options"
   [opts]
   (let [default-token-regex (re-pattern "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
+        token-cleaner-regex (re-pattern "\\p{Alpha}+")
         directory (:input opts)
         instantiate (fn [sym]
                       (clojure.lang.Reflector/invokeConstructor (resolve sym) (into-array [])))
@@ -53,7 +56,7 @@
                       (when-let [string-pipes (:string-pipes opts)]
                         (map instantiate string-pipes))
                       [(CharSequence2TokenSequence. default-token-regex)
-                       (TokenSequenceRemoveNonAlpha. true)
+                       (TokenSequenceCleaner. true token-cleaner-regex)
                        (when (:remove-stopwords opts)
                          (if-let [file (:extra-stopwords opts)]
                            (TokenSequenceRemoveStopwords. file "UTF-8" true false true)