Skip to content

Commit

Permalink
Expose the token cleaner regex.
Browse files Browse the repository at this point in the history
While this is still fixed as only alphas the regex is now exposed and its possible to tweak this to fit other valid token structures. For example allowing numbers in a token.
  • Loading branch information
josephwilk committed Dec 17, 2017
1 parent 8c67223 commit 97ef9d1
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 1 deletion.
2 changes: 2 additions & 0 deletions build.boot
Expand Up @@ -25,13 +25,15 @@

(deftask release-locally []
(comp (aot :namespace #{'kixi.mallet.pipes})
(javac)
(pom)
(jar)
(install)))

(deftask release
[]
(comp (aot :namespace #{'kixi.mallet.pipes})
(javac)
(pom)
(jar)
(push)))
76 changes: 76 additions & 0 deletions src/kixi/mallet/TokenSequenceCleaner.java
@@ -0,0 +1,76 @@
package kixi.mallet;

import java.io.ObjectOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.regex.Pattern;

import cc.mallet.types.FeatureSequenceWithBigrams;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import cc.mallet.util.CharSequenceLexer;
import cc.mallet.pipe.Pipe;

/* Copyright (C) 2005 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */

/* Remove tokens that do not match passed in Regex */
public class TokenSequenceCleaner extends cc.mallet.pipe.Pipe {
boolean markDeletions = false;
Pattern acceptPattern;

public TokenSequenceCleaner (boolean markDeletions, Pattern regex)
{
this.markDeletions = markDeletions;
this.acceptPattern = regex;
}

public TokenSequenceCleaner (Pattern regex)
{
this (false, regex);
}

public TokenSequenceCleaner ()
{
this (false, CharSequenceLexer.LEX_ALPHA);
}

public Instance pipe (Instance carrier)
{
TokenSequence ts = (TokenSequence) carrier.getData();
TokenSequence ret = new TokenSequence ();
Token prevToken = null;
for (int i = 0; i < ts.size(); i++) {
Token t = ts.get(i);
String s = t.getText();
if (this.acceptPattern.matcher(s).matches()) {
ret.add (t);
prevToken = t;
} else if (markDeletions && prevToken != null)
prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText());
}
carrier.setData(ret);
return carrier;
}

// Serialization

private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 0;

private void writeObject (ObjectOutputStream out) throws IOException {
out.writeInt (CURRENT_SERIAL_VERSION);
out.writeBoolean(markDeletions);
}

private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
int version = in.readInt ();
markDeletions = in.readBoolean();
}

}
5 changes: 4 additions & 1 deletion src/kixi/mallet/script.clj
Expand Up @@ -21,6 +21,8 @@
TokenSequenceNGrams TokenSequence2FeatureSequence
FeatureSequence2AugmentableFeatureVector]
[cc.mallet.pipe.iterator FileIterator]

[kixi.mallet TokenSequenceCleaner]
[java.io File FileOutputStream ObjectOutputStream FileFilter]
[java.nio.charset Charset]))

Expand All @@ -45,6 +47,7 @@
See kixi.mallet.boot for options"
[opts]
(let [default-token-regex (re-pattern "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
token-cleaner-regex (re-pattern "\\p{Alpha}+")
directory (:input opts)
instantiate (fn [sym]
(clojure.lang.Reflector/invokeConstructor (resolve sym) (into-array [])))
Expand All @@ -53,7 +56,7 @@
(when-let [string-pipes (:string-pipes opts)]
(map instantiate string-pipes))
[(CharSequence2TokenSequence. default-token-regex)
(TokenSequenceRemoveNonAlpha. true)
(TokenSequenceCleaner. true token-cleaner-regex)
(when (:remove-stopwords opts)
(if-let [file (:extra-stopwords opts)]
(TokenSequenceRemoveStopwords. file "UTF-8" true false true)
Expand Down

0 comments on commit 97ef9d1

Please sign in to comment.