diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/pom.xml b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/pom.xml deleted file mode 100644 index 6d1e4310d..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/pom.xml +++ /dev/null @@ -1,101 +0,0 @@ - - - - - - 4.0.0 - - - org.apache.mahout - mahout - 0.7 - ../pom.xml - - - mahout-buildtools - Mahout Build Tools - - jar - - - - setup-eclipse-workspace - - ${basedir}/../workspace - - - process-test-sources - - - org.apache.maven.plugins - maven-eclipse-plugin - 2.8 - false - - - setup.eclipse.workspace - process-test-sources - - configure-workspace - - - - - ${eclipse.workspace.dir} - file:Eclipse-Lucene-Codestyle.xml - - - - org.apache.maven.plugins - maven-antrun-plugin - false - - - org.apache.ant - ant-nodeps - 1.7.1 - - - org.apache.ant - ant-trax - 1.7.1 - - - - - setup.workspace - validate - - - - - - - - - run - - - - - - - - - - diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/findbugs-exclude.xml b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/findbugs-exclude.xml deleted file mode 100644 index 9d17e0991..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/findbugs-exclude.xml +++ /dev/null @@ -1,54 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-checkstyle-suppressions.xml b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-checkstyle-suppressions.xml deleted file mode 100644 index 41f8fe372..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-checkstyle-suppressions.xml +++ /dev/null @@ -1,34 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-checkstyle.xml b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-checkstyle.xml deleted file mode 100644 index 9b807972c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-checkstyle.xml +++ /dev/null @@ -1,282 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-eclipse-checkstyle b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-eclipse-checkstyle deleted file mode 100644 index 9c76a6444..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-eclipse-checkstyle +++ /dev/null @@ -1,27 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-eclipse-pmd b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-eclipse-pmd deleted file mode 100644 index f1bd7be03..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-eclipse-pmd +++ /dev/null @@ -1,23 +0,0 @@ - - - - true - - \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-pmd-ruleset.xml b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-pmd-ruleset.xml deleted file mode 100644 index d5a412d2f..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/src/main/resources/mahout-pmd-ruleset.xml +++ /dev/null @@ -1,189 +0,0 @@ - - - - PMD Plugin preferences rule set - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/target/maven-archiver/pom.properties b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/target/maven-archiver/pom.properties deleted file mode 100644 index 45966a97a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/buildtools/target/maven-archiver/pom.properties +++ /dev/null @@ -1,5 +0,0 @@ -#Generated by Maven -#Tue Jun 12 14:04:37 IST 2012 -version=0.7 -groupId=org.apache.mahout -artifactId=mahout-buildtools diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/pom.xml b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/pom.xml deleted file mode 100644 index 7baf17453..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/pom.xml +++ /dev/null @@ -1,244 +0,0 @@ - - - - - - 4.0.0 - - - org.apache.mahout - mahout - 0.7 - ../pom.xml - - - - mahout-core - Mahout Core - Scalable machine learning libraries - - jar - - - - - org.apache.maven.plugins - maven-compiler-plugin - - UTF-8 - 1.6 - 1.6 - true - - - - org.apache.maven.plugins - maven-antrun-plugin - - - compile - - - - - - - - run - - - - - - - - org.apache.maven.plugins - maven-jar-plugin - - - - test-jar - - - - - - - - org.apache.maven.plugins - maven-assembly-plugin - - - job - package - - single - - - - src/main/assembly/job.xml - - - - - - - - maven-javadoc-plugin - - - - maven-source-plugin - - - - org.apache.maven.plugins - maven-remote-resources-plugin - - ../src/main/appended-resources - - org.apache:apache-jar-resource-bundle:1.4 - - - supplemental-models.xml - - - - - - - - - - - - ${project.groupId} - mahout-math - - - - ${project.groupId} - mahout-math - test-jar - test - - - - - org.codehaus.jackson - jackson-core-asl - - - org.codehaus.jackson - jackson-mapper-asl - - - - org.slf4j - slf4j-api - - - - org.slf4j - slf4j-jcl - test - - - - commons-lang - commons-lang - - - - com.thoughtworks.xstream - xstream - - - - org.apache.lucene - lucene-core - - - - org.apache.lucene - lucene-analyzers - - - - org.apache.mahout.commons - commons-cli - - - - org.apache.commons - commons-math - - - - junit - junit - test - - - - org.easymock - easymock - test - - - - - - - hadoop-0.20 - - - !hadoop.version - - - - - org.apache.hadoop - hadoop-core - - - - - hadoop-0.23 - - - hadoop.version - - - - - org.apache.hadoop - hadoop-common - - - org.apache.hadoop - hadoop-mapreduce-client-common - - - org.apache.hadoop - hadoop-mapreduce-client-core - - - - - diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/assembly/job.xml b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/assembly/job.xml deleted file mode 100644 index ca50fed81..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/assembly/job.xml +++ /dev/null @@ -1,37 +0,0 @@ - - job - - jar - - false - - - true - runtime - / - - org.apache.hadoop:hadoop-core - - - - - - ${basedir}/target/classes - / - - *.jar - - - - ${basedir}/target/classes - / - - driver.classes.default.props - - - - diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/Version.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/Version.java deleted file mode 100644 index 5f3c879dd..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/Version.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout; - -import com.google.common.base.Charsets; -import com.google.common.io.Resources; - -import java.io.IOException; - -public final class Version { - - private Version() { - } - - public static String version() { - return Version.class.getPackage().getImplementationVersion(); - } - - public static String versionFromResource() throws IOException { - return Resources.toString(Resources.getResource("version"), Charsets.UTF_8); - } - - public static void main(String[] args) throws IOException { - System.out.println(version() + ' ' + versionFromResource()); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/FixedSizePriorityQueue.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/FixedSizePriorityQueue.java deleted file mode 100644 index ff88270ec..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/FixedSizePriorityQueue.java +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.common; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.PriorityQueue; -import java.util.Queue; - -/** - * base class for queues holding the top or min k elements of all elements they have been offered - */ -abstract class FixedSizePriorityQueue { - - private final int k; - private final Comparator queueingComparator; - private final Comparator sortingComparator; - private final Queue queue; - - FixedSizePriorityQueue(int k, Comparator comparator) { - Preconditions.checkArgument(k > 0); - this.k = k; - Preconditions.checkNotNull(comparator); - this.queueingComparator = queueingComparator(comparator); - this.sortingComparator = sortingComparator(comparator); - this.queue = new PriorityQueue(k + 1, queueingComparator); - } - - abstract Comparator queueingComparator(Comparator stdComparator); - abstract Comparator sortingComparator(Comparator stdComparator); - - public void offer(T item) { - if (queue.size() < k) { - queue.add(item); - } else if (queueingComparator.compare(item, queue.peek()) > 0) { - queue.add(item); - queue.poll(); - } - } - - public boolean isEmpty() { - return queue.isEmpty(); - } - - public int size() { - return queue.size(); - } - - public List retrieve() { - List topItems = Lists.newArrayList(queue); - Collections.sort(topItems, sortingComparator); - return topItems; - } - - protected T peek() { - return queue.peek(); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/MinK.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/MinK.java deleted file mode 100644 index f39d02dc5..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/MinK.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.common; - -import java.util.Collections; -import java.util.Comparator; - -/** - * this class will preserve the k minimum elements of all elements it has been offered - */ -public class MinK extends FixedSizePriorityQueue { - - public MinK(int k, Comparator comparator) { - super(k, comparator); - } - - @Override - protected Comparator queueingComparator(Comparator stdComparator) { - return Collections.reverseOrder(stdComparator); - } - - @Override - protected Comparator sortingComparator(Comparator stdComparator) { - return stdComparator; - } - - public T greatestSmall() { - return peek(); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java deleted file mode 100644 index f10ab5e04..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java +++ /dev/null @@ -1,32 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.common; - -public final class NoSuchItemException extends TasteException { - - public NoSuchItemException() { } - - public NoSuchItemException(long itemID) { - this(String.valueOf(itemID)); - } - - public NoSuchItemException(String message) { - super(message); - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java deleted file mode 100644 index 8118bc817..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java +++ /dev/null @@ -1,32 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.common; - -public final class NoSuchUserException extends TasteException { - - public NoSuchUserException() { } - - public NoSuchUserException(long userID) { - this(String.valueOf(userID)); - } - - public NoSuchUserException(String message) { - super(message); - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java deleted file mode 100644 index adc627cfd..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java +++ /dev/null @@ -1,53 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.common; - -import java.util.Collection; - -/** - *

- * Implementations of this interface have state that can be periodically refreshed. For example, an - * implementation instance might contain some pre-computed information that should be periodically refreshed. - * The {@link #refresh(Collection)} method triggers such a refresh. - *

- * - *

- * All Taste components implement this. In particular, - * {@link org.apache.mahout.cf.taste.recommender.Recommender}s do. Callers may want to call - * {@link #refresh(Collection)} periodically to re-compute information throughout the system and bring it up - * to date, though this operation may be expensive. - *

- */ -public interface Refreshable { - - /** - *

- * Triggers "refresh" -- whatever that means -- of the implementation. The general contract is that any - * should always leave itself in a consistent, operational state, and that the refresh - * atomically updates internal state from old to new. - *

- * - * @param alreadyRefreshed - * s that are known to have already been - * refreshed as a result of an initial call to a method on some - * object. This ensure that objects in a refresh dependency graph aren't refreshed twice - * needlessly. - */ - void refresh(Collection alreadyRefreshed); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java deleted file mode 100644 index 1792eff28..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.common; - -/** - *

- * An exception thrown when an error occurs inside the Taste engine. - *

- */ -public class TasteException extends Exception { - - public TasteException() { } - - public TasteException(String message) { - super(message); - } - - public TasteException(Throwable cause) { - super(cause); - } - - public TasteException(String message, Throwable cause) { - super(message, cause); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/TopK.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/TopK.java deleted file mode 100644 index 44d70d641..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/TopK.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.common; - -import java.util.Collections; -import java.util.Comparator; - -/** - * this class will preserve the k maximum elements of all elements it has been offered - */ -public class TopK extends FixedSizePriorityQueue { - - public TopK(int k, Comparator comparator) { - super(k, comparator); - } - - @Override - protected Comparator queueingComparator(Comparator stdComparator) { - return stdComparator; - } - - @Override - protected Comparator sortingComparator(Comparator stdComparator) { - return Collections.reverseOrder(stdComparator); - } - - public T smallestGreat() { - return peek(); - } -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java deleted file mode 100644 index 4e396176a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.common; - -/** - *

- * A simple enum which gives symbolic names to the ideas of "weighted" and "unweighted", to make various API - * calls which take a weighting parameter more readable. - *

- */ -public enum Weighting { - - WEIGHTED, - UNWEIGHTED - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java deleted file mode 100644 index 62b38f76a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.eval; - -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; - -/** - *

- * Implementations of this inner interface are simple helper classes which create a {@link DataModel} to be - * used while evaluating a {@link org.apache.mahout.cf.taste.recommender.Recommender}. - * - * @see RecommenderBuilder - * @see RecommenderEvaluator - */ -public interface DataModelBuilder { - - /** - *

- * Builds a {@link DataModel} implementation to be used in an evaluation, given training data. - *

- * - * @param trainingData - * data to be used in the {@link DataModel} - * @return {@link DataModel} based upon the given data - */ - DataModel buildDataModel(FastByIDMap trainingData); - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java deleted file mode 100644 index 9c442fff8..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.eval; - -/** - *

- * Implementations encapsulate information retrieval-related statistics about a - * {@link org.apache.mahout.cf.taste.recommender.Recommender}'s recommendations. - *

- * - *

- * See Information retrieval. - *

- */ -public interface IRStatistics { - - /** - *

- * See Precision. - *

- */ - double getPrecision(); - - /** - *

- * See Recall. - *

- */ - double getRecall(); - - /** - *

- * See Fall-Out. - *

- */ - double getFallOut(); - - /** - *

- * See F-measure. - *

- */ - double getF1Measure(); - - /** - *

- * See F-measure. - *

- */ - double getFNMeasure(double n); - - /** - *

- * See - * Normalized Discounted Cumulative Gain. - *

- */ - double getNormalizedDiscountedCumulativeGain(); - - /** - * @return the fraction of all users for whom recommendations could be produced - */ - double getReach(); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java deleted file mode 100644 index 1805092d6..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.eval; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.recommender.Recommender; - -/** - *

- * Implementations of this inner interface are simple helper classes which create a {@link Recommender} to be - * evaluated based on the given {@link DataModel}. - *

- */ -public interface RecommenderBuilder { - - /** - *

- * Builds a {@link Recommender} implementation to be evaluated, using the given {@link DataModel}. - *

- * - * @param dataModel - * {@link DataModel} to build the {@link Recommender} on - * @return {@link Recommender} based upon the given {@link DataModel} - * @throws TasteException - * if an error occurs while accessing the {@link DataModel} - */ - Recommender buildRecommender(DataModel dataModel) throws TasteException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java deleted file mode 100644 index bda37656c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java +++ /dev/null @@ -1,105 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.eval; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.model.DataModel; - -/** - *

- * Implementations of this interface evaluate the quality of a - * {@link org.apache.mahout.cf.taste.recommender.Recommender}'s recommendations. - *

- */ -public interface RecommenderEvaluator { - - /** - *

- * Evaluates the quality of a {@link org.apache.mahout.cf.taste.recommender.Recommender}'s recommendations. - * The range of values that may be returned depends on the implementation, but lower values must - * mean better recommendations, with 0 being the lowest / best possible evaluation, meaning a perfect match. - * This method does not accept a {@link org.apache.mahout.cf.taste.recommender.Recommender} directly, but - * rather a {@link RecommenderBuilder} which can build the - * {@link org.apache.mahout.cf.taste.recommender.Recommender} to test on top of a given {@link DataModel}. - *

- * - *

- * Implementations will take a certain percentage of the preferences supplied by the given {@link DataModel} - * as "training data". This is typically most of the data, like 90%. This data is used to produce - * recommendations, and the rest of the data is compared against estimated preference values to see how much - * the {@link org.apache.mahout.cf.taste.recommender.Recommender}'s predicted preferences match the user's - * real preferences. Specifically, for each user, this percentage of the user's ratings are used to produce - * recommendatinos, and for each user, the remaining preferences are compared against the user's real - * preferences. - *

- * - *

- * For large datasets, it may be desirable to only evaluate based on a small percentage of the data. - * {@code evaluationPercentage} controls how many of the {@link DataModel}'s users are used in - * evaluation. - *

- * - *

- * To be clear, {@code trainingPercentage} and {@code evaluationPercentage} are not related. They - * do not need to add up to 1.0, for example. - *

- * - * @param recommenderBuilder - * object that can build a {@link org.apache.mahout.cf.taste.recommender.Recommender} to test - * @param dataModelBuilder - * {@link DataModelBuilder} to use, or if null, a default {@link DataModel} - * implementation will be used - * @param dataModel - * dataset to test on - * @param trainingPercentage - * percentage of each user's preferences to use to produce recommendations; the rest are compared - * to estimated preference values to evaluate - * {@link org.apache.mahout.cf.taste.recommender.Recommender} performance - * @param evaluationPercentage - * percentage of users to use in evaluation - * @return a "score" representing how well the {@link org.apache.mahout.cf.taste.recommender.Recommender}'s - * estimated preferences match real values; lower scores mean a better match and 0 is a - * perfect match - * @throws TasteException - * if an error occurs while accessing the {@link DataModel} - */ - double evaluate(RecommenderBuilder recommenderBuilder, - DataModelBuilder dataModelBuilder, - DataModel dataModel, - double trainingPercentage, - double evaluationPercentage) throws TasteException; - - /** - * @deprecated see {@link DataModel#getMaxPreference()} - */ - @Deprecated - float getMaxPreference(); - - @Deprecated - void setMaxPreference(float maxPreference); - - /** - * @deprecated see {@link DataModel#getMinPreference()} - */ - @Deprecated - float getMinPreference(); - - @Deprecated - void setMinPreference(float minPreference); - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java deleted file mode 100644 index a7345aabf..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.eval; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.recommender.IDRescorer; - -/** - *

- * Implementations collect information retrieval-related statistics on a - * {@link org.apache.mahout.cf.taste.recommender.Recommender}'s performance, including precision, recall and - * f-measure. - *

- * - *

- * See Information retrieval. - */ -public interface RecommenderIRStatsEvaluator { - - /** - * @param recommenderBuilder - * object that can build a {@link org.apache.mahout.cf.taste.recommender.Recommender} to test - * @param dataModelBuilder - * {@link DataModelBuilder} to use, or if null, a default {@link DataModel} implementation will be - * used - * @param dataModel - * dataset to test on - * @param rescorer - * if any, to use when computing recommendations - * @param at - * as in, "precision at 5". The number of recommendations to consider when evaluating precision, - * etc. - * @param relevanceThreshold - * items whose preference value is at least this value are considered "relevant" for the purposes - * of computations - * @return {@link IRStatistics} with resulting precision, recall, etc. - * @throws TasteException - * if an error occurs while accessing the {@link DataModel} - */ - IRStatistics evaluate(RecommenderBuilder recommenderBuilder, - DataModelBuilder dataModelBuilder, - DataModel dataModel, - IDRescorer rescorer, - int at, - double relevanceThreshold, - double evaluationPercentage) throws TasteException; - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java deleted file mode 100644 index b27d1adb1..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.eval; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; - -/** - * Implementations of this interface determine the items that are considered relevant, - * and splits data into a training and test subset, for purposes of precision/recall - * tests as implemented by implementations of {@link RecommenderIRStatsEvaluator}. - */ -public interface RelevantItemsDataSplitter { - - /** - * During testing, relevant items are removed from a particular users' preferences, - * and a model is build using this user's other preferences and all other users. - * - * @param at Maximum number of items to be removed - * @param relevanceThreshold Minimum strength of preference for an item to be considered - * relevant - * @return IDs of relevant items - */ - FastIDSet getRelevantItemsIDs(long userID, - int at, - double relevanceThreshold, - DataModel dataModel) throws TasteException; - - /** - * Adds a single user and all their preferences to the training model. - * - * @param userID ID of user whose preferences we are trying to predict - * @param relevantItemIDs IDs of items considered relevant to that user - * @param trainingUsers the database of training preferences to which we will - * append the ones for otherUserID. - * @param otherUserID for whom we are adding preferences to the trianing model - */ - void processOtherUser(long userID, - FastIDSet relevantItemIDs, - FastByIDMap trainingUsers, - long otherUserID, - DataModel dataModel) throws TasteException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityCountWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityCountWritable.java deleted file mode 100644 index 4d58851e4..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityCountWritable.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.mahout.math.VarLongWritable; -import org.apache.mahout.math.Varint; - -/** A {@link org.apache.hadoop.io.Writable} encapsulating an item ID and a count . */ -public final class EntityCountWritable extends VarLongWritable { - - private int count; - - public EntityCountWritable() { - // do nothing - } - - public EntityCountWritable(long itemID, int count) { - super(itemID); - this.count = count; - } - - public EntityCountWritable(EntityCountWritable other) { - this(other.get(), other.getCount()); - } - - public long getID() { - return get(); - } - - public int getCount() { - return count; - } - - public void set(long id, int count) { - set(id); - this.count = count; - } - - @Override - public void write(DataOutput out) throws IOException { - super.write(out); - Varint.writeUnsignedVarInt(count, out); - } - - @Override - public void readFields(DataInput in) throws IOException { - super.readFields(in); - count = Varint.readUnsignedVarInt(in); - } - - @Override - public int hashCode() { - return super.hashCode() ^ count; - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof EntityCountWritable)) { - return false; - } - EntityCountWritable other = (EntityCountWritable) o; - return get() == other.get() && count == other.getCount(); - } - - @Override - public String toString() { - return get() + "\t" + count; - } - - @Override - public EntityCountWritable clone() { - return new EntityCountWritable(get(), count); - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java deleted file mode 100644 index 0106474c5..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import com.google.common.primitives.Longs; -import org.apache.hadoop.io.WritableComparable; -import org.apache.mahout.math.Varint; - -/** A {@link WritableComparable} encapsulating two items. */ -public final class EntityEntityWritable - implements WritableComparable, Cloneable { - - private long aID; - private long bID; - - public EntityEntityWritable() { - // do nothing - } - - public EntityEntityWritable(long aID, long bID) { - this.aID = aID; - this.bID = bID; - } - - public long getAID() { - return aID; - } - - public long getBID() { - return bID; - } - - public void set(long aID, long bID) { - this.aID = aID; - this.bID = bID; - } - - @Override - public void write(DataOutput out) throws IOException { - Varint.writeSignedVarLong(aID, out); - Varint.writeSignedVarLong(bID, out); - } - - @Override - public void readFields(DataInput in) throws IOException { - aID = Varint.readSignedVarLong(in); - bID = Varint.readSignedVarLong(in); - } - - @Override - public int compareTo(EntityEntityWritable that) { - int aCompare = compare(aID, that.getAID()); - return aCompare == 0 ? compare(bID, that.getBID()) : aCompare; - } - - private static int compare(long a, long b) { - return a < b ? -1 : a > b ? 1 : 0; - } - - @Override - public int hashCode() { - return Longs.hashCode(aID) + 31 * Longs.hashCode(bID); - } - - @Override - public boolean equals(Object o) { - if (o instanceof EntityEntityWritable) { - EntityEntityWritable that = (EntityEntityWritable) o; - return aID == that.getAID() && bID == that.getBID(); - } - return false; - } - - @Override - public String toString() { - return aID + "\t" + bID; - } - - @Override - public EntityEntityWritable clone() { - return new EntityEntityWritable(aID, bID); - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java deleted file mode 100644 index 5bcc80578..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.VarLongWritable; - -/** A {@link org.apache.hadoop.io.Writable} encapsulating an item ID and a preference value. */ -public final class EntityPrefWritable extends VarLongWritable { - - private float prefValue; - - public EntityPrefWritable() { - // do nothing - } - - public EntityPrefWritable(long itemID, float prefValue) { - super(itemID); - this.prefValue = prefValue; - } - - public EntityPrefWritable(EntityPrefWritable other) { - this(other.get(), other.getPrefValue()); - } - - public long getID() { - return get(); - } - - public float getPrefValue() { - return prefValue; - } - - public void set(long id, float prefValue) { - set(id); - this.prefValue = prefValue; - } - - @Override - public void write(DataOutput out) throws IOException { - super.write(out); - out.writeFloat(prefValue); - } - - @Override - public void readFields(DataInput in) throws IOException { - super.readFields(in); - prefValue = in.readFloat(); - } - - @Override - public int hashCode() { - return super.hashCode() ^ RandomUtils.hashFloat(prefValue); - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof EntityPrefWritable)) { - return false; - } - EntityPrefWritable other = (EntityPrefWritable) o; - return get() == other.get() && prefValue == other.getPrefValue(); - } - - @Override - public String toString() { - return get() + "\t" + prefValue; - } - - @Override - public EntityPrefWritable clone() { - return new EntityPrefWritable(get(), prefValue); - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritableArrayWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritableArrayWritable.java deleted file mode 100644 index 3e9161b6b..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritableArrayWritable.java +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop; - -import java.util.Arrays; - -import org.apache.hadoop.io.ArrayWritable; - -/** - * An {@link ArrayWritable} holding {@link EntityPrefWritable}s - */ -public class EntityPrefWritableArrayWritable extends ArrayWritable { - - public EntityPrefWritableArrayWritable() { - super(EntityPrefWritable.class); - } - - public EntityPrefWritableArrayWritable(EntityPrefWritable[] prefs) { - super(EntityPrefWritable.class, prefs); - } - - public EntityPrefWritable[] getPrefs() { - return (EntityPrefWritable[]) toArray(); - } - - @Override - public String toString() { - return Arrays.toString(toStrings()); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java deleted file mode 100644 index a833d5931..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.List; - -import com.google.common.collect.Lists; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.cf.taste.impl.recommender.GenericRecommendedItem; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.math.Varint; - -/** - * A {@link Writable} which encapsulates a list of {@link RecommendedItem}s. This is the mapper (and reducer) - * output, and represents items recommended to a user. The first item is the one whose estimated preference is - * highest. - */ -public final class RecommendedItemsWritable implements Writable { - - private List recommended; - - public RecommendedItemsWritable() { - // do nothing - } - - public RecommendedItemsWritable(List recommended) { - this.recommended = recommended; - } - - public List getRecommendedItems() { - return recommended; - } - - public void set(List recommended) { - this.recommended = recommended; - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeInt(recommended.size()); - for (RecommendedItem item : recommended) { - Varint.writeSignedVarLong(item.getItemID(), out); - out.writeFloat(item.getValue()); - } - - } - - @Override - public void readFields(DataInput in) throws IOException { - int size = in.readInt(); - recommended = Lists.newArrayListWithCapacity(size); - for (int i = 0; i < size; i++) { - long itemID = Varint.readSignedVarLong(in); - float value = in.readFloat(); - RecommendedItem recommendedItem = new GenericRecommendedItem(itemID, value); - recommended.add(recommendedItem); - } - } - - @Override - public String toString() { - StringBuilder result = new StringBuilder(200); - result.append('['); - boolean first = true; - for (RecommendedItem item : recommended) { - if (first) { - first = false; - } else { - result.append(','); - } - result.append(String.valueOf(item.getItemID())); - result.append(':'); - result.append(String.valueOf(item.getValue())); - } - result.append(']'); - return result.toString(); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java deleted file mode 100644 index e0ae35411..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop; - -import com.google.common.primitives.Longs; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; -import org.apache.mahout.math.VarIntWritable; -import org.apache.mahout.math.VarLongWritable; -import org.apache.mahout.math.map.OpenIntLongHashMap; - -import java.util.regex.Pattern; - -/** - * Some helper methods for the hadoop-related stuff in org.apache.mahout.cf.taste - */ -public final class TasteHadoopUtils { - - /** Standard delimiter of textual preference data */ - private static final Pattern PREFERENCE_TOKEN_DELIMITER = Pattern.compile("[\t,]"); - - private TasteHadoopUtils() {} - - /** - * Splits a preference data line into string tokens - */ - public static String[] splitPrefTokens(CharSequence line) { - return PREFERENCE_TOKEN_DELIMITER.split(line); - } - - /** - * Maps a long to an int - */ - public static int idToIndex(long id) { - return 0x7FFFFFFF & Longs.hashCode(id); - } - - /** - * Reads a binary mapping file - */ - public static OpenIntLongHashMap readItemIDIndexMap(String itemIDIndexPathStr, Configuration conf) { - OpenIntLongHashMap indexItemIDMap = new OpenIntLongHashMap(); - Path itemIDIndexPath = new Path(itemIDIndexPathStr); - for (Pair record - : new SequenceFileDirIterable(itemIDIndexPath, - PathType.LIST, - PathFilters.partFilter(), - null, - true, - conf)) { - indexItemIDMap.put(record.getFirst().get(), record.getSecond().get()); - } - return indexItemIDMap; - } - - - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java deleted file mode 100644 index fdb552ecd..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.cf.taste.hadoop.item.RecommenderJob; -import org.apache.mahout.math.VarLongWritable; - -import java.io.IOException; -import java.util.regex.Pattern; - -public abstract class ToEntityPrefsMapper extends - Mapper { - - public static final String TRANSPOSE_USER_ITEM = ToEntityPrefsMapper.class + "transposeUserItem"; - public static final String RATING_SHIFT = ToEntityPrefsMapper.class + "shiftRatings"; - - private static final Pattern DELIMITER = Pattern.compile("[\t,]"); - - private boolean booleanData; - private boolean transpose; - private final boolean itemKey; - private float ratingShift; - - ToEntityPrefsMapper(boolean itemKey) { - this.itemKey = itemKey; - } - - @Override - protected void setup(Context context) { - Configuration jobConf = context.getConfiguration(); - booleanData = jobConf.getBoolean(RecommenderJob.BOOLEAN_DATA, false); - transpose = jobConf.getBoolean(TRANSPOSE_USER_ITEM, false); - ratingShift = Float.parseFloat(jobConf.get(RATING_SHIFT, "0.0")); - } - - @Override - public void map(LongWritable key, - Text value, - Context context) throws IOException, InterruptedException { - String[] tokens = DELIMITER.split(value.toString()); - long userID = Long.parseLong(tokens[0]); - long itemID = Long.parseLong(tokens[1]); - if (itemKey ^ transpose) { - // If using items as keys, and not transposing items and users, then users are items! - // Or if not using items as keys (users are, as usual), but transposing items and users, - // then users are items! Confused? - long temp = userID; - userID = itemID; - itemID = temp; - } - if (booleanData) { - context.write(new VarLongWritable(userID), new VarLongWritable(itemID)); - } else { - float prefValue = tokens.length > 2 ? Float.parseFloat(tokens[2]) + ratingShift : 1.0f; - context.write(new VarLongWritable(userID), new EntityPrefWritable(itemID, prefValue)); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java deleted file mode 100644 index f147cf3ec..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop; - -/** - *

Input

- * - *

- * Intended for use with {@link org.apache.hadoop.mapreduce.lib.input.TextInputFormat}; - * accepts line number / line pairs as - * {@link org.apache.hadoop.io.LongWritable}/{@link org.apache.hadoop.io.Text} pairs. - *

- * - *

- * Each line is assumed to be of the form {@code userID,itemID,preference}, or {@code userID,itemID}. - *

- * - *

Output

- * - *

- * Outputs the user ID as a {@link org.apache.mahout.math.VarLongWritable} mapped to the item ID and preference as a - * {@link EntityPrefWritable}. - *

- */ -public final class ToItemPrefsMapper extends ToEntityPrefsMapper { - - public ToItemPrefsMapper() { - super(false); - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/ToUserPrefsMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/ToUserPrefsMapper.java deleted file mode 100644 index 78567ec41..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/ToUserPrefsMapper.java +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop; - -/** - * The 'reverse' of {@link ToItemPrefsMapper}; outputs item IDs mapped to user-pref data. - */ -public final class ToUserPrefsMapper extends ToEntityPrefsMapper { - - public ToUserPrefsMapper() { - super(true); - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALSUtils.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALSUtils.java deleted file mode 100644 index fc3336e44..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALSUtils.java +++ /dev/null @@ -1,60 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.als; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.map.OpenIntObjectHashMap; - -import java.io.IOException; -import java.util.Iterator; - -final class ALSUtils { - - private ALSUtils() {} - - static Vector readFirstRow(Path dir, Configuration conf) throws IOException { - Iterator iterator = new SequenceFileDirValueIterator(dir, - PathType.LIST, - PathFilters.partFilter(), - null, - true, - conf); - return iterator.hasNext() ? iterator.next().get() : null; - } - - static OpenIntObjectHashMap readMatrixByRows(Path dir, Configuration conf) { - OpenIntObjectHashMap matrix = new OpenIntObjectHashMap(); - - for (Pair pair : - new SequenceFileDirIterable(dir, PathType.LIST, PathFilters.partFilter(), conf)) { - int rowIndex = pair.getFirst().get(); - Vector row = pair.getSecond().get().clone(); - matrix.put(rowIndex, row); - } - return matrix; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java deleted file mode 100644 index f84a29f96..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java +++ /dev/null @@ -1,151 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.als; - -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.RandomUtils; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.Random; - -/** - *

Split a recommendation dataset into a training and a test set

- * - *

Command line arguments specific to this class are:

- * - *
    - *
  1. --input (path): Directory containing one or more text files with the dataset
  2. - *
  3. --output (path): path where output should go
  4. - *
  5. --trainingPercentage (double): percentage of the data to use as training set (optional, default 0.9)
  6. - *
  7. --probePercentage (double): percentage of the data to use as probe set (optional, default 0.1)
  8. - *
- */ -public class DatasetSplitter extends AbstractJob { - - private static final String TRAINING_PERCENTAGE = DatasetSplitter.class.getName() + ".trainingPercentage"; - private static final String PROBE_PERCENTAGE = DatasetSplitter.class.getName() + ".probePercentage"; - private static final String PART_TO_USE = DatasetSplitter.class.getName() + ".partToUse"; - - private static final Text INTO_TRAINING_SET = new Text("T"); - private static final Text INTO_PROBE_SET = new Text("P"); - - private static final double DEFAULT_TRAINING_PERCENTAGE = 0.9; - private static final double DEFAULT_PROBE_PERCENTAGE = 0.1; - - public static void main(String[] args) throws Exception { - ToolRunner.run(new DatasetSplitter(), args); - } - - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOutputOption(); - addOption("trainingPercentage", "t", "percentage of the data to use as training set (default: " - + DEFAULT_TRAINING_PERCENTAGE + ')', String.valueOf(DEFAULT_TRAINING_PERCENTAGE)); - addOption("probePercentage", "p", "percentage of the data to use as probe set (default: " - + DEFAULT_PROBE_PERCENTAGE + ')', String.valueOf(DEFAULT_PROBE_PERCENTAGE)); - - Map> parsedArgs = parseArguments(args); - double trainingPercentage = Double.parseDouble(getOption("trainingPercentage")); - double probePercentage = Double.parseDouble(getOption("probePercentage")); - String tempDir = getOption("tempDir"); - - Path markedPrefs = new Path(tempDir, "markedPreferences"); - Path trainingSetPath = new Path(getOutputPath(), "trainingSet"); - Path probeSetPath = new Path(getOutputPath(), "probeSet"); - - Job markPreferences = prepareJob(getInputPath(), markedPrefs, TextInputFormat.class, MarkPreferencesMapper.class, - Text.class, Text.class, SequenceFileOutputFormat.class); - markPreferences.getConfiguration().set(TRAINING_PERCENTAGE, String.valueOf(trainingPercentage)); - markPreferences.getConfiguration().set(PROBE_PERCENTAGE, String.valueOf(probePercentage)); - boolean succeeded = markPreferences.waitForCompletion(true); - if (!succeeded) - return -1; - - Job createTrainingSet = prepareJob(markedPrefs, trainingSetPath, SequenceFileInputFormat.class, - WritePrefsMapper.class, NullWritable.class, Text.class, TextOutputFormat.class); - createTrainingSet.getConfiguration().set(PART_TO_USE, INTO_TRAINING_SET.toString()); - succeeded = createTrainingSet.waitForCompletion(true); - if (!succeeded) - return -1; - - Job createProbeSet = prepareJob(markedPrefs, probeSetPath, SequenceFileInputFormat.class, - WritePrefsMapper.class, NullWritable.class, Text.class, TextOutputFormat.class); - createProbeSet.getConfiguration().set(PART_TO_USE, INTO_PROBE_SET.toString()); - succeeded = createProbeSet.waitForCompletion(true); - if (!succeeded) - return -1; - - return 0; - } - - static class MarkPreferencesMapper extends Mapper { - - private Random random; - private double trainingBound; - private double probeBound; - - @Override - protected void setup(Context ctx) throws IOException, InterruptedException { - random = RandomUtils.getRandom(); - trainingBound = Double.parseDouble(ctx.getConfiguration().get(TRAINING_PERCENTAGE)); - probeBound = trainingBound + Double.parseDouble(ctx.getConfiguration().get(PROBE_PERCENTAGE)); - } - - @Override - protected void map(LongWritable key, Text text, Context ctx) throws IOException, InterruptedException { - double randomValue = random.nextDouble(); - if (randomValue <= trainingBound) { - ctx.write(INTO_TRAINING_SET, text); - } else if (randomValue <= probeBound) { - ctx.write(INTO_PROBE_SET, text); - } - } - } - - static class WritePrefsMapper extends Mapper { - - private String partToUse; - - @Override - protected void setup(Context ctx) throws IOException, InterruptedException { - partToUse = ctx.getConfiguration().get(PART_TO_USE); - } - - @Override - protected void map(Text key, Text text, Context ctx) throws IOException, InterruptedException { - if (partToUse.equals(key.toString())) { - ctx.write(NullWritable.get(), text); - } - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java deleted file mode 100644 index 0c6e3f764..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java +++ /dev/null @@ -1,151 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.als; - -import com.google.common.io.Closeables; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.DoubleWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.map.OpenIntObjectHashMap; - -import java.io.BufferedWriter; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.util.List; -import java.util.Map; - -/** - *

Measures the root-mean-squared error of a ratring matrix factorization against a test set.

- * - *

Command line arguments specific to this class are:

- * - *
    - *
  1. --output (path): path where output should go
  2. - *
  3. --pairs (path): path containing the test ratings, each line must be userID,itemID,rating
  4. - *
  5. --userFeatures (path): path to the user feature matrix
  6. - *
  7. --itemFeatures (path): path to the item feature matrix
  8. - *
- */ -public class FactorizationEvaluator extends AbstractJob { - - private static final String USER_FEATURES_PATH = RecommenderJob.class.getName() + ".userFeatures"; - private static final String ITEM_FEATURES_PATH = RecommenderJob.class.getName() + ".itemFeatures"; - - public static void main(String[] args) throws Exception { - ToolRunner.run(new FactorizationEvaluator(), args); - } - - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOption("userFeatures", null, "path to the user feature matrix", true); - addOption("itemFeatures", null, "path to the item feature matrix", true); - addOutputOption(); - - Map> parsedArgs = parseArguments(args); - if (parsedArgs == null) { - return -1; - } - - Path errors = getTempPath("errors"); - - Job predictRatings = prepareJob(getInputPath(), errors, TextInputFormat.class, PredictRatingsMapper.class, - DoubleWritable.class, NullWritable.class, SequenceFileOutputFormat.class); - - predictRatings.getConfiguration().set(USER_FEATURES_PATH, getOption("userFeatures")); - predictRatings.getConfiguration().set(ITEM_FEATURES_PATH, getOption("itemFeatures")); - boolean succeeded = predictRatings.waitForCompletion(true); - if (!succeeded) - return -1; - - BufferedWriter writer = null; - try { - FileSystem fs = FileSystem.get(getOutputPath().toUri(), getConf()); - FSDataOutputStream outputStream = fs.create(getOutputPath("rmse.txt")); - double rmse = computeRmse(errors); - writer = new BufferedWriter(new OutputStreamWriter(outputStream)); - writer.write(String.valueOf(rmse)); - } finally { - Closeables.closeQuietly(writer); - } - - return 0; - } - - protected double computeRmse(Path errors) { - RunningAverage average = new FullRunningAverage(); - for (Pair entry : - new SequenceFileDirIterable(errors, PathType.LIST, PathFilters.logsCRCFilter(), - getConf())) { - DoubleWritable error = entry.getFirst(); - average.addDatum(error.get() * error.get()); - } - - return Math.sqrt(average.getAverage()); - } - - public static class PredictRatingsMapper extends Mapper { - - private OpenIntObjectHashMap U; - private OpenIntObjectHashMap M; - - @Override - protected void setup(Context ctx) throws IOException, InterruptedException { - Path pathToU = new Path(ctx.getConfiguration().get(USER_FEATURES_PATH)); - Path pathToM = new Path(ctx.getConfiguration().get(ITEM_FEATURES_PATH)); - - U = ALSUtils.readMatrixByRows(pathToU, ctx.getConfiguration()); - M = ALSUtils.readMatrixByRows(pathToM, ctx.getConfiguration()); - } - - @Override - protected void map(LongWritable key, Text value, Context ctx) throws IOException, InterruptedException { - - String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString()); - int userID = Integer.parseInt(tokens[0]); - int itemID = Integer.parseInt(tokens[1]); - double rating = Double.parseDouble(tokens[2]); - - if (U.containsKey(userID) && M.containsKey(itemID)) { - double estimate = U.get(userID).dot(M.get(itemID)); - double err = rating - estimate; - ctx.write(new DoubleWritable(err), NullWritable.get()); - } - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java deleted file mode 100644 index 7dc3b7988..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java +++ /dev/null @@ -1,329 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.als; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; -import com.google.common.io.Closeables; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.common.mapreduce.MergeVectorsCombiner; -import org.apache.mahout.common.mapreduce.MergeVectorsReducer; -import org.apache.mahout.common.mapreduce.TransposeMapper; -import org.apache.mahout.common.mapreduce.VectorSumReducer; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.SequentialAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.als.AlternatingLeastSquaresSolver; -import org.apache.mahout.math.als.ImplicitFeedbackAlternatingLeastSquaresSolver; -import org.apache.mahout.math.map.OpenIntObjectHashMap; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Random; - -/** - *

MapReduce implementation of the two factorization algorithms described in - * - *

"Large-scale Parallel Collaborative Filtering for the Netflix Prize" available at - * http://www.hpl.hp.com/personal/Robert_Schreiber/papers/2008%20AAIM%20Netflix/netflix_aaim08(submitted).pdf.

- * - * "

Collaborative Filtering for Implicit Feedback Datasets" available at - * http://research.yahoo.com/pub/2433

- * - *

- *

Command line arguments specific to this class are:

- * - *
    - *
  1. --input (path): Directory containing one or more text files with the dataset
  2. - *
  3. --output (path): path where output should go
  4. - *
  5. --lambda (double): regularization parameter to avoid overfitting
  6. - *
  7. --userFeatures (path): path to the user feature matrix
  8. - *
  9. --itemFeatures (path): path to the item feature matrix
  10. - *
- */ -public class ParallelALSFactorizationJob extends AbstractJob { - - private static final Logger log = LoggerFactory.getLogger(ParallelALSFactorizationJob.class); - - static final String NUM_FEATURES = ParallelALSFactorizationJob.class.getName() + ".numFeatures"; - static final String LAMBDA = ParallelALSFactorizationJob.class.getName() + ".lambda"; - static final String ALPHA = ParallelALSFactorizationJob.class.getName() + ".alpha"; - static final String FEATURE_MATRIX = ParallelALSFactorizationJob.class.getName() + ".featureMatrix"; - - private boolean implicitFeedback; - private int numIterations; - private int numFeatures; - private double lambda; - private double alpha; - - public static void main(String[] args) throws Exception { - ToolRunner.run(new ParallelALSFactorizationJob(), args); - } - - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOutputOption(); - addOption("lambda", null, "regularization parameter", true); - addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false)); - addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40)); - addOption("numFeatures", null, "dimension of the feature space", true); - addOption("numIterations", null, "number of iterations", true); - - Map> parsedArgs = parseArguments(args); - if (parsedArgs == null) { - return -1; - } - - numFeatures = Integer.parseInt(getOption("numFeatures")); - numIterations = Integer.parseInt(getOption("numIterations")); - lambda = Double.parseDouble(getOption("lambda")); - alpha = Double.parseDouble(getOption("alpha")); - implicitFeedback = Boolean.parseBoolean(getOption("implicitFeedback")); - - /* - * compute the factorization A = U M' - * - * where A (users x items) is the matrix of known ratings - * U (users x features) is the representation of users in the feature space - * M (items x features) is the representation of items in the feature space - */ - - /* create A' */ - Job itemRatings = prepareJob(getInputPath(), pathToItemRatings(), - TextInputFormat.class, ItemRatingVectorsMapper.class, IntWritable.class, - VectorWritable.class, VectorSumReducer.class, IntWritable.class, - VectorWritable.class, SequenceFileOutputFormat.class); - itemRatings.setCombinerClass(VectorSumReducer.class); - boolean succeeded = itemRatings.waitForCompletion(true); - if (!succeeded) - return -1; - - /* create A */ - Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(), - TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, - VectorWritable.class); - userRatings.setCombinerClass(MergeVectorsCombiner.class); - succeeded = userRatings.waitForCompletion(true); - if (!succeeded) - return -1; - - //TODO this could be fiddled into one of the upper jobs - Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"), - AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, - IntWritable.class, VectorWritable.class); - averageItemRatings.setCombinerClass(MergeVectorsCombiner.class); - succeeded = averageItemRatings.waitForCompletion(true); - if (!succeeded) - return -1; - - Vector averageRatings = ALSUtils.readFirstRow(getTempPath("averageRatings"), getConf()); - - /* create an initial M */ - initializeM(averageRatings); - - for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) { - /* broadcast M, read A row-wise, recompute U row-wise */ - log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations); - runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1)); - /* broadcast U, read A' row-wise, recompute M row-wise */ - log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations); - runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration)); - } - - return 0; - } - - private void initializeM(Vector averageRatings) throws IOException { - Random random = RandomUtils.getRandom(); - - FileSystem fs = FileSystem.get(pathToM(-1).toUri(), getConf()); - SequenceFile.Writer writer = null; - try { - writer = new SequenceFile.Writer(fs, getConf(), new Path(pathToM(-1), "part-m-00000"), IntWritable.class, - VectorWritable.class); - - Iterator averages = averageRatings.iterateNonZero(); - while (averages.hasNext()) { - Vector.Element e = averages.next(); - Vector row = new DenseVector(numFeatures); - row.setQuick(0, e.get()); - for (int m = 1; m < numFeatures; m++) { - row.setQuick(m, random.nextDouble()); - } - writer.append(new IntWritable(e.index()), new VectorWritable(row)); - } - } finally { - Closeables.closeQuietly(writer); - } - } - - static class ItemRatingVectorsMapper extends Mapper { - @Override - protected void map(LongWritable offset, Text line, Context ctx) throws IOException, InterruptedException { - String[] tokens = TasteHadoopUtils.splitPrefTokens(line.toString()); - int userID = Integer.parseInt(tokens[0]); - int itemID = Integer.parseInt(tokens[1]); - float rating = Float.parseFloat(tokens[2]); - - Vector ratings = new RandomAccessSparseVector(Integer.MAX_VALUE, 1); - ratings.set(userID, rating); - - ctx.write(new IntWritable(itemID), new VectorWritable(ratings, true)); - } - } - - private void runSolver(Path ratings, Path output, Path pathToUorI) - throws ClassNotFoundException, IOException, InterruptedException { - - Class solverMapper = implicitFeedback ? - SolveImplicitFeedbackMapper.class : SolveExplicitFeedbackMapper.class; - - Job solverForUorI = prepareJob(ratings, output, SequenceFileInputFormat.class, solverMapper, IntWritable.class, - VectorWritable.class, SequenceFileOutputFormat.class); - Configuration solverConf = solverForUorI.getConfiguration(); - solverConf.set(LAMBDA, String.valueOf(lambda)); - solverConf.set(ALPHA, String.valueOf(alpha)); - solverConf.setInt(NUM_FEATURES, numFeatures); - solverConf.set(FEATURE_MATRIX, pathToUorI.toString()); - boolean succeeded = solverForUorI.waitForCompletion(true); - if (!succeeded) - throw new IllegalStateException("Job failed!"); - } - - static class SolveExplicitFeedbackMapper extends Mapper { - - private double lambda; - private int numFeatures; - - private OpenIntObjectHashMap UorM; - - private AlternatingLeastSquaresSolver solver; - - @Override - protected void setup(Mapper.Context ctx) throws IOException, InterruptedException { - lambda = Double.parseDouble(ctx.getConfiguration().get(LAMBDA)); - numFeatures = ctx.getConfiguration().getInt(NUM_FEATURES, -1); - solver = new AlternatingLeastSquaresSolver(); - - Path UOrIPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX)); - - UorM = ALSUtils.readMatrixByRows(UOrIPath, ctx.getConfiguration()); - Preconditions.checkArgument(numFeatures > 0, "numFeatures was not set correctly!"); - } - - @Override - protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx) - throws IOException, InterruptedException { - Vector ratings = new SequentialAccessSparseVector(ratingsWritable.get()); - List featureVectors = Lists.newArrayList(); - Iterator interactions = ratings.iterateNonZero(); - while (interactions.hasNext()) { - int index = interactions.next().index(); - featureVectors.add(UorM.get(index)); - } - - Vector uiOrmj = solver.solve(featureVectors, ratings, lambda, numFeatures); - - ctx.write(userOrItemID, new VectorWritable(uiOrmj)); - } - } - - static class SolveImplicitFeedbackMapper extends Mapper { - - private ImplicitFeedbackAlternatingLeastSquaresSolver solver; - - @Override - protected void setup(Mapper.Context ctx) throws IOException, InterruptedException { - double lambda = Double.parseDouble(ctx.getConfiguration().get(LAMBDA)); - double alpha = Double.parseDouble(ctx.getConfiguration().get(ALPHA)); - int numFeatures = ctx.getConfiguration().getInt(NUM_FEATURES, -1); - - Path YPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX)); - OpenIntObjectHashMap Y = ALSUtils.readMatrixByRows(YPath, ctx.getConfiguration()); - - solver = new ImplicitFeedbackAlternatingLeastSquaresSolver(numFeatures, lambda, alpha, Y); - - Preconditions.checkArgument(numFeatures > 0, "numFeatures was not set correctly!"); - } - - @Override - protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx) - throws IOException, InterruptedException { - Vector ratings = new SequentialAccessSparseVector(ratingsWritable.get()); - - Vector uiOrmj = solver.solve(ratings); - - ctx.write(userOrItemID, new VectorWritable(uiOrmj)); - } - } - - static class AverageRatingMapper extends Mapper { - @Override - protected void map(IntWritable r, VectorWritable v, Context ctx) throws IOException, InterruptedException { - RunningAverage avg = new FullRunningAverage(); - Iterator elements = v.get().iterateNonZero(); - while (elements.hasNext()) { - avg.addDatum(elements.next().get()); - } - Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1); - vector.setQuick(r.get(), avg.getAverage()); - ctx.write(new IntWritable(0), new VectorWritable(vector)); - } - } - - private Path pathToM(int iteration) { - return iteration == numIterations - 1 ? getOutputPath("M") : getTempPath("M-" + iteration); - } - - private Path pathToU(int iteration) { - return iteration == numIterations - 1 ? getOutputPath("U") : getTempPath("U-" + iteration); - } - - private Path pathToItemRatings() { - return getTempPath("itemRatings"); - } - - private Path pathToUserRatings() { - return getOutputPath("userRatings"); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java deleted file mode 100644 index 9ca0b1663..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java +++ /dev/null @@ -1,169 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.als; - -import com.google.common.collect.Lists; -import com.google.common.primitives.Floats; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.cf.taste.common.TopK; -import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable; -import org.apache.mahout.cf.taste.impl.recommender.GenericRecommendedItem; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.function.IntObjectProcedure; -import org.apache.mahout.math.map.OpenIntObjectHashMap; -import org.apache.mahout.math.set.OpenIntHashSet; - -import java.io.IOException; -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -/** - *

Computes the top-N recommendations per user from a decomposition of the rating matrix

- * - *

Command line arguments specific to this class are:

- * - *
    - *
  1. --input (path): Directory containing the vectorized user ratings
  2. - *
  3. --output (path): path where output should go
  4. - *
  5. --numRecommendations (int): maximum number of recommendations per user
  6. - *
  7. --maxRating (double): maximum rating of an item
  8. - *
  9. --numFeatures (int): number of features to use for decomposition
  10. - *
- */ -public class RecommenderJob extends AbstractJob { - - private static final String NUM_RECOMMENDATIONS = RecommenderJob.class.getName() + ".numRecommendations"; - private static final String USER_FEATURES_PATH = RecommenderJob.class.getName() + ".userFeatures"; - private static final String ITEM_FEATURES_PATH = RecommenderJob.class.getName() + ".itemFeatures"; - private static final String MAX_RATING = RecommenderJob.class.getName() + ".maxRating"; - - static final int DEFAULT_NUM_RECOMMENDATIONS = 10; - - public static void main(String[] args) throws Exception { - ToolRunner.run(new RecommenderJob(), args); - } - - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOption("userFeatures", null, "path to the user feature matrix", true); - addOption("itemFeatures", null, "path to the item feature matrix", true); - addOption("numRecommendations", null, "number of recommendations per user", - String.valueOf(DEFAULT_NUM_RECOMMENDATIONS)); - addOption("maxRating", null, "maximum rating available", true); - addOutputOption(); - - Map> parsedArgs = parseArguments(args); - if (parsedArgs == null) { - return -1; - } - - Job prediction = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, PredictionMapper.class, - IntWritable.class, RecommendedItemsWritable.class, TextOutputFormat.class); - prediction.getConfiguration().setInt(NUM_RECOMMENDATIONS, - Integer.parseInt(getOption("numRecommendations"))); - prediction.getConfiguration().set(USER_FEATURES_PATH, getOption("userFeatures")); - prediction.getConfiguration().set(ITEM_FEATURES_PATH, getOption("itemFeatures")); - prediction.getConfiguration().set(MAX_RATING, getOption("maxRating")); - boolean succeeded = prediction.waitForCompletion(true); - if (!succeeded) - return -1; - - - return 0; - } - - private static final Comparator BY_PREFERENCE_VALUE = - new Comparator() { - @Override - public int compare(RecommendedItem one, RecommendedItem two) { - return Floats.compare(one.getValue(), two.getValue()); - } - }; - - static class PredictionMapper - extends Mapper { - - private OpenIntObjectHashMap U; - private OpenIntObjectHashMap M; - - private int recommendationsPerUser; - private float maxRating; - - @Override - protected void setup(Context ctx) throws IOException, InterruptedException { - recommendationsPerUser = ctx.getConfiguration().getInt(NUM_RECOMMENDATIONS, - DEFAULT_NUM_RECOMMENDATIONS); - - Path pathToU = new Path(ctx.getConfiguration().get(USER_FEATURES_PATH)); - Path pathToM = new Path(ctx.getConfiguration().get(ITEM_FEATURES_PATH)); - - U = ALSUtils.readMatrixByRows(pathToU, ctx.getConfiguration()); - M = ALSUtils.readMatrixByRows(pathToM, ctx.getConfiguration()); - - maxRating = Float.parseFloat(ctx.getConfiguration().get(MAX_RATING)); - } - - @Override - protected void map(IntWritable userIDWritable, VectorWritable ratingsWritable, Context ctx) - throws IOException, InterruptedException { - - Vector ratings = ratingsWritable.get(); - final int userID = userIDWritable.get(); - final OpenIntHashSet alreadyRatedItems = new OpenIntHashSet(ratings.getNumNondefaultElements()); - final TopK topKItems = new TopK(recommendationsPerUser, BY_PREFERENCE_VALUE); - - Iterator ratingsIterator = ratings.iterateNonZero(); - while (ratingsIterator.hasNext()) { - alreadyRatedItems.add(ratingsIterator.next().index()); - } - - M.forEachPair(new IntObjectProcedure() { - @Override - public boolean apply(int itemID, Vector itemFeatures) { - if (!alreadyRatedItems.contains(itemID)) { - double predictedRating = U.get(userID).dot(itemFeatures); - topKItems.offer(new GenericRecommendedItem(itemID, (float) predictedRating)); - } - return true; - } - }); - - List recommendedItems = Lists.newArrayListWithExpectedSize(recommendationsPerUser); - for (RecommendedItem topItem : topKItems.retrieve()) { - recommendedItems.add(new GenericRecommendedItem(topItem.getItemID(), Math.min(topItem.getValue(), maxRating))); - } - - if (!topKItems.isEmpty()) { - ctx.write(userIDWritable, new RecommendedItemsWritable(recommendedItems)); - } - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/AggregateAndRecommendReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/AggregateAndRecommendReducer.java deleted file mode 100644 index 0d8212c99..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/AggregateAndRecommendReducer.java +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.item; - -import com.google.common.primitives.Floats; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.cf.taste.common.TopK; -import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable; -import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.recommender.GenericRecommendedItem; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.iterator.FileLineIterable; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.VarLongWritable; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.function.DoubleFunction; -import org.apache.mahout.math.map.OpenIntLongHashMap; - -import java.io.IOException; -import java.util.Comparator; -import java.util.Iterator; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - *

computes prediction values for each user

- * - *
- * u = a user
- * i = an item not yet rated by u
- * N = all items similar to i (where similarity is usually computed by pairwisely comparing the item-vectors
- * of the user-item matrix)
- *
- * Prediction(u,i) = sum(all n from N: similarity(i,n) * rating(u,n)) / sum(all n from N: abs(similarity(i,n)))
- * 
- */ -public final class AggregateAndRecommendReducer extends - Reducer { - - private static final Logger log = LoggerFactory.getLogger(AggregateAndRecommendReducer.class); - - static final String ITEMID_INDEX_PATH = "itemIDIndexPath"; - static final String NUM_RECOMMENDATIONS = "numRecommendations"; - static final int DEFAULT_NUM_RECOMMENDATIONS = 10; - static final String ITEMS_FILE = "itemsFile"; - - private boolean booleanData; - private int recommendationsPerUser; - private FastIDSet itemsToRecommendFor; - private OpenIntLongHashMap indexItemIDMap; - - private static final float BOOLEAN_PREF_VALUE = 1.0f; - private static final Comparator BY_PREFERENCE_VALUE = - new Comparator() { - @Override - public int compare(RecommendedItem one, RecommendedItem two) { - return Floats.compare(one.getValue(), two.getValue()); - } - }; - - @Override - protected void setup(Context context) throws IOException { - Configuration conf = context.getConfiguration(); - recommendationsPerUser = conf.getInt(NUM_RECOMMENDATIONS, DEFAULT_NUM_RECOMMENDATIONS); - booleanData = conf.getBoolean(RecommenderJob.BOOLEAN_DATA, false); - indexItemIDMap = TasteHadoopUtils.readItemIDIndexMap(conf.get(ITEMID_INDEX_PATH), conf); - - String itemFilePathString = conf.get(ITEMS_FILE); - if (itemFilePathString != null) { - itemsToRecommendFor = new FastIDSet(); - for (String line : new FileLineIterable(HadoopUtil.openStream(new Path(itemFilePathString), conf))) { - try { - itemsToRecommendFor.add(Long.parseLong(line)); - } catch (NumberFormatException nfe) { - log.warn("itemsFile line ignored: {}", line); - } - } - } - } - - private static final DoubleFunction ABSOLUTE_VALUES = new DoubleFunction() { - @Override - public double apply(double value) { - return value < 0 ? value * -1 : value; - } - }; - - @Override - protected void reduce(VarLongWritable userID, - Iterable values, - Context context) throws IOException, InterruptedException { - if (booleanData) { - reduceBooleanData(userID, values, context); - } else { - reduceNonBooleanData(userID, values, context); - } - } - - private void reduceBooleanData(VarLongWritable userID, - Iterable values, - Context context) throws IOException, InterruptedException { - /* having boolean data, each estimated preference can only be 1, - * however we can't use this to rank the recommended items, - * so we use the sum of similarities for that. */ - Vector predictionVector = null; - for (PrefAndSimilarityColumnWritable prefAndSimilarityColumn : values) { - predictionVector = predictionVector == null - ? prefAndSimilarityColumn.getSimilarityColumn() - : predictionVector.plus(prefAndSimilarityColumn.getSimilarityColumn()); - } - writeRecommendedItems(userID, predictionVector, context); - } - - private void reduceNonBooleanData(VarLongWritable userID, - Iterable values, - Context context) throws IOException, InterruptedException { - /* each entry here is the sum in the numerator of the prediction formula */ - Vector numerators = null; - /* each entry here is the sum in the denominator of the prediction formula */ - Vector denominators = null; - /* each entry here is the number of similar items used in the prediction formula */ - Vector numberOfSimilarItemsUsed = new RandomAccessSparseVector(Integer.MAX_VALUE, 100); - - for (PrefAndSimilarityColumnWritable prefAndSimilarityColumn : values) { - Vector simColumn = prefAndSimilarityColumn.getSimilarityColumn(); - float prefValue = prefAndSimilarityColumn.getPrefValue(); - /* count the number of items used for each prediction */ - Iterator usedItemsIterator = simColumn.iterateNonZero(); - while (usedItemsIterator.hasNext()) { - int itemIDIndex = usedItemsIterator.next().index(); - numberOfSimilarItemsUsed.setQuick(itemIDIndex, numberOfSimilarItemsUsed.getQuick(itemIDIndex) + 1); - } - - numerators = numerators == null - ? prefValue == BOOLEAN_PREF_VALUE ? simColumn.clone() : simColumn.times(prefValue) - : numerators.plus(prefValue == BOOLEAN_PREF_VALUE ? simColumn : simColumn.times(prefValue)); - - simColumn.assign(ABSOLUTE_VALUES); - denominators = denominators == null ? simColumn : denominators.plus(simColumn); - } - - if (numerators == null) { - return; - } - - Vector recommendationVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100); - Iterator iterator = numerators.iterateNonZero(); - while (iterator.hasNext()) { - Vector.Element element = iterator.next(); - int itemIDIndex = element.index(); - /* preference estimations must be based on at least 2 datapoints */ - if (numberOfSimilarItemsUsed.getQuick(itemIDIndex) > 1) { - /* compute normalized prediction */ - double prediction = element.get() / denominators.getQuick(itemIDIndex); - recommendationVector.setQuick(itemIDIndex, prediction); - } - } - writeRecommendedItems(userID, recommendationVector, context); - } - - /** - * find the top entries in recommendationVector, map them to the real itemIDs and write back the result - */ - private void writeRecommendedItems(VarLongWritable userID, Vector recommendationVector, Context context) - throws IOException, InterruptedException { - - TopK topKItems = new TopK(recommendationsPerUser, BY_PREFERENCE_VALUE); - - Iterator recommendationVectorIterator = recommendationVector.iterateNonZero(); - while (recommendationVectorIterator.hasNext()) { - Vector.Element element = recommendationVectorIterator.next(); - int index = element.index(); - long itemID; - if (indexItemIDMap != null && !indexItemIDMap.isEmpty()) { - itemID = indexItemIDMap.get(index); - } else { //we don't have any mappings, so just use the original - itemID = index; - } - if (itemsToRecommendFor == null || itemsToRecommendFor.contains(itemID)) { - float value = (float) element.get(); - if (!Float.isNaN(value)) { - topKItems.offer(new GenericRecommendedItem(itemID, value)); - } - } - } - - if (!topKItems.isEmpty()) { - context.write(userID, new RecommendedItemsWritable(topKItems.retrieve())); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterAsVectorAndPrefsReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterAsVectorAndPrefsReducer.java deleted file mode 100644 index a31bf3015..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterAsVectorAndPrefsReducer.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.item; - -import com.google.common.collect.Lists; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.VarIntWritable; -import org.apache.mahout.math.VarLongWritable; -import org.apache.mahout.math.Vector; - -import java.io.IOException; -import java.util.List; - -/** - * we use a neat little trick to explicitly filter items for some users: we inject a NaN summand into the preference - * estimation for those items, which makes {@link org.apache.mahout.cf.taste.hadoop.item.AggregateAndRecommendReducer} - * automatically exclude them - */ -public class ItemFilterAsVectorAndPrefsReducer - extends Reducer { - @Override - protected void reduce(VarLongWritable itemID, Iterable values, Context ctx) - throws IOException, InterruptedException { - - int itemIDIndex = TasteHadoopUtils.idToIndex(itemID.get()); - Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1); - /* artificial NaN summand to exclude this item from the recommendations for all users specified in userIDs */ - vector.set(itemIDIndex, Double.NaN); - - List userIDs = Lists.newArrayList(); - List prefValues = Lists.newArrayList(); - for (VarLongWritable userID : values) { - userIDs.add(userID.get()); - prefValues.add(1.0f); - } - - ctx.write(new VarIntWritable(itemIDIndex), new VectorAndPrefsWritable(vector, userIDs, prefValues)); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterMapper.java deleted file mode 100644 index 6350f0c4c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterMapper.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.item; - -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.math.VarLongWritable; - -import java.io.IOException; -import java.util.regex.Pattern; - -/** - * map out all user/item pairs to filter, keyed by the itemID - */ -public class ItemFilterMapper extends Mapper { - - private static final Pattern SEPARATOR = Pattern.compile("[\t,]"); - - @Override - protected void map(LongWritable key, Text line, Context ctx) throws IOException, InterruptedException { - String[] tokens = SEPARATOR.split(line.toString()); - long userID = Long.parseLong(tokens[0]); - long itemID = Long.parseLong(tokens[1]); - ctx.write(new VarLongWritable(itemID), new VarLongWritable(userID)); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexMapper.java deleted file mode 100644 index 5435f80dd..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexMapper.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.item; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils; -import org.apache.mahout.cf.taste.hadoop.ToEntityPrefsMapper; -import org.apache.mahout.math.VarIntWritable; -import org.apache.mahout.math.VarLongWritable; - -public final class ItemIDIndexMapper extends - Mapper { - - private boolean transpose; - - @Override - protected void setup(Context context) { - Configuration jobConf = context.getConfiguration(); - transpose = jobConf.getBoolean(ToEntityPrefsMapper.TRANSPOSE_USER_ITEM, false); - } - - @Override - protected void map(LongWritable key, - Text value, - Context context) throws IOException, InterruptedException { - String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString()); - long itemID = Long.parseLong(tokens[transpose ? 0 : 1]); - int index = TasteHadoopUtils.idToIndex(itemID); - context.write(new VarIntWritable(index), new VarLongWritable(itemID)); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexReducer.java deleted file mode 100644 index 793c2ab0a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexReducer.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.item; - -import java.io.IOException; - -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.math.VarIntWritable; -import org.apache.mahout.math.VarLongWritable; - -public final class ItemIDIndexReducer extends - Reducer { - - @Override - protected void reduce(VarIntWritable index, - Iterable possibleItemIDs, - Context context) throws IOException, InterruptedException { - long minimumItemID = Long.MAX_VALUE; - for (VarLongWritable varLongWritable : possibleItemIDs) { - long itemID = varLongWritable.get(); - if (itemID < minimumItemID) { - minimumItemID = itemID; - } - } - if (minimumItemID != Long.MAX_VALUE) { - context.write(index, new VarLongWritable(minimumItemID)); - } - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PartialMultiplyMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PartialMultiplyMapper.java deleted file mode 100644 index dbc29ca10..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PartialMultiplyMapper.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.item; - -import java.io.IOException; -import java.util.List; - -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.math.VarIntWritable; -import org.apache.mahout.math.VarLongWritable; -import org.apache.mahout.math.Vector; - -/** - * maps similar items and their preference values per user - */ -public final class PartialMultiplyMapper extends - Mapper { - - @Override - protected void map(VarIntWritable key, - VectorAndPrefsWritable vectorAndPrefsWritable, - Context context) throws IOException, InterruptedException { - - Vector similarityMatrixColumn = vectorAndPrefsWritable.getVector(); - List userIDs = vectorAndPrefsWritable.getUserIDs(); - List prefValues = vectorAndPrefsWritable.getValues(); - - VarLongWritable userIDWritable = new VarLongWritable(); - PrefAndSimilarityColumnWritable prefAndSimilarityColumn = new PrefAndSimilarityColumnWritable(); - - for (int i = 0; i < userIDs.size(); i++) { - long userID = userIDs.get(i); - float prefValue = prefValues.get(i); - if (!Float.isNaN(prefValue)) { - prefAndSimilarityColumn.set(prefValue, similarityMatrixColumn); - userIDWritable.set(userID); - context.write(userIDWritable, prefAndSimilarityColumn); - } - } - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PrefAndSimilarityColumnWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PrefAndSimilarityColumnWritable.java deleted file mode 100644 index 704c74a82..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PrefAndSimilarityColumnWritable.java +++ /dev/null @@ -1,85 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.item; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -public final class PrefAndSimilarityColumnWritable implements Writable { - - private float prefValue; - private Vector similarityColumn; - - public PrefAndSimilarityColumnWritable() { - } - - public PrefAndSimilarityColumnWritable(float prefValue, Vector similarityColumn) { - set(prefValue, similarityColumn); - } - - public void set(float prefValue, Vector similarityColumn) { - this.prefValue = prefValue; - this.similarityColumn = similarityColumn; - } - - public float getPrefValue() { - return prefValue; - } - - public Vector getSimilarityColumn() { - return similarityColumn; - } - - @Override - public void readFields(DataInput in) throws IOException { - prefValue = in.readFloat(); - VectorWritable vw = new VectorWritable(); - vw.readFields(in); - similarityColumn = vw.get(); - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeFloat(prefValue); - VectorWritable vw = new VectorWritable(similarityColumn); - vw.setWritesLaxPrecision(true); - vw.write(out); - } - - @Override - public boolean equals(Object obj) { - if (obj instanceof PrefAndSimilarityColumnWritable) { - PrefAndSimilarityColumnWritable other = (PrefAndSimilarityColumnWritable) obj; - return prefValue == other.prefValue && similarityColumn.equals(other.similarityColumn); - } - return false; - } - - @Override - public int hashCode() { - return RandomUtils.hashFloat(prefValue) + 31 * similarityColumn.hashCode(); - } - - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java deleted file mode 100644 index e67a4cb64..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.item; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.JobContext; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable; -import org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.math.VarIntWritable; -import org.apache.mahout.math.VarLongWritable; -import org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob; -import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasures; - -import java.util.List; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - *

Runs a completely distributed recommender job as a series of mapreduces.

- *

- *

Preferences in the input file should look like {@code userID, itemID[, preferencevalue]}

- *

- *

- * Preference value is optional to accommodate applications that have no notion of a preference value (that is, the user - * simply expresses a preference for an item, but no degree of preference). - *

- *

- *

- * The preference value is assumed to be parseable as a {@code double}. The user IDs and item IDs are - * parsed as {@code long}s. - *

- *

- *

Command line arguments specific to this class are:

- *

- *

    - *
  1. --input(path): Directory containing one or more text files with the preference data
  2. - *
  3. --output(path): output path where recommender output should go
  4. - *
  5. --similarityClassname (classname): Name of vector similarity class to instantiate or a predefined similarity - * from {@link org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasure}
  6. - *
  7. --usersFile (path): only compute recommendations for user IDs contained in this file (optional)
  8. - *
  9. --itemsFile (path): only include item IDs from this file in the recommendations (optional)
  10. - *
  11. --filterFile (path): file containing comma-separated userID,itemID pairs. Used to exclude the item from the - * recommendations for that user (optional)
  12. - *
  13. --numRecommendations (integer): Number of recommendations to compute per user (10)
  14. - *
  15. --booleanData (boolean): Treat input data as having no pref values (false)
  16. - *
  17. --maxPrefsPerUser (integer): Maximum number of preferences considered per user in final recommendation phase (10)
  18. - *
  19. --maxSimilaritiesPerItem (integer): Maximum number of similarities considered per item (100)
  20. - *
  21. --minPrefsPerUser (integer): ignore users with less preferences than this in the similarity computation (1)
  22. - *
  23. --maxPrefsPerUserInItemSimilarity (integer): max number of preferences to consider per user in the item similarity computation phase, - * users with more preferences will be sampled down (1000)
  24. - *
  25. --threshold (double): discard item pairs with a similarity value below this
  26. - *
- *

- *

General command line options are documented in {@link AbstractJob}.

- *

- *

Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other - * arguments.

- */ -public final class RecommenderJob extends AbstractJob { - - public static final String BOOLEAN_DATA = "booleanData"; - - private static final int DEFAULT_MAX_SIMILARITIES_PER_ITEM = 100; - private static final int DEFAULT_MAX_PREFS_PER_USER = 1000; - private static final int DEFAULT_MIN_PREFS_PER_USER = 1; - - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOutputOption(); - addOption("numRecommendations", "n", "Number of recommendations per user", - String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS)); - addOption("usersFile", null, "File of users to recommend for", null); - addOption("itemsFile", null, "File of items to recommend for", null); - addOption("filterFile", "f", "File containing comma-separated userID,itemID pairs. Used to exclude the item from " - + "the recommendations for that user (optional)", null); - addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); - addOption("maxPrefsPerUser", "mxp", - "Maximum number of preferences considered per user in final recommendation phase", - String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED)); - addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this in the similarity computation " - + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); - addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ", - String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM)); - addOption("maxPrefsPerUserInItemSimilarity", "mppuiis", "max number of preferences to consider per user in the " - + "item similarity computation phase, users with more preferences will be sampled down (default: " + - DEFAULT_MAX_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MAX_PREFS_PER_USER)); - addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, " - + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')', true); - addOption("threshold", "tr", "discard item pairs with a similarity value below this", false); - - Map> parsedArgs = parseArguments(args); - if (parsedArgs == null) { - return -1; - } - - Path outputPath = getOutputPath(); - int numRecommendations = Integer.parseInt(getOption("numRecommendations")); - String usersFile = getOption("usersFile"); - String itemsFile = getOption("itemsFile"); - String filterFile = getOption("filterFile"); - boolean booleanData = Boolean.valueOf(getOption("booleanData")); - int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser")); - int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser")); - int maxPrefsPerUserInItemSimilarity = Integer.parseInt(getOption("maxPrefsPerUserInItemSimilarity")); - int maxSimilaritiesPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem")); - String similarityClassname = getOption("similarityClassname"); - double threshold = hasOption("threshold") ? - Double.parseDouble(getOption("threshold")) : RowSimilarityJob.NO_THRESHOLD; - - - Path prepPath = getTempPath("preparePreferenceMatrix"); - Path similarityMatrixPath = getTempPath("similarityMatrix"); - Path prePartialMultiplyPath1 = getTempPath("prePartialMultiply1"); - Path prePartialMultiplyPath2 = getTempPath("prePartialMultiply2"); - Path explicitFilterPath = getTempPath("explicitFilterPath"); - Path partialMultiplyPath = getTempPath("partialMultiply"); - - AtomicInteger currentPhase = new AtomicInteger(); - - int numberOfUsers = -1; - - if (shouldRunNextPhase(parsedArgs, currentPhase)) { - ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[]{ - "--input", getInputPath().toString(), - "--output", prepPath.toString(), - "--maxPrefsPerUser", String.valueOf(maxPrefsPerUserInItemSimilarity), - "--minPrefsPerUser", String.valueOf(minPrefsPerUser), - "--booleanData", String.valueOf(booleanData), - "--tempDir", getTempPath().toString()}); - - numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf()); - } - - - if (shouldRunNextPhase(parsedArgs, currentPhase)) { - - /* special behavior if phase 1 is skipped */ - if (numberOfUsers == -1) { - numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS), - PathType.LIST, null, getConf()); - } - - /* Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like - * new DistributedRowMatrix(...).rowSimilarity(...) */ - //calculate the co-occurrence matrix - ToolRunner.run(getConf(), new RowSimilarityJob(), new String[]{ - "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), - "--output", similarityMatrixPath.toString(), - "--numberOfColumns", String.valueOf(numberOfUsers), - "--similarityClassname", similarityClassname, - "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem), - "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE), - "--threshold", String.valueOf(threshold), - "--tempDir", getTempPath().toString()}); - } - - //start the multiplication of the co-occurrence matrix by the user vectors - if (shouldRunNextPhase(parsedArgs, currentPhase)) { - Job prePartialMultiply1 = prepareJob( - similarityMatrixPath, prePartialMultiplyPath1, SequenceFileInputFormat.class, - SimilarityMatrixRowWrapperMapper.class, VarIntWritable.class, VectorOrPrefWritable.class, - Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class, - SequenceFileOutputFormat.class); - boolean succeeded = prePartialMultiply1.waitForCompletion(true); - if (!succeeded) - return -1; - //continue the multiplication - Job prePartialMultiply2 = prepareJob(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS), - prePartialMultiplyPath2, SequenceFileInputFormat.class, UserVectorSplitterMapper.class, VarIntWritable.class, - VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class, - SequenceFileOutputFormat.class); - if (usersFile != null) { - prePartialMultiply2.getConfiguration().set(UserVectorSplitterMapper.USERS_FILE, usersFile); - } - prePartialMultiply2.getConfiguration().setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED, - maxPrefsPerUser); - succeeded = prePartialMultiply2.waitForCompletion(true); - if (!succeeded) - return -1; - //finish the job - Job partialMultiply = prepareJob( - new Path(prePartialMultiplyPath1 + "," + prePartialMultiplyPath2), partialMultiplyPath, - SequenceFileInputFormat.class, Mapper.class, VarIntWritable.class, VectorOrPrefWritable.class, - ToVectorAndPrefReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class, - SequenceFileOutputFormat.class); - setS3SafeCombinedInputPath(partialMultiply, getTempPath(), prePartialMultiplyPath1, prePartialMultiplyPath2); - succeeded = partialMultiply.waitForCompletion(true); - if (!succeeded) - return -1; - } - - if (shouldRunNextPhase(parsedArgs, currentPhase)) { - //filter out any users we don't care about - /* convert the user/item pairs to filter if a filterfile has been specified */ - if (filterFile != null) { - Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class, - ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class, - ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class, - SequenceFileOutputFormat.class); - boolean succeeded = itemFiltering.waitForCompletion(true); - if (!succeeded) - return -1; - } - - String aggregateAndRecommendInput = partialMultiplyPath.toString(); - if (filterFile != null) { - aggregateAndRecommendInput += "," + explicitFilterPath; - } - //extract out the recommendations - Job aggregateAndRecommend = prepareJob( - new Path(aggregateAndRecommendInput), outputPath, SequenceFileInputFormat.class, - PartialMultiplyMapper.class, VarLongWritable.class, PrefAndSimilarityColumnWritable.class, - AggregateAndRecommendReducer.class, VarLongWritable.class, RecommendedItemsWritable.class, - TextOutputFormat.class); - Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration(); - if (itemsFile != null) { - aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile); - } - - if (filterFile != null) { - setS3SafeCombinedInputPath(aggregateAndRecommend, getTempPath(), partialMultiplyPath, explicitFilterPath); - } - setIOSort(aggregateAndRecommend); - aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH, - new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString()); - aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations); - aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData); - boolean succeeded = aggregateAndRecommend.waitForCompletion(true); - if (!succeeded) - return -1; - } - - return 0; - } - - private static void setIOSort(JobContext job) { - Configuration conf = job.getConfiguration(); - conf.setInt("io.sort.factor", 100); - String javaOpts = conf.get("mapred.map.child.java.opts"); // new arg name - if (javaOpts == null) { - javaOpts = conf.get("mapred.child.java.opts"); // old arg name - } - int assumedHeapSize = 512; - if (javaOpts != null) { - Matcher m = Pattern.compile("-Xmx([0-9]+)([mMgG])").matcher(javaOpts); - if (m.find()) { - assumedHeapSize = Integer.parseInt(m.group(1)); - String megabyteOrGigabyte = m.group(2); - if ("g".equalsIgnoreCase(megabyteOrGigabyte)) { - assumedHeapSize *= 1024; - } - } - } - // Cap this at 1024MB now; see https://issues.apache.org/jira/browse/MAPREDUCE-2308 - conf.setInt("io.sort.mb", Math.min(assumedHeapSize / 2, 1024)); - // For some reason the Merger doesn't report status for a long time; increase - // timeout when running these jobs - conf.setInt("mapred.task.timeout", 60 * 60 * 1000); - } - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new RecommenderJob(), args); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/SimilarityMatrixRowWrapperMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/SimilarityMatrixRowWrapperMapper.java deleted file mode 100644 index 4cdf21e64..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/SimilarityMatrixRowWrapperMapper.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.item; - -import java.io.IOException; - -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.math.VarIntWritable; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -/** - * maps a row of the similarity matrix to a {@link VectorOrPrefWritable} - * - * actually a column from that matrix has to be used but as the similarity matrix is symmetric, - * we can use a row instead of having to transpose it - */ -public final class SimilarityMatrixRowWrapperMapper extends - Mapper { - - @Override - protected void map(IntWritable key, - VectorWritable value, - Context context) throws IOException, InterruptedException { - Vector similarityMatrixRow = value.get(); - /* remove self similarity */ - similarityMatrixRow.set(key.get(), Double.NaN); - context.write(new VarIntWritable(key.get()), new VectorOrPrefWritable(similarityMatrixRow)); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducer.java deleted file mode 100644 index d01da3f6e..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducer.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.item; - -import java.io.IOException; - -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable; -import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.VarLongWritable; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -/** - *

Input

- * - *

- * Takes user IDs as {@link VarLongWritable} mapped to all associated item IDs and preference values, as - * {@link EntityPrefWritable}s. - *

- * - *

Output

- * - *

- * The same user ID mapped to a {@link RandomAccessSparseVector} representation of the same item IDs and - * preference values. Item IDs are used as vector indexes; they are hashed into ints to work as indexes with - * {@link TasteHadoopUtils#idToIndex(long)}. The mapping is remembered for later with a combination of - * {@link ItemIDIndexMapper} and {@link ItemIDIndexReducer}. - *

- */ -public final class ToUserVectorsReducer extends - Reducer { - - public static final String MIN_PREFERENCES_PER_USER = ToUserVectorsReducer.class.getName() - + ".minPreferencesPerUser"; - - private int minPreferences; - - public enum Counters { USERS } - - @Override - protected void setup(Context ctx) throws IOException, InterruptedException { - super.setup(ctx); - minPreferences = ctx.getConfiguration().getInt(MIN_PREFERENCES_PER_USER, 1); - } - - @Override - protected void reduce(VarLongWritable userID, - Iterable itemPrefs, - Context context) throws IOException, InterruptedException { - Vector userVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100); - for (VarLongWritable itemPref : itemPrefs) { - int index = TasteHadoopUtils.idToIndex(itemPref.get()); - float value = itemPref instanceof EntityPrefWritable ? ((EntityPrefWritable) itemPref).getPrefValue() : 1.0f; - userVector.set(index, value); - } - - if (userVector.getNumNondefaultElements() >= minPreferences) { - VectorWritable vw = new VectorWritable(userVector); - vw.setWritesLaxPrecision(true); - context.getCounter(Counters.USERS).increment(1); - context.write(userID, vw); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToVectorAndPrefReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToVectorAndPrefReducer.java deleted file mode 100644 index 2f91f4c44..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToVectorAndPrefReducer.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.item; - -import java.io.IOException; -import java.util.List; - -import com.google.common.collect.Lists; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.math.VarIntWritable; -import org.apache.mahout.math.Vector; - -public final class ToVectorAndPrefReducer extends - Reducer { - - @Override - protected void reduce(VarIntWritable key, - Iterable values, - Context context) throws IOException, InterruptedException { - - List userIDs = Lists.newArrayList(); - List prefValues = Lists.newArrayList(); - Vector similarityMatrixColumn = null; - for (VectorOrPrefWritable value : values) { - if (value.getVector() == null) { - // Then this is a user-pref value - userIDs.add(value.getUserID()); - prefValues.add(value.getValue()); - } else { - // Then this is the column vector - if (similarityMatrixColumn != null) { - throw new IllegalStateException("Found two similarity-matrix columns for item index " + key.get()); - } - similarityMatrixColumn = value.getVector(); - } - } - - if (similarityMatrixColumn == null) { - return; - } - - VectorAndPrefsWritable vectorAndPrefs = new VectorAndPrefsWritable(similarityMatrixColumn, userIDs, prefValues); - context.write(key, vectorAndPrefs); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/UserVectorSplitterMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/UserVectorSplitterMapper.java deleted file mode 100644 index faf8846bb..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/UserVectorSplitterMapper.java +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.item; - -import com.google.common.io.Closeables; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.cf.taste.common.TopK; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.common.iterator.FileLineIterable; -import org.apache.mahout.math.VarIntWritable; -import org.apache.mahout.math.VarLongWritable; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -import java.io.IOException; -import java.util.Comparator; -import java.util.Iterator; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public final class UserVectorSplitterMapper extends - Mapper { - - private static final Logger log = LoggerFactory.getLogger(UserVectorSplitterMapper.class); - - static final String USERS_FILE = "usersFile"; - static final String MAX_PREFS_PER_USER_CONSIDERED = "maxPrefsPerUserConsidered"; - static final int DEFAULT_MAX_PREFS_PER_USER_CONSIDERED = 10; - - private int maxPrefsPerUserConsidered; - private FastIDSet usersToRecommendFor; - - @Override - protected void setup(Context context) throws IOException { - Configuration jobConf = context.getConfiguration(); - maxPrefsPerUserConsidered = jobConf.getInt(MAX_PREFS_PER_USER_CONSIDERED, DEFAULT_MAX_PREFS_PER_USER_CONSIDERED); - String usersFilePathString = jobConf.get(USERS_FILE); - if (usersFilePathString != null) { - FSDataInputStream in = null; - try { - Path unqualifiedUsersFilePath = new Path(usersFilePathString); - FileSystem fs = FileSystem.get(unqualifiedUsersFilePath.toUri(), jobConf); - usersToRecommendFor = new FastIDSet(); - Path usersFilePath = unqualifiedUsersFilePath.makeQualified(fs); - in = fs.open(usersFilePath); - for (String line : new FileLineIterable(in)) { - try { - usersToRecommendFor.add(Long.parseLong(line)); - } catch (NumberFormatException nfe) { - log.warn("usersFile line ignored: {}", line); - } - } - } finally { - Closeables.closeQuietly(in); - } - } - } - - @Override - protected void map(VarLongWritable key, - VectorWritable value, - Context context) throws IOException, InterruptedException { - long userID = key.get(); - if (usersToRecommendFor != null && !usersToRecommendFor.contains(userID)) { - return; - } - Vector userVector = maybePruneUserVector(value.get()); - Iterator it = userVector.iterateNonZero(); - VarIntWritable itemIndexWritable = new VarIntWritable(); - VectorOrPrefWritable vectorOrPref = new VectorOrPrefWritable(); - while (it.hasNext()) { - Vector.Element e = it.next(); - itemIndexWritable.set(e.index()); - vectorOrPref.set(userID, (float) e.get()); - context.write(itemIndexWritable, vectorOrPref); - } - } - - private Vector maybePruneUserVector(Vector userVector) { - if (userVector.getNumNondefaultElements() <= maxPrefsPerUserConsidered) { - return userVector; - } - - float smallestLargeValue = findSmallestLargeValue(userVector); - - // "Blank out" small-sized prefs to reduce the amount of partial products - // generated later. They're not zeroed, but NaN-ed, so they come through - // and can be used to exclude these items from prefs. - Iterator it = userVector.iterateNonZero(); - while (it.hasNext()) { - Vector.Element e = it.next(); - float absValue = Math.abs((float) e.get()); - if (absValue < smallestLargeValue) { - e.set(Float.NaN); - } - } - - return userVector; - } - - private float findSmallestLargeValue(Vector userVector) { - - TopK topPrefValues = new TopK(maxPrefsPerUserConsidered, new Comparator() { - @Override - public int compare(Float one, Float two) { - return one.compareTo(two); - } - }); - - Iterator it = userVector.iterateNonZero(); - while (it.hasNext()) { - float absValue = Math.abs((float) it.next().get()); - topPrefValues.offer(absValue); - } - return topPrefValues.smallestGreat(); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorAndPrefsWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorAndPrefsWritable.java deleted file mode 100644 index 29bb4169a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorAndPrefsWritable.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.item; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.List; - -import com.google.common.collect.Lists; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.math.Varint; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -public final class VectorAndPrefsWritable implements Writable { - - private Vector vector; - private List userIDs; - private List values; - - public VectorAndPrefsWritable() { - } - - public VectorAndPrefsWritable(Vector vector, List userIDs, List values) { - this.vector = vector; - this.userIDs = userIDs; - this.values = values; - } - - public Vector getVector() { - return vector; - } - - public List getUserIDs() { - return userIDs; - } - - public List getValues() { - return values; - } - - @Override - public void write(DataOutput out) throws IOException { - VectorWritable vw = new VectorWritable(vector); - vw.setWritesLaxPrecision(true); - vw.write(out); - Varint.writeUnsignedVarInt(userIDs.size(), out); - for (int i = 0; i < userIDs.size(); i++) { - Varint.writeSignedVarLong(userIDs.get(i), out); - out.writeFloat(values.get(i)); - } - } - - @Override - public void readFields(DataInput in) throws IOException { - VectorWritable writable = new VectorWritable(); - writable.readFields(in); - vector = writable.get(); - int size = Varint.readUnsignedVarInt(in); - userIDs = Lists.newArrayListWithCapacity(size); - values = Lists.newArrayListWithCapacity(size); - for (int i = 0; i < size; i++) { - userIDs.add(Varint.readSignedVarLong(in)); - values.add(in.readFloat()); - } - } - - @Override - public String toString() { - return vector + "\t" + userIDs + '\t' + values; - } -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorOrPrefWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorOrPrefWritable.java deleted file mode 100644 index 9b95f67b3..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorOrPrefWritable.java +++ /dev/null @@ -1,104 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.item; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.math.Varint; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -public final class VectorOrPrefWritable implements Writable { - - private Vector vector; - private long userID; - private float value; - - public VectorOrPrefWritable() { - } - - public VectorOrPrefWritable(Vector vector) { - this.vector = vector; - } - - public VectorOrPrefWritable(long userID, float value) { - this.userID = userID; - this.value = value; - } - - public Vector getVector() { - return vector; - } - - public long getUserID() { - return userID; - } - - public float getValue() { - return value; - } - - public void set(Vector vector) { - this.vector = vector; - this.userID = Long.MIN_VALUE; - this.value = Float.NaN; - } - - public void set(long userID, float value) { - this.vector = null; - this.userID = userID; - this.value = value; - } - - @Override - public void write(DataOutput out) throws IOException { - if (vector == null) { - out.writeBoolean(false); - Varint.writeSignedVarLong(userID, out); - out.writeFloat(value); - } else { - out.writeBoolean(true); - VectorWritable vw = new VectorWritable(vector); - vw.setWritesLaxPrecision(true); - vw.write(out); - } - } - - @Override - public void readFields(DataInput in) throws IOException { - boolean hasVector = in.readBoolean(); - if (hasVector) { - VectorWritable writable = new VectorWritable(); - writable.readFields(in); - set(writable.get()); - } else { - long theUserID = Varint.readSignedVarLong(in); - float theValue = in.readFloat(); - set(theUserID, theValue); - } - } - - @Override - public String toString() { - return vector == null ? userID + ":" + value : vector.toString(); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java deleted file mode 100644 index 2c6da306c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.preparation; - -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable; -import org.apache.mahout.cf.taste.hadoop.ToEntityPrefsMapper; -import org.apache.mahout.cf.taste.hadoop.ToItemPrefsMapper; -import org.apache.mahout.cf.taste.hadoop.item.ItemIDIndexMapper; -import org.apache.mahout.cf.taste.hadoop.item.ItemIDIndexReducer; -import org.apache.mahout.cf.taste.hadoop.item.RecommenderJob; -import org.apache.mahout.cf.taste.hadoop.item.ToUserVectorsReducer; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.math.VarIntWritable; -import org.apache.mahout.math.VarLongWritable; -import org.apache.mahout.math.VectorWritable; - -import java.util.List; -import java.util.Map; - -public class PreparePreferenceMatrixJob extends AbstractJob { - - public static final String NUM_USERS = "numUsers.bin"; - public static final String ITEMID_INDEX = "itemIDIndex"; - public static final String USER_VECTORS = "userVectors"; - public static final String RATING_MATRIX = "ratingMatrix"; - - private static final int DEFAULT_MIN_PREFS_PER_USER = 1; - - public static void main(String[] args) throws Exception { - ToolRunner.run(new PreparePreferenceMatrixJob(), args); - } - - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOutputOption(); - addOption("maxPrefsPerUser", "mppu", "max number of preferences to consider per user, " - + "users with more preferences will be sampled down"); - addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this " - + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); - addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); - addOption("ratingShift", "rs", "shift ratings by this value", "0.0"); - - Map> parsedArgs = parseArguments(args); - if (parsedArgs == null) { - return -1; - } - - int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser")); - boolean booleanData = Boolean.valueOf(getOption("booleanData")); - float ratingShift = Float.parseFloat(getOption("ratingShift")); - //convert items to an internal index - Job itemIDIndex = prepareJob(getInputPath(), getOutputPath(ITEMID_INDEX), TextInputFormat.class, - ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, - VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); - itemIDIndex.setCombinerClass(ItemIDIndexReducer.class); - boolean succeeded = itemIDIndex.waitForCompletion(true); - if (!succeeded) { - return -1; - } - //convert user preferences into a vector per user - Job toUserVectors = prepareJob(getInputPath(), getOutputPath(USER_VECTORS), TextInputFormat.class, - ToItemPrefsMapper.class, VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class, - ToUserVectorsReducer.class, VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); - toUserVectors.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData); - toUserVectors.getConfiguration().setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser); - toUserVectors.getConfiguration().set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift)); - succeeded = toUserVectors.waitForCompletion(true); - if (!succeeded) { - return -1; - } - //we need the number of users later - int numberOfUsers = (int) toUserVectors.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS).getValue(); - HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf()); - //build the rating matrix - Job toItemVectors = prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX), - ToItemVectorsMapper.class, IntWritable.class, VectorWritable.class, ToItemVectorsReducer.class, - IntWritable.class, VectorWritable.class); - toItemVectors.setCombinerClass(ToItemVectorsReducer.class); - - /* configure sampling regarding the uservectors */ - if (hasOption("maxPrefsPerUser")) { - int samplingSize = Integer.parseInt(getOption("maxPrefsPerUser")); - toItemVectors.getConfiguration().setInt(ToItemVectorsMapper.SAMPLE_SIZE, samplingSize); - } - - succeeded = toItemVectors.waitForCompletion(true); - if (!succeeded) { - return -1; - } - - return 0; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsMapper.java deleted file mode 100644 index ba85b2544..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsMapper.java +++ /dev/null @@ -1,72 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.preparation; - -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.VarLongWritable; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.hadoop.similarity.cooccurrence.Vectors; - -import java.io.IOException; -import java.util.Iterator; - -public class ToItemVectorsMapper - extends Mapper { - - public static final String SAMPLE_SIZE = ToItemVectorsMapper.class + ".sampleSize"; - - enum Elements { - USER_RATINGS_USED, USER_RATINGS_NEGLECTED - } - - private int sampleSize; - - @Override - protected void setup(Context ctx) throws IOException, InterruptedException { - sampleSize = ctx.getConfiguration().getInt(SAMPLE_SIZE, Integer.MAX_VALUE); - } - - @Override - protected void map(VarLongWritable rowIndex, VectorWritable vectorWritable, Context ctx) - throws IOException, InterruptedException { - Vector userRatings = vectorWritable.get(); - - int numElementsBeforeSampling = userRatings.getNumNondefaultElements(); - userRatings = Vectors.maybeSample(userRatings, sampleSize); - int numElementsAfterSampling = userRatings.getNumNondefaultElements(); - - int column = TasteHadoopUtils.idToIndex(rowIndex.get()); - VectorWritable itemVector = new VectorWritable(new RandomAccessSparseVector(Integer.MAX_VALUE, 1)); - itemVector.setWritesLaxPrecision(true); - - Iterator iterator = userRatings.iterateNonZero(); - while (iterator.hasNext()) { - Vector.Element elem = iterator.next(); - itemVector.get().setQuick(column, elem.get()); - ctx.write(new IntWritable(elem.index()), itemVector); - } - - ctx.getCounter(Elements.USER_RATINGS_USED).increment(numElementsAfterSampling); - ctx.getCounter(Elements.USER_RATINGS_NEGLECTED).increment(numElementsBeforeSampling - numElementsAfterSampling); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsReducer.java deleted file mode 100644 index 207a799f4..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsReducer.java +++ /dev/null @@ -1,35 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.preparation; - -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.math.VectorWritable; - -import java.io.IOException; - -public class ToItemVectorsReducer extends Reducer { - - @Override - protected void reduce(IntWritable row, Iterable vectors, Context ctx) - throws IOException, InterruptedException { - VectorWritable vectorWritable = VectorWritable.merge(vectors.iterator()); - vectorWritable.setWritesLaxPrecision(true); - ctx.write(row, vectorWritable); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java deleted file mode 100644 index 02d1ba671..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.pseudo; - -import java.io.IOException; -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable; -import org.apache.mahout.math.VarLongWritable; - -/** - *

- * This job runs a "pseudo-distributed" recommendation process on Hadoop. It merely runs many - * {@link org.apache.mahout.cf.taste.recommender.Recommender} instances on Hadoop, - * where each instance is a normal non-distributed implementation. - *

- * - *

This class configures and runs a {@link RecommenderReducer} using Hadoop.

- * - *

Command line arguments specific to this class are:

- * - *
    - *
  1. -Dmapred.input.dir=(path): Location of a data model file containing preference data, suitable for use with - * {@link org.apache.mahout.cf.taste.impl.model.file.FileDataModel}
  2. - *
  3. -Dmapred.output.dir=(path): output path where recommender output should go
  4. - *
  5. --recommenderClassName (string): Fully-qualified class name of - * {@link org.apache.mahout.cf.taste.recommender.Recommender} to use to make recommendations. - * Note that it must have a constructor which takes a {@link org.apache.mahout.cf.taste.model.DataModel} - * argument.
  6. - *
  7. --numRecommendations (integer): Number of recommendations to compute per user
  8. - *
  9. --usersFile (path): file containing user IDs to recommend for (optional)
  10. - *
- * - *

General command line options are documented in {@link AbstractJob}.

- * - *

Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other - * arguments.

- * - *

- * For example, to get started trying this out, set up Hadoop in a pseudo-distributed manner: - * http://hadoop.apache.org/common/docs/current/quickstart.html You can stop at the point where it instructs - * you to copy files into HDFS. - *

- * - *

- * Assume your preference data file is {@code input.csv}. You will also need to create a file containing - * all user IDs to write recommendations for, as something like {@code users.txt}. Place this input on - * HDFS like so: - *

- * - * {@code hadoop fs -put input.csv input/input.csv; hadoop fs -put users.txt input/users.txt * } - * - *

- * Build Mahout code with {@code mvn package} in the core/ directory. Locate - * {@code target/mahout-core-X.Y-SNAPSHOT.job}. This is a JAR file; copy it out to a convenient location - * and name it {@code recommender.jar}. - *

- * - *

- * Now add your own custom recommender code and dependencies. Your IDE produced compiled .class files - * somewhere and they need to be packaged up as well: - *

- * - * {@code jar uf recommender.jar -C (your classes directory) . * } - * - *

- * And launch: - *

- * - * {@code hadoop jar recommender.jar \ - * org.apache.mahout.cf.taste.hadoop.pseudo.RecommenderJob \ - * -Dmapred.input.dir=input/users.csv \ - * -Dmapred.output.dir=output \ - * --recommenderClassName your.project.Recommender \ - * --numRecommendations 10 * - * } - */ -public final class RecommenderJob extends AbstractJob { - - @Override - public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { - - addInputOption(); - addOutputOption(); - addOption("recommenderClassName", "r", "Name of recommender class to instantiate"); - addOption("numRecommendations", "n", "Number of recommendations per user", "10"); - addOption("usersFile", "u", "File of users to recommend for", null); - - Map> parsedArgs = parseArguments(args); - if (parsedArgs == null) { - return -1; - } - - Path inputFile = getInputPath(); - Path outputPath = getOutputPath(); - Path usersFile = hasOption("usersFile") ? inputFile : new Path(getOption("usersFile")); - - String recommendClassName = getOption("recommenderClassName"); - int recommendationsPerUser = Integer.parseInt(getOption("numRecommendations")); - - Job job = prepareJob(usersFile, - outputPath, - TextInputFormat.class, - UserIDsMapper.class, - VarLongWritable.class, - NullWritable.class, - RecommenderReducer.class, - VarLongWritable.class, - RecommendedItemsWritable.class, - TextOutputFormat.class); - FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); - Configuration jobConf = job.getConfiguration(); - jobConf.set(RecommenderReducer.RECOMMENDER_CLASS_NAME, recommendClassName); - jobConf.setInt(RecommenderReducer.RECOMMENDATIONS_PER_USER, recommendationsPerUser); - jobConf.set(RecommenderReducer.DATA_MODEL_FILE, inputFile.toString()); - - boolean succeeded = job.waitForCompletion(true); - return succeeded ? 0 : -1; - } - - public static void main(String[] args) throws Exception { - ToolRunner.run(new RecommenderJob(), args); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderReducer.java deleted file mode 100644 index d2b3ac5f9..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderReducer.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.pseudo; - -import java.io.File; -import java.io.IOException; -import java.lang.reflect.Constructor; -import java.lang.reflect.InvocationTargetException; -import java.util.Iterator; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable; -import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.cf.taste.recommender.Recommender; -import org.apache.mahout.math.VarLongWritable; - -/** - *

- * The {@link Reducer} which takes as input the user IDs parsed out by the map phase, and for each unique user - * ID, computes recommendations with the configured {@link Recommender}. The results are output as - * {@link RecommendedItemsWritable}. - *

- * - * @see RecommenderJob - */ -public final class RecommenderReducer extends - Reducer { - - static final String RECOMMENDER_CLASS_NAME = "recommenderClassName"; - static final String RECOMMENDATIONS_PER_USER = "recommendationsPerUser"; - static final String DATA_MODEL_FILE = "dataModelFile"; - - private Recommender recommender; - private int recommendationsPerUser; - - @Override - protected void setup(Context context) throws IOException { - Configuration jobConf = context.getConfiguration(); - String dataModelFile = jobConf.get(DATA_MODEL_FILE); - String recommenderClassName = jobConf.get(RECOMMENDER_CLASS_NAME); - Path dataModelPath = new Path(dataModelFile); - FileSystem fs = FileSystem.get(dataModelPath.toUri(), jobConf); - File tempDataFile = File.createTempFile("mahout-taste-hadoop", "txt"); - tempDataFile.deleteOnExit(); - fs.copyToLocalFile(dataModelPath, new Path(tempDataFile.getAbsolutePath())); - FileDataModel fileDataModel = new FileDataModel(tempDataFile); - - try { - Class recommenderClass = Class.forName(recommenderClassName).asSubclass( - Recommender.class); - Constructor constructor = recommenderClass.getConstructor(DataModel.class); - recommender = constructor.newInstance(fileDataModel); - } catch (NoSuchMethodException nsme) { - throw new IllegalStateException(nsme); - } catch (ClassNotFoundException cnfe) { - throw new IllegalStateException(cnfe); - } catch (InstantiationException ie) { - throw new IllegalStateException(ie); - } catch (IllegalAccessException iae) { - throw new IllegalStateException(iae); - } catch (InvocationTargetException ite) { - throw new IllegalStateException(ite.getCause()); - } - recommendationsPerUser = jobConf.getInt(RECOMMENDATIONS_PER_USER, 10); - } - - @Override - protected void reduce(VarLongWritable key, - Iterable values, - Context context) throws IOException, InterruptedException { - long userID = key.get(); - List recommendedItems; - try { - recommendedItems = recommender.recommend(userID, recommendationsPerUser); - } catch (TasteException te) { - throw new IllegalStateException(te); - } - Iterator it = recommendedItems.iterator(); - while (it.hasNext()) { - if (Float.isNaN(it.next().getValue())) { - it.remove(); - } - } - RecommendedItemsWritable writable = new RecommendedItemsWritable(recommendedItems); - context.write(key, writable); - context.getCounter(ReducerMetrics.USERS_PROCESSED).increment(1L); - context.getCounter(ReducerMetrics.RECOMMENDATIONS_MADE).increment(recommendedItems.size()); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/ReducerMetrics.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/ReducerMetrics.java deleted file mode 100644 index 22d903e2c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/ReducerMetrics.java +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.pseudo; - -/** Custom metrics collected by {@link RecommenderReducer}. */ -public enum ReducerMetrics { - - /** Number of unique users for which recommendations were produced */ - USERS_PROCESSED, - /** Number of items recommended to those users */ - RECOMMENDATIONS_MADE - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/UserIDsMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/UserIDsMapper.java deleted file mode 100644 index 41188f331..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/UserIDsMapper.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.pseudo; - -import java.io.IOException; -import java.util.regex.Pattern; - -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.math.VarLongWritable; - -/** - * Extracts and emits all user IDs from the users file, or input file. - */ -public final class UserIDsMapper extends - Mapper { - - private static final Pattern DELIMITER = Pattern.compile("[\t,]"); - - @Override - protected void map(LongWritable key, - Text value, - Context context) throws IOException, InterruptedException { - String[] tokens = DELIMITER.split(value.toString()); - long userID = Long.parseLong(tokens[0]); - context.write(new VarLongWritable(userID), NullWritable.get()); - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java deleted file mode 100644 index 76fad5fcb..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java +++ /dev/null @@ -1,224 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.similarity.item; - -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; - -import com.google.common.base.Preconditions; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; - -import org.apache.hadoop.io.DoubleWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.cf.taste.common.TopK; -import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable; -import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils; -import org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob; -import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasures; -import org.apache.mahout.math.map.OpenIntLongHashMap; - -/** - *

Distributed precomputation of the item-item-similarities for Itembased Collaborative Filtering

- * - *

Preferences in the input file should look like {@code userID,itemID[,preferencevalue]}

- * - *

- * Preference value is optional to accommodate applications that have no notion of a preference value (that is, the user - * simply expresses a preference for an item, but no degree of preference). - *

- * - *

- * The preference value is assumed to be parseable as a {@code double}. The user IDs and item IDs are - * parsed as {@code long}s. - *

- * - *

Command line arguments specific to this class are:

- * - *
    - *
  1. -Dmapred.input.dir=(path): Directory containing one or more text files with the preference data
  2. - *
  3. -Dmapred.output.dir=(path): output path where similarity data should be written
  4. - *
  5. --similarityClassname (classname): Name of distributed similarity measure class to instantiate or a predefined similarity - * from {@link org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasure}
  6. - *
  7. --maxSimilaritiesPerItem (integer): Maximum number of similarities considered per item (100)
  8. - *
  9. --maxCooccurrencesPerItem (integer): Maximum number of cooccurrences considered per item (100)
  10. - *
  11. --booleanData (boolean): Treat input data as having no pref values (false)
  12. - *
- * - *

General command line options are documented in {@link AbstractJob}.

- * - *

Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other arguments.

- */ -public final class ItemSimilarityJob extends AbstractJob { - - static final String ITEM_ID_INDEX_PATH_STR = ItemSimilarityJob.class.getName() + ".itemIDIndexPathStr"; - static final String MAX_SIMILARITIES_PER_ITEM = ItemSimilarityJob.class.getName() + ".maxSimilarItemsPerItem"; - - private static final int DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM = 100; - private static final int DEFAULT_MAX_PREFS_PER_USER = 1000; - private static final int DEFAULT_MIN_PREFS_PER_USER = 1; - - public static void main(String[] args) throws Exception { - ToolRunner.run(new ItemSimilarityJob(), args); - } - - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOutputOption(); - addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, " - + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')'); - addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number " - + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')', - String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM)); - addOption("maxPrefsPerUser", "mppu", "max number of preferences to consider per user, " - + "users with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS_PER_USER + ')', - String.valueOf(DEFAULT_MAX_PREFS_PER_USER)); - addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this " - + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); - addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE)); - addOption("threshold", "tr", "discard item pairs with a similarity value below this", false); - - Map> parsedArgs = parseArguments(args); - if (parsedArgs == null) { - return -1; - } - - String similarityClassName = getOption("similarityClassname"); - int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem")); - int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser")); - int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser")); - boolean booleanData = Boolean.valueOf(getOption("booleanData")); - - double threshold = hasOption("threshold") ? - Double.parseDouble(getOption("threshold")) : RowSimilarityJob.NO_THRESHOLD; - - Path similarityMatrixPath = getTempPath("similarityMatrix"); - Path prepPath = getTempPath("prepareRatingMatrix"); - - AtomicInteger currentPhase = new AtomicInteger(); - - if (shouldRunNextPhase(parsedArgs, currentPhase)) { - ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[]{ - "--input", getInputPath().toString(), - "--output", prepPath.toString(), - "--maxPrefsPerUser", String.valueOf(maxPrefsPerUser), - "--minPrefsPerUser", String.valueOf(minPrefsPerUser), - "--booleanData", String.valueOf(booleanData), - "--tempDir", getTempPath().toString() }); - } - - if (shouldRunNextPhase(parsedArgs, currentPhase)) { - int numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), - getConf()); - - ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] { - "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), - "--output", similarityMatrixPath.toString(), - "--numberOfColumns", String.valueOf(numberOfUsers), - "--similarityClassname", similarityClassName, - "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem), - "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE), - "--threshold", String.valueOf(threshold), - "--tempDir", getTempPath().toString() }); - } - - if (shouldRunNextPhase(parsedArgs, currentPhase)) { - Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class, - MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class, - MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class); - Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration(); - mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, - new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString()); - mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem); - boolean succeeded = mostSimilarItems.waitForCompletion(true); - if (!succeeded) { - return -1; - } - } - - return 0; - } - - public static class MostSimilarItemPairsMapper - extends Mapper { - - private OpenIntLongHashMap indexItemIDMap; - private int maxSimilarItemsPerItem; - - @Override - protected void setup(Context ctx) { - Configuration conf = ctx.getConfiguration(); - maxSimilarItemsPerItem = conf.getInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, -1); - indexItemIDMap = TasteHadoopUtils.readItemIDIndexMap(conf.get(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR), conf); - - Preconditions.checkArgument(maxSimilarItemsPerItem > 0, "maxSimilarItemsPerItem was not correctly set!"); - } - - @Override - protected void map(IntWritable itemIDIndexWritable, VectorWritable similarityVector, Context ctx) - throws IOException, InterruptedException { - - int itemIDIndex = itemIDIndexWritable.get(); - - TopK topKMostSimilarItems = - new TopK(maxSimilarItemsPerItem, SimilarItem.COMPARE_BY_SIMILARITY); - - Iterator similarityVectorIterator = similarityVector.get().iterateNonZero(); - - while (similarityVectorIterator.hasNext()) { - Vector.Element element = similarityVectorIterator.next(); - topKMostSimilarItems.offer(new SimilarItem(indexItemIDMap.get(element.index()), element.get())); - } - - long itemID = indexItemIDMap.get(itemIDIndex); - for (SimilarItem similarItem : topKMostSimilarItems.retrieve()) { - long otherItemID = similarItem.getItemID(); - if (itemID < otherItemID) { - ctx.write(new EntityEntityWritable(itemID, otherItemID), new DoubleWritable(similarItem.getSimilarity())); - } else { - ctx.write(new EntityEntityWritable(otherItemID, itemID), new DoubleWritable(similarItem.getSimilarity())); - } - } - } - } - - static class MostSimilarItemPairsReducer - extends Reducer { - @Override - protected void reduce(EntityEntityWritable pair, Iterable values, Context ctx) - throws IOException, InterruptedException { - ctx.write(pair, values.iterator().next()); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarItem.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarItem.java deleted file mode 100644 index 859b3fb2b..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarItem.java +++ /dev/null @@ -1,49 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.similarity.item; - -import java.io.Serializable; -import java.util.Comparator; - -class SimilarItem { - - static final Comparator COMPARE_BY_SIMILARITY = new BySimilaritySimilarItemComparator(); - - private final long itemID; - private final double similarity; - - SimilarItem(long itemID, double similarity) { - this.itemID = itemID; - this.similarity = similarity; - } - - public long getItemID() { - return itemID; - } - - public double getSimilarity() { - return similarity; - } - - static class BySimilaritySimilarItemComparator implements Comparator, Serializable { - @Override - public int compare(SimilarItem s1, SimilarItem s2) { - return s1.similarity == s2.similarity ? 0 : s1.similarity < s2.similarity ? -1 : 1; - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/ByItemIDComparator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/ByItemIDComparator.java deleted file mode 100644 index 9b116318d..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/ByItemIDComparator.java +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.slopeone; - -import java.io.Serializable; -import java.util.Comparator; - -import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable; - -final class ByItemIDComparator implements Comparator, Serializable { - - private static final Comparator INSTANCE = new ByItemIDComparator(); - - public static Comparator getInstance() { - return INSTANCE; - } - - @Override - public int compare(EntityPrefWritable a, EntityPrefWritable b) { - long idA = a.getID(); - long idB = b.getID(); - return idA < idB ? -1 : idA > idB ? 1 : 0; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/FullRunningAverageAndStdDevWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/FullRunningAverageAndStdDevWritable.java deleted file mode 100644 index d71e507e3..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/FullRunningAverageAndStdDevWritable.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.slopeone; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev; -import org.apache.mahout.math.Varint; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -public final class FullRunningAverageAndStdDevWritable implements Writable { - - private FullRunningAverageAndStdDev average; - - public FullRunningAverageAndStdDevWritable(FullRunningAverageAndStdDev average) { - this.average = average; - } - - public FullRunningAverageAndStdDev getAverage() { - return average; - } - - @Override - public String toString() { - return new StringBuilder() - .append(average.getAverage()).append('\t') - .append(average.getCount()).append('\t') - .append(average.getMk()).append('\t') - .append(average.getSk()).toString(); - } - - @Override - public void write(DataOutput dataOutput) throws IOException { - Varint.writeUnsignedVarInt(average.getCount(), dataOutput); - dataOutput.writeDouble(average.getAverage()); - dataOutput.writeDouble(average.getMk()); - dataOutput.writeDouble(average.getSk()); - } - - @Override - public void readFields(DataInput dataInput) throws IOException { - int count = Varint.readUnsignedVarInt(dataInput); - double diff = dataInput.readDouble(); - double mk = dataInput.readDouble(); - double sk = dataInput.readDouble(); - average = new FullRunningAverageAndStdDev(count, diff, mk, sk); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java deleted file mode 100644 index 57fa036f8..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.slopeone; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; - -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.FloatWritable; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable; -import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable; -import org.apache.mahout.cf.taste.hadoop.ToItemPrefsMapper; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.math.VarLongWritable; - -public final class SlopeOneAverageDiffsJob extends AbstractJob { - - @Override - public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { - - addInputOption(); - addOutputOption(); - - Map> parsedArgs = parseArguments(args); - if (parsedArgs == null) { - return -1; - } - - Path prefsFile = getInputPath(); - Path outputPath = getOutputPath(); - Path averagesOutputPath = new Path(getOption("--tempDir")); - - AtomicInteger currentPhase = new AtomicInteger(); - - if (shouldRunNextPhase(parsedArgs, currentPhase)) { - Job prefsToDiffsJob = prepareJob(prefsFile, - averagesOutputPath, - TextInputFormat.class, - ToItemPrefsMapper.class, - VarLongWritable.class, - EntityPrefWritable.class, - SlopeOnePrefsToDiffsReducer.class, - EntityEntityWritable.class, - FloatWritable.class, - SequenceFileOutputFormat.class); - boolean succeeded = prefsToDiffsJob.waitForCompletion(true); - if (!succeeded) - return -1; - } - - - if (shouldRunNextPhase(parsedArgs, currentPhase)) { - Job diffsToAveragesJob = prepareJob(averagesOutputPath, - outputPath, - SequenceFileInputFormat.class, - Mapper.class, - EntityEntityWritable.class, - FloatWritable.class, - SlopeOneDiffsToAveragesReducer.class, - EntityEntityWritable.class, - FullRunningAverageAndStdDevWritable.class, - TextOutputFormat.class); - FileOutputFormat.setOutputCompressorClass(diffsToAveragesJob, GzipCodec.class); - boolean succeeded = diffsToAveragesJob.waitForCompletion(true); - if (!succeeded) - return -1; - } - return 0; - } - - public static void main(String[] args) throws Exception { - ToolRunner.run(new SlopeOneAverageDiffsJob(), args); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneDiffsToAveragesReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneDiffsToAveragesReducer.java deleted file mode 100644 index 7cc8594d8..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneDiffsToAveragesReducer.java +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.slopeone; - -import java.io.IOException; - -import org.apache.hadoop.io.FloatWritable; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev; - -public final class SlopeOneDiffsToAveragesReducer extends - Reducer { - - @Override - protected void reduce(EntityEntityWritable key, - Iterable values, - Context context) throws IOException, InterruptedException { - FullRunningAverageAndStdDev average = new FullRunningAverageAndStdDev(); - for (FloatWritable value : values) { - average.addDatum(value.get()); - } - context.write(key, new FullRunningAverageAndStdDevWritable(average)); - } -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOnePrefsToDiffsReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOnePrefsToDiffsReducer.java deleted file mode 100644 index aed2c1594..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOnePrefsToDiffsReducer.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.hadoop.slopeone; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; - -import com.google.common.collect.Lists; -import org.apache.hadoop.io.FloatWritable; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable; -import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable; -import org.apache.mahout.math.VarLongWritable; - -public final class SlopeOnePrefsToDiffsReducer extends - Reducer { - - @Override - protected void reduce(VarLongWritable key, - Iterable values, - Context context) throws IOException, InterruptedException { - List prefs = Lists.newArrayList(); - for (EntityPrefWritable writable : values) { - prefs.add(new EntityPrefWritable(writable)); - } - Collections.sort(prefs, ByItemIDComparator.getInstance()); - int size = prefs.size(); - for (int i = 0; i < size; i++) { - EntityPrefWritable first = prefs.get(i); - long itemAID = first.getID(); - float itemAValue = first.getPrefValue(); - for (int j = i + 1; j < size; j++) { - EntityPrefWritable second = prefs.get(j); - long itemBID = second.getID(); - float itemBValue = second.getPrefValue(); - context.write(new EntityEntityWritable(itemAID, itemBID), new FloatWritable(itemBValue - itemAValue)); - } - } - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/AbstractLongPrimitiveIterator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/AbstractLongPrimitiveIterator.java deleted file mode 100644 index f46785c88..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/AbstractLongPrimitiveIterator.java +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -public abstract class AbstractLongPrimitiveIterator implements LongPrimitiveIterator { - - @Override - public Long next() { - return nextLong(); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/BitSet.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/BitSet.java deleted file mode 100644 index 6f6464879..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/BitSet.java +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -import java.io.Serializable; - -/** A simplified and streamlined version of {@link java.util.BitSet}. */ -final class BitSet implements Serializable, Cloneable { - - private final long[] bits; - - BitSet(int numBits) { - int numLongs = numBits >>> 6; - if ((numBits & 0x3F) != 0) { - numLongs++; - } - bits = new long[numLongs]; - } - - private BitSet(long[] bits) { - this.bits = bits; - } - - boolean get(int index) { - // skipping range check for speed - return (bits[index >>> 6] & 1L << (index & 0x3F)) != 0L; - } - - void set(int index) { - // skipping range check for speed - bits[index >>> 6] |= 1L << (index & 0x3F); - } - - void clear(int index) { - // skipping range check for speed - bits[index >>> 6] &= ~(1L << (index & 0x3F)); - } - - void clear() { - int length = bits.length; - for (int i = 0; i < length; i++) { - bits[i] = 0L; - } - } - - @Override - public BitSet clone() { - return new BitSet(bits); - } - - @Override - public String toString() { - StringBuilder result = new StringBuilder(64 * bits.length); - for (long l : bits) { - for (int j = 0; j < 64; j++) { - result.append((l & 1L << j) == 0 ? '0' : '1'); - } - result.append(' '); - } - return result.toString(); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/Cache.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/Cache.java deleted file mode 100755 index 2dfd9879f..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/Cache.java +++ /dev/null @@ -1,178 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -import com.google.common.base.Preconditions; -import org.apache.mahout.cf.taste.common.TasteException; - -import java.util.Iterator; - -/** - *

- * An efficient Map-like class which caches values for keys. Values are not "put" into a ; - * instead the caller supplies the instance with an implementation of {@link Retriever} which can load the - * value for a given key. - *

- * - *

- * The cache does not support {@code null} keys. - *

- * - *

- * Thanks to Amila Jayasooriya for helping evaluate performance of the rewrite of this class, as part of a - * Google Summer of Code 2007 project. - *

- */ -public final class Cache implements Retriever { - - private static final Object NULL = new Object(); - - private final FastMap cache; - private final Retriever retriever; - - /** - *

- * Creates a new cache based on the given {@link Retriever}. - *

- * - * @param retriever - * object which can retrieve values for keys - */ - public Cache(Retriever retriever) { - this(retriever, FastMap.NO_MAX_SIZE); - } - - /** - *

- * Creates a new cache based on the given {@link Retriever} and with given maximum size. - *

- * - * @param retriever - * object which can retrieve values for keys - * @param maxEntries - * maximum number of entries the cache will store before evicting some - */ - public Cache(Retriever retriever, int maxEntries) { - Preconditions.checkArgument(retriever != null, "retriever is null"); - Preconditions.checkArgument(maxEntries >= 1, "maxEntries must be at least 1"); - cache = new FastMap(11, maxEntries); - this.retriever = retriever; - } - - /** - *

- * Returns cached value for a key. If it does not exist, it is loaded using a {@link Retriever}. - *

- * - * @param key - * cache key - * @return value for that key - * @throws TasteException - * if an exception occurs while retrieving a new cached value - */ - @Override - public V get(K key) throws TasteException { - V value; - synchronized (cache) { - value = cache.get(key); - } - if (value == null) { - return getAndCacheValue(key); - } - return value == NULL ? null : value; - } - - /** - *

- * Uncaches any existing value for a given key. - *

- * - * @param key - * cache key - */ - public void remove(K key) { - synchronized (cache) { - cache.remove(key); - } - } - - /** - * Clears all cache entries whose key matches the given predicate. - */ - public void removeKeysMatching(MatchPredicate predicate) { - synchronized (cache) { - Iterator it = cache.keySet().iterator(); - while (it.hasNext()) { - K key = it.next(); - if (predicate.matches(key)) { - it.remove(); - } - } - } - } - - /** - * Clears all cache entries whose value matches the given predicate. - */ - public void removeValueMatching(MatchPredicate predicate) { - synchronized (cache) { - Iterator it = cache.values().iterator(); - while (it.hasNext()) { - V value = it.next(); - if (predicate.matches(value)) { - it.remove(); - } - } - } - } - - /** - *

- * Clears the cache. - *

- */ - public void clear() { - synchronized (cache) { - cache.clear(); - } - } - - private V getAndCacheValue(K key) throws TasteException { - V value = retriever.get(key); - if (value == null) { - value = (V) NULL; - } - synchronized (cache) { - cache.put(key, value); - } - return value; - } - - @Override - public String toString() { - return "Cache[retriever:" + retriever + ']'; - } - - /** - * Used by {#link #removeKeysMatching(Object)} to decide things that are matching. - */ - public interface MatchPredicate { - boolean matches(T thing); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FastByIDMap.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FastByIDMap.java deleted file mode 100644 index 19452ba43..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FastByIDMap.java +++ /dev/null @@ -1,571 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -import java.io.Serializable; -import java.util.AbstractSet; -import java.util.Arrays; -import java.util.Collection; -import java.util.Iterator; -import java.util.Map; -import java.util.NoSuchElementException; -import java.util.Set; - -import org.apache.mahout.common.RandomUtils; - -import com.google.common.base.Preconditions; - -/** - * @see FastMap - * @see FastIDSet - */ -public final class FastByIDMap implements Serializable, Cloneable { - - public static final int NO_MAX_SIZE = Integer.MAX_VALUE; - private static final float DEFAULT_LOAD_FACTOR = 1.5f; - - /** Dummy object used to represent a key that has been removed. */ - private static final long REMOVED = Long.MAX_VALUE; - private static final long NULL = Long.MIN_VALUE; - - private long[] keys; - private V[] values; - private float loadFactor; - private int numEntries; - private int numSlotsUsed; - private final int maxSize; - private BitSet recentlyAccessed; - private final boolean countingAccesses; - - /** Creates a new with default capacity. */ - public FastByIDMap() { - this(2, NO_MAX_SIZE); - } - - public FastByIDMap(int size) { - this(size, NO_MAX_SIZE); - } - - public FastByIDMap(int size, float loadFactor) { - this(size, NO_MAX_SIZE, loadFactor); - } - - public FastByIDMap(int size, int maxSize) { - this(size, maxSize, DEFAULT_LOAD_FACTOR); - } - - /** - * Creates a new whose capacity can accommodate the given number of entries without rehash. - * - * @param size desired capacity - * @param maxSize max capacity - * @param loadFactor ratio of internal hash table size to current size - * @throws IllegalArgumentException if size is less than 0, maxSize is less than 1 - * or at least half of {@link RandomUtils#MAX_INT_SMALLER_TWIN_PRIME}, or - * loadFactor is less than 1 - */ - public FastByIDMap(int size, int maxSize, float loadFactor) { - Preconditions.checkArgument(size >= 0, "size must be at least 0"); - Preconditions.checkArgument(loadFactor >= 1.0f, "loadFactor must be at least 1.0"); - this.loadFactor = loadFactor; - int max = (int) (RandomUtils.MAX_INT_SMALLER_TWIN_PRIME / loadFactor); - Preconditions.checkArgument(size < max, "size must be less than " + max); - Preconditions.checkArgument(maxSize >= 1, "maxSize must be at least 1"); - int hashSize = RandomUtils.nextTwinPrime((int) (loadFactor * size)); - keys = new long[hashSize]; - Arrays.fill(keys, NULL); - values = (V[]) new Object[hashSize]; - this.maxSize = maxSize; - this.countingAccesses = maxSize != Integer.MAX_VALUE; - this.recentlyAccessed = countingAccesses ? new BitSet(hashSize) : null; - } - - /** - * @see #findForAdd(long) - */ - private int find(long key) { - int theHashCode = (int) key & 0x7FFFFFFF; // make sure it's positive - long[] keys = this.keys; - int hashSize = keys.length; - int jump = 1 + theHashCode % (hashSize - 2); - int index = theHashCode % hashSize; - long currentKey = keys[index]; - while (currentKey != NULL && key != currentKey) { - if (index < jump) { - index += hashSize - jump; - } else { - index -= jump; - } - currentKey = keys[index]; - } - return index; - } - - /** - * @see #find(long) - */ - private int findForAdd(long key) { - int theHashCode = (int) key & 0x7FFFFFFF; // make sure it's positive - long[] keys = this.keys; - int hashSize = keys.length; - int jump = 1 + theHashCode % (hashSize - 2); - int index = theHashCode % hashSize; - long currentKey = keys[index]; - while (currentKey != NULL && currentKey != REMOVED && key != currentKey) { // Different - // here - if (index < jump) { - index += hashSize - jump; - } else { - index -= jump; - } - currentKey = keys[index]; - } - return index; - } - - public V get(long key) { - if (key == NULL) { - return null; - } - int index = find(key); - if (countingAccesses) { - recentlyAccessed.set(index); - } - return values[index]; - } - - public int size() { - return numEntries; - } - - public boolean isEmpty() { - return numEntries == 0; - } - - public boolean containsKey(long key) { - return key != NULL && key != REMOVED && keys[find(key)] != NULL; - } - - public boolean containsValue(Object value) { - if (value == null) { - return false; - } - for (V theValue : values) { - if (theValue != null && value.equals(theValue)) { - return true; - } - } - return false; - } - - public V put(long key, V value) { - Preconditions.checkArgument(key != NULL && key != REMOVED); - if (value == null) { - throw new NullPointerException(); - } - // If less than half the slots are open, let's clear it up - if (numSlotsUsed * loadFactor >= keys.length) { - // If over half the slots used are actual entries, let's grow - if (numEntries * loadFactor >= numSlotsUsed) { - growAndRehash(); - } else { - // Otherwise just rehash to clear REMOVED entries and don't grow - rehash(); - } - } - // Here we may later consider implementing Brent's variation described on page 532 - int index = findForAdd(key); - long keyIndex = keys[index]; - if (keyIndex == key) { - V oldValue = values[index]; - values[index] = value; - return oldValue; - } else { - // If size is limited, - if (countingAccesses && numEntries >= maxSize) { - // and we're too large, clear some old-ish entry - clearStaleEntry(index); - } - keys[index] = key; - values[index] = value; - numEntries++; - if (keyIndex == NULL) { - numSlotsUsed++; - } - return null; - } - } - - private void clearStaleEntry(int index) { - while (true) { - long currentKey; - do { - if (index == 0) { - index = keys.length - 1; - } else { - index--; - } - currentKey = keys[index]; - } while (currentKey == NULL || currentKey == REMOVED); - if (recentlyAccessed.get(index)) { - recentlyAccessed.clear(index); - } else { - break; - } - } - // Delete the entry - keys[index] = REMOVED; - numEntries--; - values[index] = null; - } - - public V remove(long key) { - if (key == NULL || key == REMOVED) { - return null; - } - int index = find(key); - if (keys[index] == NULL) { - return null; - } else { - keys[index] = REMOVED; - numEntries--; - V oldValue = values[index]; - values[index] = null; - // don't decrement numSlotsUsed - return oldValue; - } - // Could un-set recentlyAccessed's bit but doesn't matter - } - - public void clear() { - numEntries = 0; - numSlotsUsed = 0; - Arrays.fill(keys, NULL); - Arrays.fill(values, null); - if (countingAccesses) { - recentlyAccessed.clear(); - } - } - - public LongPrimitiveIterator keySetIterator() { - return new KeyIterator(); - } - - public Set> entrySet() { - return new EntrySet(); - } - - public void rehash() { - rehash(RandomUtils.nextTwinPrime((int) (loadFactor * numEntries))); - } - - private void growAndRehash() { - if (keys.length * loadFactor >= RandomUtils.MAX_INT_SMALLER_TWIN_PRIME) { - throw new IllegalStateException("Can't grow any more"); - } - rehash(RandomUtils.nextTwinPrime((int) (loadFactor * keys.length))); - } - - private void rehash(int newHashSize) { - long[] oldKeys = keys; - V[] oldValues = values; - numEntries = 0; - numSlotsUsed = 0; - if (countingAccesses) { - recentlyAccessed = new BitSet(newHashSize); - } - keys = new long[newHashSize]; - Arrays.fill(keys, NULL); - values = (V[]) new Object[newHashSize]; - int length = oldKeys.length; - for (int i = 0; i < length; i++) { - long key = oldKeys[i]; - if (key != NULL && key != REMOVED) { - put(key, oldValues[i]); - } - } - } - - void iteratorRemove(int lastNext) { - if (lastNext >= values.length) { - throw new NoSuchElementException(); - } - if (lastNext < 0) { - throw new IllegalStateException(); - } - values[lastNext] = null; - keys[lastNext] = REMOVED; - numEntries--; - } - - @Override - public FastByIDMap clone() { - FastByIDMap clone; - try { - clone = (FastByIDMap) super.clone(); - } catch (CloneNotSupportedException cnse) { - throw new AssertionError(); - } - clone.keys = keys.clone(); - clone.values = values.clone(); - clone.recentlyAccessed = countingAccesses ? new BitSet(keys.length) : null; - return clone; - } - - @Override - public String toString() { - if (isEmpty()) { - return "{}"; - } - StringBuilder result = new StringBuilder(); - result.append('{'); - for (int i = 0; i < keys.length; i++) { - long key = keys[i]; - if (key != NULL && key != REMOVED) { - result.append(key).append('=').append(values[i]).append(','); - } - } - result.setCharAt(result.length() - 1, '}'); - return result.toString(); - } - - @Override - public int hashCode() { - int hash = 0; - long[] keys = this.keys; - int max = keys.length; - for (int i = 0; i < max; i++) { - long key = keys[i]; - if (key != NULL && key != REMOVED) { - hash = 31 * hash + ((int) (key >> 32) ^ (int) key); - hash = 31 * hash + values[i].hashCode(); - } - } - return hash; - } - - @Override - public boolean equals(Object other) { - if (!(other instanceof FastByIDMap)) { - return false; - } - FastByIDMap otherMap = (FastByIDMap) other; - long[] otherKeys = otherMap.keys; - V[] otherValues = otherMap.values; - int length = keys.length; - int otherLength = otherKeys.length; - int max = Math.min(length, otherLength); - - int i = 0; - while (i < max) { - long key = keys[i]; - long otherKey = otherKeys[i]; - if (key == NULL || key == REMOVED) { - if (otherKey != NULL && otherKey != REMOVED) { - return false; - } - } else { - if (key != otherKey || !values[i].equals(otherValues[i])) { - return false; - } - } - i++; - } - while (i < length) { - long key = keys[i]; - if (key != NULL && key != REMOVED) { - return false; - } - i++; - } - while (i < otherLength) { - long key = otherKeys[i]; - if (key != NULL && key != REMOVED) { - return false; - } - i++; - } - return true; - } - - private final class KeyIterator extends AbstractLongPrimitiveIterator { - - private int position; - private int lastNext = -1; - - @Override - public boolean hasNext() { - goToNext(); - return position < keys.length; - } - - @Override - public long nextLong() { - goToNext(); - lastNext = position; - if (position >= keys.length) { - throw new NoSuchElementException(); - } - return keys[position++]; - } - - @Override - public long peek() { - goToNext(); - if (position >= keys.length) { - throw new NoSuchElementException(); - } - return keys[position]; - } - - private void goToNext() { - int length = values.length; - while (position < length && values[position] == null) { - position++; - } - } - - @Override - public void remove() { - iteratorRemove(lastNext); - } - - @Override - public void skip(int n) { - position += n; - } - - } - - private final class EntrySet extends AbstractSet> { - - @Override - public int size() { - return FastByIDMap.this.size(); - } - - @Override - public boolean isEmpty() { - return FastByIDMap.this.isEmpty(); - } - - @Override - public boolean contains(Object o) { - return containsKey((Long) o); - } - - @Override - public Iterator> iterator() { - return new EntryIterator(); - } - - @Override - public boolean add(Map.Entry t) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean remove(Object o) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean addAll(Collection> ts) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean retainAll(Collection objects) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean removeAll(Collection objects) { - throw new UnsupportedOperationException(); - } - - @Override - public void clear() { - FastByIDMap.this.clear(); - } - - private final class MapEntry implements Map.Entry { - - private final int index; - - private MapEntry(int index) { - this.index = index; - } - - @Override - public Long getKey() { - return keys[index]; - } - - @Override - public V getValue() { - return values[index]; - } - - @Override - public V setValue(V value) { - Preconditions.checkArgument(value != null); - - V oldValue = values[index]; - values[index] = value; - return oldValue; - } - } - - private final class EntryIterator implements Iterator> { - - private int position; - private int lastNext = -1; - - @Override - public boolean hasNext() { - goToNext(); - return position < keys.length; - } - - @Override - public Map.Entry next() { - goToNext(); - lastNext = position; - if (position >= keys.length) { - throw new NoSuchElementException(); - } - return new MapEntry(position++); - } - - private void goToNext() { - int length = values.length; - while (position < length && values[position] == null) { - position++; - } - } - - @Override - public void remove() { - iteratorRemove(lastNext); - } - } - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FastIDSet.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FastIDSet.java deleted file mode 100644 index 2c0c86bd0..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FastIDSet.java +++ /dev/null @@ -1,428 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -import java.io.Serializable; -import java.util.Arrays; -import java.util.Iterator; -import java.util.NoSuchElementException; - -import org.apache.mahout.common.RandomUtils; - -import com.google.common.base.Preconditions; - -/** - * @see FastByIDMap - */ -public final class FastIDSet implements Serializable, Cloneable, Iterable { - - private static final float DEFAULT_LOAD_FACTOR = 1.5f; - - /** Dummy object used to represent a key that has been removed. */ - private static final long REMOVED = Long.MAX_VALUE; - private static final long NULL = Long.MIN_VALUE; - - private long[] keys; - private float loadFactor; - private int numEntries; - private int numSlotsUsed; - - /** Creates a new with default capacity. */ - public FastIDSet() { - this(2); - } - - public FastIDSet(long[] initialKeys) { - this(initialKeys.length); - addAll(initialKeys); - } - - public FastIDSet(int size) { - this(size, DEFAULT_LOAD_FACTOR); - } - - public FastIDSet(int size, float loadFactor) { - Preconditions.checkArgument(size >= 0, "size must be at least 0"); - Preconditions.checkArgument(loadFactor >= 1.0f, "loadFactor must be at least 1.0"); - this.loadFactor = loadFactor; - int max = (int) (RandomUtils.MAX_INT_SMALLER_TWIN_PRIME / loadFactor); - Preconditions.checkArgument(size < max, "size must be less than %d", max); - int hashSize = RandomUtils.nextTwinPrime((int) (loadFactor * size)); - keys = new long[hashSize]; - Arrays.fill(keys, NULL); } - - /** - * @see #findForAdd(long) - */ - private int find(long key) { - int theHashCode = (int) key & 0x7FFFFFFF; // make sure it's positive - long[] keys = this.keys; - int hashSize = keys.length; - int jump = 1 + theHashCode % (hashSize - 2); - int index = theHashCode % hashSize; - long currentKey = keys[index]; - while (currentKey != NULL && key != currentKey) { // note: true when currentKey == REMOVED - if (index < jump) { - index += hashSize - jump; - } else { - index -= jump; - } - currentKey = keys[index]; - } - return index; - } - - /** - * @see #find(long) - */ - private int findForAdd(long key) { - int theHashCode = (int) key & 0x7FFFFFFF; // make sure it's positive - long[] keys = this.keys; - int hashSize = keys.length; - int jump = 1 + theHashCode % (hashSize - 2); - int index = theHashCode % hashSize; - long currentKey = keys[index]; - while (currentKey != NULL && currentKey != REMOVED && key != currentKey) { // Different here - if (index < jump) { - index += hashSize - jump; - } else { - index -= jump; - } - currentKey = keys[index]; - } - return index; - } - - public int size() { - return numEntries; - } - - public boolean isEmpty() { - return numEntries == 0; - } - - public boolean contains(long key) { - return key != NULL && key != REMOVED && keys[find(key)] != NULL; - } - - public boolean add(long key) { - Preconditions.checkArgument(key != NULL && key != REMOVED); - - // If less than half the slots are open, let's clear it up - if (numSlotsUsed * loadFactor >= keys.length) { - // If over half the slots used are actual entries, let's grow - if (numEntries * loadFactor >= numSlotsUsed) { - growAndRehash(); - } else { - // Otherwise just rehash to clear REMOVED entries and don't grow - rehash(); - } - } - // Here we may later consider implementing Brent's variation described on page 532 - int index = findForAdd(key); - long keyIndex = keys[index]; - if (keyIndex != key) { - keys[index] = key; - numEntries++; - if (keyIndex == NULL) { - numSlotsUsed++; - } - return true; - } - return false; - } - - @Override - public LongPrimitiveIterator iterator() { - return new KeyIterator(); - } - - public long[] toArray() { - long[] result = new long[numEntries]; - for (int i = 0, position = 0; i < result.length; i++) { - while (keys[position] == NULL || keys[position] == REMOVED) { - position++; - } - result[i] = keys[position++]; - } - return result; - } - - public boolean remove(long key) { - if (key == NULL || key == REMOVED) { - return false; - } - int index = find(key); - if (keys[index] == NULL) { - return false; - } else { - keys[index] = REMOVED; - numEntries--; - return true; - } - } - - public boolean addAll(long[] c) { - boolean changed = false; - for (long k : c) { - if (add(k)) { - changed = true; - } - } - return changed; - } - - public boolean addAll(FastIDSet c) { - boolean changed = false; - for (long k : c.keys) { - if (k != NULL && k != REMOVED && add(k)) { - changed = true; - } - } - return changed; - } - - public boolean removeAll(long[] c) { - boolean changed = false; - for (long o : c) { - if (remove(o)) { - changed = true; - } - } - return changed; - } - - public boolean removeAll(FastIDSet c) { - boolean changed = false; - for (long k : c.keys) { - if (k != NULL && k != REMOVED && remove(k)) { - changed = true; - } - } - return changed; - } - - public boolean retainAll(FastIDSet c) { - boolean changed = false; - for (int i = 0; i < keys.length; i++) { - long k = keys[i]; - if (k != NULL && k != REMOVED && !c.contains(k)) { - keys[i] = REMOVED; - numEntries--; - changed = true; - } - } - return changed; - } - - public void clear() { - numEntries = 0; - numSlotsUsed = 0; - Arrays.fill(keys, NULL); - } - - private void growAndRehash() { - if (keys.length * loadFactor >= RandomUtils.MAX_INT_SMALLER_TWIN_PRIME) { - throw new IllegalStateException("Can't grow any more"); - } - rehash(RandomUtils.nextTwinPrime((int) (loadFactor * keys.length))); - } - - public void rehash() { - rehash(RandomUtils.nextTwinPrime((int) (loadFactor * numEntries))); - } - - private void rehash(int newHashSize) { - long[] oldKeys = keys; - numEntries = 0; - numSlotsUsed = 0; - keys = new long[newHashSize]; - Arrays.fill(keys, NULL); - int length = oldKeys.length; - for (int i = 0; i < length; i++) { - long key = oldKeys[i]; - if (key != NULL && key != REMOVED) { - add(key); - } - } - } - - /** - * Convenience method to quickly compute just the size of the intersection with another . - * - * @param other - * to intersect with - * @return number of elements in intersection - */ - public int intersectionSize(FastIDSet other) { - int count = 0; - for (long key : other.keys) { - if (key != NULL && key != REMOVED && keys[find(key)] != NULL) { - count++; - } - } - return count; - } - - @Override - public FastIDSet clone() { - FastIDSet clone; - try { - clone = (FastIDSet) super.clone(); - } catch (CloneNotSupportedException cnse) { - throw new AssertionError(); - } - clone.keys = keys.clone(); - return clone; - } - - @Override - public int hashCode() { - int hash = 0; - long[] keys = this.keys; - int max = keys.length; - for (int i = 0; i < max; i++) { - long key = keys[i]; - if (key != NULL && key != REMOVED) { - hash = 31 * hash + ((int) (key >> 32) ^ (int) key); - } - } - return hash; - } - - @Override - public boolean equals(Object other) { - if (!(other instanceof FastIDSet)) { - return false; - } - FastIDSet otherMap = (FastIDSet) other; - long[] otherKeys = otherMap.keys; - int length = keys.length; - int otherLength = otherKeys.length; - int max = Math.min(length, otherLength); - - int i = 0; - while (i < max) { - long key = keys[i]; - long otherKey = otherKeys[i]; - if (key == NULL || key == REMOVED) { - if (otherKey != NULL && otherKey != REMOVED) { - return false; - } - } else { - if (key != otherKey) { - return false; - } - } - i++; - } - while (i < length) { - long key = keys[i]; - if (key != NULL && key != REMOVED) { - return false; - } - i++; - } - while (i < otherLength) { - long key = otherKeys[i]; - if (key != NULL && key != REMOVED) { - return false; - } - i++; - } - return true; - } - - @Override - public String toString() { - if (isEmpty()) { - return "[]"; - } - StringBuilder result = new StringBuilder(); - result.append('['); - for (long key : keys) { - if (key != NULL && key != REMOVED) { - result.append(key).append(','); - } - } - result.setCharAt(result.length() - 1, ']'); - return result.toString(); - } - - private final class KeyIterator extends AbstractLongPrimitiveIterator { - - private int position; - private int lastNext = -1; - - @Override - public boolean hasNext() { - goToNext(); - return position < keys.length; - } - - @Override - public long nextLong() { - goToNext(); - lastNext = position; - if (position >= keys.length) { - throw new NoSuchElementException(); - } - return keys[position++]; - } - - @Override - public long peek() { - goToNext(); - if (position >= keys.length) { - throw new NoSuchElementException(); - } - return keys[position]; - } - - private void goToNext() { - int length = keys.length; - while (position < length - && (keys[position] == NULL || keys[position] == REMOVED)) { - position++; - } - } - - @Override - public void remove() { - if (lastNext >= keys.length) { - throw new NoSuchElementException(); - } - if (lastNext < 0) { - throw new IllegalStateException(); - } - keys[lastNext] = REMOVED; - numEntries--; - } - - public Iterator iterator() { - return new KeyIterator(); - } - - @Override - public void skip(int n) { - position += n; - } - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FastMap.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FastMap.java deleted file mode 100644 index df2a47023..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FastMap.java +++ /dev/null @@ -1,712 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -import java.io.Serializable; -import java.util.AbstractCollection; -import java.util.AbstractSet; -import java.util.Arrays; -import java.util.Collection; -import java.util.Iterator; -import java.util.Map; -import java.util.NoSuchElementException; -import java.util.Set; - -import org.apache.mahout.common.RandomUtils; - -import com.google.common.base.Preconditions; - -/** - *

- * This is an optimized {@link Map} implementation, based on algorithms described in Knuth's "Art of Computer - * Programming", Vol. 3, p. 529. - *

- * - *

- * It should be faster than {@link java.util.HashMap} in some cases, but not all. Its main feature is a - * "max size" and the ability to transparently, efficiently and semi-intelligently evict old entries when max - * size is exceeded. - *

- * - *

- * This class is not a bit thread-safe. - *

- * - *

- * This implementation does not allow {@code null} as a key or value. - *

- */ -public final class FastMap implements Map, Serializable, Cloneable { - - public static final int NO_MAX_SIZE = Integer.MAX_VALUE; - private static final float DEFAULT_LOAD_FACTOR = 1.5f; - - /** Dummy object used to represent a key that has been removed. */ - private static final Object REMOVED = new Object(); - - private K[] keys; - private V[] values; - private float loadFactor; - private int numEntries; - private int numSlotsUsed; - private final int maxSize; - private BitSet recentlyAccessed; - private final boolean countingAccesses; - - /** Creates a new with default capacity. */ - public FastMap() { - this(2, NO_MAX_SIZE); - } - - public FastMap(int size) { - this(size, NO_MAX_SIZE); - } - - public FastMap(Map other) { - this(other.size()); - putAll(other); - } - - public FastMap(int size, float loadFactor) { - this(size, NO_MAX_SIZE, loadFactor); - } - - public FastMap(int size, int maxSize) { - this(size, maxSize, DEFAULT_LOAD_FACTOR); - } - - /** - * Creates a new whose capacity can accommodate the given number of entries without rehash. - * - * @param size desired capacity - * @param maxSize max capacity - * @throws IllegalArgumentException if size is less than 0, maxSize is less than 1 - * or at least half of {@link RandomUtils#MAX_INT_SMALLER_TWIN_PRIME}, or - * loadFactor is less than 1 - */ - public FastMap(int size, int maxSize, float loadFactor) { - Preconditions.checkArgument(size >= 0, "size must be at least 0"); - Preconditions.checkArgument(loadFactor >= 1.0f, "loadFactor must be at least 1.0"); - this.loadFactor = loadFactor; - int max = (int) (RandomUtils.MAX_INT_SMALLER_TWIN_PRIME / loadFactor); - Preconditions.checkArgument(size < max, "size must be less than " + max); - Preconditions.checkArgument(maxSize >= 1, "maxSize must be at least 1"); - int hashSize = RandomUtils.nextTwinPrime((int) (loadFactor * size)); - keys = (K[]) new Object[hashSize]; - values = (V[]) new Object[hashSize]; - this.maxSize = maxSize; - this.countingAccesses = maxSize != Integer.MAX_VALUE; - this.recentlyAccessed = countingAccesses ? new BitSet(hashSize) : null; - } - - private int find(Object key) { - int theHashCode = key.hashCode() & 0x7FFFFFFF; // make sure it's positive - K[] keys = this.keys; - int hashSize = keys.length; - int jump = 1 + theHashCode % (hashSize - 2); - int index = theHashCode % hashSize; - K currentKey = keys[index]; - while (currentKey != null && (currentKey == REMOVED || !key.equals(currentKey))) { - if (index < jump) { - index += hashSize - jump; - } else { - index -= jump; - } - currentKey = keys[index]; - } - return index; - } - - @Override - public V get(Object key) { - if (key == null) { - return null; - } - int index = find(key); - if (countingAccesses) { - recentlyAccessed.set(index); - } - return values[index]; - } - - @Override - public int size() { - return numEntries; - } - - @Override - public boolean isEmpty() { - return numEntries == 0; - } - - @Override - public boolean containsKey(Object key) { - return key != null && keys[find(key)] != null; - } - - @Override - public boolean containsValue(Object value) { - if (value == null) { - return false; - } - for (V theValue : values) { - if (theValue != null && value.equals(theValue)) { - return true; - } - } - return false; - } - - /** - * @throws NullPointerException - * if key or value is null - */ - @Override - public V put(K key, V value) { - if (key == null || value == null) { - throw new NullPointerException(); - } - // If less than half the slots are open, let's clear it up - if (numSlotsUsed * loadFactor >= keys.length) { - // If over half the slots used are actual entries, let's grow - if (numEntries * loadFactor >= numSlotsUsed) { - growAndRehash(); - } else { - // Otherwise just rehash to clear REMOVED entries and don't grow - rehash(); - } - } - // Here we may later consider implementing Brent's variation described on page 532 - int index = find(key); - if (keys[index] == null) { - // If size is limited, - if (countingAccesses && numEntries >= maxSize) { - // and we're too large, clear some old-ish entry - clearStaleEntry(index); - } - keys[index] = key; - values[index] = value; - numEntries++; - numSlotsUsed++; - return null; - } else { - V oldValue = values[index]; - values[index] = value; - return oldValue; - } - } - - private void clearStaleEntry(int index) { - while (true) { - K currentKey; - do { - if (index == 0) { - index = keys.length - 1; - } else { - index--; - } - currentKey = keys[index]; - } while (currentKey == null || currentKey == REMOVED); - if (recentlyAccessed.get(index)) { - recentlyAccessed.clear(index); - } else { - break; - } - } - // Delete the entry - ((Object[])keys)[index] = REMOVED; - numEntries--; - values[index] = null; - } - - @Override - public void putAll(Map map) { - for (Entry entry : map.entrySet()) { - put(entry.getKey(), entry.getValue()); - } - } - - @Override - public V remove(Object key) { - if (key == null) { - return null; - } - int index = find(key); - if (keys[index] == null) { - return null; - } else { - ((Object[])keys)[index] = REMOVED; - numEntries--; - V oldValue = values[index]; - values[index] = null; - // don't decrement numSlotsUsed - return oldValue; - } - // Could un-set recentlyAccessed's bit but doesn't matter - } - - @Override - public void clear() { - numEntries = 0; - numSlotsUsed = 0; - Arrays.fill(keys, null); - Arrays.fill(values, null); - if (countingAccesses) { - recentlyAccessed.clear(); - } - } - - @Override - public Set keySet() { - return new KeySet(); - } - - @Override - public Collection values() { - return new ValueCollection(); - } - - @Override - public Set> entrySet() { - return new EntrySet(); - } - - public void rehash() { - rehash(RandomUtils.nextTwinPrime((int) (loadFactor * numEntries))); - } - - private void growAndRehash() { - if (keys.length * loadFactor >= RandomUtils.MAX_INT_SMALLER_TWIN_PRIME) { - throw new IllegalStateException("Can't grow any more"); - } - rehash(RandomUtils.nextTwinPrime((int) (loadFactor * keys.length))); - } - - private void rehash(int newHashSize) { - K[] oldKeys = keys; - V[] oldValues = values; - numEntries = 0; - numSlotsUsed = 0; - if (countingAccesses) { - recentlyAccessed = new BitSet(newHashSize); - } - keys = (K[]) new Object[newHashSize]; - values = (V[]) new Object[newHashSize]; - int length = oldKeys.length; - for (int i = 0; i < length; i++) { - K key = oldKeys[i]; - if (key != null && key != REMOVED) { - put(key, oldValues[i]); - } - } - } - - void iteratorRemove(int lastNext) { - if (lastNext >= values.length) { - throw new NoSuchElementException(); - } - if (lastNext < 0) { - throw new IllegalStateException(); - } - values[lastNext] = null; - ((Object[])keys)[lastNext] = REMOVED; - numEntries--; - } - - @Override - public FastMap clone() { - FastMap clone; - try { - clone = (FastMap) super.clone(); - } catch (CloneNotSupportedException cnse) { - throw new AssertionError(); - } - clone.keys = keys.clone(); - clone.values = values.clone(); - clone.recentlyAccessed = countingAccesses ? new BitSet(keys.length) : null; - return clone; - } - - @Override - public int hashCode() { - int hash = 0; - K[] keys = this.keys; - int max = keys.length; - for (int i = 0; i < max; i++) { - K key = keys[i]; - if (key != null && key != REMOVED) { - hash = 31 * hash + key.hashCode(); - hash = 31 * hash + values[i].hashCode(); - } - } - return hash; - } - - @Override - public boolean equals(Object other) { - if (!(other instanceof FastMap)) { - return false; - } - FastMap otherMap = (FastMap) other; - K[] otherKeys = otherMap.keys; - V[] otherValues = otherMap.values; - int length = keys.length; - int otherLength = otherKeys.length; - int max = Math.min(length, otherLength); - - int i = 0; - while (i < max) { - K key = keys[i]; - K otherKey = otherKeys[i]; - if (key == null || key == REMOVED) { - if (otherKey != null && otherKey != REMOVED) { - return false; - } - } else { - if (key != otherKey || !values[i].equals(otherValues[i])) { - return false; - } - } - i++; - } - while (i < length) { - K key = keys[i]; - if (key != null && key != REMOVED) { - return false; - } - i++; - } - while (i < otherLength) { - K key = otherKeys[i]; - if (key != null && key != REMOVED) { - return false; - } - i++; - } - return true; - } - - @Override - public String toString() { - if (isEmpty()) { - return "{}"; - } - StringBuilder result = new StringBuilder(); - result.append('{'); - for (int i = 0; i < keys.length; i++) { - K key = keys[i]; - if (key != null && key != REMOVED) { - result.append(key).append('=').append(values[i]).append(','); - } - } - result.setCharAt(result.length() - 1, '}'); - return result.toString(); - } - - private final class EntrySet extends AbstractSet> { - - @Override - public int size() { - return FastMap.this.size(); - } - - @Override - public boolean isEmpty() { - return FastMap.this.isEmpty(); - } - - @Override - public boolean contains(Object o) { - return containsKey(o); - } - - @Override - public Iterator> iterator() { - return new EntryIterator(); - } - - @Override - public boolean add(Entry t) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean remove(Object o) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean addAll(Collection> ts) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean retainAll(Collection objects) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean removeAll(Collection objects) { - throw new UnsupportedOperationException(); - } - - @Override - public void clear() { - FastMap.this.clear(); - } - - private final class MapEntry implements Entry { - - private final int index; - - private MapEntry(int index) { - this.index = index; - } - - @Override - public K getKey() { - return keys[index]; - } - - @Override - public V getValue() { - return values[index]; - } - - @Override - public V setValue(V value) { - Preconditions.checkArgument(value != null); - V oldValue = values[index]; - values[index] = value; - return oldValue; - } - } - - private final class EntryIterator implements Iterator> { - - private int position; - private int lastNext = -1; - - @Override - public boolean hasNext() { - goToNext(); - return position < keys.length; - } - - @Override - public Entry next() { - goToNext(); - lastNext = position; - if (position >= keys.length) { - throw new NoSuchElementException(); - } - return new MapEntry(position++); - } - - private void goToNext() { - int length = values.length; - while (position < length && values[position] == null) { - position++; - } - } - - @Override - public void remove() { - iteratorRemove(lastNext); - } - } - - } - - private final class KeySet extends AbstractSet { - - @Override - public int size() { - return FastMap.this.size(); - } - - @Override - public boolean isEmpty() { - return FastMap.this.isEmpty(); - } - - @Override - public boolean contains(Object o) { - return containsKey(o); - } - - @Override - public Iterator iterator() { - return new KeyIterator(); - } - - @Override - public boolean add(K t) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean remove(Object o) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean addAll(Collection ts) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean retainAll(Collection objects) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean removeAll(Collection objects) { - throw new UnsupportedOperationException(); - } - - @Override - public void clear() { - FastMap.this.clear(); - } - - private final class KeyIterator implements Iterator { - - private int position; - private int lastNext = -1; - - @Override - public boolean hasNext() { - goToNext(); - return position < keys.length; - } - - @Override - public K next() { - goToNext(); - lastNext = position; - if (position >= keys.length) { - throw new NoSuchElementException(); - } - return keys[position++]; - } - - private void goToNext() { - int length = values.length; - while (position < length && values[position] == null) { - position++; - } - } - - @Override - public void remove() { - iteratorRemove(lastNext); - } - } - - } - - private final class ValueCollection extends AbstractCollection { - - @Override - public int size() { - return FastMap.this.size(); - } - - @Override - public boolean isEmpty() { - return FastMap.this.isEmpty(); - } - - @Override - public boolean contains(Object o) { - return containsValue(o); - } - - @Override - public Iterator iterator() { - return new ValueIterator(); - } - - @Override - public boolean add(V v) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean remove(Object o) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean addAll(Collection vs) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean removeAll(Collection objects) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean retainAll(Collection objects) { - throw new UnsupportedOperationException(); - } - - @Override - public void clear() { - FastMap.this.clear(); - } - - private final class ValueIterator implements Iterator { - - private int position; - private int lastNext = -1; - - @Override - public boolean hasNext() { - goToNext(); - return position < values.length; - } - - @Override - public V next() { - goToNext(); - lastNext = position; - if (position >= values.length) { - throw new NoSuchElementException(); - } - return values[position++]; - } - - private void goToNext() { - int length = values.length; - while (position < length && values[position] == null) { - position++; - } - } - - @Override - public void remove() { - iteratorRemove(lastNext); - } - - } - - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverage.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverage.java deleted file mode 100644 index 9c68e111b..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverage.java +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -import java.io.Serializable; - -/** - *

- * A simple class that represents a fixed value of an average and count. This is useful - * when an API needs to return {@link RunningAverage} but is not in a position to accept - * updates to it. - *

- */ -public class FixedRunningAverage implements RunningAverage, Serializable { - - private final double average; - private final int count; - - public FixedRunningAverage(double average, int count) { - this.average = average; - this.count = count; - } - - /** - * @throws UnsupportedOperationException - */ - @Override - public synchronized void addDatum(double datum) { - throw new UnsupportedOperationException(); - } - - /** - * @throws UnsupportedOperationException - */ - @Override - public synchronized void removeDatum(double datum) { - throw new UnsupportedOperationException(); - } - - /** - * @throws UnsupportedOperationException - */ - @Override - public synchronized void changeDatum(double delta) { - throw new UnsupportedOperationException(); - } - - @Override - public synchronized int getCount() { - return count; - } - - @Override - public synchronized double getAverage() { - return average; - } - - @Override - public RunningAverage inverse() { - return new InvertedRunningAverage(this); - } - - @Override - public synchronized String toString() { - return String.valueOf(average); - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverageAndStdDev.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverageAndStdDev.java deleted file mode 100644 index 0447ac956..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverageAndStdDev.java +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -/** - *

- * A simple class that represents a fixed value of an average, count and standard deviation. This is useful - * when an API needs to return {@link RunningAverageAndStdDev} but is not in a position to accept - * updates to it. - *

- */ -public final class FixedRunningAverageAndStdDev extends FixedRunningAverage implements RunningAverageAndStdDev { - - private final double stdDev; - - public FixedRunningAverageAndStdDev(double average, double stdDev, int count) { - super(average, count); - this.stdDev = stdDev; - } - - @Override - public RunningAverageAndStdDev inverse() { - return new InvertedRunningAverageAndStdDev(this); - } - - @Override - public synchronized String toString() { - return super.toString() + ',' + stdDev; - } - - @Override - public double getStandardDeviation() { - return stdDev; - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverage.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverage.java deleted file mode 100644 index 04ff312d2..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverage.java +++ /dev/null @@ -1,109 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -import java.io.Serializable; - -/** - *

- * A simple class that can keep track of a running avearage of a series of numbers. One can add to or remove - * from the series, as well as update a datum in the series. The class does not actually keep track of the - * series of values, just its running average, so it doesn't even matter if you remove/change a value that - * wasn't added. - *

- */ -public class FullRunningAverage implements RunningAverage, Serializable { - - private int count; - private double average; - - public FullRunningAverage() { - this(0, Double.NaN); - } - - public FullRunningAverage(int count, double average) { - this.count = count; - this.average = average; - } - - /** - * @param datum - * new item to add to the running average - */ - @Override - public synchronized void addDatum(double datum) { - if (++count == 1) { - average = datum; - } else { - average = average * (count - 1) / count + datum / count; - } - } - - /** - * @param datum - * item to remove to the running average - * @throws IllegalStateException - * if count is 0 - */ - @Override - public synchronized void removeDatum(double datum) { - if (count == 0) { - throw new IllegalStateException(); - } - if (--count == 0) { - average = Double.NaN; - } else { - average = average * (count + 1) / count - datum / count; - } - } - - /** - * @param delta - * amount by which to change a datum in the running average - * @throws IllegalStateException - * if count is 0 - */ - @Override - public synchronized void changeDatum(double delta) { - if (count == 0) { - throw new IllegalStateException(); - } - average += delta / count; - } - - @Override - public synchronized int getCount() { - return count; - } - - @Override - public synchronized double getAverage() { - return average; - } - - @Override - public RunningAverage inverse() { - return new InvertedRunningAverage(this); - } - - @Override - public synchronized String toString() { - return String.valueOf(average); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverageAndStdDev.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverageAndStdDev.java deleted file mode 100644 index 6212e6616..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverageAndStdDev.java +++ /dev/null @@ -1,107 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -/** - *

- * Extends {@link FullRunningAverage} to add a running standard deviation computation. - * Uses Welford's method, as described at http://www.johndcook.com/standard_deviation.html - *

- */ -public final class FullRunningAverageAndStdDev extends FullRunningAverage implements RunningAverageAndStdDev { - - private double stdDev; - private double mk; - private double sk; - - public FullRunningAverageAndStdDev() { - mk = 0.0; - sk = 0.0; - recomputeStdDev(); - } - - public FullRunningAverageAndStdDev(int count, double average, double mk, double sk) { - super(count, average); - this.mk = mk; - this.sk = sk; - recomputeStdDev(); - } - - public double getMk() { - return mk; - } - - public double getSk() { - return sk; - } - - @Override - public synchronized double getStandardDeviation() { - return stdDev; - } - - @Override - public synchronized void addDatum(double datum) { - super.addDatum(datum); - int count = getCount(); - if (count == 1) { - mk = datum; - sk = 0.0; - } else { - double oldmk = mk; - double diff = datum - oldmk; - mk += diff / count; - sk += diff * (datum - mk); - } - recomputeStdDev(); - } - - @Override - public synchronized void removeDatum(double datum) { - int oldCount = getCount(); - super.removeDatum(datum); - double oldmk = mk; - mk = (oldCount * oldmk - datum) / (oldCount - 1); - sk -= (datum - mk) * (datum - oldmk); - recomputeStdDev(); - } - - /** - * @throws UnsupportedOperationException - */ - @Override - public void changeDatum(double delta) { - throw new UnsupportedOperationException(); - } - - private synchronized void recomputeStdDev() { - int count = getCount(); - stdDev = count > 1 ? Math.sqrt(sk / (count - 1)) : Double.NaN; - } - - @Override - public RunningAverageAndStdDev inverse() { - return new InvertedRunningAverageAndStdDev(this); - } - - @Override - public synchronized String toString() { - return String.valueOf(String.valueOf(getAverage()) + ',' + stdDev); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java deleted file mode 100644 index ffd9b2271..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -public final class InvertedRunningAverage implements RunningAverage { - - private final RunningAverage delegate; - - public InvertedRunningAverage(RunningAverage delegate) { - this.delegate = delegate; - } - - @Override - public void addDatum(double datum) { - throw new UnsupportedOperationException(); - } - - @Override - public void removeDatum(double datum) { - throw new UnsupportedOperationException(); - } - - @Override - public void changeDatum(double delta) { - throw new UnsupportedOperationException(); - } - - @Override - public int getCount() { - return delegate.getCount(); - } - - @Override - public double getAverage() { - return -delegate.getAverage(); - } - - @Override - public RunningAverage inverse() { - return delegate; - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java deleted file mode 100644 index 3b2b6d886..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -public final class InvertedRunningAverageAndStdDev implements RunningAverageAndStdDev { - - private final RunningAverageAndStdDev delegate; - - public InvertedRunningAverageAndStdDev(RunningAverageAndStdDev delegate) { - this.delegate = delegate; - } - - @Override - public void addDatum(double datum) { - throw new UnsupportedOperationException(); - } - - @Override - public void removeDatum(double datum) { - throw new UnsupportedOperationException(); - } - - @Override - public void changeDatum(double delta) { - throw new UnsupportedOperationException(); - } - - @Override - public int getCount() { - return delegate.getCount(); - } - - @Override - public double getAverage() { - return -delegate.getAverage(); - } - - @Override - public double getStandardDeviation() { - return delegate.getStandardDeviation(); - } - - @Override - public RunningAverageAndStdDev inverse() { - return delegate; - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveArrayIterator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveArrayIterator.java deleted file mode 100644 index 96e317c43..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveArrayIterator.java +++ /dev/null @@ -1,93 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -import java.util.NoSuchElementException; - -import com.google.common.base.Preconditions; - -/** - * While long[] is an Iterable, it is not an Iterable<Long>. This adapter class addresses that. - */ -public final class LongPrimitiveArrayIterator implements LongPrimitiveIterator { - - private final long[] array; - private int position; - private final int max; - - /** - *

- * Creates an over an entire array. - *

- * - * @param array - * array to iterate over - */ - public LongPrimitiveArrayIterator(long[] array) { - this.array = Preconditions.checkNotNull(array); // yeah, not going to copy the array here, for performance - this.position = 0; - this.max = array.length; - } - - @Override - public boolean hasNext() { - return position < max; - } - - @Override - public Long next() { - return nextLong(); - } - - @Override - public long nextLong() { - if (position >= array.length) { - throw new NoSuchElementException(); - } - return array[position++]; - } - - @Override - public long peek() { - if (position >= array.length) { - throw new NoSuchElementException(); - } - return array[position]; - } - - /** - * @throws UnsupportedOperationException - */ - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - @Override - public void skip(int n) { - if (n > 0) { - position += n; - } - } - - @Override - public String toString() { - return "LongPrimitiveArrayIterator"; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveIterator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveIterator.java deleted file mode 100644 index 7776361dc..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveIterator.java +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -/** - * Adds notion of iterating over {@code long} primitives in the style of an {@link java.util.Iterator} -- as - * opposed to iterating over {@link Long}. Implementations of this interface however also implement - * {@link java.util.Iterator} and {@link Iterable} over {@link Long} for convenience. - */ -public interface LongPrimitiveIterator extends SkippingIterator { - - /** - * @return next {@code long} in iteration - * @throws java.util.NoSuchElementException - * if no more elements exist in the iteration - */ - long nextLong(); - - /** - * @return next {@code long} in iteration without advancing iteration - */ - long peek(); - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/RefreshHelper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/RefreshHelper.java deleted file mode 100644 index 87e54993c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/RefreshHelper.java +++ /dev/null @@ -1,122 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.locks.ReentrantLock; - -import com.google.common.collect.Lists; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * A helper class for implementing {@link Refreshable}. This object is typically included in an implementation - * {@link Refreshable} to implement {@link Refreshable#refresh(Collection)}. It execute the class's own - * supplied update logic, after updating all the object's dependencies. This also ensures that dependencies - * are not updated multiple times. - */ -public final class RefreshHelper implements Refreshable { - - private static final Logger log = LoggerFactory.getLogger(RefreshHelper.class); - - private final List dependencies; - private final ReentrantLock refreshLock; - private final Callable refreshRunnable; - - /** - * @param refreshRunnable - * encapsulates the containing object's own refresh logic - */ - public RefreshHelper(Callable refreshRunnable) { - this.dependencies = Lists.newArrayListWithCapacity(3); - this.refreshLock = new ReentrantLock(); - this.refreshRunnable = refreshRunnable; - } - - /** Add a dependency to be refreshed first when the encapsulating object does. */ - public void addDependency(Refreshable refreshable) { - if (refreshable != null) { - dependencies.add(refreshable); - } - } - - public void removeDependency(Refreshable refreshable) { - if (refreshable != null) { - dependencies.remove(refreshable); - } - } - - /** - * Typically this is called in and is the entire body of - * that method. - */ - @Override - public void refresh(Collection alreadyRefreshed) { - if (refreshLock.tryLock()) { - try { - alreadyRefreshed = buildRefreshed(alreadyRefreshed); - for (Refreshable dependency : dependencies) { - maybeRefresh(alreadyRefreshed, dependency); - } - if (refreshRunnable != null) { - try { - refreshRunnable.call(); - } catch (Exception e) { - log.warn("Unexpected exception while refreshing", e); - } - } - } finally { - refreshLock.unlock(); - } - } - } - - /** - * Creates a new and empty {@link Collection} if the method parameter is {@code null}. - * - * @param currentAlreadyRefreshed - * {@link Refreshable}s to refresh later on - * @return an empty {@link Collection} if the method param was {@code null} or the unmodified method - * param. - */ - public static Collection buildRefreshed(Collection currentAlreadyRefreshed) { - return currentAlreadyRefreshed == null ? new HashSet(3) : currentAlreadyRefreshed; - } - - /** - * Adds the specified {@link Refreshable} to the given collection of {@link Refreshable}s if it is not - * already there and immediately refreshes it. - * - * @param alreadyRefreshed - * the collection of {@link Refreshable}s - * @param refreshable - * the {@link Refreshable} to potentially add and refresh - */ - public static void maybeRefresh(Collection alreadyRefreshed, Refreshable refreshable) { - if (!alreadyRefreshed.contains(refreshable)) { - alreadyRefreshed.add(refreshable); - log.info("Added refreshable: {}", refreshable); - refreshable.refresh(alreadyRefreshed); - log.info("Refreshed: {}", alreadyRefreshed); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/Retriever.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/Retriever.java deleted file mode 100644 index 40da9de62..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/Retriever.java +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -import org.apache.mahout.cf.taste.common.TasteException; - -/** - *

- * Implementations can retrieve a value for a given key. - *

- */ -public interface Retriever { - - /** - * @param key key for which a value should be retrieved - * @return value for key - * @throws TasteException if an error occurs while retrieving the value - */ - V get(K key) throws TasteException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverage.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverage.java deleted file mode 100644 index bf8e39c62..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverage.java +++ /dev/null @@ -1,67 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -/** - *

- * Interface for classes that can keep track of a running average of a series of numbers. One can add to or - * remove from the series, as well as update a datum in the series. The class does not actually keep track of - * the series of values, just its running average, so it doesn't even matter if you remove/change a value that - * wasn't added. - *

- */ -public interface RunningAverage { - - /** - * @param datum - * new item to add to the running average - * @throws IllegalArgumentException - * if datum is {@link Double#NaN} - */ - void addDatum(double datum); - - /** - * @param datum - * item to remove to the running average - * @throws IllegalArgumentException - * if datum is {@link Double#NaN} - * @throws IllegalStateException - * if count is 0 - */ - void removeDatum(double datum); - - /** - * @param delta - * amount by which to change a datum in the running average - * @throws IllegalArgumentException - * if delta is {@link Double#NaN} - * @throws IllegalStateException - * if count is 0 - */ - void changeDatum(double delta); - - int getCount(); - - double getAverage(); - - /** - * @return a (possibly immutable) object whose average is the negative of this object's - */ - RunningAverage inverse(); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverageAndStdDev.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverageAndStdDev.java deleted file mode 100644 index 4ac610897..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverageAndStdDev.java +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -/** - *

- * Extends {@link RunningAverage} by adding standard deviation too. - *

- */ -public interface RunningAverageAndStdDev extends RunningAverage { - - /** @return standard deviation of data */ - double getStandardDeviation(); - - /** - * @return a (possibly immutable) object whose average is the negative of this object's - */ - @Override - RunningAverageAndStdDev inverse(); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/SamplingLongPrimitiveIterator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/SamplingLongPrimitiveIterator.java deleted file mode 100644 index 390ab86a1..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/SamplingLongPrimitiveIterator.java +++ /dev/null @@ -1,106 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -import java.util.NoSuchElementException; -import java.util.Random; - -import org.apache.mahout.common.RandomUtils; - -/** - * Wraps a {@link LongPrimitiveIterator} and returns only some subset of the elements that it would, - * as determined by a sampling rate parameter. - */ -public final class SamplingLongPrimitiveIterator extends AbstractLongPrimitiveIterator { - - private final Random random; - private final LongPrimitiveIterator delegate; - private final double samplingRate; - private long next; - private boolean hasNext; - - public SamplingLongPrimitiveIterator(LongPrimitiveIterator delegate, double samplingRate) { - random = RandomUtils.getRandom(); - this.delegate = delegate; - this.samplingRate = samplingRate; - this.hasNext = true; - doNext(); - } - - @Override - public boolean hasNext() { - return hasNext; - } - - @Override - public long nextLong() { - if (hasNext) { - long result = next; - doNext(); - return result; - } - throw new NoSuchElementException(); - } - - @Override - public long peek() { - if (hasNext) { - return next; - } - throw new NoSuchElementException(); - } - - private void doNext() { - int toSkip = 0; - while (random.nextDouble() >= samplingRate) { - toSkip++; - } - // Really, would be nicer to select value from geometric distribution, for small values of samplingRate - if (toSkip > 0) { - delegate.skip(toSkip); - } - if (delegate.hasNext()) { - next = delegate.next(); - } else { - hasNext = false; - } - } - - /** - * @throws UnsupportedOperationException - */ - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - @Override - public void skip(int n) { - delegate.skip((int) (n / samplingRate)); // Kind of an approximation, but this is expected skip - if (delegate.hasNext()) { - next = delegate.next(); - } else { - hasNext = false; - } - } - - public static LongPrimitiveIterator maybeWrapIterator(LongPrimitiveIterator delegate, double samplingRate) { - return samplingRate >= 1.0 ? delegate : new SamplingLongPrimitiveIterator(delegate, samplingRate); - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/SkippingIterator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/SkippingIterator.java deleted file mode 100644 index e88f98a49..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/SkippingIterator.java +++ /dev/null @@ -1,35 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -import java.util.Iterator; - -/** - * Adds ability to skip ahead in an iterator, perhaps more efficiently than by calling {@link #next()} - * repeatedly. - */ -public interface SkippingIterator extends Iterator { - - /** - * Skip the next n elements supplied by this {@link Iterator}. If there are less than n elements remaining, - * this skips all remaining elements in the {@link Iterator}. This method has the same effect as calling - * {@link #next()} n times, except that it will never throw {@link java.util.NoSuchElementException}. - */ - void skip(int n); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverage.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverage.java deleted file mode 100644 index 78a32d458..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverage.java +++ /dev/null @@ -1,100 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -import java.io.Serializable; - -import com.google.common.base.Preconditions; - -public class WeightedRunningAverage implements RunningAverage, Serializable { - - private double totalWeight; - private double average; - - public WeightedRunningAverage() { - totalWeight = 0.0; - average = Double.NaN; - } - - @Override - public synchronized void addDatum(double datum) { - addDatum(datum, 1.0); - } - - public synchronized void addDatum(double datum, double weight) { - double oldTotalWeight = totalWeight; - totalWeight += weight; - if (oldTotalWeight <= 0.0) { - average = datum; - } else { - average = average * oldTotalWeight / totalWeight + datum * weight / totalWeight; - } - } - - @Override - public synchronized void removeDatum(double datum) { - removeDatum(datum, 1.0); - } - - public synchronized void removeDatum(double datum, double weight) { - double oldTotalWeight = totalWeight; - totalWeight -= weight; - if (totalWeight <= 0.0) { - average = Double.NaN; - totalWeight = 0.0; - } else { - average = average * oldTotalWeight / totalWeight - datum * weight / totalWeight; - } - } - - @Override - public synchronized void changeDatum(double delta) { - changeDatum(delta, 1.0); - } - - public synchronized void changeDatum(double delta, double weight) { - Preconditions.checkArgument(weight <= totalWeight); - average += delta * weight / totalWeight; - } - - public synchronized double getTotalWeight() { - return totalWeight; - } - - /** @return {@link #getTotalWeight()} */ - @Override - public synchronized int getCount() { - return (int) totalWeight; - } - - @Override - public synchronized double getAverage() { - return average; - } - - @Override - public RunningAverage inverse() { - return new InvertedRunningAverage(this); - } - - @Override - public synchronized String toString() { - return String.valueOf(average); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverageAndStdDev.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverageAndStdDev.java deleted file mode 100644 index bed5812b2..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverageAndStdDev.java +++ /dev/null @@ -1,89 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common; - -/** - * This subclass also provides for a weighted estimate of the sample standard deviation. - * See estimate formulae here. - */ -public final class WeightedRunningAverageAndStdDev extends WeightedRunningAverage implements RunningAverageAndStdDev { - - private double totalSquaredWeight; - private double totalWeightedData; - private double totalWeightedSquaredData; - - public WeightedRunningAverageAndStdDev() { - totalSquaredWeight = 0.0; - totalWeightedData = 0.0; - totalWeightedSquaredData = 0.0; - } - - @Override - public synchronized void addDatum(double datum, double weight) { - super.addDatum(datum, weight); - totalSquaredWeight += weight * weight; - double weightedData = datum * weight; - totalWeightedData += weightedData; - totalWeightedSquaredData += weightedData * datum; - } - - @Override - public synchronized void removeDatum(double datum, double weight) { - super.removeDatum(datum, weight); - totalSquaredWeight -= weight * weight; - if (totalSquaredWeight <= 0.0) { - totalSquaredWeight = 0.0; - } - double weightedData = datum * weight; - totalWeightedData -= weightedData; - if (totalWeightedData <= 0.0) { - totalWeightedData = 0.0; - } - totalWeightedSquaredData -= weightedData * datum; - if (totalWeightedSquaredData <= 0.0) { - totalWeightedSquaredData = 0.0; - } - } - - /** - * @throws UnsupportedOperationException - */ - @Override - public synchronized void changeDatum(double delta, double weight) { - throw new UnsupportedOperationException(); - } - - - @Override - public synchronized double getStandardDeviation() { - double totalWeight = getTotalWeight(); - return Math.sqrt((totalWeightedSquaredData * totalWeight - totalWeightedData * totalWeightedData) - / (totalWeight * totalWeight - totalSquaredWeight)); - } - - @Override - public RunningAverageAndStdDev inverse() { - return new InvertedRunningAverageAndStdDev(this); - } - - @Override - public synchronized String toString() { - return String.valueOf(String.valueOf(getAverage()) + ',' + getStandardDeviation()); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/AbstractJDBCComponent.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/AbstractJDBCComponent.java deleted file mode 100644 index d1e93abe2..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/AbstractJDBCComponent.java +++ /dev/null @@ -1,88 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common.jdbc; - -import javax.naming.Context; -import javax.naming.InitialContext; -import javax.naming.NamingException; -import javax.sql.DataSource; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - * A helper class with common elements for several JDBC-related components. - */ -public abstract class AbstractJDBCComponent { - - private static final Logger log = LoggerFactory.getLogger(AbstractJDBCComponent.class); - - private static final int DEFAULT_FETCH_SIZE = 1000; // A max, "big" number of rows to buffer at once - protected static final String DEFAULT_DATASOURCE_NAME = "jdbc/taste"; - - protected static void checkNotNullAndLog(String argName, Object value) { - Preconditions.checkArgument(value != null && !value.toString().isEmpty(), - argName + " is null or empty"); - log.debug("{}: {}", argName, value); - } - - protected static void checkNotNullAndLog(String argName, Object[] values) { - Preconditions.checkArgument(values != null && values.length != 0, argName + " is null or zero-length"); - for (Object value : values) { - checkNotNullAndLog(argName, value); - } - } - - /** - *

- * Looks up a {@link DataSource} by name from JNDI. "java:comp/env/" is prepended to the argument before - * looking up the name in JNDI. - *

- * - * @param dataSourceName - * JNDI name where a {@link DataSource} is bound (e.g. "jdbc/taste") - * @return {@link DataSource} under that JNDI name - * @throws TasteException - * if a JNDI error occurs - */ - public static DataSource lookupDataSource(String dataSourceName) throws TasteException { - Context context = null; - try { - context = new InitialContext(); - return (DataSource) context.lookup("java:comp/env/" + dataSourceName); - } catch (NamingException ne) { - throw new TasteException(ne); - } finally { - if (context != null) { - try { - context.close(); - } catch (NamingException ne) { - log.warn("Error while closing Context; continuing...", ne); - } - } - } - } - - protected int getFetchSize() { - return DEFAULT_FETCH_SIZE; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/EachRowIterator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/EachRowIterator.java deleted file mode 100644 index 3f024bc1d..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/EachRowIterator.java +++ /dev/null @@ -1,92 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common.jdbc; - -import javax.sql.DataSource; -import java.io.Closeable; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; - -import com.google.common.collect.AbstractIterator; -import org.apache.mahout.common.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Provides an {@link java.util.Iterator} over the result of an SQL query, as an iteration over the {@link ResultSet}. - * While the same object will be returned from the iteration each time, it will be returned once for each row - * of the result. - */ -final class EachRowIterator extends AbstractIterator implements Closeable { - - private static final Logger log = LoggerFactory.getLogger(EachRowIterator.class); - - private final Connection connection; - private final PreparedStatement statement; - private final ResultSet resultSet; - - EachRowIterator(DataSource dataSource, String sqlQuery) throws SQLException { - try { - connection = dataSource.getConnection(); - statement = connection.prepareStatement(sqlQuery, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY); - statement.setFetchDirection(ResultSet.FETCH_FORWARD); - //statement.setFetchSize(getFetchSize()); - log.debug("Executing SQL query: {}", sqlQuery); - resultSet = statement.executeQuery(); - } catch (SQLException sqle) { - close(); - throw sqle; - } - } - - @Override - protected ResultSet computeNext() { - try { - if (resultSet.next()) { - return resultSet; - } else { - close(); - return null; - } - } catch (SQLException sqle) { - close(); - throw new IllegalStateException(sqle); - } - } - - public void skip(int n) throws SQLException { - try { - resultSet.relative(n); - } catch (SQLException sqle) { - // Can't use relative on MySQL Connector/J; try advancing manually - int i = 0; - while (i < n && resultSet.next()) { - i++; - } - } - } - - @Override - public void close() { - IOUtils.quietClose(resultSet, statement, connection); - endOfData(); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/ResultSetIterator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/ResultSetIterator.java deleted file mode 100644 index 09135d00a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/ResultSetIterator.java +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.common.jdbc; - -import javax.sql.DataSource; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.Iterator; - -import com.google.common.base.Function; -import com.google.common.collect.ForwardingIterator; -import com.google.common.collect.Iterators; - -public abstract class ResultSetIterator extends ForwardingIterator { - - private final Iterator delegate; - private final EachRowIterator rowDelegate; - - protected ResultSetIterator(DataSource dataSource, String sqlQuery) throws SQLException { - this.rowDelegate = new EachRowIterator(dataSource, sqlQuery); - delegate = Iterators.transform(rowDelegate, - new Function() { - @Override - public T apply(ResultSet from) { - try { - return parseElement(from); - } catch (SQLException sqle) { - throw new IllegalStateException(sqle); - } - } - }); - } - - @Override - protected Iterator delegate() { - return delegate; - } - - protected abstract T parseElement(ResultSet resultSet) throws SQLException; - - public void skip(int n) { - if (n >= 1) { - try { - rowDelegate.skip(n); - } catch (SQLException sqle) { - throw new IllegalStateException(sqle); - } - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/AbstractDifferenceRecommenderEvaluator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/AbstractDifferenceRecommenderEvaluator.java deleted file mode 100644 index 96066d257..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/AbstractDifferenceRecommenderEvaluator.java +++ /dev/null @@ -1,269 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.eval; - -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.atomic.AtomicInteger; - -import com.google.common.collect.Lists; -import org.apache.mahout.cf.taste.common.NoSuchItemException; -import org.apache.mahout.cf.taste.common.NoSuchUserException; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.eval.DataModelBuilder; -import org.apache.mahout.cf.taste.eval.RecommenderBuilder; -import org.apache.mahout.cf.taste.eval.RecommenderEvaluator; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev; -import org.apache.mahout.cf.taste.impl.model.GenericDataModel; -import org.apache.mahout.cf.taste.impl.model.GenericPreference; -import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.Recommender; -import org.apache.mahout.common.RandomUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - * Abstract superclass of a couple implementations, providing shared functionality. - */ -public abstract class AbstractDifferenceRecommenderEvaluator implements RecommenderEvaluator { - - private static final Logger log = LoggerFactory.getLogger(AbstractDifferenceRecommenderEvaluator.class); - - private final Random random; - private float maxPreference; - private float minPreference; - - protected AbstractDifferenceRecommenderEvaluator() { - random = RandomUtils.getRandom(); - maxPreference = Float.NaN; - minPreference = Float.NaN; - } - - @Override - public final float getMaxPreference() { - return maxPreference; - } - - @Override - public final void setMaxPreference(float maxPreference) { - this.maxPreference = maxPreference; - } - - @Override - public final float getMinPreference() { - return minPreference; - } - - @Override - public final void setMinPreference(float minPreference) { - this.minPreference = minPreference; - } - - @Override - public double evaluate(RecommenderBuilder recommenderBuilder, - DataModelBuilder dataModelBuilder, - DataModel dataModel, - double trainingPercentage, - double evaluationPercentage) throws TasteException { - Preconditions.checkNotNull(recommenderBuilder); - Preconditions.checkNotNull(dataModel); - Preconditions.checkArgument(trainingPercentage >= 0.0 && trainingPercentage <= 1.0, - "Invalid trainingPercentage: " + trainingPercentage); - Preconditions.checkArgument(evaluationPercentage >= 0.0 && evaluationPercentage <= 1.0, - "Invalid evaluationPercentage: " + evaluationPercentage); - - log.info("Beginning evaluation using {} of {}", trainingPercentage, dataModel); - - int numUsers = dataModel.getNumUsers(); - FastByIDMap trainingPrefs = new FastByIDMap( - 1 + (int) (evaluationPercentage * numUsers)); - FastByIDMap testPrefs = new FastByIDMap( - 1 + (int) (evaluationPercentage * numUsers)); - - LongPrimitiveIterator it = dataModel.getUserIDs(); - while (it.hasNext()) { - long userID = it.nextLong(); - if (random.nextDouble() < evaluationPercentage) { - splitOneUsersPrefs(trainingPercentage, trainingPrefs, testPrefs, userID, dataModel); - } - } - - DataModel trainingModel = dataModelBuilder == null ? new GenericDataModel(trainingPrefs) - : dataModelBuilder.buildDataModel(trainingPrefs); - - Recommender recommender = recommenderBuilder.buildRecommender(trainingModel); - - double result = getEvaluation(testPrefs, recommender); - log.info("Evaluation result: {}", result); - return result; - } - - private void splitOneUsersPrefs(double trainingPercentage, - FastByIDMap trainingPrefs, - FastByIDMap testPrefs, - long userID, - DataModel dataModel) throws TasteException { - List oneUserTrainingPrefs = null; - List oneUserTestPrefs = null; - PreferenceArray prefs = dataModel.getPreferencesFromUser(userID); - int size = prefs.length(); - for (int i = 0; i < size; i++) { - Preference newPref = new GenericPreference(userID, prefs.getItemID(i), prefs.getValue(i)); - if (random.nextDouble() < trainingPercentage) { - if (oneUserTrainingPrefs == null) { - oneUserTrainingPrefs = Lists.newArrayListWithCapacity(3); - } - oneUserTrainingPrefs.add(newPref); - } else { - if (oneUserTestPrefs == null) { - oneUserTestPrefs = Lists.newArrayListWithCapacity(3); - } - oneUserTestPrefs.add(newPref); - } - } - if (oneUserTrainingPrefs != null) { - trainingPrefs.put(userID, new GenericUserPreferenceArray(oneUserTrainingPrefs)); - if (oneUserTestPrefs != null) { - testPrefs.put(userID, new GenericUserPreferenceArray(oneUserTestPrefs)); - } - } - } - - private float capEstimatedPreference(float estimate) { - if (estimate > maxPreference) { - return maxPreference; - } - if (estimate < minPreference) { - return minPreference; - } - return estimate; - } - - private double getEvaluation(FastByIDMap testPrefs, Recommender recommender) - throws TasteException { - reset(); - Collection> estimateCallables = Lists.newArrayList(); - AtomicInteger noEstimateCounter = new AtomicInteger(); - for (Map.Entry entry : testPrefs.entrySet()) { - estimateCallables.add( - new PreferenceEstimateCallable(recommender, entry.getKey(), entry.getValue(), noEstimateCounter)); - } - log.info("Beginning evaluation of {} users", estimateCallables.size()); - RunningAverageAndStdDev timing = new FullRunningAverageAndStdDev(); - execute(estimateCallables, noEstimateCounter, timing); - return computeFinalEvaluation(); - } - - protected static void execute(Collection> callables, - AtomicInteger noEstimateCounter, - RunningAverageAndStdDev timing) throws TasteException { - - callables = wrapWithStatsCallables(callables, noEstimateCounter, timing); - int numProcessors = Runtime.getRuntime().availableProcessors(); - ExecutorService executor = Executors.newFixedThreadPool(numProcessors); - log.info("Starting timing of {} tasks in {} threads", callables.size(), numProcessors); - try { - List> futures = executor.invokeAll(callables); - // Go look for exceptions here, really - for (Future future : futures) { - future.get(); - } - } catch (InterruptedException ie) { - throw new TasteException(ie); - } catch (ExecutionException ee) { - throw new TasteException(ee.getCause()); - } - executor.shutdown(); - } - - private static Collection> wrapWithStatsCallables(Iterable> callables, - AtomicInteger noEstimateCounter, - RunningAverageAndStdDev timing) { - Collection> wrapped = Lists.newArrayList(); - int count = 0; - for (Callable callable : callables) { - boolean logStats = count++ % 1000 == 0; // log every 1000 or so iterations - wrapped.add(new StatsCallable(callable, logStats, timing, noEstimateCounter)); - } - return wrapped; - } - - protected abstract void reset(); - - protected abstract void processOneEstimate(float estimatedPreference, Preference realPref); - - protected abstract double computeFinalEvaluation(); - - public final class PreferenceEstimateCallable implements Callable { - - private final Recommender recommender; - private final long testUserID; - private final PreferenceArray prefs; - private final AtomicInteger noEstimateCounter; - - public PreferenceEstimateCallable(Recommender recommender, - long testUserID, - PreferenceArray prefs, - AtomicInteger noEstimateCounter) { - this.recommender = recommender; - this.testUserID = testUserID; - this.prefs = prefs; - this.noEstimateCounter = noEstimateCounter; - } - - @Override - public Void call() throws TasteException { - for (Preference realPref : prefs) { - float estimatedPreference = Float.NaN; - try { - estimatedPreference = recommender.estimatePreference(testUserID, realPref.getItemID()); - } catch (NoSuchUserException nsue) { - // It's possible that an item exists in the test data but not training data in which case - // NSEE will be thrown. Just ignore it and move on. - log.info("User exists in test data but not training data: {}", testUserID); - } catch (NoSuchItemException nsie) { - log.info("Item exists in test data but not training data: {}", realPref.getItemID()); - } - if (Float.isNaN(estimatedPreference)) { - noEstimateCounter.incrementAndGet(); - } else { - estimatedPreference = capEstimatedPreference(estimatedPreference); - processOneEstimate(estimatedPreference, realPref); - } - } - return null; - } - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/AverageAbsoluteDifferenceRecommenderEvaluator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/AverageAbsoluteDifferenceRecommenderEvaluator.java deleted file mode 100644 index 4dad0400d..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/AverageAbsoluteDifferenceRecommenderEvaluator.java +++ /dev/null @@ -1,59 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.eval; - -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.model.Preference; - -/** - *

- * A {@link org.apache.mahout.cf.taste.eval.RecommenderEvaluator} which computes the average absolute - * difference between predicted and actual ratings for users. - *

- * - *

- * This algorithm is also called "mean average error". - *

- */ -public final class AverageAbsoluteDifferenceRecommenderEvaluator extends - AbstractDifferenceRecommenderEvaluator { - - private RunningAverage average; - - @Override - protected void reset() { - average = new FullRunningAverage(); - } - - @Override - protected void processOneEstimate(float estimatedPreference, Preference realPref) { - average.addDatum(Math.abs(realPref.getValue() - estimatedPreference)); - } - - @Override - protected double computeFinalEvaluation() { - return average.getAverage(); - } - - @Override - public String toString() { - return "AverageAbsoluteDifferenceRecommenderEvaluator"; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRecommenderIRStatsEvaluator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRecommenderIRStatsEvaluator.java deleted file mode 100644 index 00996edfc..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRecommenderIRStatsEvaluator.java +++ /dev/null @@ -1,237 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.eval; - -import java.util.List; -import java.util.Random; - -import org.apache.mahout.cf.taste.common.NoSuchUserException; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.eval.DataModelBuilder; -import org.apache.mahout.cf.taste.eval.IRStatistics; -import org.apache.mahout.cf.taste.eval.RecommenderBuilder; -import org.apache.mahout.cf.taste.eval.RecommenderIRStatsEvaluator; -import org.apache.mahout.cf.taste.eval.RelevantItemsDataSplitter; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev; -import org.apache.mahout.cf.taste.impl.model.GenericDataModel; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.IDRescorer; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.cf.taste.recommender.Recommender; -import org.apache.mahout.common.RandomUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * For each user, these implementation determine the top {@code n} preferences, then evaluate the IR - * statistics based on a {@link DataModel} that does not have these values. This number {@code n} is the - * "at" value, as in "precision at 5". For example, this would mean precision evaluated by removing the top 5 - * preferences for a user and then finding the percentage of those 5 items included in the top 5 - * recommendations for that user. - *

- */ -public final class GenericRecommenderIRStatsEvaluator implements RecommenderIRStatsEvaluator { - - private static final Logger log = LoggerFactory.getLogger(GenericRecommenderIRStatsEvaluator.class); - - private static final double LOG2 = Math.log(2.0); - - /** - * Pass as "relevanceThreshold" argument to - * {@link #evaluate(RecommenderBuilder, DataModelBuilder, DataModel, IDRescorer, int, double, double)} to - * have it attempt to compute a reasonable threshold. Note that this will impact performance. - */ - public static final double CHOOSE_THRESHOLD = Double.NaN; - - private final Random random; - private final RelevantItemsDataSplitter dataSplitter; - - public GenericRecommenderIRStatsEvaluator() { - this(new GenericRelevantItemsDataSplitter()); - } - - public GenericRecommenderIRStatsEvaluator(RelevantItemsDataSplitter dataSplitter) { - Preconditions.checkNotNull(dataSplitter); - random = RandomUtils.getRandom(); - this.dataSplitter = dataSplitter; - } - - @Override - public IRStatistics evaluate(RecommenderBuilder recommenderBuilder, - DataModelBuilder dataModelBuilder, - DataModel dataModel, - IDRescorer rescorer, - int at, - double relevanceThreshold, - double evaluationPercentage) throws TasteException { - - Preconditions.checkArgument(recommenderBuilder != null, "recommenderBuilder is null"); - Preconditions.checkArgument(dataModel != null, "dataModel is null"); - Preconditions.checkArgument(at >= 1, "at must be at least 1"); - Preconditions.checkArgument(evaluationPercentage > 0.0 && evaluationPercentage <= 1.0, - "Invalid evaluationPercentage: %s", evaluationPercentage); - - int numItems = dataModel.getNumItems(); - RunningAverage precision = new FullRunningAverage(); - RunningAverage recall = new FullRunningAverage(); - RunningAverage fallOut = new FullRunningAverage(); - RunningAverage nDCG = new FullRunningAverage(); - int numUsersRecommendedFor = 0; - int numUsersWithRecommendations = 0; - - LongPrimitiveIterator it = dataModel.getUserIDs(); - while (it.hasNext()) { - - long userID = it.nextLong(); - - if (random.nextDouble() >= evaluationPercentage) { - // Skipped - continue; - } - - long start = System.currentTimeMillis(); - - PreferenceArray prefs = dataModel.getPreferencesFromUser(userID); - - // List some most-preferred items that would count as (most) "relevant" results - double theRelevanceThreshold = Double.isNaN(relevanceThreshold) ? computeThreshold(prefs) : relevanceThreshold; - FastIDSet relevantItemIDs = dataSplitter.getRelevantItemsIDs(userID, at, theRelevanceThreshold, dataModel); - - int numRelevantItems = relevantItemIDs.size(); - if (numRelevantItems <= 0) { - continue; - } - - FastByIDMap trainingUsers = new FastByIDMap(dataModel.getNumUsers()); - LongPrimitiveIterator it2 = dataModel.getUserIDs(); - while (it2.hasNext()) { - dataSplitter.processOtherUser(userID, relevantItemIDs, trainingUsers, it2.nextLong(), dataModel); - } - - DataModel trainingModel = dataModelBuilder == null ? new GenericDataModel(trainingUsers) - : dataModelBuilder.buildDataModel(trainingUsers); - try { - trainingModel.getPreferencesFromUser(userID); - } catch (NoSuchUserException nsee) { - continue; // Oops we excluded all prefs for the user -- just move on - } - - int size = relevantItemIDs.size() + trainingModel.getItemIDsFromUser(userID).size(); - if (size < 2 * at) { - // Really not enough prefs to meaningfully evaluate this user - continue; - } - - Recommender recommender = recommenderBuilder.buildRecommender(trainingModel); - - int intersectionSize = 0; - List recommendedItems = recommender.recommend(userID, at, rescorer); - for (RecommendedItem recommendedItem : recommendedItems) { - if (relevantItemIDs.contains(recommendedItem.getItemID())) { - intersectionSize++; - } - } - - int numRecommendedItems = recommendedItems.size(); - - // Precision - if (numRecommendedItems > 0) { - precision.addDatum((double) intersectionSize / (double) numRecommendedItems); - } - - // Recall - recall.addDatum((double) intersectionSize / (double) numRelevantItems); - - // Fall-out - if (numRelevantItems < size) { - fallOut.addDatum((double) (numRecommendedItems - intersectionSize) - / (double) (numItems - numRelevantItems)); - } - - // nDCG - // In computing, assume relevant IDs have relevance 1 and others 0 - double cumulativeGain = 0.0; - double idealizedGain = 0.0; - for (int i = 0; i < recommendedItems.size(); i++) { - RecommendedItem item = recommendedItems.get(i); - double discount = i == 0 ? 1.0 : 1.0 / log2(i + 1); - if (relevantItemIDs.contains(item.getItemID())) { - cumulativeGain += discount; - } - // otherwise we're multiplying discount by relevance 0 so it doesn't do anything - - // Ideally results would be ordered with all relevant ones first, so this theoretical - // ideal list starts with number of relevant items equal to the total number of relevant items - if (i < relevantItemIDs.size()) { - idealizedGain += discount; - } - } - nDCG.addDatum(cumulativeGain / idealizedGain); - - // Reach - numUsersRecommendedFor++; - if (numRecommendedItems > 0) { - numUsersWithRecommendations++; - } - - long end = System.currentTimeMillis(); - - log.info("Evaluated with user {} in {}ms", userID, end - start); - log.info("Precision/recall/fall-out/nDCG: {} / {} / {} / {}", new Object[] { - precision.getAverage(), recall.getAverage(), fallOut.getAverage(), nDCG.getAverage() - }); - } - - double reach = (double) numUsersWithRecommendations / (double) numUsersRecommendedFor; - - return new IRStatisticsImpl( - precision.getAverage(), - recall.getAverage(), - fallOut.getAverage(), - nDCG.getAverage(), - reach); - } - - private static double computeThreshold(PreferenceArray prefs) { - if (prefs.length() < 2) { - // Not enough data points -- return a threshold that allows everything - return Double.NEGATIVE_INFINITY; - } - RunningAverageAndStdDev stdDev = new FullRunningAverageAndStdDev(); - int size = prefs.length(); - for (int i = 0; i < size; i++) { - stdDev.addDatum(prefs.getValue(i)); - } - return stdDev.getAverage() + stdDev.getStandardDeviation(); - } - - private static double log2(double value) { - return Math.log(value) / LOG2; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRelevantItemsDataSplitter.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRelevantItemsDataSplitter.java deleted file mode 100644 index fb4858c40..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRelevantItemsDataSplitter.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.eval; - -import com.google.common.collect.Lists; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.eval.RelevantItemsDataSplitter; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.cf.taste.model.PreferenceArray; - -import java.util.Iterator; -import java.util.List; - -/** - * Picks relevant items to be those with the strongest preference, and - * includes the other users' preferences in full. - */ -public final class GenericRelevantItemsDataSplitter implements RelevantItemsDataSplitter { - - @Override - public FastIDSet getRelevantItemsIDs(long userID, - int at, - double relevanceThreshold, - DataModel dataModel) throws TasteException { - PreferenceArray prefs = dataModel.getPreferencesFromUser(userID); - FastIDSet relevantItemIDs = new FastIDSet(at); - prefs.sortByValueReversed(); - for (int i = 0; i < prefs.length() && relevantItemIDs.size() < at; i++) { - if (prefs.getValue(i) >= relevanceThreshold) { - relevantItemIDs.add(prefs.getItemID(i)); - } - } - return relevantItemIDs; - } - - @Override - public void processOtherUser(long userID, - FastIDSet relevantItemIDs, - FastByIDMap trainingUsers, - long otherUserID, - DataModel dataModel) throws TasteException { - PreferenceArray prefs2Array = dataModel.getPreferencesFromUser(otherUserID); - // If we're dealing with the very user that we're evaluating for precision/recall, - if (userID == otherUserID) { - // then must remove all the test IDs, the "relevant" item IDs - List prefs2 = Lists.newArrayListWithCapacity(prefs2Array.length()); - for (Preference pref : prefs2Array) { - prefs2.add(pref); - } - for (Iterator iterator = prefs2.iterator(); iterator.hasNext(); ) { - Preference pref = iterator.next(); - if (relevantItemIDs.contains(pref.getItemID())) { - iterator.remove(); - } - } - if (!prefs2.isEmpty()) { - trainingUsers.put(otherUserID, new GenericUserPreferenceArray(prefs2)); - } - } else { - // otherwise just add all those other user's prefs - trainingUsers.put(otherUserID, prefs2Array); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/IRStatisticsImpl.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/IRStatisticsImpl.java deleted file mode 100644 index 920935ebb..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/IRStatisticsImpl.java +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.eval; - -import java.io.Serializable; - -import org.apache.mahout.cf.taste.eval.IRStatistics; - -import com.google.common.base.Preconditions; - -public final class IRStatisticsImpl implements IRStatistics, Serializable { - - private final double precision; - private final double recall; - private final double fallOut; - private final double ndcg; - private final double reach; - - IRStatisticsImpl(double precision, double recall, double fallOut, double ndcg, double reach) { - Preconditions.checkArgument(precision >= 0.0 && precision <= 1.0, "Illegal precision: " + precision); - Preconditions.checkArgument(recall >= 0.0 && recall <= 1.0, "Illegal recall: " + recall); - Preconditions.checkArgument(fallOut >= 0.0 && fallOut <= 1.0, "Illegal fallOut: " + fallOut); - Preconditions.checkArgument(ndcg >= 0.0 && ndcg <= 1.0, "Illegal nDCG: " + ndcg); - Preconditions.checkArgument(reach >= 0.0 && reach <= 1.0, "Illegal reach: " + reach); - this.precision = precision; - this.recall = recall; - this.fallOut = fallOut; - this.ndcg = ndcg; - this.reach = reach; - } - - @Override - public double getPrecision() { - return precision; - } - - @Override - public double getRecall() { - return recall; - } - - @Override - public double getFallOut() { - return fallOut; - } - - @Override - public double getF1Measure() { - return getFNMeasure(1.0); - } - - @Override - public double getFNMeasure(double b) { - double b2 = b * b; - double sum = b2 * precision + recall; - return sum == 0.0 ? Double.NaN : (1.0 + b2) * precision * recall / sum; - } - - @Override - public double getNormalizedDiscountedCumulativeGain() { - return ndcg; - } - - @Override - public double getReach() { - return reach; - } - - @Override - public String toString() { - return "IRStatisticsImpl[precision:" + precision + ",recall:" + recall + ",fallOut:" - + fallOut + ",nDCG:" + ndcg + ",reach:" + reach + ']'; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadCallable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadCallable.java deleted file mode 100644 index 213f7f960..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadCallable.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.eval; - -import org.apache.mahout.cf.taste.recommender.Recommender; - -import java.util.concurrent.Callable; - -final class LoadCallable implements Callable { - - private final Recommender recommender; - private final long userID; - - LoadCallable(Recommender recommender, long userID) { - this.recommender = recommender; - this.userID = userID; - } - - @Override - public Void call() throws Exception { - recommender.recommend(userID, 10); - return null; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadEvaluator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadEvaluator.java deleted file mode 100644 index b606047cc..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadEvaluator.java +++ /dev/null @@ -1,61 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.eval; - -import java.util.Collection; -import java.util.concurrent.Callable; -import java.util.concurrent.atomic.AtomicInteger; - -import com.google.common.collect.Lists; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev; -import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.recommender.Recommender; - -/** - * Simple helper class for running load on a Recommender. - */ -public final class LoadEvaluator { - - private LoadEvaluator() { } - - public static LoadStatistics runLoad(Recommender recommender) throws TasteException { - return runLoad(recommender, 10); - } - - public static LoadStatistics runLoad(Recommender recommender, int howMany) throws TasteException { - DataModel dataModel = recommender.getDataModel(); - int numUsers = dataModel.getNumUsers(); - double sampleRate = 1000.0 / numUsers; - LongPrimitiveIterator userSampler = - SamplingLongPrimitiveIterator.maybeWrapIterator(dataModel.getUserIDs(), sampleRate); - recommender.recommend(userSampler.next(), howMany); // Warm up - Collection> callables = Lists.newArrayList(); - while (userSampler.hasNext()) { - callables.add(new LoadCallable(recommender, userSampler.next())); - } - AtomicInteger noEstimateCounter = new AtomicInteger(); - RunningAverageAndStdDev timing = new FullRunningAverageAndStdDev(); - AbstractDifferenceRecommenderEvaluator.execute(callables, noEstimateCounter, timing); - return new LoadStatistics(timing); - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadStatistics.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadStatistics.java deleted file mode 100644 index f89160c23..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadStatistics.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.eval; - -import org.apache.mahout.cf.taste.impl.common.RunningAverage; - -public final class LoadStatistics { - - private final RunningAverage timing; - - LoadStatistics(RunningAverage timing) { - this.timing = timing; - } - - public RunningAverage getTiming() { - return timing; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/OrderBasedRecommenderEvaluator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/OrderBasedRecommenderEvaluator.java deleted file mode 100644 index 00a8b2fc9..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/OrderBasedRecommenderEvaluator.java +++ /dev/null @@ -1,431 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.eval; - -import java.util.Arrays; -import java.util.List; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.cf.taste.recommender.Recommender; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Evaluate recommender by comparing order of all raw prefs with order in - * recommender's output for that user. Can also compare data models. - */ -public final class OrderBasedRecommenderEvaluator { - - private static final Logger log = LoggerFactory.getLogger(OrderBasedRecommenderEvaluator.class); - - private OrderBasedRecommenderEvaluator() { - } - - public static void evaluate(Recommender recommender1, - Recommender recommender2, - int samples, - RunningAverage tracker, - String tag) throws TasteException { - printHeader(); - LongPrimitiveIterator users = recommender1.getDataModel().getUserIDs(); - - while (users.hasNext()) { - long userID = users.nextLong(); - List recs1 = recommender1.recommend(userID, samples); - List recs2 = recommender2.recommend(userID, samples); - FastIDSet commonSet = new FastIDSet(); - long maxItemID = setBits(commonSet, recs1, samples); - FastIDSet otherSet = new FastIDSet(); - maxItemID = Math.max(maxItemID, setBits(otherSet, recs2, samples)); - int max = mask(commonSet, otherSet, maxItemID); - max = Math.min(max, samples); - if (max < 2) { - continue; - } - Long[] items1 = getCommonItems(commonSet, recs1, max); - Long[] items2 = getCommonItems(commonSet, recs2, max); - double variance = scoreCommonSubset(tag, userID, samples, max, items1, items2); - tracker.addDatum(variance); - } - } - - public static void evaluate(Recommender recommender, - DataModel model, - int samples, - RunningAverage tracker, - String tag) throws TasteException { - printHeader(); - LongPrimitiveIterator users = recommender.getDataModel().getUserIDs(); - while (users.hasNext()) { - long userID = users.nextLong(); - List recs1 = recommender.recommend(userID, model.getNumItems()); - PreferenceArray prefs2 = model.getPreferencesFromUser(userID); - prefs2.sortByValueReversed(); - FastIDSet commonSet = new FastIDSet(); - long maxItemID = setBits(commonSet, recs1, samples); - FastIDSet otherSet = new FastIDSet(); - maxItemID = Math.max(maxItemID, setBits(otherSet, prefs2, samples)); - int max = mask(commonSet, otherSet, maxItemID); - max = Math.min(max, samples); - if (max < 2) { - continue; - } - Long[] items1 = getCommonItems(commonSet, recs1, max); - Long[] items2 = getCommonItems(commonSet, prefs2, max); - double variance = scoreCommonSubset(tag, userID, samples, max, items1, items2); - tracker.addDatum(variance); - } - } - - public static void evaluate(DataModel model1, - DataModel model2, - int samples, - RunningAverage tracker, - String tag) throws TasteException { - printHeader(); - LongPrimitiveIterator users = model1.getUserIDs(); - while (users.hasNext()) { - long userID = users.nextLong(); - PreferenceArray prefs1 = model1.getPreferencesFromUser(userID); - PreferenceArray prefs2 = model2.getPreferencesFromUser(userID); - prefs1.sortByValueReversed(); - prefs2.sortByValueReversed(); - FastIDSet commonSet = new FastIDSet(); - long maxItemID = setBits(commonSet, prefs1, samples); - FastIDSet otherSet = new FastIDSet(); - maxItemID = Math.max(maxItemID, setBits(otherSet, prefs2, samples)); - int max = mask(commonSet, otherSet, maxItemID); - max = Math.min(max, samples); - if (max < 2) { - continue; - } - Long[] items1 = getCommonItems(commonSet, prefs1, max); - Long[] items2 = getCommonItems(commonSet, prefs2, max); - double variance = scoreCommonSubset(tag, userID, samples, max, items1, items2); - tracker.addDatum(variance); - } - } - - /** - * This exists because FastIDSet has 'retainAll' as MASK, but there is - * no count of the number of items in the set. size() is supposed to do - * this but does not work. - */ - private static int mask(FastIDSet commonSet, FastIDSet otherSet, long maxItemID) { - int count = 0; - for (int i = 0; i <= maxItemID; i++) { - if (commonSet.contains(i)) { - if (otherSet.contains(i)) { - count++; - } else { - commonSet.remove(i); - } - } - } - return count; - } - - private static Long[] getCommonItems(FastIDSet commonSet, Iterable recs, int max) { - Long[] commonItems = new Long[max]; - int index = 0; - for (RecommendedItem rec : recs) { - Long item = rec.getItemID(); - if (commonSet.contains(item)) { - commonItems[index++] = item; - } - if (index == max) { - break; - } - } - return commonItems; - } - - private static Long[] getCommonItems(FastIDSet commonSet, PreferenceArray prefs1, int max) { - Long[] commonItems = new Long[max]; - int index = 0; - for (int i = 0; i < prefs1.length(); i++) { - Long item = prefs1.getItemID(i); - if (commonSet.contains(item)) { - commonItems[index++] = item; - } - if (index == max) { - break; - } - } - return commonItems; - } - - private static long setBits(FastIDSet modelSet, List items, int max) { - long maxItem = -1; - for (int i = 0; i < items.size() && i < max; i++) { - long itemID = items.get(i).getItemID(); - modelSet.add(itemID); - if (itemID > maxItem) { - maxItem = itemID; - } - } - return maxItem; - } - - private static long setBits(FastIDSet modelSet, PreferenceArray prefs, int max) { - long maxItem = -1; - for (int i = 0; i < prefs.length() && i < max; i++) { - long itemID = prefs.getItemID(i); - modelSet.add(itemID); - if (itemID > maxItem) { - maxItem = itemID; - } - } - return maxItem; - } - - private static void printHeader() { - log.info("tag,user,samples,common,hamming,bubble,rank,normal,score"); - } - - /** - * Common Subset Scoring - * - * These measurements are given the set of results that are common to both - * recommendation lists. They only get ordered lists. - * - * These measures all return raw numbers do not correlate among the tests. - * The numbers are not corrected against the total number of samples or the - * number of common items. - * The one contract is that all measures are 0 for an exact match and an - * increasing positive number as differences increase. - */ - private static double scoreCommonSubset(String tag, - long userID, - int samples, - int subset, - Long[] itemsL, - Long[] itemsR) { - int[] vectorZ = new int[subset]; - int[] vectorZabs = new int[subset]; - - long bubble = sort(itemsL, itemsR); - int hamming = slidingWindowHamming(itemsR, itemsL); - if (hamming > samples) { - throw new IllegalStateException(); - } - getVectorZ(itemsR, itemsL, vectorZ, vectorZabs); - double normalW = normalWilcoxon(vectorZ, vectorZabs); - double meanRank = getMeanRank(vectorZabs); - // case statement for requested value - double variance = Math.sqrt(meanRank); - log.info("{},{},{},{},{},{},{},{},{}", - new Object[] {tag, userID, samples, subset, hamming, bubble, meanRank, normalW, variance}); - return variance; - } - - // simple sliding-window hamming distance: a[i or plus/minus 1] == b[i] - private static int slidingWindowHamming(Long[] itemsR, Long[] itemsL) { - int count = 0; - int samples = itemsR.length; - - if (itemsR[0].equals(itemsL[0]) || itemsR[0].equals(itemsL[1])) { - count++; - } - for (int i = 1; i < samples - 1; i++) { - long itemID = itemsL[i]; - if (itemsR[i] == itemID || itemsR[i - 1] == itemID || itemsR[i + 1] == itemID) { - count++; - } - } - if (itemsR[samples - 1].equals(itemsL[samples - 1]) || itemsR[samples - 1].equals(itemsL[samples - 2])) { - count++; - } - return count; - } - - /** - * Normal-distribution probability value for matched sets of values. - * Based upon: - * http://comp9.psych.cornell.edu/Darlington/normscor.htm - * - * The Standard Wilcoxon is not used because it requires a lookup table. - */ - static double normalWilcoxon(int[] vectorZ, int[] vectorZabs) { - int nitems = vectorZ.length; - - double[] ranks = new double[nitems]; - double[] ranksAbs = new double[nitems]; - wilcoxonRanks(vectorZ, vectorZabs, ranks, ranksAbs); - return Math.min(getMeanWplus(ranks), getMeanWminus(ranks)); - } - - /** - * vector Z is a list of distances between the correct value and the recommended value - * Z[i] = position i of correct itemID - position of correct itemID in recommendation list - * can be positive or negative - * the smaller the better - means recommendations are closer - * both are the same length, and both sample from the same set - * - * destructive to items arrays - allows N log N instead of N^2 order - */ - private static void getVectorZ(Long[] itemsR, Long[] itemsL, int[] vectorZ, int[] vectorZabs) { - int nitems = itemsR.length; - int bottom = 0; - int top = nitems - 1; - for (int i = 0; i < nitems; i++) { - long itemID = itemsR[i]; - for (int j = bottom; j <= top; j++) { - if (itemsL[j] == null) { - continue; - } - long test = itemsL[j]; - if (itemID == test) { - vectorZ[i] = i - j; - vectorZabs[i] = Math.abs(i - j); - if (j == bottom) { - bottom++; - } else if (j == top) { - top--; - } else { - itemsL[j] = null; - } - break; - } - } - } - } - - /** - * Ranks are the position of the value from low to high, divided by the # of values. - * I had to walk through it a few times. - */ - private static void wilcoxonRanks(int[] vectorZ, int[] vectorZabs, double[] ranks, double[] ranksAbs) { - int nitems = vectorZ.length; - int[] sorted = vectorZabs.clone(); - Arrays.sort(sorted); - int zeros = 0; - for (; zeros < nitems; zeros++) { - if (sorted[zeros] > 0) { - break; - } - } - for (int i = 0; i < nitems; i++) { - double rank = 0.0; - int count = 0; - int score = vectorZabs[i]; - for (int j = 0; j < nitems; j++) { - if (score == sorted[j]) { - rank += j + 1 - zeros; - count++; - } else if (score < sorted[j]) { - break; - } - } - if (vectorZ[i] != 0) { - ranks[i] = (rank / count) * (vectorZ[i] < 0 ? -1 : 1); // better be at least 1 - ranksAbs[i] = Math.abs(ranks[i]); - } - } - } - - private static double getMeanRank(int[] ranks) { - int nitems = ranks.length; - double sum = 0.0; - for (int i = 0; i < nitems; i++) { - sum += ranks[i]; - } - return sum / nitems; - } - - private static double getMeanWplus(double[] ranks) { - int nitems = ranks.length; - double sum = 0.0; - for (int i = 0; i < nitems; i++) { - if (ranks[i] > 0) { - sum += ranks[i]; - } - } - return sum / nitems; - } - - private static double getMeanWminus(double[] ranks) { - int nitems = ranks.length; - double sum = 0.0; - for (int i = 0; i < nitems; i++) { - if (ranks[i] < 0) { - sum -= ranks[i]; - } - } - return sum / nitems; - } - - /** - * Do bubble sort and return number of swaps needed to match preference lists. - * Sort itemsR using itemsL as the reference order. - */ - static long sort(Long[] itemsL, Long[] itemsR) { - int length = itemsL.length; - if (length < 2) { - return 0; - } - if (length == 2) { - return itemsL[0].longValue() == itemsR[0].longValue() ? 0 : 1; - } - // 1) avoid changing originals; 2) primitive type is more efficient - long[] reference = new long[length]; - long[] sortable = new long[length]; - for (int i = 0; i < length; i++) { - reference[i] = itemsL[i]; - sortable[i] = itemsR[i]; - } - int sorted = 0; - long swaps = 0; - while (sorted < length - 1) { - // opportunistically trim back the top - while (length > 0 && reference[length - 1] == sortable[length - 1]) { - length--; - } - if (length == 0) { - break; - } - if (reference[sorted] == sortable[sorted]) { - sorted++; - } else { - for (int j = sorted; j < length - 1; j++) { - // do not swap anything already in place - int jump = 1; - if (reference[j] == sortable[j]) { - while (j + jump < length && reference[j + jump] == sortable[j + jump]) { - jump++; - } - } - if (j + jump < length && !(reference[j] == sortable[j] && reference[j + jump] == sortable[j + jump])) { - long tmp = sortable[j]; - sortable[j] = sortable[j + 1]; - sortable[j + 1] = tmp; - swaps++; - } - } - } - } - return swaps; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/RMSRecommenderEvaluator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/RMSRecommenderEvaluator.java deleted file mode 100644 index 97eda1018..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/RMSRecommenderEvaluator.java +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.eval; - -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.model.Preference; - -/** - *

- * A {@link org.apache.mahout.cf.taste.eval.RecommenderEvaluator} which computes the "root mean squared" - * difference between predicted and actual ratings for users. This is the square root of the average of this - * difference, squared. - *

- */ -public final class RMSRecommenderEvaluator extends AbstractDifferenceRecommenderEvaluator { - - private RunningAverage average; - - @Override - protected void reset() { - average = new FullRunningAverage(); - } - - @Override - protected void processOneEstimate(float estimatedPreference, Preference realPref) { - double diff = realPref.getValue() - estimatedPreference; - average.addDatum(diff * diff); - } - - @Override - protected double computeFinalEvaluation() { - return Math.sqrt(average.getAverage()); - } - - @Override - public String toString() { - return "RMSRecommenderEvaluator"; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/StatsCallable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/StatsCallable.java deleted file mode 100644 index 036d0b428..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/eval/StatsCallable.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.eval; - -import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.concurrent.Callable; -import java.util.concurrent.atomic.AtomicInteger; - -final class StatsCallable implements Callable { - - private static final Logger log = LoggerFactory.getLogger(StatsCallable.class); - - private final Callable delegate; - private final boolean logStats; - private final RunningAverageAndStdDev timing; - private final AtomicInteger noEstimateCounter; - - StatsCallable(Callable delegate, - boolean logStats, - RunningAverageAndStdDev timing, - AtomicInteger noEstimateCounter) { - this.delegate = delegate; - this.logStats = logStats; - this.timing = timing; - this.noEstimateCounter = noEstimateCounter; - } - - @Override - public Void call() throws Exception { - long start = System.currentTimeMillis(); - delegate.call(); - long end = System.currentTimeMillis(); - timing.addDatum(end - start); - if (logStats) { - Runtime runtime = Runtime.getRuntime(); - int average = (int) timing.getAverage(); - log.info("Average time per recommendation: {}ms", average); - long totalMemory = runtime.totalMemory(); - long memory = totalMemory - runtime.freeMemory(); - log.info("Approximate memory used: {}MB / {}MB", memory / 1000000L, totalMemory / 1000000L); - log.info("Unable to recommend in {} cases", noEstimateCounter.get()); - } - return null; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractDataModel.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractDataModel.java deleted file mode 100644 index a1a2a1f78..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractDataModel.java +++ /dev/null @@ -1,53 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import org.apache.mahout.cf.taste.model.DataModel; - -/** - * Contains some features common to all implementations. - */ -public abstract class AbstractDataModel implements DataModel { - - private float maxPreference; - private float minPreference; - - protected AbstractDataModel() { - maxPreference = Float.NaN; - minPreference = Float.NaN; - } - - @Override - public float getMaxPreference() { - return maxPreference; - } - - protected void setMaxPreference(float maxPreference) { - this.maxPreference = maxPreference; - } - - @Override - public float getMinPreference() { - return minPreference; - } - - protected void setMinPreference(float minPreference) { - this.minPreference = minPreference; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java deleted file mode 100644 index a48bc90eb..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java +++ /dev/null @@ -1,67 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; - -import java.util.Collection; - -import com.google.common.base.Charsets; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.model.IDMigrator; - -public abstract class AbstractIDMigrator implements IDMigrator { - - private final MessageDigest md5Digest; - - protected AbstractIDMigrator() { - try { - md5Digest = MessageDigest.getInstance("MD5"); - } catch (NoSuchAlgorithmException nsae) { - // Can't happen - throw new IllegalStateException(nsae); - } - } - - /** - * @return most significant 8 bytes of the MD5 hash of the string, as a long - */ - protected final long hash(String value) { - byte[] md5hash; - synchronized (md5Digest) { - md5hash = md5Digest.digest(value.getBytes(Charsets.UTF_8)); - md5Digest.reset(); - } - long hash = 0L; - for (int i = 0; i < 8; i++) { - hash = hash << 8 | md5hash[i] & 0x00000000000000FFL; - } - return hash; - } - - @Override - public long toLongID(String stringID) { - return hash(stringID); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java deleted file mode 100644 index e28dd202a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java +++ /dev/null @@ -1,108 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; - -import javax.sql.DataSource; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.model.UpdatableIDMigrator; -import org.apache.mahout.common.IOUtils; - -/** - * Implementation which stores the reverse long-to-String mapping in a database. Subclasses can override and - * configure the class to operate with particular databases by supplying appropriate SQL statements to the - * constructor. - */ -public abstract class AbstractJDBCIDMigrator extends AbstractIDMigrator implements UpdatableIDMigrator { - - public static final String DEFAULT_MAPPING_TABLE = "taste_id_mapping"; - public static final String DEFAULT_LONG_ID_COLUMN = "long_id"; - public static final String DEFAULT_STRING_ID_COLUMN = "string_id"; - - private final DataSource dataSource; - private final String getStringIDSQL; - private final String storeMappingSQL; - - /** - * @param getStringIDSQL - * SQL statement which selects one column, the String ID, from a mapping table. The statement - * should take one long parameter. - * @param storeMappingSQL - * SQL statement which saves a mapping from long to String. It should take two parameters, a long - * and a String. - */ - protected AbstractJDBCIDMigrator(DataSource dataSource, String getStringIDSQL, String storeMappingSQL) { - this.dataSource = dataSource; - this.getStringIDSQL = getStringIDSQL; - this.storeMappingSQL = storeMappingSQL; - } - - @Override - public final void storeMapping(long longID, String stringID) throws TasteException { - Connection conn = null; - PreparedStatement stmt = null; - try { - conn = dataSource.getConnection(); - stmt = conn.prepareStatement(storeMappingSQL); - stmt.setLong(1, longID); - stmt.setString(2, stringID); - stmt.executeUpdate(); - } catch (SQLException sqle) { - throw new TasteException(sqle); - } finally { - IOUtils.quietClose(null, stmt, conn); - } - } - - @Override - public final String toStringID(long longID) throws TasteException { - Connection conn = null; - PreparedStatement stmt = null; - ResultSet rs = null; - try { - conn = dataSource.getConnection(); - stmt = conn.prepareStatement(getStringIDSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY); - stmt.setFetchDirection(ResultSet.FETCH_FORWARD); - stmt.setFetchSize(1); - stmt.setLong(1, longID); - rs = stmt.executeQuery(); - if (rs.next()) { - return rs.getString(1); - } else { - return null; - } - } catch (SQLException sqle) { - throw new TasteException(sqle); - } finally { - IOUtils.quietClose(rs, stmt, conn); - } - } - - @Override - public void initialize(Iterable stringIDs) throws TasteException { - for (String stringID : stringIDs) { - storeMapping(toLongID(stringID), stringID); - } - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanItemPreferenceArray.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanItemPreferenceArray.java deleted file mode 100644 index 1fa639dae..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanItemPreferenceArray.java +++ /dev/null @@ -1,234 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; - -import com.google.common.base.Function; -import com.google.common.collect.Iterators; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.common.iterator.CountingIterator; - -/** - *

- * Like {@link BooleanUserPreferenceArray} but stores preferences for one item (all item IDs the same) rather - * than one user. - *

- * - * @see BooleanPreference - * @see BooleanUserPreferenceArray - * @see GenericItemPreferenceArray - */ -public final class BooleanItemPreferenceArray implements PreferenceArray { - - private final long[] ids; - private long id; - - public BooleanItemPreferenceArray(int size) { - this.ids = new long[size]; - this.id = Long.MIN_VALUE; // as a sort of 'unspecified' value - } - - public BooleanItemPreferenceArray(List prefs, boolean forOneUser) { - this(prefs.size()); - int size = prefs.size(); - for (int i = 0; i < size; i++) { - Preference pref = prefs.get(i); - ids[i] = forOneUser ? pref.getItemID() : pref.getUserID(); - } - if (size > 0) { - id = forOneUser ? prefs.get(0).getUserID() : prefs.get(0).getItemID(); - } - } - - /** - * This is a private copy constructor for clone(). - */ - private BooleanItemPreferenceArray(long[] ids, long id) { - this.ids = ids; - this.id = id; - } - - @Override - public int length() { - return ids.length; - } - - @Override - public Preference get(int i) { - return new PreferenceView(i); - } - - @Override - public void set(int i, Preference pref) { - id = pref.getItemID(); - ids[i] = pref.getUserID(); - } - - @Override - public long getUserID(int i) { - return ids[i]; - } - - @Override - public void setUserID(int i, long userID) { - ids[i] = userID; - } - - @Override - public long getItemID(int i) { - return id; - } - - /** - * {@inheritDoc} - * - * Note that this method will actually set the item ID for all preferences. - */ - @Override - public void setItemID(int i, long itemID) { - id = itemID; - } - - /** - * @return all user IDs - */ - @Override - public long[] getIDs() { - return ids; - } - - @Override - public float getValue(int i) { - return 1.0f; - } - - @Override - public void setValue(int i, float value) { - throw new UnsupportedOperationException(); - } - - @Override - public void sortByUser() { - Arrays.sort(ids); - } - - @Override - public void sortByItem() { } - - @Override - public void sortByValue() { } - - @Override - public void sortByValueReversed() { } - - @Override - public boolean hasPrefWithUserID(long userID) { - for (long id : ids) { - if (userID == id) { - return true; - } - } - return false; - } - - @Override - public boolean hasPrefWithItemID(long itemID) { - return id == itemID; - } - - @Override - public BooleanItemPreferenceArray clone() { - return new BooleanItemPreferenceArray(ids.clone(), id); - } - - @Override - public int hashCode() { - return (int) (id >> 32) ^ (int) id ^ Arrays.hashCode(ids); - } - - @Override - public boolean equals(Object other) { - if (!(other instanceof BooleanItemPreferenceArray)) { - return false; - } - BooleanItemPreferenceArray otherArray = (BooleanItemPreferenceArray) other; - return id == otherArray.id && Arrays.equals(ids, otherArray.ids); - } - - @Override - public Iterator iterator() { - return Iterators.transform(new CountingIterator(length()), - new Function() { - @Override - public Preference apply(Integer from) { - return new PreferenceView(from); - } - }); - } - - @Override - public String toString() { - StringBuilder result = new StringBuilder(10 * ids.length); - result.append("BooleanItemPreferenceArray[itemID:"); - result.append(id); - result.append(",{"); - for (int i = 0; i < ids.length; i++) { - if (i > 0) { - result.append(','); - } - result.append(ids[i]); - } - result.append("}]"); - return result.toString(); - } - - private final class PreferenceView implements Preference { - - private final int i; - - private PreferenceView(int i) { - this.i = i; - } - - @Override - public long getUserID() { - return BooleanItemPreferenceArray.this.getUserID(i); - } - - @Override - public long getItemID() { - return BooleanItemPreferenceArray.this.getItemID(i); - } - - @Override - public float getValue() { - return 1.0f; - } - - @Override - public void setValue(float value) { - throw new UnsupportedOperationException(); - } - - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanPreference.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanPreference.java deleted file mode 100644 index 3c05cafe1..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanPreference.java +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import java.io.Serializable; - -import org.apache.mahout.cf.taste.model.Preference; - -/** - * Encapsulates a simple boolean "preference" for an item whose value does not matter (is fixed at 1.0). This - * is appropriate in situations where users conceptually have only a general "yes" preference for items, - * rather than a spectrum of preference values. - */ -public final class BooleanPreference implements Preference, Serializable { - - private final long userID; - private final long itemID; - - public BooleanPreference(long userID, long itemID) { - this.userID = userID; - this.itemID = itemID; - } - - @Override - public long getUserID() { - return userID; - } - - @Override - public long getItemID() { - return itemID; - } - - @Override - public float getValue() { - return 1.0f; - } - - @Override - public void setValue(float value) { - throw new UnsupportedOperationException(); - } - - @Override - public String toString() { - return "BooleanPreference[userID: " + userID + ", itemID:" + itemID + ']'; - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanUserPreferenceArray.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanUserPreferenceArray.java deleted file mode 100644 index 931f60b4b..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanUserPreferenceArray.java +++ /dev/null @@ -1,234 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; - -import com.google.common.base.Function; -import com.google.common.collect.Iterators; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.common.iterator.CountingIterator; - -/** - *

- * Like {@link GenericUserPreferenceArray} but stores, conceptually, {@link BooleanPreference} objects which - * have no associated preference value. - *

- * - * @see BooleanPreference - * @see BooleanItemPreferenceArray - * @see GenericUserPreferenceArray - */ -public final class BooleanUserPreferenceArray implements PreferenceArray { - - private final long[] ids; - private long id; - - public BooleanUserPreferenceArray(int size) { - this.ids = new long[size]; - this.id = Long.MIN_VALUE; // as a sort of 'unspecified' value - } - - public BooleanUserPreferenceArray(List prefs) { - this(prefs.size()); - int size = prefs.size(); - for (int i = 0; i < size; i++) { - Preference pref = prefs.get(i); - ids[i] = pref.getItemID(); - } - if (size > 0) { - id = prefs.get(0).getUserID(); - } - } - - /** - * This is a private copy constructor for clone(). - */ - private BooleanUserPreferenceArray(long[] ids, long id) { - this.ids = ids; - this.id = id; - } - - @Override - public int length() { - return ids.length; - } - - @Override - public Preference get(int i) { - return new PreferenceView(i); - } - - @Override - public void set(int i, Preference pref) { - id = pref.getUserID(); - ids[i] = pref.getItemID(); - } - - @Override - public long getUserID(int i) { - return id; - } - - /** - * {@inheritDoc} - * - * Note that this method will actually set the user ID for all preferences. - */ - @Override - public void setUserID(int i, long userID) { - id = userID; - } - - @Override - public long getItemID(int i) { - return ids[i]; - } - - @Override - public void setItemID(int i, long itemID) { - ids[i] = itemID; - } - - /** - * @return all item IDs - */ - @Override - public long[] getIDs() { - return ids; - } - - @Override - public float getValue(int i) { - return 1.0f; - } - - @Override - public void setValue(int i, float value) { - throw new UnsupportedOperationException(); - } - - @Override - public void sortByUser() { } - - @Override - public void sortByItem() { - Arrays.sort(ids); - } - - @Override - public void sortByValue() { } - - @Override - public void sortByValueReversed() { } - - @Override - public boolean hasPrefWithUserID(long userID) { - return id == userID; - } - - @Override - public boolean hasPrefWithItemID(long itemID) { - for (long id : ids) { - if (itemID == id) { - return true; - } - } - return false; - } - - @Override - public BooleanUserPreferenceArray clone() { - return new BooleanUserPreferenceArray(ids.clone(), id); - } - - @Override - public int hashCode() { - return (int) (id >> 32) ^ (int) id ^ Arrays.hashCode(ids); - } - - @Override - public boolean equals(Object other) { - if (!(other instanceof BooleanUserPreferenceArray)) { - return false; - } - BooleanUserPreferenceArray otherArray = (BooleanUserPreferenceArray) other; - return id == otherArray.id && Arrays.equals(ids, otherArray.ids); - } - - @Override - public Iterator iterator() { - return Iterators.transform(new CountingIterator(length()), - new Function() { - @Override - public Preference apply(Integer from) { - return new PreferenceView(from); - } - }); - } - - @Override - public String toString() { - StringBuilder result = new StringBuilder(10 * ids.length); - result.append("BooleanUserPreferenceArray[userID:"); - result.append(id); - result.append(",{"); - for (int i = 0; i < ids.length; i++) { - if (i > 0) { - result.append(','); - } - result.append(ids[i]); - } - result.append("}]"); - return result.toString(); - } - - private final class PreferenceView implements Preference { - - private final int i; - - private PreferenceView(int i) { - this.i = i; - } - - @Override - public long getUserID() { - return BooleanUserPreferenceArray.this.getUserID(i); - } - - @Override - public long getItemID() { - return BooleanUserPreferenceArray.this.getItemID(i); - } - - @Override - public float getValue() { - return 1.0f; - } - - @Override - public void setValue(float value) { - throw new UnsupportedOperationException(); - } - - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java deleted file mode 100644 index 56e7028dc..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java +++ /dev/null @@ -1,320 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import java.util.Arrays; -import java.util.Collection; -import java.util.Map; - -import org.apache.mahout.cf.taste.common.NoSuchItemException; -import org.apache.mahout.cf.taste.common.NoSuchUserException; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveArrayIterator; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; - -import com.google.common.base.Preconditions; - -/** - *

- * A simple {@link DataModel} which uses given user data as its data source. This implementation - * is mostly useful for small experiments and is not recommended for contexts where performance is important. - *

- */ -public final class GenericBooleanPrefDataModel extends AbstractDataModel { - - private final long[] userIDs; - private final FastByIDMap preferenceFromUsers; - private final long[] itemIDs; - private final FastByIDMap preferenceForItems; - private final FastByIDMap> timestamps; - - /** - *

- * Creates a new {@link GenericDataModel} from the given users (and their preferences). This - * {@link DataModel} retains all this information in memory and is effectively immutable. - *

- * - * @param userData users to include - */ - public GenericBooleanPrefDataModel(FastByIDMap userData) { - this(userData, null); - } - - /** - *

- * Creates a new {@link GenericDataModel} from the given users (and their preferences). This - * {@link DataModel} retains all this information in memory and is effectively immutable. - *

- * - * @param userData users to include - * @param timestamps optionally, provided timestamps of preferences as milliseconds since the epoch. - * User IDs are mapped to maps of item IDs to Long timestamps. - */ - public GenericBooleanPrefDataModel(FastByIDMap userData, FastByIDMap> timestamps) { - Preconditions.checkArgument(userData != null, "userData is null"); - - this.preferenceFromUsers = userData; - this.preferenceForItems = new FastByIDMap(); - FastIDSet itemIDSet = new FastIDSet(); - for (Map.Entry entry : preferenceFromUsers.entrySet()) { - long userID = entry.getKey(); - FastIDSet itemIDs = entry.getValue(); - itemIDSet.addAll(itemIDs); - LongPrimitiveIterator it = itemIDs.iterator(); - while (it.hasNext()) { - long itemID = it.nextLong(); - FastIDSet userIDs = preferenceForItems.get(itemID); - if (userIDs == null) { - userIDs = new FastIDSet(2); - preferenceForItems.put(itemID, userIDs); - } - userIDs.add(userID); - } - } - - this.itemIDs = itemIDSet.toArray(); - itemIDSet = null; // Might help GC -- this is big - Arrays.sort(itemIDs); - - this.userIDs = new long[userData.size()]; - int i = 0; - LongPrimitiveIterator it = userData.keySetIterator(); - while (it.hasNext()) { - userIDs[i++] = it.next(); - } - Arrays.sort(userIDs); - - this.timestamps = timestamps; - } - - /** - *

- * Creates a new {@link GenericDataModel} containing an immutable copy of the data from another given - * {@link DataModel}. - *

- * - * @param dataModel - * {@link DataModel} to copy - * @throws TasteException - * if an error occurs while retrieving the other {@link DataModel}'s users - * @deprecated without direct replacement. - * Consider {@link #toDataMap(DataModel)} with {@link #GenericBooleanPrefDataModel(FastByIDMap)} - */ - @Deprecated - public GenericBooleanPrefDataModel(DataModel dataModel) throws TasteException { - this(toDataMap(dataModel)); - } - - /** - * Exports the simple user IDs and associated item IDs in the data model. - * - * @return a {@link FastByIDMap} mapping user IDs to {@link FastIDSet}s representing - * that user's associated items - */ - public static FastByIDMap toDataMap(DataModel dataModel) throws TasteException { - FastByIDMap data = new FastByIDMap(dataModel.getNumUsers()); - LongPrimitiveIterator it = dataModel.getUserIDs(); - while (it.hasNext()) { - long userID = it.nextLong(); - data.put(userID, dataModel.getItemIDsFromUser(userID)); - } - return data; - } - - public static FastByIDMap toDataMap(FastByIDMap data) { - for (Map.Entry entry : ((FastByIDMap) (FastByIDMap) data).entrySet()) { - PreferenceArray prefArray = (PreferenceArray) entry.getValue(); - int size = prefArray.length(); - FastIDSet itemIDs = new FastIDSet(size); - for (int i = 0; i < size; i++) { - itemIDs.add(prefArray.getItemID(i)); - } - entry.setValue(itemIDs); - } - return (FastByIDMap) (FastByIDMap) data; - } - - /** - * This is used mostly internally to the framework, and shouldn't be relied upon otherwise. - */ - public FastByIDMap getRawUserData() { - return this.preferenceFromUsers; - } - - /** - * This is used mostly internally to the framework, and shouldn't be relied upon otherwise. - */ - public FastByIDMap getRawItemData() { - return this.preferenceForItems; - } - - @Override - public LongPrimitiveArrayIterator getUserIDs() { - return new LongPrimitiveArrayIterator(userIDs); - } - - /** - * @throws NoSuchUserException - * if there is no such user - */ - @Override - public PreferenceArray getPreferencesFromUser(long userID) throws NoSuchUserException { - FastIDSet itemIDs = preferenceFromUsers.get(userID); - if (itemIDs == null) { - throw new NoSuchUserException(userID); - } - PreferenceArray prefArray = new BooleanUserPreferenceArray(itemIDs.size()); - int i = 0; - LongPrimitiveIterator it = itemIDs.iterator(); - while (it.hasNext()) { - prefArray.setUserID(i, userID); - prefArray.setItemID(i, it.nextLong()); - i++; - } - return prefArray; - } - - @Override - public FastIDSet getItemIDsFromUser(long userID) throws TasteException { - FastIDSet itemIDs = preferenceFromUsers.get(userID); - if (itemIDs == null) { - throw new NoSuchUserException(userID); - } - return itemIDs; - } - - @Override - public LongPrimitiveArrayIterator getItemIDs() { - return new LongPrimitiveArrayIterator(itemIDs); - } - - @Override - public PreferenceArray getPreferencesForItem(long itemID) throws NoSuchItemException { - FastIDSet userIDs = preferenceForItems.get(itemID); - if (userIDs == null) { - throw new NoSuchItemException(itemID); - } - PreferenceArray prefArray = new BooleanItemPreferenceArray(userIDs.size()); - int i = 0; - LongPrimitiveIterator it = userIDs.iterator(); - while (it.hasNext()) { - prefArray.setUserID(i, it.nextLong()); - prefArray.setItemID(i, itemID); - i++; - } - return prefArray; - } - - @Override - public Float getPreferenceValue(long userID, long itemID) throws NoSuchUserException { - FastIDSet itemIDs = preferenceFromUsers.get(userID); - if (itemIDs == null) { - throw new NoSuchUserException(userID); - } - if (itemIDs.contains(itemID)) { - return 1.0f; - } - return null; - } - - @Override - public Long getPreferenceTime(long userID, long itemID) throws TasteException { - if (timestamps == null) { - return null; - } - FastByIDMap itemTimestamps = timestamps.get(userID); - if (itemTimestamps == null) { - throw new NoSuchUserException(userID); - } - return itemTimestamps.get(itemID); - } - - @Override - public int getNumItems() { - return itemIDs.length; - } - - @Override - public int getNumUsers() { - return userIDs.length; - } - - @Override - public int getNumUsersWithPreferenceFor(long itemID) { - FastIDSet userIDs1 = preferenceForItems.get(itemID); - return userIDs1 == null ? 0 : userIDs1.size(); - } - - @Override - public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) { - FastIDSet userIDs1 = preferenceForItems.get(itemID1); - if (userIDs1 == null) { - return 0; - } - FastIDSet userIDs2 = preferenceForItems.get(itemID2); - if (userIDs2 == null) { - return 0; - } - return userIDs1.size() < userIDs2.size() ? - userIDs2.intersectionSize(userIDs1) : - userIDs1.intersectionSize(userIDs2); - } - - @Override - public void removePreference(long userID, long itemID) { - throw new UnsupportedOperationException(); - } - - @Override - public void setPreference(long userID, long itemID, float value) { - throw new UnsupportedOperationException(); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - // Does nothing - } - - @Override - public boolean hasPreferenceValues() { - return false; - } - - @Override - public String toString() { - StringBuilder result = new StringBuilder(200); - result.append("GenericBooleanPrefDataModel[users:"); - for (int i = 0; i < Math.min(3, userIDs.length); i++) { - if (i > 0) { - result.append(','); - } - result.append(userIDs[i]); - } - if (userIDs.length > 3) { - result.append("..."); - } - result.append(']'); - return result.toString(); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java deleted file mode 100644 index 47fc2e2c9..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java +++ /dev/null @@ -1,361 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import java.util.Arrays; -import java.util.Collection; -import java.util.List; -import java.util.Map; - -import com.google.common.collect.Lists; -import org.apache.mahout.cf.taste.common.NoSuchItemException; -import org.apache.mahout.cf.taste.common.NoSuchUserException; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveArrayIterator; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * A simple {@link DataModel} which uses a given {@link List} of users as its data source. This implementation - * is mostly useful for small experiments and is not recommended for contexts where performance is important. - *

- */ -public final class GenericDataModel extends AbstractDataModel { - - private static final Logger log = LoggerFactory.getLogger(GenericDataModel.class); - - private final long[] userIDs; - private final FastByIDMap preferenceFromUsers; - private final long[] itemIDs; - private final FastByIDMap preferenceForItems; - private final FastByIDMap> timestamps; - - /** - *

- * Creates a new from the given users (and their preferences). This - * {@link DataModel} retains all this information in memory and is effectively immutable. - *

- * - * @param userData users to include; (see also {@link #toDataMap(FastByIDMap, boolean)}) - */ - public GenericDataModel(FastByIDMap userData) { - this(userData, null); - } - - /** - *

- * Creates a new from the given users (and their preferences). This - * {@link DataModel} retains all this information in memory and is effectively immutable. - *

- * - * @param userData users to include; (see also {@link #toDataMap(FastByIDMap, boolean)}) - * @param timestamps optionally, provided timestamps of preferences as milliseconds since the epoch. - * User IDs are mapped to maps of item IDs to Long timestamps. - */ - public GenericDataModel(FastByIDMap userData, FastByIDMap> timestamps) { - Preconditions.checkArgument(userData != null, "userData is null"); - - this.preferenceFromUsers = userData; - FastByIDMap> prefsForItems = new FastByIDMap>(); - FastIDSet itemIDSet = new FastIDSet(); - int currentCount = 0; - float maxPrefValue = Float.NEGATIVE_INFINITY; - float minPrefValue = Float.POSITIVE_INFINITY; - for (Map.Entry entry : preferenceFromUsers.entrySet()) { - PreferenceArray prefs = entry.getValue(); - prefs.sortByItem(); - for (Preference preference : prefs) { - long itemID = preference.getItemID(); - itemIDSet.add(itemID); - Collection prefsForItem = prefsForItems.get(itemID); - if (prefsForItem == null) { - prefsForItem = Lists.newArrayListWithCapacity(2); - prefsForItems.put(itemID, prefsForItem); - } - prefsForItem.add(preference); - float value = preference.getValue(); - if (value > maxPrefValue) { - maxPrefValue = value; - } - if (value < minPrefValue) { - minPrefValue = value; - } - } - if (++currentCount % 10000 == 0) { - log.info("Processed {} users", currentCount); - } - } - log.info("Processed {} users", currentCount); - - setMinPreference(minPrefValue); - setMaxPreference(maxPrefValue); - - this.itemIDs = itemIDSet.toArray(); - itemIDSet = null; // Might help GC -- this is big - Arrays.sort(itemIDs); - - this.preferenceForItems = toDataMap(prefsForItems, false); - - for (Map.Entry entry : preferenceForItems.entrySet()) { - entry.getValue().sortByUser(); - } - - this.userIDs = new long[userData.size()]; - int i = 0; - LongPrimitiveIterator it = userData.keySetIterator(); - while (it.hasNext()) { - userIDs[i++] = it.next(); - } - Arrays.sort(userIDs); - - this.timestamps = timestamps; - } - - /** - *

- * Creates a new containing an immutable copy of the data from another given - * {@link DataModel}. - *

- * - * @param dataModel {@link DataModel} to copy - * @throws TasteException if an error occurs while retrieving the other {@link DataModel}'s users - * @deprecated without direct replacement. - * Consider {@link #toDataMap(DataModel)} with {@link #GenericDataModel(FastByIDMap)} - */ - @Deprecated - public GenericDataModel(DataModel dataModel) throws TasteException { - this(toDataMap(dataModel)); - } - - /** - * Swaps, in-place, {@link List}s for arrays in {@link Map} values . - * - * @return input value - */ - public static FastByIDMap toDataMap(FastByIDMap> data, - boolean byUser) { - for (Map.Entry entry : ((FastByIDMap) (FastByIDMap) data).entrySet()) { - List prefList = (List) entry.getValue(); - entry.setValue(byUser ? new GenericUserPreferenceArray(prefList) : new GenericItemPreferenceArray( - prefList)); - } - return (FastByIDMap) (FastByIDMap) data; - } - - /** - * Exports the simple user IDs and preferences in the data model. - * - * @return a {@link FastByIDMap} mapping user IDs to {@link PreferenceArray}s representing - * that user's preferences - */ - public static FastByIDMap toDataMap(DataModel dataModel) throws TasteException { - FastByIDMap data = new FastByIDMap(dataModel.getNumUsers()); - LongPrimitiveIterator it = dataModel.getUserIDs(); - while (it.hasNext()) { - long userID = it.nextLong(); - data.put(userID, dataModel.getPreferencesFromUser(userID)); - } - return data; - } - - /** - * This is used mostly internally to the framework, and shouldn't be relied upon otherwise. - */ - public FastByIDMap getRawUserData() { - return this.preferenceFromUsers; - } - - /** - * This is used mostly internally to the framework, and shouldn't be relied upon otherwise. - */ - public FastByIDMap getRawItemData() { - return this.preferenceForItems; - } - - @Override - public LongPrimitiveArrayIterator getUserIDs() { - return new LongPrimitiveArrayIterator(userIDs); - } - - /** - * @throws NoSuchUserException - * if there is no such user - */ - @Override - public PreferenceArray getPreferencesFromUser(long userID) throws NoSuchUserException { - PreferenceArray prefs = preferenceFromUsers.get(userID); - if (prefs == null) { - throw new NoSuchUserException(userID); - } - return prefs; - } - - @Override - public FastIDSet getItemIDsFromUser(long userID) throws TasteException { - PreferenceArray prefs = getPreferencesFromUser(userID); - int size = prefs.length(); - FastIDSet result = new FastIDSet(size); - for (int i = 0; i < size; i++) { - result.add(prefs.getItemID(i)); - } - return result; - } - - @Override - public LongPrimitiveArrayIterator getItemIDs() { - return new LongPrimitiveArrayIterator(itemIDs); - } - - @Override - public PreferenceArray getPreferencesForItem(long itemID) throws NoSuchItemException { - PreferenceArray prefs = preferenceForItems.get(itemID); - if (prefs == null) { - throw new NoSuchItemException(itemID); - } - return prefs; - } - - @Override - public Float getPreferenceValue(long userID, long itemID) throws TasteException { - PreferenceArray prefs = getPreferencesFromUser(userID); - int size = prefs.length(); - for (int i = 0; i < size; i++) { - if (prefs.getItemID(i) == itemID) { - return prefs.getValue(i); - } - } - return null; - } - - @Override - public Long getPreferenceTime(long userID, long itemID) throws TasteException { - if (timestamps == null) { - return null; - } - FastByIDMap itemTimestamps = timestamps.get(userID); - if (itemTimestamps == null) { - throw new NoSuchUserException(userID); - } - return itemTimestamps.get(itemID); - } - - @Override - public int getNumItems() { - return itemIDs.length; - } - - @Override - public int getNumUsers() { - return userIDs.length; - } - - @Override - public int getNumUsersWithPreferenceFor(long itemID) { - PreferenceArray prefs1 = preferenceForItems.get(itemID); - return prefs1 == null ? 0 : prefs1.length(); - } - - @Override - public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) { - PreferenceArray prefs1 = preferenceForItems.get(itemID1); - if (prefs1 == null) { - return 0; - } - PreferenceArray prefs2 = preferenceForItems.get(itemID2); - if (prefs2 == null) { - return 0; - } - - int size1 = prefs1.length(); - int size2 = prefs2.length(); - int count = 0; - int i = 0; - int j = 0; - long userID1 = prefs1.getUserID(0); - long userID2 = prefs2.getUserID(0); - while (true) { - if (userID1 < userID2) { - if (++i == size1) { - break; - } - userID1 = prefs1.getUserID(i); - } else if (userID1 > userID2) { - if (++j == size2) { - break; - } - userID2 = prefs2.getUserID(j); - } else { - count++; - if (++i == size1 || ++j == size2) { - break; - } - userID1 = prefs1.getUserID(i); - userID2 = prefs2.getUserID(j); - } - } - return count; - } - - @Override - public void removePreference(long userID, long itemID) { - throw new UnsupportedOperationException(); - } - - @Override - public void setPreference(long userID, long itemID, float value) { - throw new UnsupportedOperationException(); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - // Does nothing - } - - @Override - public boolean hasPreferenceValues() { - return true; - } - - @Override - public String toString() { - StringBuilder result = new StringBuilder(200); - result.append("GenericDataModel[users:"); - for (int i = 0; i < Math.min(3, userIDs.length); i++) { - if (i > 0) { - result.append(','); - } - result.append(userIDs[i]); - } - if (userIDs.length > 3) { - result.append("..."); - } - result.append(']'); - return result.toString(); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericItemPreferenceArray.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericItemPreferenceArray.java deleted file mode 100644 index 43d2a6205..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericItemPreferenceArray.java +++ /dev/null @@ -1,301 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; - -import com.google.common.base.Function; -import com.google.common.collect.Iterators; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.common.iterator.CountingIterator; - -/** - *

- * Like {@link GenericUserPreferenceArray} but stores preferences for one item (all item IDs the same) rather - * than one user. - *

- * - * @see BooleanItemPreferenceArray - * @see GenericUserPreferenceArray - * @see GenericPreference - */ -public final class GenericItemPreferenceArray implements PreferenceArray { - - private static final int USER = 0; - private static final int VALUE = 2; - private static final int VALUE_REVERSED = 3; - - private final long[] ids; - private long id; - private final float[] values; - - public GenericItemPreferenceArray(int size) { - this.ids = new long[size]; - values = new float[size]; - this.id = Long.MIN_VALUE; // as a sort of 'unspecified' value - } - - public GenericItemPreferenceArray(List prefs) { - this(prefs.size()); - int size = prefs.size(); - long itemID = Long.MIN_VALUE; - for (int i = 0; i < size; i++) { - Preference pref = prefs.get(i); - ids[i] = pref.getUserID(); - if (i == 0) { - itemID = pref.getItemID(); - } else { - if (itemID != pref.getItemID()) { - throw new IllegalArgumentException("Not all item IDs are the same"); - } - } - values[i] = pref.getValue(); - } - id = itemID; - } - - /** - * This is a private copy constructor for clone(). - */ - private GenericItemPreferenceArray(long[] ids, long id, float[] values) { - this.ids = ids; - this.id = id; - this.values = values; - } - - @Override - public int length() { - return ids.length; - } - - @Override - public Preference get(int i) { - return new PreferenceView(i); - } - - @Override - public void set(int i, Preference pref) { - id = pref.getItemID(); - ids[i] = pref.getUserID(); - values[i] = pref.getValue(); - } - - @Override - public long getUserID(int i) { - return ids[i]; - } - - @Override - public void setUserID(int i, long userID) { - ids[i] = userID; - } - - @Override - public long getItemID(int i) { - return id; - } - - /** - * {@inheritDoc} - * - * Note that this method will actually set the item ID for all preferences. - */ - @Override - public void setItemID(int i, long itemID) { - id = itemID; - } - - /** - * @return all user IDs - */ - @Override - public long[] getIDs() { - return ids; - } - - @Override - public float getValue(int i) { - return values[i]; - } - - @Override - public void setValue(int i, float value) { - values[i] = value; - } - - @Override - public void sortByUser() { - lateralSort(USER); - } - - @Override - public void sortByItem() { } - - @Override - public void sortByValue() { - lateralSort(VALUE); - } - - @Override - public void sortByValueReversed() { - lateralSort(VALUE_REVERSED); - } - - @Override - public boolean hasPrefWithUserID(long userID) { - for (long id : ids) { - if (userID == id) { - return true; - } - } - return false; - } - - @Override - public boolean hasPrefWithItemID(long itemID) { - return id == itemID; - } - - private void lateralSort(int type) { - //Comb sort: http://en.wikipedia.org/wiki/Comb_sort - int length = length(); - int gap = length; - boolean swapped = false; - while (gap > 1 || swapped) { - if (gap > 1) { - gap /= 1.247330950103979; // = 1 / (1 - 1/e^phi) - } - swapped = false; - int max = length - gap; - for (int i = 0; i < max; i++) { - int other = i + gap; - if (isLess(other, i, type)) { - swap(i, other); - swapped = true; - } - } - } - } - - private boolean isLess(int i, int j, int type) { - switch (type) { - case USER: - return ids[i] < ids[j]; - case VALUE: - return values[i] < values[j]; - case VALUE_REVERSED: - return values[i] > values[j]; - default: - throw new IllegalStateException(); - } - } - - private void swap(int i, int j) { - long temp1 = ids[i]; - float temp2 = values[i]; - ids[i] = ids[j]; - values[i] = values[j]; - ids[j] = temp1; - values[j] = temp2; - } - - @Override - public GenericItemPreferenceArray clone() { - return new GenericItemPreferenceArray(ids.clone(), id, values.clone()); - } - - @Override - public int hashCode() { - return (int) (id >> 32) ^ (int) id ^ Arrays.hashCode(ids) ^ Arrays.hashCode(values); - } - - @Override - public boolean equals(Object other) { - if (!(other instanceof GenericItemPreferenceArray)) { - return false; - } - GenericItemPreferenceArray otherArray = (GenericItemPreferenceArray) other; - return id == otherArray.id && Arrays.equals(ids, otherArray.ids) && Arrays.equals(values, otherArray.values); - } - - @Override - public Iterator iterator() { - return Iterators.transform(new CountingIterator(length()), - new Function() { - @Override - public Preference apply(Integer from) { - return new PreferenceView(from); - } - }); - } - - @Override - public String toString() { - if (ids == null || ids.length == 0) { - return "GenericItemPreferenceArray[{}]"; - } - StringBuilder result = new StringBuilder(20 * ids.length); - result.append("GenericItemPreferenceArray[itemID:"); - result.append(id); - result.append(",{"); - for (int i = 0; i < ids.length; i++) { - if (i > 0) { - result.append(','); - } - result.append(ids[i]); - result.append('='); - result.append(values[i]); - } - result.append("}]"); - return result.toString(); - } - - private final class PreferenceView implements Preference { - - private final int i; - - private PreferenceView(int i) { - this.i = i; - } - - @Override - public long getUserID() { - return GenericItemPreferenceArray.this.getUserID(i); - } - - @Override - public long getItemID() { - return GenericItemPreferenceArray.this.getItemID(i); - } - - @Override - public float getValue() { - return values[i]; - } - - @Override - public void setValue(float value) { - values[i] = value; - } - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericPreference.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericPreference.java deleted file mode 100644 index e6c7f430e..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericPreference.java +++ /dev/null @@ -1,70 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import java.io.Serializable; - -import org.apache.mahout.cf.taste.model.Preference; - -import com.google.common.base.Preconditions; - -/** - *

- * A simple {@link Preference} encapsulating an item and preference value. - *

- */ -public class GenericPreference implements Preference, Serializable { - - private final long userID; - private final long itemID; - private float value; - - public GenericPreference(long userID, long itemID, float value) { - Preconditions.checkArgument(!Float.isNaN(value), "NaN value"); - this.userID = userID; - this.itemID = itemID; - this.value = value; - } - - @Override - public long getUserID() { - return userID; - } - - @Override - public long getItemID() { - return itemID; - } - - @Override - public float getValue() { - return value; - } - - @Override - public void setValue(float value) { - Preconditions.checkArgument(!Float.isNaN(value), "NaN value"); - this.value = value; - } - - @Override - public String toString() { - return "GenericPreference[userID: " + userID + ", itemID:" + itemID + ", value:" + value + ']'; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericUserPreferenceArray.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericUserPreferenceArray.java deleted file mode 100644 index 8e95d1982..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericUserPreferenceArray.java +++ /dev/null @@ -1,307 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; - -import com.google.common.base.Function; -import com.google.common.collect.Iterators; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.common.iterator.CountingIterator; - -/** - *

- * Like {@link GenericItemPreferenceArray} but stores preferences for one user (all user IDs the same) rather - * than one item. - *

- * - *

- * This implementation maintains two parallel arrays, of item IDs and values. The idea is to save allocating - * {@link Preference} objects themselves. This saves the overhead of {@link Preference} objects but also - * duplicating the user ID value. - *

- * - * @see BooleanUserPreferenceArray - * @see GenericItemPreferenceArray - * @see GenericPreference - */ -public final class GenericUserPreferenceArray implements PreferenceArray { - - private static final int ITEM = 1; - private static final int VALUE = 2; - private static final int VALUE_REVERSED = 3; - - private final long[] ids; - private long id; - private final float[] values; - - public GenericUserPreferenceArray(int size) { - this.ids = new long[size]; - values = new float[size]; - this.id = Long.MIN_VALUE; // as a sort of 'unspecified' value - } - - public GenericUserPreferenceArray(List prefs) { - this(prefs.size()); - int size = prefs.size(); - long userID = Long.MIN_VALUE; - for (int i = 0; i < size; i++) { - Preference pref = prefs.get(i); - if (i == 0) { - userID = pref.getUserID(); - } else { - if (userID != pref.getUserID()) { - throw new IllegalArgumentException("Not all user IDs are the same"); - } - } - ids[i] = pref.getItemID(); - values[i] = pref.getValue(); - } - id = userID; - } - - /** - * This is a private copy constructor for clone(). - */ - private GenericUserPreferenceArray(long[] ids, long id, float[] values) { - this.ids = ids; - this.id = id; - this.values = values; - } - - @Override - public int length() { - return ids.length; - } - - @Override - public Preference get(int i) { - return new PreferenceView(i); - } - - @Override - public void set(int i, Preference pref) { - id = pref.getUserID(); - ids[i] = pref.getItemID(); - values[i] = pref.getValue(); - } - - @Override - public long getUserID(int i) { - return id; - } - - /** - * {@inheritDoc} - * - * Note that this method will actually set the user ID for all preferences. - */ - @Override - public void setUserID(int i, long userID) { - id = userID; - } - - @Override - public long getItemID(int i) { - return ids[i]; - } - - @Override - public void setItemID(int i, long itemID) { - ids[i] = itemID; - } - - /** - * @return all item IDs - */ - @Override - public long[] getIDs() { - return ids; - } - - @Override - public float getValue(int i) { - return values[i]; - } - - @Override - public void setValue(int i, float value) { - values[i] = value; - } - - @Override - public void sortByUser() { } - - @Override - public void sortByItem() { - lateralSort(ITEM); - } - - @Override - public void sortByValue() { - lateralSort(VALUE); - } - - @Override - public void sortByValueReversed() { - lateralSort(VALUE_REVERSED); - } - - @Override - public boolean hasPrefWithUserID(long userID) { - return id == userID; - } - - @Override - public boolean hasPrefWithItemID(long itemID) { - for (long id : ids) { - if (itemID == id) { - return true; - } - } - return false; - } - - private void lateralSort(int type) { - //Comb sort: http://en.wikipedia.org/wiki/Comb_sort - int length = length(); - int gap = length; - boolean swapped = false; - while (gap > 1 || swapped) { - if (gap > 1) { - gap /= 1.247330950103979; // = 1 / (1 - 1/e^phi) - } - swapped = false; - int max = length - gap; - for (int i = 0; i < max; i++) { - int other = i + gap; - if (isLess(other, i, type)) { - swap(i, other); - swapped = true; - } - } - } - } - - private boolean isLess(int i, int j, int type) { - switch (type) { - case ITEM: - return ids[i] < ids[j]; - case VALUE: - return values[i] < values[j]; - case VALUE_REVERSED: - return values[i] > values[j]; - default: - throw new IllegalStateException(); - } - } - - private void swap(int i, int j) { - long temp1 = ids[i]; - float temp2 = values[i]; - ids[i] = ids[j]; - values[i] = values[j]; - ids[j] = temp1; - values[j] = temp2; - } - - @Override - public GenericUserPreferenceArray clone() { - return new GenericUserPreferenceArray(ids.clone(), id, values.clone()); - } - - @Override - public int hashCode() { - return (int) (id >> 32) ^ (int) id ^ Arrays.hashCode(ids) ^ Arrays.hashCode(values); - } - - @Override - public boolean equals(Object other) { - if (!(other instanceof GenericUserPreferenceArray)) { - return false; - } - GenericUserPreferenceArray otherArray = (GenericUserPreferenceArray) other; - return id == otherArray.id && Arrays.equals(ids, otherArray.ids) && Arrays.equals(values, otherArray.values); - } - - @Override - public Iterator iterator() { - return Iterators.transform(new CountingIterator(length()), - new Function() { - @Override - public Preference apply(Integer from) { - return new PreferenceView(from); - } - }); - } - - @Override - public String toString() { - if (ids == null || ids.length == 0) { - return "GenericUserPreferenceArray[{}]"; - } - StringBuilder result = new StringBuilder(20 * ids.length); - result.append("GenericUserPreferenceArray[userID:"); - result.append(id); - result.append(",{"); - for (int i = 0; i < ids.length; i++) { - if (i > 0) { - result.append(','); - } - result.append(ids[i]); - result.append('='); - result.append(values[i]); - } - result.append("}]"); - return result.toString(); - } - - private final class PreferenceView implements Preference { - - private final int i; - - private PreferenceView(int i) { - this.i = i; - } - - @Override - public long getUserID() { - return GenericUserPreferenceArray.this.getUserID(i); - } - - @Override - public long getItemID() { - return GenericUserPreferenceArray.this.getItemID(i); - } - - @Override - public float getValue() { - return values[i]; - } - - @Override - public void setValue(float value) { - values[i] = value; - } - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java deleted file mode 100644 index 97f033e02..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java +++ /dev/null @@ -1,55 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.model.UpdatableIDMigrator; - -/** - * Implementation which stores the reverse long-to-String mapping in memory. - */ -public final class MemoryIDMigrator extends AbstractIDMigrator implements UpdatableIDMigrator { - - private final FastByIDMap longToString; - - public MemoryIDMigrator() { - this.longToString = new FastByIDMap(100); - } - - @Override - public void storeMapping(long longID, String stringID) { - synchronized (longToString) { - longToString.put(longID, stringID); - } - } - - @Override - public String toStringID(long longID) { - synchronized (longToString) { - return longToString.get(longID); - } - } - - @Override - public void initialize(Iterable stringIDs) { - for (String stringID : stringIDs) { - storeMapping(toLongID(stringID), stringID); - } - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java deleted file mode 100644 index d4fd316e5..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java +++ /dev/null @@ -1,67 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import javax.sql.DataSource; - -/** - *

- * An implementation for MySQL. The following statement would create a table suitable for use with this class: - *

- * - *

- * - *

- * CREATE TABLE taste_id_migration (
- *   long_id BIGINT NOT NULL PRIMARY KEY,
- *   string_id VARCHAR(255) NOT NULL UNIQUE
- * )
- * 
- * - *

- * - *

- * Separately, note that in a MySQL database, the following function calls will convert a string value into a - * numeric value in the same way that the standard implementations in this package do. This may be useful in - * writing SQL statements for use with - * {@code AbstractJDBCDataModel} subclasses which convert string - * column values to appropriate numeric values -- though this should be viewed as a temporary arrangement - * since it will impact performance: - *

- * - *

- * {@code cast(conv(substring(md5([column name]), 1, 16),16,10) as signed)} - *

- */ -public final class MySQLJDBCIDMigrator extends AbstractJDBCIDMigrator { - - public MySQLJDBCIDMigrator(DataSource dataSource) { - this(dataSource, DEFAULT_MAPPING_TABLE, - DEFAULT_LONG_ID_COLUMN, DEFAULT_STRING_ID_COLUMN); - } - - public MySQLJDBCIDMigrator(DataSource dataSource, - String mappingTable, - String longIDColumn, - String stringIDColumn) { - super(dataSource, - "SELECT " + stringIDColumn + " FROM " + mappingTable + " WHERE " + longIDColumn + "=?", - "INSERT IGNORE INTO " + mappingTable + " (" + longIDColumn + ',' + stringIDColumn + ") VALUES (?,?)"); - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java deleted file mode 100644 index 11eb295a7..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java +++ /dev/null @@ -1,344 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import com.google.common.base.Preconditions; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Queue; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentLinkedQueue; -import org.apache.mahout.cf.taste.common.NoSuchItemException; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.cf.taste.model.PreferenceArray; - -/** - *

- * This is a special thread-safe version of {@link PlusAnonymousUserDataModel} - * which allow multiple concurrent anonymous requests. - *

- * - *

- * To use it, you have to estimate the number of concurrent anonymous users of your application. - * The pool of users with the given size will be created. For each anonymous recommendations request, - * a user has to be taken from the pool and returned back immediately afterwars. - *

- * - *

- * If no more users are available in the pool, anonymous recommendations cannot be produced. - *

- * - *

- * - * Setup: - *
- * int concurrentUsers = 100;
- * DataModel realModel = ..
- * PlusAnonymousConcurrentUserDataModel plusModel =
- *   new PlusAnonymousConcurrentUserDataModel(realModel, concurrentUsers);
- * Recommender recommender = ...;
- * 
- * - * Real-time recommendation: - *
- * PlusAnonymousConcurrentUserDataModel plusModel =
- *   (PlusAnonymousConcurrentUserDataModel) recommender.getDataModel();
- *
- * // Take the next available anonymous user from the pool
- * Long anonymousUserID = plusModel.takeAvailableUser();
- *
- * PreferenceArray tempPrefs = ..
- * tempPrefs.setUserID(0, anonymousUserID);
- * tempPrefs.setItemID(0, itemID);
- * plusModel.setTempPrefs(tempPrefs);
- *
- * // Produce recommendations
- * recommender.recommend(anonymousUserID, howMany);
- *
- * // It is very IMPORTANT to release user back to the pool
- * plusModel.releaseUser(anonymousUserID);
- * 
- * - *

- */ -public final class PlusAnonymousConcurrentUserDataModel extends PlusAnonymousUserDataModel { - - /** Preferences for all anonymous users */ - private final Map tempPrefs; - /** Item IDs set for all anonymous users */ - private final Map prefItemIDs; - /** Pool of the users (FIFO) */ - private Queue usersPool; - - /** - * @param delegate Real model where anonymous users will be added to - * @param maxConcurrentUsers Maximum allowed number of concurrent anonymous users - */ - public PlusAnonymousConcurrentUserDataModel(DataModel delegate, int maxConcurrentUsers) { - super(delegate); - - tempPrefs = new ConcurrentHashMap(); - prefItemIDs = new ConcurrentHashMap(); - - initializeUsersPools(maxConcurrentUsers); - } - - /** - * Initialize the pool of concurrent anonymous users. - * - * @param usersPoolSize Maximum allowed number of concurrent anonymous user. Depends on the consumer system. - */ - private void initializeUsersPools(int usersPoolSize) { - usersPool = new ConcurrentLinkedQueue(); - for (int i = 0; i < usersPoolSize; i++) { - usersPool.add(TEMP_USER_ID + i); - } - } - - /** - * Take the next available concurrent anonymous users from the pool. - * - * @return User ID or null if no more users are available - */ - public Long takeAvailableUser() { - Long takenUserID = usersPool.poll(); - if (takenUserID != null) { - // Initialize the preferences array to indicate that the user is taken. - tempPrefs.put(takenUserID, new GenericUserPreferenceArray(0)); - return takenUserID; - } - return null; - } - - /** - * Release previously taken anonymous user and return it to the pool. - * - * @param userID ID of a previously taken anonymous user - * @return true if the user was previously taken, false otherwise - */ - public boolean releaseUser(Long userID) { - if (tempPrefs.containsKey(userID)) { - this.clearTempPrefs(userID); - // Return previously taken user to the pool - usersPool.offer(userID); - return true; - } - return false; - } - - /** - * Checks whether a given user is a valid previously acquired anonymous user. - */ - private boolean isAnonymousUser(long userID) { - return tempPrefs.containsKey(userID); - } - - /** - * Sets temporary preferences for a given anonymous user. - */ - public void setTempPrefs(PreferenceArray prefs, long anonymousUserID) { - Preconditions.checkArgument(prefs != null && prefs.length() > 0, "prefs is null or empty"); - - this.tempPrefs.put(anonymousUserID, prefs); - FastIDSet userPrefItemIDs = new FastIDSet(); - - for (int i = 0; i < prefs.length(); i++) { - userPrefItemIDs.add(prefs.getItemID(i)); - } - - this.prefItemIDs.put(anonymousUserID, userPrefItemIDs); - } - - /** - * Clears temporary preferences for a given anonymous user. - */ - public void clearTempPrefs(long anonymousUserID) { - this.tempPrefs.remove(anonymousUserID); - this.prefItemIDs.remove(anonymousUserID); - } - - @Override - public LongPrimitiveIterator getUserIDs() throws TasteException { - // Anonymous users have short lifetime and should not be included into the neighbohoods of the real users. - // Thus exclude them from the universe. - return getDelegate().getUserIDs(); - } - - @Override - public PreferenceArray getPreferencesFromUser(long userID) throws TasteException { - if (isAnonymousUser(userID)) { - return tempPrefs.get(userID); - } - return getDelegate().getPreferencesFromUser(userID); - } - - @Override - public FastIDSet getItemIDsFromUser(long userID) throws TasteException { - if (isAnonymousUser(userID)) { - return prefItemIDs.get(userID); - } - return getDelegate().getItemIDsFromUser(userID); - } - - @Override - public PreferenceArray getPreferencesForItem(long itemID) throws TasteException { - if (tempPrefs.isEmpty()) { - return getDelegate().getPreferencesForItem(itemID); - } - - PreferenceArray delegatePrefs = null; - - try { - delegatePrefs = getDelegate().getPreferencesForItem(itemID); - } catch (NoSuchItemException nsie) { - // OK. Probably an item that only the anonymous user has - } - - List anonymousPreferences = new ArrayList(); - - for (Map.Entry prefsMap : tempPrefs.entrySet()) { - PreferenceArray singleUserTempPrefs = prefsMap.getValue(); - for (int i = 0; i < singleUserTempPrefs.length(); i++) { - if (singleUserTempPrefs.getItemID(i) == itemID) { - anonymousPreferences.add(singleUserTempPrefs.get(i)); - } - } - } - - int delegateLength = delegatePrefs == null ? 0 : delegatePrefs.length(); - int anonymousPrefsLength = anonymousPreferences.size(); - int prefsCounter = 0; - - // Merge the delegate and anonymous preferences into a single array - PreferenceArray newPreferenceArray = new GenericItemPreferenceArray(delegateLength + anonymousPrefsLength); - - for (int i = 0; i < delegateLength; i++) { - newPreferenceArray.set(prefsCounter++, delegatePrefs.get(i)); - } - - for (Preference anonymousPreference : anonymousPreferences) { - newPreferenceArray.set(prefsCounter++, anonymousPreference); - } - - if (newPreferenceArray.length() == 0) { - // No, didn't find it among the anonymous user prefs - throw new NoSuchItemException(itemID); - } - - return newPreferenceArray; - } - - @Override - public Float getPreferenceValue(long userID, long itemID) throws TasteException { - if (isAnonymousUser(userID)) { - PreferenceArray singleUserTempPrefs = tempPrefs.get(userID); - for (int i = 0; i < singleUserTempPrefs.length(); i++) { - if (singleUserTempPrefs.getItemID(i) == itemID) { - return singleUserTempPrefs.getValue(i); - } - } - return null; - } - return getDelegate().getPreferenceValue(userID, itemID); - } - - @Override - public Long getPreferenceTime(long userID, long itemID) throws TasteException { - if (isAnonymousUser(userID)) { - // Timestamps are not saved for anonymous preferences - return null; - } - return getDelegate().getPreferenceTime(userID, itemID); - } - - @Override - public int getNumUsers() throws TasteException { - // Anonymous users have short lifetime and should not be included into the neighbohoods of the real users. - // Thus exclude them from the universe. - return getDelegate().getNumUsers(); - } - - @Override - public int getNumUsersWithPreferenceFor(long itemID) throws TasteException { - if (tempPrefs.isEmpty()) { - return getDelegate().getNumUsersWithPreferenceFor(itemID); - } - - int countAnonymousUsersWithPreferenceFor = 0; - - for (Map.Entry singleUserTempPrefs : tempPrefs.entrySet()) { - for (int i = 0; i < singleUserTempPrefs.getValue().length(); i++) { - if (singleUserTempPrefs.getValue().getItemID(i) == itemID) { - countAnonymousUsersWithPreferenceFor++; - break; - } - } - } - return getDelegate().getNumUsersWithPreferenceFor(itemID) + countAnonymousUsersWithPreferenceFor; - } - - @Override - public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException { - if (tempPrefs.isEmpty()) { - return getDelegate().getNumUsersWithPreferenceFor(itemID1, itemID2); - } - - int countAnonymousUsersWithPreferenceFor = 0; - - for (Map.Entry singleUserTempPrefs : tempPrefs.entrySet()) { - boolean found1 = false; - boolean found2 = false; - for (int i = 0; i < singleUserTempPrefs.getValue().length() && !(found1 && found2); i++) { - long itemID = singleUserTempPrefs.getValue().getItemID(i); - if (itemID == itemID1) { - found1 = true; - } - if (itemID == itemID2) { - found2 = true; - } - } - - if (found1 && found2) { - countAnonymousUsersWithPreferenceFor++; - } - } - - return getDelegate().getNumUsersWithPreferenceFor(itemID1, itemID2) + countAnonymousUsersWithPreferenceFor; - } - - @Override - public void setPreference(long userID, long itemID, float value) throws TasteException { - if (isAnonymousUser(userID)) { - throw new UnsupportedOperationException(); - } - getDelegate().setPreference(userID, itemID, value); - } - - @Override - public void removePreference(long userID, long itemID) throws TasteException { - if (isAnonymousUser(userID)) { - throw new UnsupportedOperationException(); - } - getDelegate().removePreference(userID, itemID); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java deleted file mode 100644 index e21fb6729..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java +++ /dev/null @@ -1,313 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import java.util.Collection; - -import org.apache.mahout.cf.taste.common.NoSuchItemException; -import org.apache.mahout.cf.taste.common.NoSuchUserException; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; - -import com.google.common.base.Preconditions; - -/** - *

- * This {@link DataModel} decorator class is useful in a situation where you wish to recommend to a user that - * doesn't really exist yet in your actual {@link DataModel}. For example maybe you wish to recommend DVDs to - * a user who has browsed a few titles on your DVD store site, but, the user is not yet registered. - *

- * - *

- * This enables you to temporarily add a temporary user to an existing {@link DataModel} in a way that - * recommenders can then produce recommendations anyway. To do so, wrap your real implementation in this - * class: - *

- * - *

- * - *

- * DataModel realModel = ...;
- * DataModel plusModel = new PlusAnonymousUserDataModel(realModel);
- * ...
- * ItemSimilarity similarity = new LogLikelihoodSimilarity(realModel); // not plusModel
- * 
- * - *

- * - *

- * But, you may continue to use {@code realModel} as input to other components. To recommend, first construct and - * set the temporary user information on the model and then simply call the recommender. The - * {@code synchronized} block exists to remind you that this is of course not thread-safe. Only one set - * of temp data can be inserted into the model and used at one time. - *

- * - *

- * - *

- * Recommender recommender = ...;
- * ...
- * synchronized(...) {
- *   PreferenceArray tempPrefs = ...;
- *   plusModel.setTempPrefs(tempPrefs);
- *   recommender.recommend(PlusAnonymousUserDataModel.TEMP_USER_ID, 10);
- *   plusModel.setTempPrefs(null);
- * }
- * 
- * - *

- */ -public class PlusAnonymousUserDataModel implements DataModel { - - public static final long TEMP_USER_ID = Long.MIN_VALUE; - - private final DataModel delegate; - private PreferenceArray tempPrefs; - private final FastIDSet prefItemIDs; - - public PlusAnonymousUserDataModel(DataModel delegate) { - this.delegate = delegate; - this.prefItemIDs = new FastIDSet(); - } - - protected DataModel getDelegate() { - return delegate; - } - - public void setTempPrefs(PreferenceArray prefs) { - Preconditions.checkArgument(prefs != null && prefs.length() > 0, "prefs is null or empty"); - this.tempPrefs = prefs; - this.prefItemIDs.clear(); - for (int i = 0; i < prefs.length(); i++) { - this.prefItemIDs.add(prefs.getItemID(i)); - } - } - - public void clearTempPrefs() { - tempPrefs = null; - prefItemIDs.clear(); - } - - @Override - public LongPrimitiveIterator getUserIDs() throws TasteException { - if (tempPrefs == null) { - return delegate.getUserIDs(); - } - return new PlusAnonymousUserLongPrimitiveIterator(delegate.getUserIDs(), TEMP_USER_ID); - } - - @Override - public PreferenceArray getPreferencesFromUser(long userID) throws TasteException { - if (userID == TEMP_USER_ID) { - if (tempPrefs == null) { - throw new NoSuchUserException(TEMP_USER_ID); - } - return tempPrefs; - } - return delegate.getPreferencesFromUser(userID); - } - - @Override - public FastIDSet getItemIDsFromUser(long userID) throws TasteException { - if (userID == TEMP_USER_ID) { - if (tempPrefs == null) { - throw new NoSuchUserException(TEMP_USER_ID); - } - return prefItemIDs; - } - return delegate.getItemIDsFromUser(userID); - } - - @Override - public LongPrimitiveIterator getItemIDs() throws TasteException { - return delegate.getItemIDs(); - // Yeah ignoring items that only the plus-one user knows about... can't really happen - } - - @Override - public PreferenceArray getPreferencesForItem(long itemID) throws TasteException { - if (tempPrefs == null) { - return delegate.getPreferencesForItem(itemID); - } - PreferenceArray delegatePrefs = null; - try { - delegatePrefs = delegate.getPreferencesForItem(itemID); - } catch (NoSuchItemException nsie) { - // OK. Probably an item that only the anonymous user has - } - for (int i = 0; i < tempPrefs.length(); i++) { - if (tempPrefs.getItemID(i) == itemID) { - return cloneAndMergeInto(delegatePrefs, itemID, tempPrefs.getUserID(i), tempPrefs.getValue(i)); - } - } - if (delegatePrefs == null) { - // No, didn't find it among the anonymous user prefs - throw new NoSuchItemException(itemID); - } - return delegatePrefs; - } - - private static PreferenceArray cloneAndMergeInto(PreferenceArray delegatePrefs, - long itemID, - long newUserID, - float value) { - - int length = delegatePrefs == null ? 0 : delegatePrefs.length(); - int newLength = length + 1; - PreferenceArray newPreferenceArray = new GenericItemPreferenceArray(newLength); - - // Set item ID once - newPreferenceArray.setItemID(0, itemID); - - int positionToInsert = 0; - while (positionToInsert < length && newUserID > delegatePrefs.getUserID(positionToInsert)) { - positionToInsert++; - } - - for (int i = 0; i < positionToInsert; i++) { - newPreferenceArray.setUserID(i, delegatePrefs.getUserID(i)); - newPreferenceArray.setValue(i, delegatePrefs.getValue(i)); - } - newPreferenceArray.setUserID(positionToInsert, newUserID); - newPreferenceArray.setValue(positionToInsert, value); - for (int i = positionToInsert + 1; i < newLength; i++) { - newPreferenceArray.setUserID(i, delegatePrefs.getUserID(i - 1)); - newPreferenceArray.setValue(i, delegatePrefs.getValue(i - 1)); - } - - return newPreferenceArray; - } - - @Override - public Float getPreferenceValue(long userID, long itemID) throws TasteException { - if (userID == TEMP_USER_ID) { - if (tempPrefs == null) { - throw new NoSuchUserException(TEMP_USER_ID); - } - for (int i = 0; i < tempPrefs.length(); i++) { - if (tempPrefs.getItemID(i) == itemID) { - return tempPrefs.getValue(i); - } - } - return null; - } - return delegate.getPreferenceValue(userID, itemID); - } - - @Override - public Long getPreferenceTime(long userID, long itemID) throws TasteException { - if (userID == TEMP_USER_ID) { - if (tempPrefs == null) { - throw new NoSuchUserException(TEMP_USER_ID); - } - return null; - } - return delegate.getPreferenceTime(userID, itemID); - } - - @Override - public int getNumItems() throws TasteException { - return delegate.getNumItems(); - } - - @Override - public int getNumUsers() throws TasteException { - return delegate.getNumUsers() + (tempPrefs == null ? 0 : 1); - } - - @Override - public int getNumUsersWithPreferenceFor(long itemID) throws TasteException { - if (tempPrefs == null) { - return delegate.getNumUsersWithPreferenceFor(itemID); - } - boolean found = false; - for (int i = 0; i < tempPrefs.length(); i++) { - if (tempPrefs.getItemID(i) == itemID) { - found = true; - break; - } - } - return delegate.getNumUsersWithPreferenceFor(itemID) + (found ? 1 : 0); - } - - @Override - public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException { - if (tempPrefs == null) { - return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2); - } - boolean found1 = false; - boolean found2 = false; - for (int i = 0; i < tempPrefs.length() && !(found1 && found2); i++) { - long itemID = tempPrefs.getItemID(i); - if (itemID == itemID1) { - found1 = true; - } - if (itemID == itemID2) { - found2 = true; - } - } - return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2) + (found1 && found2 ? 1 : 0); - } - - @Override - public void setPreference(long userID, long itemID, float value) throws TasteException { - if (userID == TEMP_USER_ID) { - if (tempPrefs == null) { - throw new NoSuchUserException(TEMP_USER_ID); - } - throw new UnsupportedOperationException(); - } - delegate.setPreference(userID, itemID, value); - } - - @Override - public void removePreference(long userID, long itemID) throws TasteException { - if (userID == TEMP_USER_ID) { - if (tempPrefs == null) { - throw new NoSuchUserException(TEMP_USER_ID); - } - throw new UnsupportedOperationException(); - } - delegate.removePreference(userID, itemID); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - delegate.refresh(alreadyRefreshed); - } - - @Override - public boolean hasPreferenceValues() { - return delegate.hasPreferenceValues(); - } - - @Override - public float getMaxPreference() { - return delegate.getMaxPreference(); - } - - @Override - public float getMinPreference() { - return delegate.getMinPreference(); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java deleted file mode 100644 index ea4df856a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model; - -import org.apache.mahout.cf.taste.impl.common.AbstractLongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; - -final class PlusAnonymousUserLongPrimitiveIterator extends AbstractLongPrimitiveIterator { - - private final LongPrimitiveIterator delegate; - private final long extraDatum; - private boolean datumConsumed; - - PlusAnonymousUserLongPrimitiveIterator(LongPrimitiveIterator delegate, long extraDatum) { - this.delegate = delegate; - this.extraDatum = extraDatum; - datumConsumed = false; - } - - @Override - public long nextLong() { - if (datumConsumed) { - return delegate.nextLong(); - } else { - if (delegate.hasNext()) { - long delegateNext = delegate.peek(); - if (extraDatum <= delegateNext) { - datumConsumed = true; - return extraDatum; - } else { - return delegate.next(); - } - } else { - datumConsumed = true; - return extraDatum; - } - } - } - - @Override - public long peek() { - if (datumConsumed) { - return delegate.peek(); - } else { - if (delegate.hasNext()) { - long delegateNext = delegate.peek(); - if (extraDatum <= delegateNext) { - return extraDatum; - } else { - return delegateNext; - } - } else { - return extraDatum; - } - } - } - - @Override - public boolean hasNext() { - return !datumConsumed || delegate.hasNext(); - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - @Override - public void skip(int n) { - for (int i = 0; i < n; i++) { - nextLong(); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java deleted file mode 100644 index 78df37cc2..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java +++ /dev/null @@ -1,728 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model.file; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.Collection; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; -import java.util.concurrent.locks.ReentrantLock; - -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; -import com.google.common.io.Closeables; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.model.AbstractDataModel; -import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel; -import org.apache.mahout.cf.taste.impl.model.GenericDataModel; -import org.apache.mahout.cf.taste.impl.model.GenericPreference; -import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.common.iterator.FileLineIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * A {@link DataModel} backed by a delimited file. This class expects a file where each line - * contains a user ID, followed by item ID, followed by optional preference value, followed by - * optional timestamp. Commas or tabs delimit fields: - *

- * - *

{@code userID,itemID[,preference[,timestamp]]}

- * - *

- * Preference value is optional to accommodate applications that have no notion of a - * preference value (that is, the user simply expresses a - * preference for an item, but no degree of preference). - *

- * - *

- * The preference value is assumed to be parseable as a {@code double}. The user IDs and item IDs are - * read parsed as {@code long}s. The timestamp, if present, is assumed to be parseable as a - * {@code long}, though this can be overridden via {@link #readTimestampFromString(String)}. - * The preference value may be empty, to indicate "no preference value", but cannot be empty. That is, - * this is legal: - *

- * - *

{@code 123,456,,129050099059}

- * - *

But this isn't:

- * - *

{@code 123,456,129050099059}

- * - *

- * It is also acceptable for the lines to contain additional fields. Fields beyond the third will be ignored. - * An empty line, or one that begins with '#' will be ignored as a comment. - *

- * - *

- * This class will reload data from the data file when {@link #refresh(Collection)} is called, unless the file - * has been reloaded very recently already. - *

- * - *

- * This class will also look for update "delta" files in the same directory, with file names that start the - * same way (up to the first period). These files have the same format, and provide updated data that - * supersedes what is in the main data file. This is a mechanism that allows an application to push updates to - * without re-copying the entire data file. - *

- * - *

- * One small format difference exists. Update files must also be able to express deletes. - * This is done by ending with a blank preference value, as in "123,456,". - *

- * - *

- * Note that it's all-or-nothing -- all of the items in the file must express no preference, or the all must. - * These cannot be mixed. Put another way there will always be the same number of delimiters on every line of - * the file! - *

- * - *

- * This class is not intended for use with very large amounts of data (over, say, tens of millions of rows). - * For that, a JDBC-backed {@link DataModel} and a database are more appropriate. - *

- * - *

- * It is possible and likely useful to subclass this class and customize its behavior to accommodate - * application-specific needs and input formats. See {@link #processLine(String, FastByIDMap, FastByIDMap, boolean)} and - * {@link #processLineWithoutID(String, FastByIDMap, FastByIDMap)} - */ -public class FileDataModel extends AbstractDataModel { - - private static final Logger log = LoggerFactory.getLogger(FileDataModel.class); - - public static final long DEFAULT_MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute? - private static final char COMMENT_CHAR = '#'; - private static final char[] DELIMIETERS = {',', '\t'}; - - private final File dataFile; - private long lastModified; - private long lastUpdateFileModified; - private final char delimiter; - private final Splitter delimiterPattern; - private final boolean hasPrefValues; - private DataModel delegate; - private final ReentrantLock reloadLock; - private final boolean transpose; - private final long minReloadIntervalMS; - - /** - * @param dataFile - * file containing preferences data. If file is compressed (and name ends in .gz or .zip - * accordingly) it will be decompressed as it is read) - * @throws FileNotFoundException - * if dataFile does not exist - * @throws IOException - * if file can't be read - */ - public FileDataModel(File dataFile) throws IOException { - this(dataFile, false, DEFAULT_MIN_RELOAD_INTERVAL_MS); - } - - /** - * @param transpose - * transposes user IDs and item IDs -- convenient for 'flipping' the data model this way - * @param minReloadIntervalMS - * the minimum interval in milliseconds after which a full reload of the original datafile is done - * when refresh() is called - * @see #FileDataModel(File) - */ - public FileDataModel(File dataFile, boolean transpose, long minReloadIntervalMS) throws IOException { - this.dataFile = Preconditions.checkNotNull(dataFile.getAbsoluteFile()); - if (!dataFile.exists() || dataFile.isDirectory()) { - throw new FileNotFoundException(dataFile.toString()); - } - Preconditions.checkArgument(dataFile.length() > 0L, "dataFile is empty"); - Preconditions.checkArgument(minReloadIntervalMS >= 0L, "minReloadIntervalMs must be non-negative"); - - log.info("Creating FileDataModel for file {}", dataFile); - - this.lastModified = dataFile.lastModified(); - this.lastUpdateFileModified = readLastUpdateFileModified(); - - FileLineIterator iterator = new FileLineIterator(dataFile, false); - String firstLine = iterator.peek(); - while (firstLine.isEmpty() || firstLine.charAt(0) == COMMENT_CHAR) { - iterator.next(); - firstLine = iterator.peek(); - } - Closeables.closeQuietly(iterator); - - delimiter = determineDelimiter(firstLine); - delimiterPattern = Splitter.on(delimiter); - List firstLineSplit = Lists.newArrayList(); - for (String token : delimiterPattern.split(firstLine)) { - firstLineSplit.add(token); - } - // If preference value exists and isn't empty then the file is specifying pref values - hasPrefValues = firstLineSplit.size() >= 3 && !firstLineSplit.get(2).isEmpty(); - - this.reloadLock = new ReentrantLock(); - this.transpose = transpose; - this.minReloadIntervalMS = minReloadIntervalMS; - - reload(); - } - - public File getDataFile() { - return dataFile; - } - - public char getDelimiter() { - return delimiter; - } - - protected void reload() { - if (reloadLock.tryLock()) { - try { - delegate = buildModel(); - } catch (IOException ioe) { - log.warn("Exception while reloading", ioe); - } finally { - reloadLock.unlock(); - } - } - } - - protected DataModel buildModel() throws IOException { - - long newLastModified = dataFile.lastModified(); - long newLastUpdateFileModified = readLastUpdateFileModified(); - - boolean loadFreshData = delegate == null || newLastModified > lastModified + minReloadIntervalMS; - - long oldLastUpdateFileModifieid = lastUpdateFileModified; - lastModified = newLastModified; - lastUpdateFileModified = newLastUpdateFileModified; - - FastByIDMap> timestamps = new FastByIDMap>(); - - if (hasPrefValues) { - - if (loadFreshData) { - - FastByIDMap> data = new FastByIDMap>(); - FileLineIterator iterator = new FileLineIterator(dataFile, false); - processFile(iterator, data, timestamps, false); - - for (File updateFile : findUpdateFilesAfter(newLastModified)) { - processFile(new FileLineIterator(updateFile, false), data, timestamps, false); - } - - return new GenericDataModel(GenericDataModel.toDataMap(data, true), timestamps); - - } else { - - FastByIDMap rawData = ((GenericDataModel) delegate).getRawUserData(); - - for (File updateFile : findUpdateFilesAfter(Math.max(oldLastUpdateFileModifieid, newLastModified))) { - processFile(new FileLineIterator(updateFile, false), rawData, timestamps, true); - } - - return new GenericDataModel(rawData, timestamps); - - } - - } else { - - if (loadFreshData) { - - FastByIDMap data = new FastByIDMap(); - FileLineIterator iterator = new FileLineIterator(dataFile, false); - processFileWithoutID(iterator, data, timestamps); - - for (File updateFile : findUpdateFilesAfter(newLastModified)) { - processFileWithoutID(new FileLineIterator(updateFile, false), data, timestamps); - } - - return new GenericBooleanPrefDataModel(data, timestamps); - - } else { - - FastByIDMap rawData = ((GenericBooleanPrefDataModel) delegate).getRawUserData(); - - for (File updateFile : findUpdateFilesAfter(Math.max(oldLastUpdateFileModifieid, newLastModified))) { - processFileWithoutID(new FileLineIterator(updateFile, false), rawData, timestamps); - } - - return new GenericBooleanPrefDataModel(rawData, timestamps); - - } - - } - } - - /** - * Finds update delta files in the same directory as the data file. This finds any file whose name starts - * the same way as the data file (up to first period) but isn't the data file itself. For example, if the - * data file is /foo/data.txt.gz, you might place update files at /foo/data.1.txt.gz, /foo/data.2.txt.gz, - * etc. - */ - private Iterable findUpdateFilesAfter(long minimumLastModified) { - String dataFileName = dataFile.getName(); - int period = dataFileName.indexOf('.'); - String startName = period < 0 ? dataFileName : dataFileName.substring(0, period); - File parentDir = dataFile.getParentFile(); - Map modTimeToUpdateFile = new TreeMap(); - for (File updateFile : parentDir.listFiles()) { - String updateFileName = updateFile.getName(); - if (updateFileName.startsWith(startName) - && !updateFileName.equals(dataFileName) - && updateFile.lastModified() >= minimumLastModified) { - modTimeToUpdateFile.put(updateFile.lastModified(), updateFile); - } - } - return modTimeToUpdateFile.values(); - } - - private long readLastUpdateFileModified() { - long mostRecentModification = Long.MIN_VALUE; - for (File updateFile : findUpdateFilesAfter(0L)) { - mostRecentModification = Math.max(mostRecentModification, updateFile.lastModified()); - } - return mostRecentModification; - } - - public static char determineDelimiter(String line) { - for (char possibleDelimieter : DELIMIETERS) { - if (line.indexOf(possibleDelimieter) >= 0) { - return possibleDelimieter; - } - } - throw new IllegalArgumentException("Did not find a delimiter in first line"); - } - - protected void processFile(FileLineIterator dataOrUpdateFileIterator, - FastByIDMap data, - FastByIDMap> timestamps, - boolean fromPriorData) { - log.info("Reading file info..."); - int count = 0; - while (dataOrUpdateFileIterator.hasNext()) { - String line = dataOrUpdateFileIterator.next(); - if (!line.isEmpty()) { - processLine(line, data, timestamps, fromPriorData); - if (++count % 1000000 == 0) { - log.info("Processed {} lines", count); - } - } - } - log.info("Read lines: {}", count); - } - - /** - *

- * Reads one line from the input file and adds the data to a {@link FastByIDMap} data structure which maps user IDs - * to preferences. This assumes that each line of the input file corresponds to one preference. After - * reading a line and determining which user and item the preference pertains to, the method should look to - * see if the data contains a mapping for the user ID already, and if not, add an empty data structure of preferences - * as appropriate to the data. - *

- * - *

- * Note that if the line is empty or begins with '#' it will be ignored as a comment. - *

- * - * @param line - * line from input data file - * @param data - * all data read so far, as a mapping from user IDs to preferences - * @param fromPriorData an implementation detail -- if true, data will map IDs to - * {@link PreferenceArray} since the framework is attempting to read and update raw - * data that is already in memory. Otherwise it maps to {@link Collection}s of - * {@link Preference}s, since it's reading fresh data. Subclasses must be prepared - * to handle this wrinkle. - */ - protected void processLine(String line, - FastByIDMap data, - FastByIDMap> timestamps, - boolean fromPriorData) { - - // Ignore empty lines and comments - if (line.isEmpty() || line.charAt(0) == COMMENT_CHAR) { - return; - } - - Iterator tokens = delimiterPattern.split(line).iterator(); - String userIDString = tokens.next(); - String itemIDString = tokens.next(); - String preferenceValueString = tokens.next(); - boolean hasTimestamp = tokens.hasNext(); - String timestampString = hasTimestamp ? tokens.next() : null; - - long userID = readUserIDFromString(userIDString); - long itemID = readItemIDFromString(itemIDString); - - if (transpose) { - long tmp = userID; - userID = itemID; - itemID = tmp; - } - - // This is kind of gross but need to handle two types of storage - Object maybePrefs = data.get(userID); - if (fromPriorData) { - // Data are PreferenceArray - - PreferenceArray prefs = (PreferenceArray) maybePrefs; - if (!hasTimestamp && preferenceValueString.isEmpty()) { - // Then line is of form "userID,itemID,", meaning remove - if (prefs != null) { - boolean exists = false; - int length = prefs.length(); - for (int i = 0; i < length; i++) { - if (prefs.getItemID(i) == itemID) { - exists = true; - break; - } - } - if (exists) { - if (length == 1) { - data.remove(userID); - } else { - PreferenceArray newPrefs = new GenericUserPreferenceArray(length - 1); - for (int i = 0, j = 0; i < length; i++, j++) { - if (prefs.getItemID(i) == itemID) { - j--; - } else { - newPrefs.set(j, prefs.get(i)); - } - } - } - } - } - - removeTimestamp(userID, itemID, timestamps); - - } else { - - float preferenceValue = Float.parseFloat(preferenceValueString); - - boolean exists = false; - if (prefs != null) { - for (int i = 0; i < prefs.length(); i++) { - if (prefs.getItemID(i) == itemID) { - exists = true; - prefs.setValue(i, preferenceValue); - break; - } - } - } - - if (!exists) { - if (prefs == null) { - prefs = new GenericUserPreferenceArray(1); - } else { - PreferenceArray newPrefs = new GenericUserPreferenceArray(prefs.length() + 1); - for (int i = 0, j = 1; i < prefs.length(); i++, j++) { - newPrefs.set(j, prefs.get(i)); - } - prefs = newPrefs; - } - prefs.setUserID(0, userID); - prefs.setItemID(0, itemID); - prefs.setValue(0, preferenceValue); - ((FastByIDMap) data).put(userID, prefs); - } - } - - addTimestamp(userID, itemID, timestampString, timestamps); - - } else { - // Data are Collection - - Collection prefs = (Collection) maybePrefs; - - if (!hasTimestamp && preferenceValueString.isEmpty()) { - // Then line is of form "userID,itemID,", meaning remove - if (prefs != null) { - // remove pref - Iterator prefsIterator = prefs.iterator(); - while (prefsIterator.hasNext()) { - Preference pref = prefsIterator.next(); - if (pref.getItemID() == itemID) { - prefsIterator.remove(); - break; - } - } - } - - removeTimestamp(userID, itemID, timestamps); - - } else { - - float preferenceValue = Float.parseFloat(preferenceValueString); - - boolean exists = false; - if (prefs != null) { - for (Preference pref : prefs) { - if (pref.getItemID() == itemID) { - exists = true; - pref.setValue(preferenceValue); - break; - } - } - } - - if (!exists) { - if (prefs == null) { - prefs = Lists.newArrayListWithCapacity(2); - ((FastByIDMap>) data).put(userID, prefs); - } - prefs.add(new GenericPreference(userID, itemID, preferenceValue)); - } - - addTimestamp(userID, itemID, timestampString, timestamps); - - } - - } - } - - protected void processFileWithoutID(FileLineIterator dataOrUpdateFileIterator, - FastByIDMap data, - FastByIDMap> timestamps) { - log.info("Reading file info..."); - int count = 0; - while (dataOrUpdateFileIterator.hasNext()) { - String line = dataOrUpdateFileIterator.next(); - if (!line.isEmpty()) { - processLineWithoutID(line, data, timestamps); - if (++count % 100000 == 0) { - log.info("Processed {} lines", count); - } - } - } - log.info("Read lines: {}", count); - } - - protected void processLineWithoutID(String line, - FastByIDMap data, - FastByIDMap> timestamps) { - - if (line.isEmpty() || line.charAt(0) == COMMENT_CHAR) { - return; - } - - Iterator tokens = delimiterPattern.split(line).iterator(); - String userIDString = tokens.next(); - String itemIDString = tokens.next(); - boolean hasPreference = tokens.hasNext(); - String preferenceValueString = hasPreference ? tokens.next() : ""; - boolean hasTimestamp = tokens.hasNext(); - String timestampString = hasTimestamp ? tokens.next() : null; - - long userID = readUserIDFromString(userIDString); - long itemID = readItemIDFromString(itemIDString); - - if (transpose) { - long tmp = userID; - userID = itemID; - itemID = tmp; - } - - if (hasPreference && !hasTimestamp && preferenceValueString.isEmpty()) { - // Then line is of form "userID,itemID,", meaning remove - - FastIDSet itemIDs = data.get(userID); - if (itemIDs != null) { - itemIDs.remove(itemID); - } - - removeTimestamp(userID, itemID, timestamps); - - } else { - - FastIDSet itemIDs = data.get(userID); - if (itemIDs == null) { - itemIDs = new FastIDSet(2); - data.put(userID, itemIDs); - } - itemIDs.add(itemID); - - addTimestamp(userID, itemID, timestampString, timestamps); - - } - } - - private void addTimestamp(long userID, - long itemID, - String timestampString, - FastByIDMap> timestamps) { - if (timestampString != null) { - FastByIDMap itemTimestamps = timestamps.get(userID); - if (itemTimestamps == null) { - itemTimestamps = new FastByIDMap(); - timestamps.put(userID, itemTimestamps); - } - long timestamp = readTimestampFromString(timestampString); - itemTimestamps.put(itemID, timestamp); - } - } - - private static void removeTimestamp(long userID, - long itemID, - FastByIDMap> timestamps) { - FastByIDMap itemTimestamps = timestamps.get(userID); - if (itemTimestamps != null) { - itemTimestamps.remove(itemID); - } - } - - /** - * Subclasses may wish to override this if ID values in the file are not numeric. This provides a hook by - * which subclasses can inject an {@link org.apache.mahout.cf.taste.model.IDMigrator} to perform - * translation. - */ - protected long readUserIDFromString(String value) { - return Long.parseLong(value); - } - - /** - * Subclasses may wish to override this if ID values in the file are not numeric. This provides a hook by - * which subclasses can inject an {@link org.apache.mahout.cf.taste.model.IDMigrator} to perform - * translation. - */ - protected long readItemIDFromString(String value) { - return Long.parseLong(value); - } - - /** - * Subclasses may wish to override this to change how time values in the input file are parsed. - * By default they are expected to be numeric, expressing a time as milliseconds since the epoch. - */ - protected long readTimestampFromString(String value) { - return Long.parseLong(value); - } - - @Override - public LongPrimitiveIterator getUserIDs() throws TasteException { - return delegate.getUserIDs(); - } - - @Override - public PreferenceArray getPreferencesFromUser(long userID) throws TasteException { - return delegate.getPreferencesFromUser(userID); - } - - @Override - public FastIDSet getItemIDsFromUser(long userID) throws TasteException { - return delegate.getItemIDsFromUser(userID); - } - - @Override - public LongPrimitiveIterator getItemIDs() throws TasteException { - return delegate.getItemIDs(); - } - - @Override - public PreferenceArray getPreferencesForItem(long itemID) throws TasteException { - return delegate.getPreferencesForItem(itemID); - } - - @Override - public Float getPreferenceValue(long userID, long itemID) throws TasteException { - return delegate.getPreferenceValue(userID, itemID); - } - - @Override - public Long getPreferenceTime(long userID, long itemID) throws TasteException { - return delegate.getPreferenceTime(userID, itemID); - } - - @Override - public int getNumItems() throws TasteException { - return delegate.getNumItems(); - } - - @Override - public int getNumUsers() throws TasteException { - return delegate.getNumUsers(); - } - - @Override - public int getNumUsersWithPreferenceFor(long itemID) throws TasteException { - return delegate.getNumUsersWithPreferenceFor(itemID); - } - - @Override - public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException { - return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2); - } - - /** - * Note that this method only updates the in-memory preference data that this - * maintains; it does not modify any data on disk. Therefore any updates from this method are only - * temporary, and lost when data is reloaded from a file. This method should also be considered relatively - * slow. - */ - @Override - public void setPreference(long userID, long itemID, float value) throws TasteException { - delegate.setPreference(userID, itemID, value); - } - - /** See the warning at {@link #setPreference(long, long, float)}. */ - @Override - public void removePreference(long userID, long itemID) throws TasteException { - delegate.removePreference(userID, itemID); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - if (dataFile.lastModified() > lastModified + minReloadIntervalMS - || readLastUpdateFileModified() > lastUpdateFileModified + minReloadIntervalMS) { - log.debug("File has changed; reloading..."); - reload(); - } - } - - @Override - public boolean hasPreferenceValues() { - return delegate.hasPreferenceValues(); - } - - @Override - public float getMaxPreference() { - return delegate.getMaxPreference(); - } - - @Override - public float getMinPreference() { - return delegate.getMinPreference(); - } - - @Override - public String toString() { - return "FileDataModel[dataFile:" + dataFile + ']'; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java deleted file mode 100644 index 6a60b5c06..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model.file; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.Collection; -import java.util.concurrent.locks.ReentrantLock; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.model.AbstractIDMigrator; -import org.apache.mahout.common.iterator.FileLineIterable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * An {@link org.apache.mahout.cf.taste.model.IDMigrator} backed by a file. - * This class typically expects a file where each line - * contains a single stringID to be stored in this migrator. - *

- * - *

- * This class will reload data from the data file when {@link #refresh(Collection)} is called, unless the file - * has been reloaded very recently already. - *

- */ -public class FileIDMigrator extends AbstractIDMigrator { - - public static final long DEFAULT_MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute? - - private final File dataFile; - private FastByIDMap longToString; - private final ReentrantLock reloadLock; - - private long lastModified; - private final long minReloadIntervalMS; - - private static final Logger log = LoggerFactory.getLogger(FileIDMigrator.class); - - public FileIDMigrator(File dataFile) throws FileNotFoundException { - this(dataFile, DEFAULT_MIN_RELOAD_INTERVAL_MS); - } - - public FileIDMigrator(File dataFile, long minReloadIntervalMS) throws FileNotFoundException { - longToString = new FastByIDMap(100); - this.dataFile = Preconditions.checkNotNull(dataFile); - if (!dataFile.exists() || dataFile.isDirectory()) { - throw new FileNotFoundException(dataFile.toString()); - } - - log.info("Creating FileReadonlyIDMigrator for file {}", dataFile); - - this.reloadLock = new ReentrantLock(); - this.lastModified = dataFile.lastModified(); - this.minReloadIntervalMS = minReloadIntervalMS; - - reload(); - } - - @Override - public String toStringID(long longID) { - return longToString.get(longID); - } - - private void reload() { - if (reloadLock.tryLock()) { - try { - longToString = buildMapping(); - } catch (IOException ioe) { - throw new IllegalStateException(ioe); - } finally { - reloadLock.unlock(); - } - } - } - - private FastByIDMap buildMapping() throws IOException { - FastByIDMap mapping = new FastByIDMap(); - for (String line : new FileLineIterable(dataFile)) { - mapping.put(toLongID(line), line); - } - lastModified = dataFile.lastModified(); - return mapping; - } - - @Override - public void refresh(Collection alreadyRefreshed) { - if (dataFile.lastModified() > lastModified + minReloadIntervalMS) { - log.debug("File has changed; reloading..."); - reload(); - } - } - - @Override - public String toString() { - return "FileIDMigrator[dataFile:" + dataFile + ']'; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java deleted file mode 100644 index 8d33f60f1..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java +++ /dev/null @@ -1,71 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.neighborhood; - -import java.util.Collection; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; -import org.apache.mahout.cf.taste.similarity.UserSimilarity; - -import com.google.common.base.Preconditions; - -/** - *

- * Contains methods and resources useful to all classes in this package. - *

- */ -abstract class AbstractUserNeighborhood implements UserNeighborhood { - - private final UserSimilarity userSimilarity; - private final DataModel dataModel; - private final double samplingRate; - private final RefreshHelper refreshHelper; - - AbstractUserNeighborhood(UserSimilarity userSimilarity, DataModel dataModel, double samplingRate) { - Preconditions.checkArgument(userSimilarity != null, "userSimilarity is null"); - Preconditions.checkArgument(dataModel != null, "dataModel is null"); - Preconditions.checkArgument(samplingRate > 0.0 && samplingRate <= 1.0, "samplingRate must be in (0,1]"); - this.userSimilarity = userSimilarity; - this.dataModel = dataModel; - this.samplingRate = samplingRate; - this.refreshHelper = new RefreshHelper(null); - this.refreshHelper.addDependency(this.dataModel); - this.refreshHelper.addDependency(this.userSimilarity); - } - - final UserSimilarity getUserSimilarity() { - return userSimilarity; - } - - final DataModel getDataModel() { - return dataModel; - } - - final double getSamplingRate() { - return samplingRate; - } - - @Override - public final void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java deleted file mode 100644 index 30a9f50fb..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.neighborhood; - -import java.util.Collection; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.Cache; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.common.Retriever; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; - -import com.google.common.base.Preconditions; - -/** A caching wrapper around an underlying {@link UserNeighborhood} implementation. */ -public final class CachingUserNeighborhood implements UserNeighborhood { - - private final UserNeighborhood neighborhood; - private final Cache neighborhoodCache; - - public CachingUserNeighborhood(UserNeighborhood neighborhood, DataModel dataModel) throws TasteException { - Preconditions.checkArgument(neighborhood != null, "neighborhood is null"); - this.neighborhood = neighborhood; - int maxCacheSize = dataModel.getNumUsers(); // just a dumb heuristic for sizing - this.neighborhoodCache = new Cache(new NeighborhoodRetriever(neighborhood), maxCacheSize); - } - - @Override - public long[] getUserNeighborhood(long userID) throws TasteException { - return neighborhoodCache.get(userID); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - neighborhoodCache.clear(); - alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed); - RefreshHelper.maybeRefresh(alreadyRefreshed, neighborhood); - } - - private static final class NeighborhoodRetriever implements Retriever { - private final UserNeighborhood neighborhood; - - private NeighborhoodRetriever(UserNeighborhood neighborhood) { - this.neighborhood = neighborhood; - } - - @Override - public long[] get(Long key) throws TasteException { - return neighborhood.getUserNeighborhood(key); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java deleted file mode 100644 index 7f3a98aba..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java +++ /dev/null @@ -1,122 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.neighborhood; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.recommender.TopItems; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.similarity.UserSimilarity; - -import com.google.common.base.Preconditions; - -/** - *

- * Computes a neighborhood consisting of the nearest n users to a given user. "Nearest" is defined by the - * given {@link UserSimilarity}. - *

- */ -public final class NearestNUserNeighborhood extends AbstractUserNeighborhood { - - private final int n; - private final double minSimilarity; - - /** - * @param n neighborhood size; capped at the number of users in the data model - * @throws IllegalArgumentException - * if {@code n < 1}, or userSimilarity or dataModel are {@code null} - */ - public NearestNUserNeighborhood(int n, UserSimilarity userSimilarity, DataModel dataModel) throws TasteException { - this(n, Double.NEGATIVE_INFINITY, userSimilarity, dataModel, 1.0); - } - - /** - * @param n neighborhood size; capped at the number of users in the data model - * @param minSimilarity minimal similarity required for neighbors - * @throws IllegalArgumentException - * if {@code n < 1}, or userSimilarity or dataModel are {@code null} - */ - public NearestNUserNeighborhood(int n, - double minSimilarity, - UserSimilarity userSimilarity, - DataModel dataModel) throws TasteException { - this(n, minSimilarity, userSimilarity, dataModel, 1.0); - } - - /** - * @param n neighborhood size; capped at the number of users in the data model - * @param minSimilarity minimal similarity required for neighbors - * @param samplingRate percentage of users to consider when building neighborhood -- decrease to trade quality for - * performance - * @throws IllegalArgumentException - * if {@code n < 1} or samplingRate is NaN or not in (0,1], or userSimilarity or dataModel are - * {@code null} - */ - public NearestNUserNeighborhood(int n, - double minSimilarity, - UserSimilarity userSimilarity, - DataModel dataModel, - double samplingRate) throws TasteException { - super(userSimilarity, dataModel, samplingRate); - Preconditions.checkArgument(n >= 1, "n must be at least 1"); - int numUsers = dataModel.getNumUsers(); - this.n = n > numUsers ? numUsers : n; - this.minSimilarity = minSimilarity; - } - - @Override - public long[] getUserNeighborhood(long userID) throws TasteException { - - DataModel dataModel = getDataModel(); - UserSimilarity userSimilarityImpl = getUserSimilarity(); - - TopItems.Estimator estimator = new Estimator(userSimilarityImpl, userID, minSimilarity); - - LongPrimitiveIterator userIDs = SamplingLongPrimitiveIterator.maybeWrapIterator(dataModel.getUserIDs(), - getSamplingRate()); - - return TopItems.getTopUsers(n, userIDs, null, estimator); - } - - @Override - public String toString() { - return "NearestNUserNeighborhood"; - } - - private static final class Estimator implements TopItems.Estimator { - private final UserSimilarity userSimilarityImpl; - private final long theUserID; - private final double minSim; - - private Estimator(UserSimilarity userSimilarityImpl, long theUserID, double minSim) { - this.userSimilarityImpl = userSimilarityImpl; - this.theUserID = theUserID; - this.minSim = minSim; - } - - @Override - public double estimate(Long userID) throws TasteException { - if (userID == theUserID) { - return Double.NaN; - } - double sim = userSimilarityImpl.userSimilarity(theUserID, userID); - return sim >= minSim ? sim : Double.NaN; - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java deleted file mode 100644 index d5246e41b..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java +++ /dev/null @@ -1,104 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.neighborhood; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.similarity.UserSimilarity; - -import com.google.common.base.Preconditions; - -/** - *

- * Computes a neigbhorhood consisting of all users whose similarity to the given user meets or exceeds a - * certain threshold. Similarity is defined by the given {@link UserSimilarity}. - *

- */ -public final class ThresholdUserNeighborhood extends AbstractUserNeighborhood { - - private final double threshold; - - /** - * @param threshold - * similarity threshold - * @param userSimilarity - * similarity metric - * @param dataModel - * data model - * @throws IllegalArgumentException - * if threshold is {@link Double#NaN}, or if samplingRate is not positive and less than or equal - * to 1.0, or if userSimilarity or dataModel are {@code null} - */ - public ThresholdUserNeighborhood(double threshold, UserSimilarity userSimilarity, DataModel dataModel) { - this(threshold, userSimilarity, dataModel, 1.0); - } - - /** - * @param threshold - * similarity threshold - * @param userSimilarity - * similarity metric - * @param dataModel - * data model - * @param samplingRate - * percentage of users to consider when building neighborhood -- decrease to trade quality for - * performance - * @throws IllegalArgumentException - * if threshold or samplingRate is {@link Double#NaN}, or if samplingRate is not positive and less - * than or equal to 1.0, or if userSimilarity or dataModel are {@code null} - */ - public ThresholdUserNeighborhood(double threshold, - UserSimilarity userSimilarity, - DataModel dataModel, - double samplingRate) { - super(userSimilarity, dataModel, samplingRate); - Preconditions.checkArgument(!Double.isNaN(threshold), "threshold must not be NaN"); - this.threshold = threshold; - } - - @Override - public long[] getUserNeighborhood(long userID) throws TasteException { - - DataModel dataModel = getDataModel(); - FastIDSet neighborhood = new FastIDSet(); - LongPrimitiveIterator usersIterable = SamplingLongPrimitiveIterator.maybeWrapIterator(dataModel - .getUserIDs(), getSamplingRate()); - UserSimilarity userSimilarityImpl = getUserSimilarity(); - - while (usersIterable.hasNext()) { - long otherUserID = usersIterable.next(); - if (userID != otherUserID) { - double theSimilarity = userSimilarityImpl.userSimilarity(userID, otherUserID); - if (!Double.isNaN(theSimilarity) && theSimilarity >= threshold) { - neighborhood.add(otherUserID); - } - } - } - - return neighborhood.toArray(); - } - - @Override - public String toString() { - return "ThresholdUserNeighborhood"; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java deleted file mode 100644 index 9fc85a79d..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy; -import org.apache.mahout.cf.taste.recommender.MostSimilarItemsCandidateItemsStrategy; - -/** - * Abstract base implementation for retrieving candidate items to recommend - */ -public abstract class AbstractCandidateItemsStrategy implements CandidateItemsStrategy, - MostSimilarItemsCandidateItemsStrategy { - - @Override - public FastIDSet getCandidateItems(long userID, PreferenceArray preferencesFromUser, DataModel dataModel) - throws TasteException { - return doGetCandidateItems(preferencesFromUser.getIDs(), dataModel); - } - - @Override - public FastIDSet getCandidateItems(long[] itemIDs, DataModel dataModel) throws TasteException { - return doGetCandidateItems(itemIDs, dataModel); - } - - abstract FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel) throws TasteException; -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractRecommender.java deleted file mode 100644 index 443117903..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractRecommender.java +++ /dev/null @@ -1,114 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy; -import java.util.List; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.cf.taste.recommender.Recommender; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -public abstract class AbstractRecommender implements Recommender { - - private static final Logger log = LoggerFactory.getLogger(AbstractRecommender.class); - - private final DataModel dataModel; - private final CandidateItemsStrategy candidateItemsStrategy; - - protected AbstractRecommender(DataModel dataModel, CandidateItemsStrategy candidateItemsStrategy) { - this.dataModel = Preconditions.checkNotNull(dataModel); - this.candidateItemsStrategy = Preconditions.checkNotNull(candidateItemsStrategy); - } - - protected AbstractRecommender(DataModel dataModel) { - this(dataModel, getDefaultCandidateItemsStrategy()); - } - - protected static CandidateItemsStrategy getDefaultCandidateItemsStrategy() { - return new PreferredItemsNeighborhoodCandidateItemsStrategy(); - } - - /** - *

- * Default implementation which just calls - * {@link Recommender#recommend(long, int, org.apache.mahout.cf.taste.recommender.IDRescorer)}, with a - * {@link org.apache.mahout.cf.taste.recommender.Rescorer} that does nothing. - *

- */ - @Override - public List recommend(long userID, int howMany) throws TasteException { - return recommend(userID, howMany, null); - } - - /** - *

- * Default implementation which just calls {@link DataModel#setPreference(long, long, float)}. - *

- * - * @throws IllegalArgumentException - * if userID or itemID is {@code null}, or if value is {@link Double#NaN} - */ - @Override - public void setPreference(long userID, long itemID, float value) throws TasteException { - Preconditions.checkArgument(!Float.isNaN(value), "NaN value"); - log.debug("Setting preference for user {}, item {}", userID, itemID); - dataModel.setPreference(userID, itemID, value); - } - - /** - *

- * Default implementation which just calls {@link DataModel#removePreference(long, long)} (Object, Object)}. - *

- * - * @throws IllegalArgumentException - * if userID or itemID is {@code null} - */ - @Override - public void removePreference(long userID, long itemID) throws TasteException { - log.debug("Remove preference for user '{}', item '{}'", userID, itemID); - dataModel.removePreference(userID, itemID); - } - - @Override - public DataModel getDataModel() { - return dataModel; - } - - /** - * @param userID - * ID of user being evaluated - * @param preferencesFromUser - * the preferences from the user - * @return all items in the {@link DataModel} for which the user has not expressed a preference and could - * possibly be recommended to the user - * @throws TasteException - * if an error occurs while listing items - */ - protected FastIDSet getAllOtherItems(long userID, PreferenceArray preferencesFromUser) throws TasteException { - return candidateItemsStrategy.getCandidateItems(userID, preferencesFromUser, dataModel); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllSimilarItemsCandidateItemsStrategy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllSimilarItemsCandidateItemsStrategy.java deleted file mode 100644 index d1bb91b16..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllSimilarItemsCandidateItemsStrategy.java +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import com.google.common.base.Preconditions; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.similarity.ItemSimilarity; - -/** - * returns the result of {@link ItemSimilarity#allSimilarItemIDs(long)} as candidate items - */ -public class AllSimilarItemsCandidateItemsStrategy extends AbstractCandidateItemsStrategy { - - private final ItemSimilarity similarity; - - public AllSimilarItemsCandidateItemsStrategy(ItemSimilarity similarity) { - Preconditions.checkArgument(similarity != null, "similarity is null"); - this.similarity = similarity; - } - - @Override - FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel) throws TasteException { - FastIDSet candidateItemIDs = new FastIDSet(); - for (long itemID : preferredItemIDs) { - candidateItemIDs.addAll(similarity.allSimilarItemIDs(itemID)); - } - candidateItemIDs.removeAll(preferredItemIDs); - return candidateItemIDs; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllUnknownItemsCandidateItemsStrategy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllUnknownItemsCandidateItemsStrategy.java deleted file mode 100644 index d7cfa2252..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllUnknownItemsCandidateItemsStrategy.java +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.model.DataModel; - -public final class AllUnknownItemsCandidateItemsStrategy extends AbstractCandidateItemsStrategy { - - /** - * return all items the user has not yet seen - */ - @Override - protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel) throws TasteException { - FastIDSet possibleItemIDs = new FastIDSet(dataModel.getNumItems()); - LongPrimitiveIterator allItemIDs = dataModel.getItemIDs(); - while (allItemIDs.hasNext()) { - possibleItemIDs.add(allItemIDs.nextLong()); - } - possibleItemIDs.removeAll(preferredItemIDs); - return possibleItemIDs; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByRescoreComparator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByRescoreComparator.java deleted file mode 100644 index 1677ea8e7..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByRescoreComparator.java +++ /dev/null @@ -1,65 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import java.io.Serializable; -import java.util.Comparator; - -import org.apache.mahout.cf.taste.recommender.IDRescorer; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; - -/** - *

- * Defines ordering on {@link RecommendedItem} by the rescored value of the recommendations' estimated - * preference value, from high to low. - *

- */ -final class ByRescoreComparator implements Comparator, Serializable { - - private final IDRescorer rescorer; - - ByRescoreComparator(IDRescorer rescorer) { - this.rescorer = rescorer; - } - - @Override - public int compare(RecommendedItem o1, RecommendedItem o2) { - double rescored1; - double rescored2; - if (rescorer == null) { - rescored1 = o1.getValue(); - rescored2 = o2.getValue(); - } else { - rescored1 = rescorer.rescore(o1.getItemID(), o1.getValue()); - rescored2 = rescorer.rescore(o2.getItemID(), o2.getValue()); - } - if (rescored1 < rescored2) { - return 1; - } else if (rescored1 > rescored2) { - return -1; - } else { - return 0; - } - } - - @Override - public String toString() { - return "ByRescoreComparator[rescorer:" + rescorer + ']'; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByValueRecommendedItemComparator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByValueRecommendedItemComparator.java deleted file mode 100644 index 57c5f3de6..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByValueRecommendedItemComparator.java +++ /dev/null @@ -1,43 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import java.io.Serializable; -import java.util.Comparator; - -import org.apache.mahout.cf.taste.recommender.RecommendedItem; - -/** - * Defines a natural ordering from most-preferred item (highest value) to least-preferred. - */ -public final class ByValueRecommendedItemComparator implements Comparator, Serializable { - - private static final Comparator INSTANCE = new ByValueRecommendedItemComparator(); - - public static Comparator getInstance() { - return INSTANCE; - } - - @Override - public int compare(RecommendedItem o1, RecommendedItem o2) { - float value1 = o1.getValue(); - float value2 = o2.getValue(); - return value1 > value2 ? -1 : value1 < value2 ? 1 : 0; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/CachingRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/CachingRecommender.java deleted file mode 100644 index 2f3aef724..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/CachingRecommender.java +++ /dev/null @@ -1,231 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.Callable; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.Cache; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.common.Retriever; -import org.apache.mahout.cf.taste.impl.model.PlusAnonymousUserDataModel; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.recommender.IDRescorer; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.cf.taste.recommender.Recommender; -import org.apache.mahout.common.LongPair; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * A {@link Recommender} which caches the results from another {@link Recommender} in memory. - *

- */ -public final class CachingRecommender implements Recommender { - - private static final Logger log = LoggerFactory.getLogger(CachingRecommender.class); - - private final Recommender recommender; - private final int[] maxHowMany; - private final Retriever recommendationsRetriever; - private final Cache recommendationCache; - private final Cache estimatedPrefCache; - private final RefreshHelper refreshHelper; - private IDRescorer currentRescorer; - - public CachingRecommender(Recommender recommender) throws TasteException { - Preconditions.checkArgument(recommender != null, "recommender is null"); - this.recommender = recommender; - maxHowMany = new int[]{1}; - // Use "num users" as an upper limit on cache size. Rough guess. - int numUsers = recommender.getDataModel().getNumUsers(); - recommendationsRetriever = new RecommendationRetriever(); - recommendationCache = new Cache(recommendationsRetriever, numUsers); - estimatedPrefCache = new Cache(new EstimatedPrefRetriever(), numUsers); - refreshHelper = new RefreshHelper(new Callable() { - @Override - public Object call() { - clear(); - return null; - } - }); - refreshHelper.addDependency(recommender); - } - - private void setCurrentRescorer(IDRescorer rescorer) { - if (rescorer == null) { - if (currentRescorer != null) { - currentRescorer = null; - clear(); - } - } else { - if (!rescorer.equals(currentRescorer)) { - currentRescorer = rescorer; - clear(); - } - } - } - - @Override - public List recommend(long userID, int howMany) throws TasteException { - return recommend(userID, howMany, null); - } - - @Override - public List recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException { - Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1"); - synchronized (maxHowMany) { - if (howMany > maxHowMany[0]) { - maxHowMany[0] = howMany; - } - } - - // Special case, avoid caching an anonymous user - if (userID == PlusAnonymousUserDataModel.TEMP_USER_ID) { - return recommendationsRetriever.get(PlusAnonymousUserDataModel.TEMP_USER_ID).getItems(); - } - - setCurrentRescorer(rescorer); - - Recommendations recommendations = recommendationCache.get(userID); - if (recommendations.getItems().size() < howMany && !recommendations.isNoMoreRecommendableItems()) { - clear(userID); - recommendations = recommendationCache.get(userID); - if (recommendations.getItems().size() < howMany) { - recommendations.setNoMoreRecommendableItems(true); - } - } - - List recommendedItems = recommendations.getItems(); - return recommendedItems.size() > howMany ? recommendedItems.subList(0, howMany) : recommendedItems; - } - - @Override - public float estimatePreference(long userID, long itemID) throws TasteException { - return estimatedPrefCache.get(new LongPair(userID, itemID)); - } - - @Override - public void setPreference(long userID, long itemID, float value) throws TasteException { - recommender.setPreference(userID, itemID, value); - clear(userID); - } - - @Override - public void removePreference(long userID, long itemID) throws TasteException { - recommender.removePreference(userID, itemID); - clear(userID); - } - - @Override - public DataModel getDataModel() { - return recommender.getDataModel(); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } - - /** - *

- * Clears cached recommendations for the given user. - *

- * - * @param userID - * clear cached data associated with this user ID - */ - public void clear(final long userID) { - log.debug("Clearing recommendations for user ID '{}'", userID); - recommendationCache.remove(userID); - estimatedPrefCache.removeKeysMatching(new Cache.MatchPredicate() { - @Override - public boolean matches(LongPair userItemPair) { - return userItemPair.getFirst() == userID; - } - }); - } - - /** - *

- * Clears all cached recommendations. - *

- */ - public void clear() { - log.debug("Clearing all recommendations..."); - recommendationCache.clear(); - estimatedPrefCache.clear(); - } - - @Override - public String toString() { - return "CachingRecommender[recommender:" + recommender + ']'; - } - - private final class RecommendationRetriever implements Retriever { - @Override - public Recommendations get(Long key) throws TasteException { - log.debug("Retrieving new recommendations for user ID '{}'", key); - int howMany = maxHowMany[0]; - IDRescorer rescorer = currentRescorer; - List recommendations = - rescorer == null ? recommender.recommend(key, howMany) : recommender.recommend(key, howMany, rescorer); - return new Recommendations(Collections.unmodifiableList(recommendations)); - } - } - - private final class EstimatedPrefRetriever implements Retriever { - @Override - public Float get(LongPair key) throws TasteException { - long userID = key.getFirst(); - long itemID = key.getSecond(); - log.debug("Retrieving estimated preference for user ID '{}' and item ID '{}'", userID, itemID); - return recommender.estimatePreference(userID, itemID); - } - } - - private static final class Recommendations { - - private final List items; - private boolean noMoreRecommendableItems; - - private Recommendations(List items) { - this.items = items; - } - - List getItems() { - return items; - } - - boolean isNoMoreRecommendableItems() { - return noMoreRecommendableItems; - } - - void setNoMoreRecommendableItems(boolean noMoreRecommendableItems) { - this.noMoreRecommendableItems = noMoreRecommendableItems; - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ClusterSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ClusterSimilarity.java deleted file mode 100644 index d8fb89ced..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ClusterSimilarity.java +++ /dev/null @@ -1,48 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; - -/** - *

- * Returns the "similarity" between two clusters of users, according to some definition of similarity. - * Subclassses define different notions of similarity. - *

- * - * @see TreeClusteringRecommender - */ -public interface ClusterSimilarity extends Refreshable { - - /** - * @param cluster1 - * first cluster of user IDs - * @param cluster2 - * second cluster of user IDs - * @return "distance" between clusters; a bigger value means less similarity - * @throws TasteException - * if an error occurs while computing similarity, such as errors accessing an underlying - * {@link org.apache.mahout.cf.taste.model.DataModel} - * @throws IllegalArgumentException - * if either argument is null or empty - */ - double getSimilarity(FastIDSet cluster1, FastIDSet cluster2) throws TasteException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/EstimatedPreferenceCapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/EstimatedPreferenceCapper.java deleted file mode 100644 index f0f389f7f..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/EstimatedPreferenceCapper.java +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import org.apache.mahout.cf.taste.model.DataModel; - -/** - * Simple class which encapsulates restricting a preference value - * to a predefined range. The simple logic is wrapped up here for - * performance reasons. - */ -public final class EstimatedPreferenceCapper { - - private final float min; - private final float max; - - public EstimatedPreferenceCapper(DataModel model) { - min = model.getMinPreference(); - max = model.getMaxPreference(); - } - - public float capEstimate(float estimate) { - if (estimate > max) { - estimate = max; - } else if (estimate < min) { - estimate = min; - } - return estimate; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/FarthestNeighborClusterSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/FarthestNeighborClusterSimilarity.java deleted file mode 100644 index 092a12e43..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/FarthestNeighborClusterSimilarity.java +++ /dev/null @@ -1,106 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import java.util.Collection; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator; -import org.apache.mahout.cf.taste.similarity.UserSimilarity; - -import com.google.common.base.Preconditions; - -/** - *

- * Defines cluster similarity as the smallest similarity between any two users in the clusters -- - * that is, it says that clusters are close when all pairs of their members have relatively high - * similarity. - *

- */ -public final class FarthestNeighborClusterSimilarity implements ClusterSimilarity { - - private final UserSimilarity similarity; - private final double samplingRate; - - /** - *

- * Constructs a based on the given {@link UserSimilarity}. All - * user-user similarities are examined. - *

- */ - public FarthestNeighborClusterSimilarity(UserSimilarity similarity) { - this(similarity, 1.0); - } - - /** - *

- * Constructs a based on the given {@link UserSimilarity}. By - * setting {@code samplingRate} to a value less than 1.0, this implementation will only examine that - * fraction of all user-user similarities between two clusters, increasing performance at the expense of - * accuracy. - *

- */ - public FarthestNeighborClusterSimilarity(UserSimilarity similarity, double samplingRate) { - Preconditions.checkArgument(similarity != null, "similarity is null"); - Preconditions.checkArgument(!Double.isNaN(samplingRate) && samplingRate > 0.0 && samplingRate <= 1.0, - "samplingRate is invalid: %.4f", samplingRate); - this.similarity = similarity; - this.samplingRate = samplingRate; - } - - @Override - public double getSimilarity(FastIDSet cluster1, FastIDSet cluster2) throws TasteException { - if (cluster1.isEmpty() || cluster2.isEmpty()) { - return Double.NaN; - } - double leastSimilarity = Double.POSITIVE_INFINITY; - LongPrimitiveIterator someUsers = SamplingLongPrimitiveIterator.maybeWrapIterator(cluster1.iterator(), - samplingRate); - while (someUsers.hasNext()) { - long userID1 = someUsers.next(); - LongPrimitiveIterator it2 = cluster2.iterator(); - while (it2.hasNext()) { - double theSimilarity = similarity.userSimilarity(userID1, it2.nextLong()); - if (theSimilarity < leastSimilarity) { - leastSimilarity = theSimilarity; - } - } - } - // We skipped everything? well, at least try comparing the first Users to get some value - if (leastSimilarity == Double.POSITIVE_INFINITY) { - return similarity.userSimilarity(cluster1.iterator().next(), cluster2.iterator().next()); - } - return leastSimilarity; - } - - @Override - public void refresh(Collection alreadyRefreshed) { - alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed); - RefreshHelper.maybeRefresh(alreadyRefreshed, similarity); - } - - @Override - public String toString() { - return "FarthestNeighborClusterSimilarity[similarity:" + similarity + ']'; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefItemBasedRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefItemBasedRecommender.java deleted file mode 100644 index 40e21a36f..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefItemBasedRecommender.java +++ /dev/null @@ -1,71 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy; -import org.apache.mahout.cf.taste.recommender.MostSimilarItemsCandidateItemsStrategy; -import org.apache.mahout.cf.taste.similarity.ItemSimilarity; - -/** - * A variant on {@link GenericItemBasedRecommender} which is appropriate for use when no notion of preference - * value exists in the data. - * - * @see org.apache.mahout.cf.taste.impl.recommender.GenericBooleanPrefUserBasedRecommender - */ -public final class GenericBooleanPrefItemBasedRecommender extends GenericItemBasedRecommender { - - public GenericBooleanPrefItemBasedRecommender(DataModel dataModel, ItemSimilarity similarity) { - super(dataModel, similarity); - } - - public GenericBooleanPrefItemBasedRecommender(DataModel dataModel, ItemSimilarity similarity, - CandidateItemsStrategy candidateItemsStrategy, MostSimilarItemsCandidateItemsStrategy - mostSimilarItemsCandidateItemsStrategy) { - super(dataModel, similarity, candidateItemsStrategy, mostSimilarItemsCandidateItemsStrategy); - } - - /** - * This computation is in a technical sense, wrong, since in the domain of "boolean preference users" where - * all preference values are 1, this method should only ever return 1.0 or NaN. This isn't terribly useful - * however since it means results can't be ranked by preference value (all are 1). So instead this returns a - * sum of similarities. - */ - @Override - protected float doEstimatePreference(long userID, PreferenceArray preferencesFromUser, long itemID) - throws TasteException { - double[] similarities = getSimilarity().itemSimilarities(itemID, preferencesFromUser.getIDs()); - boolean foundAPref = false; - double totalSimilarity = 0.0; - for (double theSimilarity : similarities) { - if (!Double.isNaN(theSimilarity)) { - foundAPref = true; - totalSimilarity += theSimilarity; - } - } - return foundAPref ? (float) totalSimilarity : Float.NaN; - } - - @Override - public String toString() { - return "GenericBooleanPrefItemBasedRecommender"; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefUserBasedRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefUserBasedRecommender.java deleted file mode 100644 index db7c42a61..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefUserBasedRecommender.java +++ /dev/null @@ -1,79 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; -import org.apache.mahout.cf.taste.similarity.UserSimilarity; - -/** - * A variant on {@link GenericUserBasedRecommender} which is appropriate for use when no notion of preference - * value exists in the data. - */ -public final class GenericBooleanPrefUserBasedRecommender extends GenericUserBasedRecommender { - - public GenericBooleanPrefUserBasedRecommender(DataModel dataModel, - UserNeighborhood neighborhood, - UserSimilarity similarity) { - super(dataModel, neighborhood, similarity); - } - - /** - * This computation is in a technical sense, wrong, since in the domain of "boolean preference users" where - * all preference values are 1, this method should only ever return 1.0 or NaN. This isn't terribly useful - * however since it means results can't be ranked by preference value (all are 1). So instead this returns a - * sum of similarities to any other user in the neighborhood who has also rated the item. - */ - @Override - protected float doEstimatePreference(long theUserID, long[] theNeighborhood, long itemID) throws TasteException { - if (theNeighborhood.length == 0) { - return Float.NaN; - } - DataModel dataModel = getDataModel(); - UserSimilarity similarity = getSimilarity(); - float totalSimilarity = 0.0f; - boolean foundAPref = false; - for (long userID : theNeighborhood) { - // See GenericItemBasedRecommender.doEstimatePreference() too - if (userID != theUserID && dataModel.getPreferenceValue(userID, itemID) != null) { - foundAPref = true; - totalSimilarity += (float) similarity.userSimilarity(theUserID, userID); - } - } - return foundAPref ? totalSimilarity : Float.NaN; - } - - @Override - protected FastIDSet getAllOtherItems(long[] theNeighborhood, long theUserID) throws TasteException { - DataModel dataModel = getDataModel(); - FastIDSet possibleItemIDs = new FastIDSet(); - for (long userID : theNeighborhood) { - possibleItemIDs.addAll(dataModel.getItemIDsFromUser(userID)); - } - possibleItemIDs.removeAll(dataModel.getItemIDsFromUser(theUserID)); - return possibleItemIDs; - } - - @Override - public String toString() { - return "GenericBooleanPrefUserBasedRecommender"; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericItemBasedRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericItemBasedRecommender.java deleted file mode 100644 index 2c225a702..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericItemBasedRecommender.java +++ /dev/null @@ -1,375 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.Callable; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.IDRescorer; -import org.apache.mahout.cf.taste.recommender.ItemBasedRecommender; -import org.apache.mahout.cf.taste.recommender.MostSimilarItemsCandidateItemsStrategy; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.cf.taste.recommender.Rescorer; -import org.apache.mahout.cf.taste.similarity.ItemSimilarity; -import org.apache.mahout.common.LongPair; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * A simple {@link org.apache.mahout.cf.taste.recommender.Recommender} which uses a given - * {@link org.apache.mahout.cf.taste.model.DataModel} and - * {@link org.apache.mahout.cf.taste.similarity.ItemSimilarity} to produce recommendations. This class - * represents Taste's support for item-based recommenders. - *

- * - *

- * The {@link org.apache.mahout.cf.taste.similarity.ItemSimilarity} is the most important point to discuss - * here. Item-based recommenders are useful because they can take advantage of something to be very fast: they - * base their computations on item similarity, not user similarity, and item similarity is relatively static. - * It can be precomputed, instead of re-computed in real time. - *

- * - *

- * Thus it's strongly recommended that you use - * {@link org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity} with pre-computed similarities if - * you're going to use this class. You can use - * {@link org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity} too, which computes - * similarities in real-time, but will probably find this painfully slow for large amounts of data. - *

- */ -public class GenericItemBasedRecommender extends AbstractRecommender implements ItemBasedRecommender { - - private static final Logger log = LoggerFactory.getLogger(GenericItemBasedRecommender.class); - - private final ItemSimilarity similarity; - private final MostSimilarItemsCandidateItemsStrategy mostSimilarItemsCandidateItemsStrategy; - private final RefreshHelper refreshHelper; - private EstimatedPreferenceCapper capper; - - private static final boolean EXCLUDE_ITEM_IF_NOT_SIMILAR_TO_ALL_BY_DEFAULT = true; - - public GenericItemBasedRecommender(DataModel dataModel, - ItemSimilarity similarity, - CandidateItemsStrategy candidateItemsStrategy, - MostSimilarItemsCandidateItemsStrategy mostSimilarItemsCandidateItemsStrategy) { - super(dataModel, candidateItemsStrategy); - Preconditions.checkArgument(similarity != null, "similarity is null"); - this.similarity = similarity; - Preconditions.checkArgument(mostSimilarItemsCandidateItemsStrategy != null, - "mostSimilarItemsCandidateItemsStrategy is null"); - this.mostSimilarItemsCandidateItemsStrategy = mostSimilarItemsCandidateItemsStrategy; - this.refreshHelper = new RefreshHelper(new Callable() { - @Override - public Void call() { - capper = buildCapper(); - return null; - } - }); - refreshHelper.addDependency(dataModel); - refreshHelper.addDependency(similarity); - capper = buildCapper(); - } - - public GenericItemBasedRecommender(DataModel dataModel, ItemSimilarity similarity) { - this(dataModel, - similarity, - AbstractRecommender.getDefaultCandidateItemsStrategy(), - getDefaultMostSimilarItemsCandidateItemsStrategy()); - } - - protected static MostSimilarItemsCandidateItemsStrategy getDefaultMostSimilarItemsCandidateItemsStrategy() { - return new PreferredItemsNeighborhoodCandidateItemsStrategy(); - } - - public ItemSimilarity getSimilarity() { - return similarity; - } - - @Override - public List recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException { - Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1"); - log.debug("Recommending items for user ID '{}'", userID); - - PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID); - if (preferencesFromUser.length() == 0) { - return Collections.emptyList(); - } - - FastIDSet possibleItemIDs = getAllOtherItems(userID, preferencesFromUser); - - TopItems.Estimator estimator = new Estimator(userID, preferencesFromUser); - - List topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer, - estimator); - - log.debug("Recommendations are: {}", topItems); - return topItems; - } - - @Override - public float estimatePreference(long userID, long itemID) throws TasteException { - PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID); - Float actualPref = getPreferenceForItem(preferencesFromUser, itemID); - if (actualPref != null) { - return actualPref; - } - return doEstimatePreference(userID, preferencesFromUser, itemID); - } - - private static Float getPreferenceForItem(PreferenceArray preferencesFromUser, long itemID) { - int size = preferencesFromUser.length(); - for (int i = 0; i < size; i++) { - if (preferencesFromUser.getItemID(i) == itemID) { - return preferencesFromUser.getValue(i); - } - } - return null; - } - - @Override - public List mostSimilarItems(long itemID, int howMany) throws TasteException { - return mostSimilarItems(itemID, howMany, null); - } - - @Override - public List mostSimilarItems(long itemID, int howMany, - Rescorer rescorer) throws TasteException { - TopItems.Estimator estimator = new MostSimilarEstimator(itemID, similarity, rescorer); - return doMostSimilarItems(new long[] {itemID}, howMany, estimator); - } - - @Override - public List mostSimilarItems(long[] itemIDs, int howMany) throws TasteException { - TopItems.Estimator estimator = new MultiMostSimilarEstimator(itemIDs, similarity, null, - EXCLUDE_ITEM_IF_NOT_SIMILAR_TO_ALL_BY_DEFAULT); - return doMostSimilarItems(itemIDs, howMany, estimator); - } - - @Override - public List mostSimilarItems(long[] itemIDs, int howMany, - Rescorer rescorer) throws TasteException { - TopItems.Estimator estimator = new MultiMostSimilarEstimator(itemIDs, similarity, rescorer, - EXCLUDE_ITEM_IF_NOT_SIMILAR_TO_ALL_BY_DEFAULT); - return doMostSimilarItems(itemIDs, howMany, estimator); - } - - @Override - public List mostSimilarItems(long[] itemIDs, - int howMany, - boolean excludeItemIfNotSimilarToAll) throws TasteException { - TopItems.Estimator estimator = new MultiMostSimilarEstimator(itemIDs, similarity, null, - excludeItemIfNotSimilarToAll); - return doMostSimilarItems(itemIDs, howMany, estimator); - } - - @Override - public List mostSimilarItems(long[] itemIDs, int howMany, - Rescorer rescorer, - boolean excludeItemIfNotSimilarToAll) throws TasteException { - TopItems.Estimator estimator = new MultiMostSimilarEstimator(itemIDs, similarity, rescorer, - excludeItemIfNotSimilarToAll); - return doMostSimilarItems(itemIDs, howMany, estimator); - } - - @Override - public List recommendedBecause(long userID, long itemID, int howMany) throws TasteException { - Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1"); - - DataModel model = getDataModel(); - TopItems.Estimator estimator = new RecommendedBecauseEstimator(userID, itemID); - - PreferenceArray prefs = model.getPreferencesFromUser(userID); - int size = prefs.length(); - FastIDSet allUserItems = new FastIDSet(size); - for (int i = 0; i < size; i++) { - allUserItems.add(prefs.getItemID(i)); - } - allUserItems.remove(itemID); - - return TopItems.getTopItems(howMany, allUserItems.iterator(), null, estimator); - } - - private List doMostSimilarItems(long[] itemIDs, - int howMany, - TopItems.Estimator estimator) throws TasteException { - FastIDSet possibleItemIDs = mostSimilarItemsCandidateItemsStrategy.getCandidateItems(itemIDs, getDataModel()); - return TopItems.getTopItems(howMany, possibleItemIDs.iterator(), null, estimator); - } - - protected float doEstimatePreference(long userID, PreferenceArray preferencesFromUser, long itemID) - throws TasteException { - double preference = 0.0; - double totalSimilarity = 0.0; - int count = 0; - double[] similarities = similarity.itemSimilarities(itemID, preferencesFromUser.getIDs()); - for (int i = 0; i < similarities.length; i++) { - double theSimilarity = similarities[i]; - if (!Double.isNaN(theSimilarity)) { - // Weights can be negative! - preference += theSimilarity * preferencesFromUser.getValue(i); - totalSimilarity += theSimilarity; - count++; - } - } - // Throw out the estimate if it was based on no data points, of course, but also if based on - // just one. This is a bit of a band-aid on the 'stock' item-based algorithm for the moment. - // The reason is that in this case the estimate is, simply, the user's rating for one item - // that happened to have a defined similarity. The similarity score doesn't matter, and that - // seems like a bad situation. - if (count <= 1) { - return Float.NaN; - } - float estimate = (float) (preference / totalSimilarity); - if (capper != null) { - estimate = capper.capEstimate(estimate); - } - return estimate; - } - - @Override - public void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } - - @Override - public String toString() { - return "GenericItemBasedRecommender[similarity:" + similarity + ']'; - } - - private EstimatedPreferenceCapper buildCapper() { - DataModel dataModel = getDataModel(); - if (Float.isNaN(dataModel.getMinPreference()) && Float.isNaN(dataModel.getMaxPreference())) { - return null; - } else { - return new EstimatedPreferenceCapper(dataModel); - } - } - - public static class MostSimilarEstimator implements TopItems.Estimator { - - private final long toItemID; - private final ItemSimilarity similarity; - private final Rescorer rescorer; - - public MostSimilarEstimator(long toItemID, ItemSimilarity similarity, Rescorer rescorer) { - this.toItemID = toItemID; - this.similarity = similarity; - this.rescorer = rescorer; - } - - @Override - public double estimate(Long itemID) throws TasteException { - LongPair pair = new LongPair(toItemID, itemID); - if (rescorer != null && rescorer.isFiltered(pair)) { - return Double.NaN; - } - double originalEstimate = similarity.itemSimilarity(toItemID, itemID); - return rescorer == null ? originalEstimate : rescorer.rescore(pair, originalEstimate); - } - } - - private final class Estimator implements TopItems.Estimator { - - private final long userID; - private final PreferenceArray preferencesFromUser; - - private Estimator(long userID, PreferenceArray preferencesFromUser) { - this.userID = userID; - this.preferencesFromUser = preferencesFromUser; - } - - @Override - public double estimate(Long itemID) throws TasteException { - return doEstimatePreference(userID, preferencesFromUser, itemID); - } - } - - private static final class MultiMostSimilarEstimator implements TopItems.Estimator { - - private final long[] toItemIDs; - private final ItemSimilarity similarity; - private final Rescorer rescorer; - private final boolean excludeItemIfNotSimilarToAll; - - private MultiMostSimilarEstimator(long[] toItemIDs, ItemSimilarity similarity, Rescorer rescorer, - boolean excludeItemIfNotSimilarToAll) { - this.toItemIDs = toItemIDs; - this.similarity = similarity; - this.rescorer = rescorer; - this.excludeItemIfNotSimilarToAll = excludeItemIfNotSimilarToAll; - } - - @Override - public double estimate(Long itemID) throws TasteException { - RunningAverage average = new FullRunningAverage(); - double[] similarities = similarity.itemSimilarities(itemID, toItemIDs); - for (int i = 0; i < toItemIDs.length; i++) { - long toItemID = toItemIDs[i]; - LongPair pair = new LongPair(toItemID, itemID); - if (rescorer != null && rescorer.isFiltered(pair)) { - continue; - } - double estimate = similarities[i]; - if (rescorer != null) { - estimate = rescorer.rescore(pair, estimate); - } - if (excludeItemIfNotSimilarToAll || !Double.isNaN(estimate)) { - average.addDatum(estimate); - } - } - double averageEstimate = average.getAverage(); - return averageEstimate == 0 ? Double.NaN : averageEstimate; - } - } - - private final class RecommendedBecauseEstimator implements TopItems.Estimator { - - private final long userID; - private final long recommendedItemID; - - private RecommendedBecauseEstimator(long userID, long recommendedItemID) { - this.userID = userID; - this.recommendedItemID = recommendedItemID; - } - - @Override - public double estimate(Long itemID) throws TasteException { - Float pref = getDataModel().getPreferenceValue(userID, itemID); - if (pref == null) { - return Float.NaN; - } - double similarityValue = similarity.itemSimilarity(recommendedItemID, itemID); - return (1.0 + similarityValue) * pref; - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericRecommendedItem.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericRecommendedItem.java deleted file mode 100644 index eb2b83803..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericRecommendedItem.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import java.io.Serializable; - -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.common.RandomUtils; - -import com.google.common.base.Preconditions; - -/** - *

- * A simple implementation of {@link RecommendedItem}. - *

- */ -public final class GenericRecommendedItem implements RecommendedItem, Serializable { - - private final long itemID; - private final float value; - - /** - * @throws IllegalArgumentException - * if item is null or value is NaN - */ - public GenericRecommendedItem(long itemID, float value) { - Preconditions.checkArgument(!Float.isNaN(value), "value is NaN"); - this.itemID = itemID; - this.value = value; - } - - @Override - public long getItemID() { - return itemID; - } - - @Override - public float getValue() { - return value; - } - - @Override - public String toString() { - return "RecommendedItem[item:" + itemID + ", value:" + value + ']'; - } - - @Override - public int hashCode() { - return (int) itemID ^ RandomUtils.hashFloat(value); - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof GenericRecommendedItem)) { - return false; - } - RecommendedItem other = (RecommendedItem) o; - return itemID == other.getItemID() && value == other.getValue(); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericUserBasedRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericUserBasedRecommender.java deleted file mode 100644 index 000b44dad..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericUserBasedRecommender.java +++ /dev/null @@ -1,243 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.Callable; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; -import org.apache.mahout.cf.taste.recommender.IDRescorer; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.cf.taste.recommender.Rescorer; -import org.apache.mahout.cf.taste.recommender.UserBasedRecommender; -import org.apache.mahout.cf.taste.similarity.UserSimilarity; -import org.apache.mahout.common.LongPair; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * A simple {@link org.apache.mahout.cf.taste.recommender.Recommender} - * which uses a given {@link DataModel} and {@link UserNeighborhood} to produce recommendations. - *

- */ -public class GenericUserBasedRecommender extends AbstractRecommender implements UserBasedRecommender { - - private static final Logger log = LoggerFactory.getLogger(GenericUserBasedRecommender.class); - - private final UserNeighborhood neighborhood; - private final UserSimilarity similarity; - private final RefreshHelper refreshHelper; - private EstimatedPreferenceCapper capper; - - public GenericUserBasedRecommender(DataModel dataModel, - UserNeighborhood neighborhood, - UserSimilarity similarity) { - super(dataModel); - Preconditions.checkArgument(neighborhood != null, "neighborhood is null"); - this.neighborhood = neighborhood; - this.similarity = similarity; - this.refreshHelper = new RefreshHelper(new Callable() { - @Override - public Void call() { - capper = buildCapper(); - return null; - } - }); - refreshHelper.addDependency(dataModel); - refreshHelper.addDependency(similarity); - refreshHelper.addDependency(neighborhood); - capper = buildCapper(); - } - - public UserSimilarity getSimilarity() { - return similarity; - } - - @Override - public List recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException { - Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1"); - - log.debug("Recommending items for user ID '{}'", userID); - - long[] theNeighborhood = neighborhood.getUserNeighborhood(userID); - - if (theNeighborhood.length == 0) { - return Collections.emptyList(); - } - - FastIDSet allItemIDs = getAllOtherItems(theNeighborhood, userID); - - TopItems.Estimator estimator = new Estimator(userID, theNeighborhood); - - List topItems = TopItems - .getTopItems(howMany, allItemIDs.iterator(), rescorer, estimator); - - log.debug("Recommendations are: {}", topItems); - return topItems; - } - - @Override - public float estimatePreference(long userID, long itemID) throws TasteException { - DataModel model = getDataModel(); - Float actualPref = model.getPreferenceValue(userID, itemID); - if (actualPref != null) { - return actualPref; - } - long[] theNeighborhood = neighborhood.getUserNeighborhood(userID); - return doEstimatePreference(userID, theNeighborhood, itemID); - } - - @Override - public long[] mostSimilarUserIDs(long userID, int howMany) throws TasteException { - return mostSimilarUserIDs(userID, howMany, null); - } - - @Override - public long[] mostSimilarUserIDs(long userID, int howMany, Rescorer rescorer) throws TasteException { - TopItems.Estimator estimator = new MostSimilarEstimator(userID, similarity, rescorer); - return doMostSimilarUsers(howMany, estimator); - } - - private long[] doMostSimilarUsers(int howMany, TopItems.Estimator estimator) throws TasteException { - DataModel model = getDataModel(); - return TopItems.getTopUsers(howMany, model.getUserIDs(), null, estimator); - } - - protected float doEstimatePreference(long theUserID, long[] theNeighborhood, long itemID) throws TasteException { - if (theNeighborhood.length == 0) { - return Float.NaN; - } - DataModel dataModel = getDataModel(); - double preference = 0.0; - double totalSimilarity = 0.0; - int count = 0; - for (long userID : theNeighborhood) { - if (userID != theUserID) { - // See GenericItemBasedRecommender.doEstimatePreference() too - Float pref = dataModel.getPreferenceValue(userID, itemID); - if (pref != null) { - double theSimilarity = similarity.userSimilarity(theUserID, userID); - if (!Double.isNaN(theSimilarity)) { - preference += theSimilarity * pref; - totalSimilarity += theSimilarity; - count++; - } - } - } - } - // Throw out the estimate if it was based on no data points, of course, but also if based on - // just one. This is a bit of a band-aid on the 'stock' item-based algorithm for the moment. - // The reason is that in this case the estimate is, simply, the user's rating for one item - // that happened to have a defined similarity. The similarity score doesn't matter, and that - // seems like a bad situation. - if (count <= 1) { - return Float.NaN; - } - float estimate = (float) (preference / totalSimilarity); - if (capper != null) { - estimate = capper.capEstimate(estimate); - } - return estimate; - } - - protected FastIDSet getAllOtherItems(long[] theNeighborhood, long theUserID) throws TasteException { - DataModel dataModel = getDataModel(); - FastIDSet possibleItemIDs = new FastIDSet(); - for (long userID : theNeighborhood) { - possibleItemIDs.addAll(dataModel.getItemIDsFromUser(userID)); - } - possibleItemIDs.removeAll(dataModel.getItemIDsFromUser(theUserID)); - return possibleItemIDs; - } - - @Override - public void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } - - @Override - public String toString() { - return "GenericUserBasedRecommender[neighborhood:" + neighborhood + ']'; - } - - private EstimatedPreferenceCapper buildCapper() { - DataModel dataModel = getDataModel(); - if (Float.isNaN(dataModel.getMinPreference()) && Float.isNaN(dataModel.getMaxPreference())) { - return null; - } else { - return new EstimatedPreferenceCapper(dataModel); - } - } - - private static final class MostSimilarEstimator implements TopItems.Estimator { - - private final long toUserID; - private final UserSimilarity similarity; - private final Rescorer rescorer; - - private MostSimilarEstimator(long toUserID, UserSimilarity similarity, Rescorer rescorer) { - this.toUserID = toUserID; - this.similarity = similarity; - this.rescorer = rescorer; - } - - @Override - public double estimate(Long userID) throws TasteException { - // Don't consider the user itself as a possible most similar user - if (userID == toUserID) { - return Double.NaN; - } - if (rescorer == null) { - return similarity.userSimilarity(toUserID, userID); - } else { - LongPair pair = new LongPair(toUserID, userID); - if (rescorer.isFiltered(pair)) { - return Double.NaN; - } - double originalEstimate = similarity.userSimilarity(toUserID, userID); - return rescorer.rescore(pair, originalEstimate); - } - } - } - - private final class Estimator implements TopItems.Estimator { - - private final long theUserID; - private final long[] theNeighborhood; - - Estimator(long theUserID, long[] theNeighborhood) { - this.theUserID = theUserID; - this.theNeighborhood = theNeighborhood; - } - - @Override - public double estimate(Long itemID) throws TasteException { - return doEstimatePreference(theUserID, theNeighborhood, itemID); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemAverageRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemAverageRecommender.java deleted file mode 100644 index 48d29e269..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemAverageRecommender.java +++ /dev/null @@ -1,198 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import java.util.Collection; -import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; - -import org.apache.mahout.cf.taste.common.NoSuchUserException; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.IDRescorer; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * A simple recommender that always estimates preference for an item to be the average of all known preference - * values for that item. No information about users is taken into account. This implementation is provided for - * experimentation; while simple and fast, it may not produce very good recommendations. - *

- */ -public final class ItemAverageRecommender extends AbstractRecommender { - - private static final Logger log = LoggerFactory.getLogger(ItemAverageRecommender.class); - - private final FastByIDMap itemAverages; - private final ReadWriteLock buildAveragesLock; - private final RefreshHelper refreshHelper; - - public ItemAverageRecommender(DataModel dataModel) throws TasteException { - super(dataModel); - this.itemAverages = new FastByIDMap(); - this.buildAveragesLock = new ReentrantReadWriteLock(); - this.refreshHelper = new RefreshHelper(new Callable() { - @Override - public Object call() throws TasteException { - buildAverageDiffs(); - return null; - } - }); - refreshHelper.addDependency(dataModel); - buildAverageDiffs(); - } - - @Override - public List recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException { - Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1"); - log.debug("Recommending items for user ID '{}'", userID); - - PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID); - FastIDSet possibleItemIDs = getAllOtherItems(userID, preferencesFromUser); - - TopItems.Estimator estimator = new Estimator(); - - List topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer, - estimator); - - log.debug("Recommendations are: {}", topItems); - return topItems; - } - - @Override - public float estimatePreference(long userID, long itemID) throws TasteException { - DataModel dataModel = getDataModel(); - Float actualPref = dataModel.getPreferenceValue(userID, itemID); - if (actualPref != null) { - return actualPref; - } - return doEstimatePreference(itemID); - } - - private float doEstimatePreference(long itemID) { - buildAveragesLock.readLock().lock(); - try { - RunningAverage average = itemAverages.get(itemID); - return average == null ? Float.NaN : (float) average.getAverage(); - } finally { - buildAveragesLock.readLock().unlock(); - } - } - - private void buildAverageDiffs() throws TasteException { - try { - buildAveragesLock.writeLock().lock(); - DataModel dataModel = getDataModel(); - LongPrimitiveIterator it = dataModel.getUserIDs(); - while (it.hasNext()) { - PreferenceArray prefs = dataModel.getPreferencesFromUser(it.nextLong()); - int size = prefs.length(); - for (int i = 0; i < size; i++) { - long itemID = prefs.getItemID(i); - RunningAverage average = itemAverages.get(itemID); - if (average == null) { - average = new FullRunningAverage(); - itemAverages.put(itemID, average); - } - average.addDatum(prefs.getValue(i)); - } - } - } finally { - buildAveragesLock.writeLock().unlock(); - } - } - - @Override - public void setPreference(long userID, long itemID, float value) throws TasteException { - DataModel dataModel = getDataModel(); - double prefDelta; - try { - Float oldPref = dataModel.getPreferenceValue(userID, itemID); - prefDelta = oldPref == null ? value : value - oldPref; - } catch (NoSuchUserException nsee) { - prefDelta = value; - } - super.setPreference(userID, itemID, value); - try { - buildAveragesLock.writeLock().lock(); - RunningAverage average = itemAverages.get(itemID); - if (average == null) { - RunningAverage newAverage = new FullRunningAverage(); - newAverage.addDatum(prefDelta); - itemAverages.put(itemID, newAverage); - } else { - average.changeDatum(prefDelta); - } - } finally { - buildAveragesLock.writeLock().unlock(); - } - } - - @Override - public void removePreference(long userID, long itemID) throws TasteException { - DataModel dataModel = getDataModel(); - Float oldPref = dataModel.getPreferenceValue(userID, itemID); - super.removePreference(userID, itemID); - if (oldPref != null) { - try { - buildAveragesLock.writeLock().lock(); - RunningAverage average = itemAverages.get(itemID); - if (average == null) { - throw new IllegalStateException("No preferences exist for item ID: " + itemID); - } else { - average.removeDatum(oldPref); - } - } finally { - buildAveragesLock.writeLock().unlock(); - } - } - } - - @Override - public void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } - - @Override - public String toString() { - return "ItemAverageRecommender"; - } - - private final class Estimator implements TopItems.Estimator { - - @Override - public double estimate(Long itemID) { - return doEstimatePreference(itemID); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemUserAverageRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemUserAverageRecommender.java deleted file mode 100644 index e1ee3b301..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemUserAverageRecommender.java +++ /dev/null @@ -1,239 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import java.util.Collection; -import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; - -import org.apache.mahout.cf.taste.common.NoSuchUserException; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.IDRescorer; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * Like {@link ItemAverageRecommender}, except that estimated preferences are adjusted for the users' average - * preference value. For example, say user X has not rated item Y. Item Y's average preference value is 3.5. - * User X's average preference value is 4.2, and the average over all preference values is 4.0. User X prefers - * items 0.2 higher on average, so, the estimated preference for user X, item Y is 3.5 + 0.2 = 3.7. - *

- */ -public final class ItemUserAverageRecommender extends AbstractRecommender { - - private static final Logger log = LoggerFactory.getLogger(ItemUserAverageRecommender.class); - - private final FastByIDMap itemAverages; - private final FastByIDMap userAverages; - private final RunningAverage overallAveragePrefValue; - private final ReadWriteLock buildAveragesLock; - private final RefreshHelper refreshHelper; - - public ItemUserAverageRecommender(DataModel dataModel) throws TasteException { - super(dataModel); - this.itemAverages = new FastByIDMap(); - this.userAverages = new FastByIDMap(); - this.overallAveragePrefValue = new FullRunningAverage(); - this.buildAveragesLock = new ReentrantReadWriteLock(); - this.refreshHelper = new RefreshHelper(new Callable() { - @Override - public Object call() throws TasteException { - buildAverageDiffs(); - return null; - } - }); - refreshHelper.addDependency(dataModel); - buildAverageDiffs(); - } - - @Override - public List recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException { - Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1"); - log.debug("Recommending items for user ID '{}'", userID); - - PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID); - FastIDSet possibleItemIDs = getAllOtherItems(userID, preferencesFromUser); - - TopItems.Estimator estimator = new Estimator(userID); - - List topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer, - estimator); - - log.debug("Recommendations are: {}", topItems); - return topItems; - } - - @Override - public float estimatePreference(long userID, long itemID) throws TasteException { - DataModel dataModel = getDataModel(); - Float actualPref = dataModel.getPreferenceValue(userID, itemID); - if (actualPref != null) { - return actualPref; - } - return doEstimatePreference(userID, itemID); - } - - private float doEstimatePreference(long userID, long itemID) { - buildAveragesLock.readLock().lock(); - try { - RunningAverage itemAverage = itemAverages.get(itemID); - if (itemAverage == null) { - return Float.NaN; - } - RunningAverage userAverage = userAverages.get(userID); - if (userAverage == null) { - return Float.NaN; - } - double userDiff = userAverage.getAverage() - overallAveragePrefValue.getAverage(); - return (float) (itemAverage.getAverage() + userDiff); - } finally { - buildAveragesLock.readLock().unlock(); - } - } - - private void buildAverageDiffs() throws TasteException { - try { - buildAveragesLock.writeLock().lock(); - DataModel dataModel = getDataModel(); - LongPrimitiveIterator it = dataModel.getUserIDs(); - while (it.hasNext()) { - long userID = it.nextLong(); - PreferenceArray prefs = dataModel.getPreferencesFromUser(userID); - int size = prefs.length(); - for (int i = 0; i < size; i++) { - long itemID = prefs.getItemID(i); - float value = prefs.getValue(i); - addDatumAndCreateIfNeeded(itemID, value, itemAverages); - addDatumAndCreateIfNeeded(userID, value, userAverages); - overallAveragePrefValue.addDatum(value); - } - } - } finally { - buildAveragesLock.writeLock().unlock(); - } - } - - private static void addDatumAndCreateIfNeeded(long itemID, float value, FastByIDMap averages) { - RunningAverage itemAverage = averages.get(itemID); - if (itemAverage == null) { - itemAverage = new FullRunningAverage(); - averages.put(itemID, itemAverage); - } - itemAverage.addDatum(value); - } - - @Override - public void setPreference(long userID, long itemID, float value) throws TasteException { - DataModel dataModel = getDataModel(); - double prefDelta; - try { - Float oldPref = dataModel.getPreferenceValue(userID, itemID); - prefDelta = oldPref == null ? value : value - oldPref; - } catch (NoSuchUserException nsee) { - prefDelta = value; - } - super.setPreference(userID, itemID, value); - try { - buildAveragesLock.writeLock().lock(); - RunningAverage itemAverage = itemAverages.get(itemID); - if (itemAverage == null) { - RunningAverage newItemAverage = new FullRunningAverage(); - newItemAverage.addDatum(prefDelta); - itemAverages.put(itemID, newItemAverage); - } else { - itemAverage.changeDatum(prefDelta); - } - RunningAverage userAverage = userAverages.get(userID); - if (userAverage == null) { - RunningAverage newUserAveragae = new FullRunningAverage(); - newUserAveragae.addDatum(prefDelta); - userAverages.put(userID, newUserAveragae); - } else { - userAverage.changeDatum(prefDelta); - } - overallAveragePrefValue.changeDatum(prefDelta); - } finally { - buildAveragesLock.writeLock().unlock(); - } - } - - @Override - public void removePreference(long userID, long itemID) throws TasteException { - DataModel dataModel = getDataModel(); - Float oldPref = dataModel.getPreferenceValue(userID, itemID); - super.removePreference(userID, itemID); - if (oldPref != null) { - try { - buildAveragesLock.writeLock().lock(); - RunningAverage itemAverage = itemAverages.get(itemID); - if (itemAverage == null) { - throw new IllegalStateException("No preferences exist for item ID: " + itemID); - } - itemAverage.removeDatum(oldPref); - RunningAverage userAverage = userAverages.get(userID); - if (userAverage == null) { - throw new IllegalStateException("No preferences exist for user ID: " + userID); - } - userAverage.removeDatum(oldPref); - overallAveragePrefValue.removeDatum(oldPref); - } finally { - buildAveragesLock.writeLock().unlock(); - } - } - } - - @Override - public void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } - - @Override - public String toString() { - return "ItemUserAverageRecommender"; - } - - private final class Estimator implements TopItems.Estimator { - - private final long userID; - - private Estimator(long userID) { - this.userID = userID; - } - - @Override - public double estimate(Long itemID) { - return doEstimatePreference(userID, itemID); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/NearestNeighborClusterSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/NearestNeighborClusterSimilarity.java deleted file mode 100644 index e406bf7b9..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/NearestNeighborClusterSimilarity.java +++ /dev/null @@ -1,105 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import java.util.Collection; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator; -import org.apache.mahout.cf.taste.similarity.UserSimilarity; - -import com.google.common.base.Preconditions; - -/** - *

- * Defines cluster similarity as the largest similarity between any two users in the clusters -- that - * is, it says that clusters are close when some pair of their members has high similarity. - *

- */ -public final class NearestNeighborClusterSimilarity implements ClusterSimilarity { - - private final UserSimilarity similarity; - private final double samplingRate; - - /** - *

- * Constructs a based on the given {@link UserSimilarity}. All - * user-user similarities are examined. - *

- */ - public NearestNeighborClusterSimilarity(UserSimilarity similarity) { - this(similarity, 1.0); - } - - /** - *

- * Constructs a based on the given {@link UserSimilarity}. By - * setting {@code samplingRate} to a value less than 1.0, this implementation will only examine that - * fraction of all user-user similarities between two clusters, increasing performance at the expense of - * accuracy. - *

- */ - public NearestNeighborClusterSimilarity(UserSimilarity similarity, double samplingRate) { - Preconditions.checkArgument(similarity != null, "similarity is null"); - Preconditions.checkArgument(samplingRate > 0.0 && samplingRate <= 1.0, "samplingRate is invalid: %f", samplingRate); - - this.similarity = similarity; - this.samplingRate = samplingRate; - } - - @Override - public double getSimilarity(FastIDSet cluster1, FastIDSet cluster2) throws TasteException { - if (cluster1.isEmpty() || cluster2.isEmpty()) { - return Double.NaN; - } - LongPrimitiveIterator someUsers = SamplingLongPrimitiveIterator.maybeWrapIterator(cluster1.iterator(), - samplingRate); - double greatestSimilarity = Double.NEGATIVE_INFINITY; - while (someUsers.hasNext()) { - long userID1 = someUsers.next(); - LongPrimitiveIterator it2 = cluster2.iterator(); - while (it2.hasNext()) { - double theSimilarity = similarity.userSimilarity(userID1, it2.nextLong()); - if (theSimilarity > greatestSimilarity) { - greatestSimilarity = theSimilarity; - } - } - } - // We skipped everything? well, at least try comparing the first Users to get some value - if (greatestSimilarity == Double.NEGATIVE_INFINITY) { - return similarity.userSimilarity(cluster1.iterator().next(), cluster2.iterator().next()); - } - return greatestSimilarity; - } - - @Override - public void refresh(Collection alreadyRefreshed) { - alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed); - RefreshHelper.maybeRefresh(alreadyRefreshed, similarity); - } - - @Override - public String toString() { - return "NearestNeighborClusterSimilarity[similarity:" + similarity + ']'; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/NullRescorer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/NullRescorer.java deleted file mode 100644 index bc86ec377..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/NullRescorer.java +++ /dev/null @@ -1,86 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import org.apache.mahout.cf.taste.recommender.IDRescorer; -import org.apache.mahout.cf.taste.recommender.Rescorer; -import org.apache.mahout.common.LongPair; - -/** - *

- * A simple {@link Rescorer} which always returns the original score. - *

- */ -public final class NullRescorer implements Rescorer, IDRescorer { - - private static final IDRescorer USER_OR_ITEM_INSTANCE = new NullRescorer(); - private static final Rescorer ITEM_ITEM_PAIR_INSTANCE = new NullRescorer(); - private static final Rescorer USER_USER_PAIR_INSTANCE = new NullRescorer(); - - private NullRescorer() { - } - - public static IDRescorer getItemInstance() { - return USER_OR_ITEM_INSTANCE; - } - - public static IDRescorer getUserInstance() { - return USER_OR_ITEM_INSTANCE; - } - - public static Rescorer getItemItemPairInstance() { - return ITEM_ITEM_PAIR_INSTANCE; - } - - public static Rescorer getUserUserPairInstance() { - return USER_USER_PAIR_INSTANCE; - } - - /** - * @param thing - * to rescore - * @param originalScore - * current score for item - * @return same originalScore as new score, always - */ - @Override - public double rescore(T thing, double originalScore) { - return originalScore; - } - - @Override - public boolean isFiltered(T thing) { - return false; - } - - @Override - public double rescore(long id, double originalScore) { - return originalScore; - } - - @Override - public boolean isFiltered(long id) { - return false; - } - - @Override - public String toString() { - return "NullRescorer"; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/PreferredItemsNeighborhoodCandidateItemsStrategy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/PreferredItemsNeighborhoodCandidateItemsStrategy.java deleted file mode 100644 index a84cbc096..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/PreferredItemsNeighborhoodCandidateItemsStrategy.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; - -public final class PreferredItemsNeighborhoodCandidateItemsStrategy extends AbstractCandidateItemsStrategy { - - /** - * returns all items that have not been rated by the user and that were preferred by another user - * that has preferred at least one item that the current user has preferred too - */ - @Override - protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel) throws TasteException { - FastIDSet possibleItemsIDs = new FastIDSet(); - for (long itemID : preferredItemIDs) { - PreferenceArray itemPreferences = dataModel.getPreferencesForItem(itemID); - int numUsersPreferringItem = itemPreferences.length(); - for (int index = 0; index < numUsersPreferringItem; index++) { - possibleItemsIDs.addAll(dataModel.getItemIDsFromUser(itemPreferences.getUserID(index))); - } - } - possibleItemsIDs.removeAll(preferredItemIDs); - return possibleItemsIDs; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/RandomRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/RandomRecommender.java deleted file mode 100644 index 6f74a6621..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/RandomRecommender.java +++ /dev/null @@ -1,96 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import java.util.Collection; -import java.util.List; -import java.util.Random; - -import com.google.common.collect.Lists; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.IDRescorer; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.common.RandomUtils; - -/** - * Produces random recommendations and preference estimates. This is likely only useful as a novelty and for - * benchmarking. - */ -public final class RandomRecommender extends AbstractRecommender { - - private final Random random = RandomUtils.getRandom(); - private final float minPref; - private final float maxPref; - - public RandomRecommender(DataModel dataModel) throws TasteException { - super(dataModel); - float maxPref = Float.NEGATIVE_INFINITY; - float minPref = Float.POSITIVE_INFINITY; - LongPrimitiveIterator userIterator = dataModel.getUserIDs(); - while (userIterator.hasNext()) { - long userID = userIterator.next(); - PreferenceArray prefs = dataModel.getPreferencesFromUser(userID); - for (int i = 0; i < prefs.length(); i++) { - float prefValue = prefs.getValue(i); - if (prefValue < minPref) { - minPref = prefValue; - } - if (prefValue > maxPref) { - maxPref = prefValue; - } - } - } - this.minPref = minPref; - this.maxPref = maxPref; - } - - @Override - public List recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException { - DataModel dataModel = getDataModel(); - int numItems = dataModel.getNumItems(); - List result = Lists.newArrayListWithCapacity(howMany); - while (result.size() < howMany) { - LongPrimitiveIterator it = dataModel.getItemIDs(); - it.skip(random.nextInt(numItems)); - long itemID = it.next(); - if (dataModel.getPreferenceValue(userID, itemID) == null) { - result.add(new GenericRecommendedItem(itemID, randomPref())); - } - } - return result; - } - - @Override - public float estimatePreference(long userID, long itemID) { - return randomPref(); - } - - private float randomPref() { - return minPref + random.nextFloat() * (maxPref - minPref); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - getDataModel().refresh(alreadyRefreshed); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java deleted file mode 100644 index ad2d89f47..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import com.google.common.base.Preconditions; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveArrayIterator; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.common.iterator.FixedSizeSamplingIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Iterator; - -/** - *

Returns all items that have not been rated by the user (3) and that were preferred by another user - * (2) that has preferred at least one item (1) that the current user has preferred too.

- * - *

This strategy uses sampling to limit the number of items that are considered, by sampling three different - * things, noted above:

- * - *
    - *
  1. The items that the user has preferred
  2. - *
  3. The users who also prefer each of those items
  4. - *
  5. The items those users also prefer
  6. - *
- * - *

There is a maximum associated with each of these three things; if the number of items or users exceeds - * that max, it is sampled so that the expected number of items or users actually used in that part of the - * computation is equal to the max.

- * - *

Three arguments control these three maxima. Each is a "factor" f, which establishes the max at - * f * log2(n), where n is the number of users or items in the data. For example if factor #2 is 5, - * which controls the number of users sampled per item, then 5 * log2(# users) is the maximum for this - * part of the computation.

- * - *

Each can be set to not do any limiting with value {@link #NO_LIMIT_FACTOR}.

- */ -public class SamplingCandidateItemsStrategy extends AbstractCandidateItemsStrategy { - - private static final Logger log = LoggerFactory.getLogger(SamplingCandidateItemsStrategy.class); - - /** - * Default factor used if not otherwise specified, for all limits. (30). - */ - public static final int DEFAULT_FACTOR = 30; - /** - * Specify this value as a factor to mean no limit. - */ - public static final int NO_LIMIT_FACTOR = Integer.MAX_VALUE; - private static final int MAX_LIMIT = Integer.MAX_VALUE; - private static final double LOG2 = Math.log(2.0); - - private final int maxItems; - private final int maxUsersPerItem; - private final int maxItemsPerUser; - - /** - * Defaults to using no limit ({@link #NO_LIMIT_FACTOR}) for all factors, except - * {@code candidatesPerUserFactor} which defaults to {@link #DEFAULT_FACTOR}. - * - * @see #SamplingCandidateItemsStrategy(int, int, int, int, int) - */ - public SamplingCandidateItemsStrategy(int numUsers, int numItems) { - this(DEFAULT_FACTOR, DEFAULT_FACTOR, DEFAULT_FACTOR, numUsers, numItems); - } - - /** - * @param itemsFactor factor controlling max items considered for a user - * @param usersPerItemFactor factor controlling max users considered for each of those items - * @param candidatesPerUserFactor factor controlling max candidate items considered from each of those users - * @param numUsers number of users currently in the data - * @param numItems number of items in the data - */ - public SamplingCandidateItemsStrategy(int itemsFactor, - int usersPerItemFactor, - int candidatesPerUserFactor, - int numUsers, - int numItems) { - Preconditions.checkArgument(itemsFactor > 0); - Preconditions.checkArgument(usersPerItemFactor > 0); - Preconditions.checkArgument(candidatesPerUserFactor > 0); - Preconditions.checkArgument(numUsers > 0); - Preconditions.checkArgument(numItems > 0); - maxItems = computeMaxFrom(itemsFactor, numItems); - maxUsersPerItem = computeMaxFrom(usersPerItemFactor, numUsers); - maxItemsPerUser = computeMaxFrom(candidatesPerUserFactor, numItems); - log.debug("maxItems {}, maxUsersPerItem {}, maxItemsPerUser {}", new Object[] {maxItems, maxUsersPerItem, maxItemsPerUser}); - } - - private static int computeMaxFrom(int factor, int numThings) { - if (factor == NO_LIMIT_FACTOR) { - return MAX_LIMIT; - } - long max = (long) (factor * (1.0 + Math.log(numThings) / LOG2)); - return max > MAX_LIMIT ? MAX_LIMIT : (int) max; - } - - @Override - protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel) throws TasteException { - FastIDSet possibleItemsIDs = new FastIDSet(); - LongPrimitiveIterator preferredItemIDsIterator = new LongPrimitiveArrayIterator(preferredItemIDs); - if (preferredItemIDs.length > maxItems) { - double samplingRate = (double) maxItems / preferredItemIDs.length; -// log.info("preferredItemIDs.length {}, samplingRate {}", preferredItemIDs.length, samplingRate); - preferredItemIDsIterator = - new SamplingLongPrimitiveIterator(preferredItemIDsIterator, samplingRate); - } - while (preferredItemIDsIterator.hasNext()) { - long itemID = preferredItemIDsIterator.nextLong(); - PreferenceArray prefs = dataModel.getPreferencesForItem(itemID); - int prefsLength = prefs.length(); - if (prefsLength > maxUsersPerItem) { - Iterator sampledPrefs = - new FixedSizeSamplingIterator(maxUsersPerItem, prefs.iterator()); - while (sampledPrefs.hasNext()) { - addSomeOf(possibleItemsIDs, dataModel.getItemIDsFromUser(sampledPrefs.next().getUserID())); - } - } else { - for (int i = 0; i < prefsLength; i++) { - addSomeOf(possibleItemsIDs, dataModel.getItemIDsFromUser(prefs.getUserID(i))); - } - } - } - possibleItemsIDs.removeAll(preferredItemIDs); - return possibleItemsIDs; - } - - private void addSomeOf(FastIDSet possibleItemIDs, FastIDSet itemIDs) { - if (itemIDs.size() > maxItemsPerUser) { - LongPrimitiveIterator it = - new SamplingLongPrimitiveIterator(itemIDs.iterator(), (double) maxItemsPerUser / itemIDs.size()); - while (it.hasNext()) { - possibleItemIDs.add(it.nextLong()); - } - } else { - possibleItemIDs.addAll(itemIDs); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SimilarUser.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SimilarUser.java deleted file mode 100644 index c6d417f49..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SimilarUser.java +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import org.apache.mahout.common.RandomUtils; - -/** Simply encapsulates a user and a similarity value. */ -public final class SimilarUser implements Comparable { - - private final long userID; - private final double similarity; - - public SimilarUser(long userID, double similarity) { - this.userID = userID; - this.similarity = similarity; - } - - long getUserID() { - return userID; - } - - double getSimilarity() { - return similarity; - } - - @Override - public int hashCode() { - return (int) userID ^ RandomUtils.hashDouble(similarity); - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof SimilarUser)) { - return false; - } - SimilarUser other = (SimilarUser) o; - return userID == other.getUserID() && similarity == other.getSimilarity(); - } - - @Override - public String toString() { - return "SimilarUser[user:" + userID + ", similarity:" + similarity + ']'; - } - - /** Defines an ordering from most similar to least similar. */ - @Override - public int compareTo(SimilarUser other) { - double otherSimilarity = other.getSimilarity(); - if (similarity > otherSimilarity) { - return -1; - } - if (similarity < otherSimilarity) { - return 1; - } - long otherUserID = other.getUserID(); - if (userID < otherUserID) { - return -1; - } - if (userID > otherUserID) { - return 1; - } - return 0; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TopItems.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TopItems.java deleted file mode 100644 index 1c448c2fb..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TopItems.java +++ /dev/null @@ -1,212 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.PriorityQueue; -import java.util.Queue; - -import com.google.common.collect.Lists; -import org.apache.mahout.cf.taste.common.NoSuchItemException; -import org.apache.mahout.cf.taste.common.NoSuchUserException; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity; -import org.apache.mahout.cf.taste.impl.similarity.GenericUserSimilarity; -import org.apache.mahout.cf.taste.recommender.IDRescorer; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; - -import com.google.common.base.Preconditions; - -/** - *

- * A simple class that refactors the "find top N things" logic that is used in several places. - *

- */ -public final class TopItems { - - private static final long[] NO_IDS = new long[0]; - - private TopItems() { } - - public static List getTopItems(int howMany, - LongPrimitiveIterator possibleItemIDs, - IDRescorer rescorer, - Estimator estimator) throws TasteException { - Preconditions.checkArgument(possibleItemIDs != null, "argument is null"); - Preconditions.checkArgument(estimator != null, "argument is null"); - - Queue topItems = new PriorityQueue(howMany + 1, - Collections.reverseOrder(ByValueRecommendedItemComparator.getInstance())); - boolean full = false; - double lowestTopValue = Double.NEGATIVE_INFINITY; - while (possibleItemIDs.hasNext()) { - long itemID = possibleItemIDs.next(); - if (rescorer == null || !rescorer.isFiltered(itemID)) { - double preference; - try { - preference = estimator.estimate(itemID); - } catch (NoSuchItemException nsie) { - continue; - } - double rescoredPref = rescorer == null ? preference : rescorer.rescore(itemID, preference); - if (!Double.isNaN(rescoredPref) && (!full || rescoredPref > lowestTopValue)) { - topItems.add(new GenericRecommendedItem(itemID, (float) rescoredPref)); - if (full) { - topItems.poll(); - } else if (topItems.size() > howMany) { - full = true; - topItems.poll(); - } - lowestTopValue = topItems.peek().getValue(); - } - } - } - int size = topItems.size(); - if (size == 0) { - return Collections.emptyList(); - } - List result = Lists.newArrayListWithCapacity(size); - result.addAll(topItems); - Collections.sort(result, ByValueRecommendedItemComparator.getInstance()); - return result; - } - - public static long[] getTopUsers(int howMany, - LongPrimitiveIterator allUserIDs, - IDRescorer rescorer, - Estimator estimator) throws TasteException { - Queue topUsers = new PriorityQueue(howMany + 1, Collections.reverseOrder()); - boolean full = false; - double lowestTopValue = Double.NEGATIVE_INFINITY; - while (allUserIDs.hasNext()) { - long userID = allUserIDs.next(); - if (rescorer != null && rescorer.isFiltered(userID)) { - continue; - } - double similarity; - try { - similarity = estimator.estimate(userID); - } catch (NoSuchUserException nsue) { - continue; - } - double rescoredSimilarity = rescorer == null ? similarity : rescorer.rescore(userID, similarity); - if (!Double.isNaN(rescoredSimilarity) && (!full || rescoredSimilarity > lowestTopValue)) { - topUsers.add(new SimilarUser(userID, rescoredSimilarity)); - if (full) { - topUsers.poll(); - } else if (topUsers.size() > howMany) { - full = true; - topUsers.poll(); - } - lowestTopValue = topUsers.peek().getSimilarity(); - } - } - int size = topUsers.size(); - if (size == 0) { - return NO_IDS; - } - List sorted = Lists.newArrayListWithCapacity(size); - sorted.addAll(topUsers); - Collections.sort(sorted); - long[] result = new long[size]; - int i = 0; - for (SimilarUser similarUser : sorted) { - result[i++] = similarUser.getUserID(); - } - return result; - } - - /** - *

- * Thanks to tsmorton for suggesting this functionality and writing part of the code. - *

- * - * @see GenericItemSimilarity#GenericItemSimilarity(Iterable, int) - * @see GenericItemSimilarity#GenericItemSimilarity(org.apache.mahout.cf.taste.similarity.ItemSimilarity, - * org.apache.mahout.cf.taste.model.DataModel, int) - */ - public static List getTopItemItemSimilarities( - int howMany, Iterator allSimilarities) { - - Queue topSimilarities - = new PriorityQueue(howMany + 1, Collections.reverseOrder()); - boolean full = false; - double lowestTopValue = Double.NEGATIVE_INFINITY; - while (allSimilarities.hasNext()) { - GenericItemSimilarity.ItemItemSimilarity similarity = allSimilarities.next(); - double value = similarity.getValue(); - if (!Double.isNaN(value) && (!full || value > lowestTopValue)) { - topSimilarities.add(similarity); - if (full) { - topSimilarities.poll(); - } else if (topSimilarities.size() > howMany) { - full = true; - topSimilarities.poll(); - } - lowestTopValue = topSimilarities.peek().getValue(); - } - } - int size = topSimilarities.size(); - if (size == 0) { - return Collections.emptyList(); - } - List result = Lists.newArrayListWithCapacity(size); - result.addAll(topSimilarities); - Collections.sort(result); - return result; - } - - public static List getTopUserUserSimilarities( - int howMany, Iterator allSimilarities) { - - Queue topSimilarities - = new PriorityQueue(howMany + 1, Collections.reverseOrder()); - boolean full = false; - double lowestTopValue = Double.NEGATIVE_INFINITY; - while (allSimilarities.hasNext()) { - GenericUserSimilarity.UserUserSimilarity similarity = allSimilarities.next(); - double value = similarity.getValue(); - if (!Double.isNaN(value) && (!full || value > lowestTopValue)) { - topSimilarities.add(similarity); - if (full) { - topSimilarities.poll(); - } else if (topSimilarities.size() > howMany) { - full = true; - topSimilarities.poll(); - } - lowestTopValue = topSimilarities.peek().getValue(); - } - } - int size = topSimilarities.size(); - if (size == 0) { - return Collections.emptyList(); - } - List result = Lists.newArrayListWithCapacity(size); - result.addAll(topSimilarities); - Collections.sort(result); - return result; - } - - public interface Estimator { - double estimate(T thing) throws TasteException; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TreeClusteringRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TreeClusteringRecommender.java deleted file mode 100644 index 6d3e8dc73..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TreeClusteringRecommender.java +++ /dev/null @@ -1,413 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Random; -import java.util.concurrent.Callable; - -import com.google.common.collect.Lists; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.recommender.ClusteringRecommender; -import org.apache.mahout.cf.taste.recommender.IDRescorer; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.RandomUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * A {@link org.apache.mahout.cf.taste.recommender.Recommender} that clusters users, then determines the - * clusters' top recommendations. This implementation builds clusters by repeatedly merging clusters until - * only a certain number remain, meaning that each cluster is sort of a tree of other clusters. - *

- * - *

- * This {@link org.apache.mahout.cf.taste.recommender.Recommender} therefore has a few properties to note: - *

- * - *
    - *
  • For all users in a cluster, recommendations will be the same
  • - *
  • {@link #estimatePreference(long, long)} may well return {@link Double#NaN}; it does so when asked to - * estimate preference for an item for which no preference is expressed in the users in the cluster.
  • - *
- */ -public final class TreeClusteringRecommender extends AbstractRecommender implements ClusteringRecommender { - - private static final Logger log = LoggerFactory.getLogger(TreeClusteringRecommender.class); - - private static final FastIDSet[] NO_CLUSTERS = new FastIDSet[0]; - - private final Random random; - private final ClusterSimilarity clusterSimilarity; - private final int numClusters; - private final double clusteringThreshold; - private final boolean clusteringByThreshold; - private final double samplingRate; - private FastByIDMap> topRecsByUserID; - private FastIDSet[] allClusters; - private FastByIDMap clustersByUserID; - private final RefreshHelper refreshHelper; - - /** - * @param dataModel - * {@link DataModel} which provdes users - * @param clusterSimilarity - * {@link ClusterSimilarity} used to compute cluster similarity - * @param numClusters - * desired number of clusters to create - * @throws IllegalArgumentException - * if arguments are {@code null}, or {@code numClusters} is less than 2 - */ - public TreeClusteringRecommender(DataModel dataModel, ClusterSimilarity clusterSimilarity, int numClusters) - throws TasteException { - this(dataModel, clusterSimilarity, numClusters, 1.0); - } - - /** - * @param dataModel - * {@link DataModel} which provdes users - * @param clusterSimilarity - * {@link ClusterSimilarity} used to compute cluster similarity - * @param numClusters - * desired number of clusters to create - * @param samplingRate - * percentage of all cluster-cluster pairs to consider when finding next-most-similar clusters. - * Decreasing this value from 1.0 can increase performance at the cost of accuracy - * @throws IllegalArgumentException - * if arguments are {@code null}, or {@code numClusters} is less than 2, or samplingRate - * is {@link Double#NaN} or nonpositive or greater than 1.0 - */ - public TreeClusteringRecommender(DataModel dataModel, - ClusterSimilarity clusterSimilarity, - int numClusters, - double samplingRate) throws TasteException { - super(dataModel); - Preconditions.checkArgument(numClusters >= 2, "numClusters must be at least 2"); - Preconditions.checkArgument(samplingRate > 0.0 && samplingRate <= 1.0, - "samplingRate is invalid: %f", samplingRate); - random = RandomUtils.getRandom(); - this.clusterSimilarity = Preconditions.checkNotNull(clusterSimilarity); - this.numClusters = numClusters; - this.clusteringThreshold = Double.NaN; - this.clusteringByThreshold = false; - this.samplingRate = samplingRate; - this.refreshHelper = new RefreshHelper(new Callable() { - @Override - public Object call() throws TasteException { - buildClusters(); - return null; - } - }); - refreshHelper.addDependency(dataModel); - refreshHelper.addDependency(clusterSimilarity); - buildClusters(); - } - - /** - * @param dataModel - * {@link DataModel} which provdes users - * @param clusterSimilarity - * {@link ClusterSimilarity} used to compute cluster similarity - * @param clusteringThreshold - * clustering similarity threshold; clusters will be aggregated into larger clusters until the next - * two nearest clusters' similarity drops below this threshold - * @throws IllegalArgumentException - * if arguments are {@code null}, or {@code clusteringThreshold} is {@link Double#NaN} - */ - public TreeClusteringRecommender(DataModel dataModel, - ClusterSimilarity clusterSimilarity, - double clusteringThreshold) throws TasteException { - this(dataModel, clusterSimilarity, clusteringThreshold, 1.0); - } - - /** - * @param dataModel - * {@link DataModel} which provides users - * @param clusterSimilarity - * {@link ClusterSimilarity} used to compute cluster similarity - * @param clusteringThreshold - * clustering similarity threshold; clusters will be aggregated into larger clusters until the next - * two nearest clusters' similarity drops below this threshold - * @param samplingRate - * percentage of all cluster-cluster pairs to consider when finding next-most-similar clusters. - * Decreasing this value from 1.0 can increase performance at the cost of accuracy - * @throws IllegalArgumentException - * if arguments are {@code null}, or {@code clusteringThreshold} is {@link Double#NaN}, - * or samplingRate is {@link Double#NaN} or nonpositive or greater than 1.0 - */ - public TreeClusteringRecommender(DataModel dataModel, - ClusterSimilarity clusterSimilarity, - double clusteringThreshold, - double samplingRate) throws TasteException { - super(dataModel); - Preconditions.checkArgument(!Double.isNaN(clusteringThreshold), "clusteringThreshold must not be NaN"); - Preconditions.checkArgument(samplingRate > 0.0 && samplingRate <= 1.0, "samplingRate is invalid: %f", samplingRate); - random = RandomUtils.getRandom(); - this.clusterSimilarity = Preconditions.checkNotNull(clusterSimilarity); - this.numClusters = Integer.MIN_VALUE; - this.clusteringThreshold = clusteringThreshold; - this.clusteringByThreshold = true; - this.samplingRate = samplingRate; - this.refreshHelper = new RefreshHelper(new Callable() { - @Override - public Object call() throws TasteException { - buildClusters(); - return null; - } - }); - refreshHelper.addDependency(dataModel); - refreshHelper.addDependency(clusterSimilarity); - buildClusters(); - } - - @Override - public List recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException { - Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1"); - buildClusters(); - - log.debug("Recommending items for user ID '{}'", userID); - - List recommended = topRecsByUserID.get(userID); - if (recommended == null) { - return Collections.emptyList(); - } - - DataModel dataModel = getDataModel(); - List rescored = Lists.newArrayListWithCapacity(recommended.size()); - // Only add items the user doesn't already have a preference for. - // And that the rescorer doesn't "reject". - for (RecommendedItem recommendedItem : recommended) { - long itemID = recommendedItem.getItemID(); - if (rescorer != null && rescorer.isFiltered(itemID)) { - continue; - } - if (dataModel.getPreferenceValue(userID, itemID) == null - && (rescorer == null || !Double.isNaN(rescorer.rescore(itemID, recommendedItem.getValue())))) { - rescored.add(recommendedItem); - } - } - Collections.sort(rescored, new ByRescoreComparator(rescorer)); - - return rescored; - } - - @Override - public float estimatePreference(long userID, long itemID) throws TasteException { - DataModel model = getDataModel(); - Float actualPref = model.getPreferenceValue(userID, itemID); - if (actualPref != null) { - return actualPref; - } - buildClusters(); - List topRecsForUser = topRecsByUserID.get(userID); - if (topRecsForUser != null) { - for (RecommendedItem item : topRecsForUser) { - if (itemID == item.getItemID()) { - return item.getValue(); - } - } - } - // Hmm, we have no idea. The item is not in the user's cluster - return Float.NaN; - } - - @Override - public FastIDSet getCluster(long userID) throws TasteException { - buildClusters(); - FastIDSet cluster = clustersByUserID.get(userID); - return cluster == null ? new FastIDSet() : cluster; - } - - @Override - public FastIDSet[] getClusters() throws TasteException { - buildClusters(); - return allClusters; - } - - private void buildClusters() throws TasteException { - DataModel model = getDataModel(); - int numUsers = model.getNumUsers(); - if (numUsers > 0) { - List newClusters = Lists.newArrayListWithCapacity(numUsers); - // Begin with a cluster for each user: - LongPrimitiveIterator it = model.getUserIDs(); - while (it.hasNext()) { - FastIDSet newCluster = new FastIDSet(); - newCluster.add(it.nextLong()); - newClusters.add(newCluster); - } - if (numUsers > 1) { - findClusters(newClusters); - } - topRecsByUserID = computeTopRecsPerUserID(newClusters); - clustersByUserID = computeClustersPerUserID(newClusters); - allClusters = newClusters.toArray(new FastIDSet[newClusters.size()]); - } else { - topRecsByUserID = new FastByIDMap>(); - clustersByUserID = new FastByIDMap(); - allClusters = NO_CLUSTERS; - } - } - - private void findClusters(List newClusters) throws TasteException { - if (clusteringByThreshold) { - Pair nearestPair = findNearestClusters(newClusters); - if (nearestPair != null) { - FastIDSet cluster1 = nearestPair.getFirst(); - FastIDSet cluster2 = nearestPair.getSecond(); - while (clusterSimilarity.getSimilarity(cluster1, cluster2) >= clusteringThreshold) { - newClusters.remove(cluster1); - newClusters.remove(cluster2); - FastIDSet merged = new FastIDSet(cluster1.size() + cluster2.size()); - merged.addAll(cluster1); - merged.addAll(cluster2); - newClusters.add(merged); - nearestPair = findNearestClusters(newClusters); - if (nearestPair == null) { - break; - } - cluster1 = nearestPair.getFirst(); - cluster2 = nearestPair.getSecond(); - } - } - } else { - while (newClusters.size() > numClusters) { - Pair nearestPair = findNearestClusters(newClusters); - if (nearestPair == null) { - break; - } - FastIDSet cluster1 = nearestPair.getFirst(); - FastIDSet cluster2 = nearestPair.getSecond(); - newClusters.remove(cluster1); - newClusters.remove(cluster2); - FastIDSet merged = new FastIDSet(cluster1.size() + cluster2.size()); - merged.addAll(cluster1); - merged.addAll(cluster2); - newClusters.add(merged); - } - } - } - - private Pair findNearestClusters(List clusters) throws TasteException { - int size = clusters.size(); - Pair nearestPair = null; - double bestSimilarity = Double.NEGATIVE_INFINITY; - for (int i = 0; i < size; i++) { - FastIDSet cluster1 = clusters.get(i); - for (int j = i + 1; j < size; j++) { - if (samplingRate >= 1.0 || random.nextDouble() < samplingRate) { - FastIDSet cluster2 = clusters.get(j); - double similarity = clusterSimilarity.getSimilarity(cluster1, cluster2); - if (!Double.isNaN(similarity) && similarity > bestSimilarity) { - bestSimilarity = similarity; - nearestPair = new Pair(cluster1, cluster2); - } - } - } - } - return nearestPair; - } - - private FastByIDMap> computeTopRecsPerUserID(Iterable clusters) - throws TasteException { - FastByIDMap> recsPerUser = new FastByIDMap>(); - for (FastIDSet cluster : clusters) { - List recs = computeTopRecsForCluster(cluster); - LongPrimitiveIterator it = cluster.iterator(); - while (it.hasNext()) { - recsPerUser.put(it.nextLong(), recs); - } - } - return recsPerUser; - } - - private List computeTopRecsForCluster(FastIDSet cluster) throws TasteException { - DataModel dataModel = getDataModel(); - FastIDSet possibleItemIDs = new FastIDSet(); - LongPrimitiveIterator it = cluster.iterator(); - while (it.hasNext()) { - possibleItemIDs.addAll(dataModel.getItemIDsFromUser(it.nextLong())); - } - - TopItems.Estimator estimator = new Estimator(cluster); - - List topItems = - TopItems.getTopItems(possibleItemIDs.size(), possibleItemIDs.iterator(), null, estimator); - - log.debug("Recommendations are: {}", topItems); - return Collections.unmodifiableList(topItems); - } - - private static FastByIDMap computeClustersPerUserID(Collection clusters) { - FastByIDMap clustersPerUser = new FastByIDMap(clusters.size()); - for (FastIDSet cluster : clusters) { - LongPrimitiveIterator it = cluster.iterator(); - while (it.hasNext()) { - clustersPerUser.put(it.nextLong(), cluster); - } - } - return clustersPerUser; - } - - @Override - public void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } - - @Override - public String toString() { - return "TreeClusteringRecommender[clusterSimilarity:" + clusterSimilarity + ']'; - } - - private final class Estimator implements TopItems.Estimator { - - private final FastIDSet cluster; - - private Estimator(FastIDSet cluster) { - this.cluster = cluster; - } - - @Override - public double estimate(Long itemID) throws TasteException { - DataModel dataModel = getDataModel(); - RunningAverage average = new FullRunningAverage(); - LongPrimitiveIterator it = cluster.iterator(); - while (it.hasNext()) { - Float pref = dataModel.getPreferenceValue(it.nextLong(), itemID); - if (pref != null) { - average.addDatum(pref); - } - } - return average.getAverage(); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TreeClusteringRecommender2.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TreeClusteringRecommender2.java deleted file mode 100644 index 94133efab..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TreeClusteringRecommender2.java +++ /dev/null @@ -1,479 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender; - -import java.util.Collection; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.ListIterator; -import java.util.PriorityQueue; -import java.util.Queue; -import java.util.concurrent.Callable; - -import com.google.common.collect.Lists; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.recommender.ClusteringRecommender; -import org.apache.mahout.cf.taste.recommender.IDRescorer; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.common.RandomUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * A {@link org.apache.mahout.cf.taste.recommender.Recommender} that clusters users, then determines the - * clusters' top recommendations. This implementation builds clusters by repeatedly merging clusters until - * only a certain number remain, meaning that each cluster is sort of a tree of other clusters. - *

- * - *

- * This {@link org.apache.mahout.cf.taste.recommender.Recommender} therefore has a few properties to note: - *

- *
    - *
  • For all users in a cluster, recommendations will be the same
  • - *
  • {@link #estimatePreference(long, long)} may well return {@link Double#NaN}; it does so when asked to - * estimate preference for an item for which no preference is expressed in the users in the cluster.
  • - *
- * - *

- * This is an experimental implementation which tries to gain a lot of speed at the cost of accuracy - * in building clusters, compared to {@link TreeClusteringRecommender}. It will sometimes cluster two other - * clusters together that may not be the exact closest two clusters in existence. This may not affect the - * recommendation quality much, but it potentially speeds up the clustering process dramatically. - *

- */ -public final class TreeClusteringRecommender2 extends AbstractRecommender implements ClusteringRecommender { - - private static final Logger log = LoggerFactory.getLogger(TreeClusteringRecommender2.class); - - private static final int NUM_CLUSTER_RECS = 100; - - private final ClusterSimilarity clusterSimilarity; - private final int numClusters; - private final double clusteringThreshold; - private final boolean clusteringByThreshold; - private FastByIDMap> topRecsByUserID; - private FastIDSet[] allClusters; - private FastByIDMap clustersByUserID; - private final RefreshHelper refreshHelper; - - /** - * @param dataModel - * {@link DataModel} which provides users - * @param clusterSimilarity - * {@link ClusterSimilarity} used to compute cluster similarity - * @param numClusters - * desired number of clusters to create - * @throws IllegalArgumentException - * if arguments are {@code null}, or {@code numClusters} is less than 2 - */ - public TreeClusteringRecommender2(DataModel dataModel, ClusterSimilarity clusterSimilarity, int numClusters) - throws TasteException { - super(dataModel); - Preconditions.checkArgument(numClusters >= 2, "numClusters must be at least 2"); - this.clusterSimilarity = Preconditions.checkNotNull(clusterSimilarity); - this.numClusters = numClusters; - this.clusteringThreshold = Double.NaN; - this.clusteringByThreshold = false; - this.refreshHelper = new RefreshHelper(new Callable() { - @Override - public Object call() throws TasteException { - buildClusters(); - return null; - } - }); - refreshHelper.addDependency(dataModel); - refreshHelper.addDependency(clusterSimilarity); - buildClusters(); - } - - /** - * @param dataModel - * {@link DataModel} which provides users - * @param clusterSimilarity - * {@link ClusterSimilarity} used to compute cluster - * similarity - * @param clusteringThreshold - * clustering similarity threshold; clusters will be aggregated into larger clusters until the next - * two nearest clusters' similarity drops below this threshold - * @throws IllegalArgumentException - * if arguments are {@code null}, or {@code clusteringThreshold} is {@link Double#NaN} - */ - public TreeClusteringRecommender2(DataModel dataModel, - ClusterSimilarity clusterSimilarity, - double clusteringThreshold) throws TasteException { - super(dataModel); - Preconditions.checkArgument(!Double.isNaN(clusteringThreshold), "clusteringThreshold must not be NaN"); - this.clusterSimilarity = Preconditions.checkNotNull(clusterSimilarity); - this.numClusters = Integer.MIN_VALUE; - this.clusteringThreshold = clusteringThreshold; - this.clusteringByThreshold = true; - this.refreshHelper = new RefreshHelper(new Callable() { - @Override - public Object call() throws TasteException { - buildClusters(); - return null; - } - }); - refreshHelper.addDependency(dataModel); - refreshHelper.addDependency(clusterSimilarity); - buildClusters(); - } - - @Override - public List recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException { - Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1"); - buildClusters(); - - log.debug("Recommending items for user ID '{}'", userID); - - List recommended = topRecsByUserID.get(userID); - if (recommended == null) { - return Collections.emptyList(); - } - - DataModel dataModel = getDataModel(); - List rescored = Lists.newArrayListWithCapacity(recommended.size()); - // Only add items the user doesn't already have a preference for. - // And that the rescorer doesn't "reject". - for (RecommendedItem recommendedItem : recommended) { - long itemID = recommendedItem.getItemID(); - if (rescorer != null && rescorer.isFiltered(itemID)) { - continue; - } - if (dataModel.getPreferenceValue(userID, itemID) == null - && (rescorer == null || !Double.isNaN(rescorer.rescore(itemID, recommendedItem.getValue())))) { - rescored.add(recommendedItem); - } - } - Collections.sort(rescored, new ByRescoreComparator(rescorer)); - - return rescored; - } - - @Override - public float estimatePreference(long userID, long itemID) throws TasteException { - Float actualPref = getDataModel().getPreferenceValue(userID, itemID); - if (actualPref != null) { - return actualPref; - } - buildClusters(); - List topRecsForUser = topRecsByUserID.get(userID); - if (topRecsForUser != null) { - for (RecommendedItem item : topRecsForUser) { - if (itemID == item.getItemID()) { - return item.getValue(); - } - } - } - // Hmm, we have no idea. The item is not in the user's cluster - return Float.NaN; - } - - @Override - public FastIDSet getCluster(long userID) throws TasteException { - buildClusters(); - FastIDSet cluster = clustersByUserID.get(userID); - return cluster == null ? new FastIDSet() : cluster; - } - - @Override - public FastIDSet[] getClusters() throws TasteException { - buildClusters(); - return allClusters; - } - - private static final class ClusterClusterPair implements Comparable { - - private final FastIDSet cluster1; - private final FastIDSet cluster2; - private final double similarity; - - private ClusterClusterPair(FastIDSet cluster1, FastIDSet cluster2, double similarity) { - this.cluster1 = cluster1; - this.cluster2 = cluster2; - this.similarity = similarity; - } - - FastIDSet getCluster1() { - return cluster1; - } - - FastIDSet getCluster2() { - return cluster2; - } - - double getSimilarity() { - return similarity; - } - - @Override - public int hashCode() { - return cluster1.hashCode() ^ cluster2.hashCode() ^ RandomUtils.hashDouble(similarity); - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof ClusterClusterPair)) { - return false; - } - ClusterClusterPair other = (ClusterClusterPair) o; - return cluster1.equals(other.getCluster1()) - && cluster2.equals(other.getCluster2()) - && similarity == other.getSimilarity(); - } - - @Override - public int compareTo(ClusterClusterPair other) { - double otherSimilarity = other.getSimilarity(); - if (similarity > otherSimilarity) { - return -1; - } else if (similarity < otherSimilarity) { - return 1; - } else { - return 0; - } - } - - } - - private void buildClusters() throws TasteException { - DataModel model = getDataModel(); - int numUsers = model.getNumUsers(); - - if (numUsers == 0) { - - topRecsByUserID = new FastByIDMap>(); - clustersByUserID = new FastByIDMap(); - - } else { - - List clusters = Lists.newArrayList(); - // Begin with a cluster for each user: - LongPrimitiveIterator it = model.getUserIDs(); - while (it.hasNext()) { - FastIDSet newCluster = new FastIDSet(); - newCluster.add(it.nextLong()); - clusters.add(newCluster); - } - - boolean done = false; - while (!done) { - done = mergeClosestClusters(numUsers, clusters, done); - } - - topRecsByUserID = computeTopRecsPerUserID(clusters); - clustersByUserID = computeClustersPerUserID(clusters); - allClusters = clusters.toArray(new FastIDSet[clusters.size()]); - - } - } - - private boolean mergeClosestClusters(int numUsers, List clusters, boolean done) throws TasteException { - // We find a certain number of closest clusters... - List queue = findClosestClusters(numUsers, clusters); - - // The first one is definitely the closest pair in existence so we can cluster - // the two together, put it back into the set of clusters, and start again. Instead - // we assume everything else in our list of closest cluster pairs is still pretty good, - // and we cluster them too. - - while (!queue.isEmpty()) { - - if (!clusteringByThreshold && clusters.size() <= numClusters) { - done = true; - break; - } - - ClusterClusterPair top = queue.remove(0); - - if (clusteringByThreshold && top.getSimilarity() < clusteringThreshold) { - done = true; - break; - } - - FastIDSet cluster1 = top.getCluster1(); - FastIDSet cluster2 = top.getCluster2(); - - // Pull out current two clusters from clusters - Iterator clusterIterator = clusters.iterator(); - boolean removed1 = false; - boolean removed2 = false; - while (clusterIterator.hasNext() && !(removed1 && removed2)) { - FastIDSet current = clusterIterator.next(); - // Yes, use == here - if (!removed1 && cluster1 == current) { - clusterIterator.remove(); - removed1 = true; - } else if (!removed2 && cluster2 == current) { - clusterIterator.remove(); - removed2 = true; - } - } - - // The only catch is if a cluster showed it twice in the list of best cluster pairs; - // have to remove the others. Pull out anything referencing these clusters from queue - for (Iterator queueIterator = queue.iterator(); queueIterator.hasNext();) { - ClusterClusterPair pair = queueIterator.next(); - FastIDSet pair1 = pair.getCluster1(); - FastIDSet pair2 = pair.getCluster2(); - if (pair1 == cluster1 || pair1 == cluster2 || pair2 == cluster1 || pair2 == cluster2) { - queueIterator.remove(); - } - } - - // Make new merged cluster - FastIDSet merged = new FastIDSet(cluster1.size() + cluster2.size()); - merged.addAll(cluster1); - merged.addAll(cluster2); - - // Compare against other clusters; update queue if needed - // That new pair we're just adding might be pretty close to something else, so - // catch that case here and put it back into our queue - for (FastIDSet cluster : clusters) { - double similarity = clusterSimilarity.getSimilarity(merged, cluster); - if (similarity > queue.get(queue.size() - 1).getSimilarity()) { - ListIterator queueIterator = queue.listIterator(); - while (queueIterator.hasNext()) { - if (similarity > queueIterator.next().getSimilarity()) { - queueIterator.previous(); - break; - } - } - queueIterator.add(new ClusterClusterPair(merged, cluster, similarity)); - } - } - - // Finally add new cluster to list - clusters.add(merged); - - } - return done; - } - - private List findClosestClusters(int numUsers, - List clusters) throws TasteException { - Queue queue = - new PriorityQueue(numUsers + 1, Collections.reverseOrder()); - int size = clusters.size(); - for (int i = 0; i < size; i++) { - FastIDSet cluster1 = clusters.get(i); - for (int j = i + 1; j < size; j++) { - FastIDSet cluster2 = clusters.get(j); - double similarity = clusterSimilarity.getSimilarity(cluster1, cluster2); - if (!Double.isNaN(similarity)) { - if (queue.size() < numUsers) { - queue.add(new ClusterClusterPair(cluster1, cluster2, similarity)); - } else if (similarity > queue.poll().getSimilarity()) { - queue.add(new ClusterClusterPair(cluster1, cluster2, similarity)); - queue.poll(); - } - } - } - } - List result = Lists.newArrayList(queue); - Collections.sort(result); - return result; - } - - private FastByIDMap> computeTopRecsPerUserID(Iterable clusters) - throws TasteException { - FastByIDMap> recsPerUser = new FastByIDMap>(); - for (FastIDSet cluster : clusters) { - List recs = computeTopRecsForCluster(cluster); - LongPrimitiveIterator it = cluster.iterator(); - while (it.hasNext()) { - recsPerUser.put(it.nextLong(), recs); - } - } - return recsPerUser; - } - - private List computeTopRecsForCluster(FastIDSet cluster) throws TasteException { - - DataModel dataModel = getDataModel(); - FastIDSet possibleItemIDs = new FastIDSet(); - LongPrimitiveIterator it = cluster.iterator(); - while (it.hasNext()) { - possibleItemIDs.addAll(dataModel.getItemIDsFromUser(it.nextLong())); - } - - TopItems.Estimator estimator = new Estimator(cluster); - - List topItems = TopItems.getTopItems(NUM_CLUSTER_RECS, - possibleItemIDs.iterator(), null, estimator); - - log.debug("Recommendations are: {}", topItems); - return Collections.unmodifiableList(topItems); - } - - private static FastByIDMap computeClustersPerUserID(Collection clusters) { - FastByIDMap clustersPerUser = new FastByIDMap(clusters.size()); - for (FastIDSet cluster : clusters) { - LongPrimitiveIterator it = cluster.iterator(); - while (it.hasNext()) { - clustersPerUser.put(it.nextLong(), cluster); - } - } - return clustersPerUser; - } - - @Override - public void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } - - @Override - public String toString() { - return "TreeClusteringRecommender2[clusterSimilarity:" + clusterSimilarity + ']'; - } - - private final class Estimator implements TopItems.Estimator { - - private final FastIDSet cluster; - - private Estimator(FastIDSet cluster) { - this.cluster = cluster; - } - - @Override - public double estimate(Long itemID) throws TasteException { - DataModel dataModel = getDataModel(); - RunningAverage average = new FullRunningAverage(); - LongPrimitiveIterator it = cluster.iterator(); - while (it.hasNext()) { - Float pref = dataModel.getPreferenceValue(it.nextLong(), itemID); - if (pref != null) { - average.addDatum(pref); - } - } - return average.getAverage(); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/knn/ConjugateGradientOptimizer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/knn/ConjugateGradientOptimizer.java deleted file mode 100644 index dd5daa655..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/knn/ConjugateGradientOptimizer.java +++ /dev/null @@ -1,142 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.knn; - -import java.util.Arrays; - -public final class ConjugateGradientOptimizer implements Optimizer { - - private static final double CONVERGENCE_LIMIT = 0.1; - private static final int MAX_ITERATIONS = 1000; - - /** - *

- * Conjugate gradient optimization. Matlab code: - *

- * - *

- * - *

-   * function [x] = conjgrad(A,b,x0)
-   *   x = x0;
-   *   r = b - A*x0;
-   *   w = -r;
-   *   for i = 1:size(A);
-   *      z = A*w;
-   *      a = (r'*w)/(w'*z);
-   *      x = x + a*w;
-   *      r = r - a*z;
-   *      if ( norm(r) < 1e-10 )
-   *           break;
-   *      end
-   *      B = (r'*z)/(w'*z);
-   *      w = -r + B*w;
-   *   end
-   * end
-   * 
- * - *

- * - * @param matrix - * matrix nxn positions - * @param b - * vector b, n positions - * @return vector of n weights - */ - @Override - public double[] optimize(double[][] matrix, double[] b) { - - int k = b.length; - double[] x = new double[k]; - double[] r = new double[k]; - double[] w = new double[k]; - double[] z = new double[k]; - Arrays.fill(x, 3.0 / k); - - // r = b - A*x0; - // w = -r; - for (int i = 0; i < k; i++) { - double v = 0.0; - double[] ai = matrix[i]; - for (int j = 0; j < k; j++) { - v += ai[j] * x[j]; - } - double ri = b[i] - v; - r[i] = ri; - w[i] = -ri; - } - - for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++) { - - // z = A*w; - for (int i = 0; i < k; i++) { - double v = 0.0; - double[] ai = matrix[i]; - for (int j = 0; j < k; j++) { - v += ai[j] * w[j]; - } - z[i] = v; - } - - // a = (r'*w)/(w'*z); - double anum = 0.0; - double aden = 0.0; - for (int i = 0; i < k; i++) { - anum += r[i] * w[i]; - aden += w[i] * z[i]; - } - double a = anum / aden; - - // x = x + a*w; - // r = r - a*z; - for (int i = 0; i < k; i++) { - x[i] += a * w[i]; - r[i] -= a * z[i]; - } - - // stop when residual is close to 0 - double rdot = 0.0; - for (int i = 0; i < k; i++) { - double value = r[i]; - rdot += value * value; - } - if (rdot <= CONVERGENCE_LIMIT) { - break; - } - - // B = (r'*z)/(w'*z); - double bnum = 0.0; - double bden = 0.0; - for (int i = 0; i < k; i++) { - double zi = z[i]; - bnum += r[i] * zi; - bden += w[i] * zi; - } - double B = bnum / bden; - - // w = -r + B*w; - for (int i = 0; i < k; i++) { - w[i] = -r[i] + B * w[i]; - } - - } - - return x; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/knn/KnnItemBasedRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/knn/KnnItemBasedRecommender.java deleted file mode 100644 index 2cface4e6..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/knn/KnnItemBasedRecommender.java +++ /dev/null @@ -1,251 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.knn; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender; -import org.apache.mahout.cf.taste.impl.recommender.TopItems; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy; -import org.apache.mahout.cf.taste.recommender.MostSimilarItemsCandidateItemsStrategy; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.cf.taste.recommender.Rescorer; -import org.apache.mahout.cf.taste.similarity.ItemSimilarity; -import org.apache.mahout.common.LongPair; - -/** - *

- * The weights to compute the final predicted preferences are calculated using linear interpolation, through - * an {@link Optimizer}. This algorithm is based in the paper of Robert M. Bell and Yehuda Koren in ICDM '07. - *

- */ -public final class KnnItemBasedRecommender extends GenericItemBasedRecommender { - - private static final double BETA = 500.0; - - private final Optimizer optimizer; - private final int neighborhoodSize; - - public KnnItemBasedRecommender(DataModel dataModel, - ItemSimilarity similarity, - Optimizer optimizer, - CandidateItemsStrategy candidateItemsStrategy, - MostSimilarItemsCandidateItemsStrategy mostSimilarItemsCandidateItemsStrategy, - int neighborhoodSize) { - super(dataModel, similarity, candidateItemsStrategy, mostSimilarItemsCandidateItemsStrategy); - this.optimizer = optimizer; - this.neighborhoodSize = neighborhoodSize; - } - - public KnnItemBasedRecommender(DataModel dataModel, - ItemSimilarity similarity, - Optimizer optimizer, - int neighborhoodSize) { - this(dataModel, similarity, optimizer, getDefaultCandidateItemsStrategy(), - getDefaultMostSimilarItemsCandidateItemsStrategy(), neighborhoodSize); - } - - private List mostSimilarItems(long itemID, - LongPrimitiveIterator possibleItemIDs, - int howMany, - Rescorer rescorer) throws TasteException { - TopItems.Estimator estimator = new MostSimilarEstimator(itemID, getSimilarity(), rescorer); - return TopItems.getTopItems(howMany, possibleItemIDs, null, estimator); - } - - private double[] getInterpolations(long itemID, - long[] itemNeighborhood, - Collection usersRatedNeighborhood) throws TasteException { - - int length = 0; - for (int i = 0; i < itemNeighborhood.length; i++) { - if (itemNeighborhood[i] == itemID) { - itemNeighborhood[i] = -1; - length = itemNeighborhood.length - 1; - break; - } - } - - int k = length; - double[][] aMatrix = new double[k][k]; - double[] b = new double[k]; - int i = 0; - - DataModel dataModel = getDataModel(); - - int numUsers = usersRatedNeighborhood.size(); - for (long iitem : itemNeighborhood) { - if (iitem == -1) { - break; - } - int j = 0; - double value = 0.0; - for (long jitem : itemNeighborhood) { - if (jitem == -1) { - continue; - } - for (long user : usersRatedNeighborhood) { - float prefVJ = dataModel.getPreferenceValue(user, iitem); - float prefVK = dataModel.getPreferenceValue(user, jitem); - value += prefVJ * prefVK; - } - aMatrix[i][j] = value/numUsers; - j++; - } - i++; - } - - i = 0; - for (long jitem : itemNeighborhood) { - if (jitem == -1) { - break; - } - double value = 0.0; - for (long user : usersRatedNeighborhood) { - float prefVJ = dataModel.getPreferenceValue(user, jitem); - float prefVI = dataModel.getPreferenceValue(user, itemID); - value += prefVJ * prefVI; - } - b[i] = value / numUsers; - i++; - } - - // Find the larger diagonal and calculate the average - double avgDiagonal = 0.0; - if (k > 1) { - double diagonalA = 0.0; - for (i = 0; i < k; i++) { - diagonalA += aMatrix[i][i]; - } - double diagonalB = 0.0; - for (i = k - 1; i >= 0; i--) { - for (int j = 0; j < k; j++) { - diagonalB += aMatrix[i--][j]; - } - } - avgDiagonal = Math.max(diagonalA, diagonalB) / k; - } - // Calculate the average of non-diagonal values - double avgMatrixA = 0.0; - double avgVectorB = 0.0; - for (i = 0; i < k; i++) { - for (int j = 0; j < k; j++) { - if (i != j || k <= 1) { - avgMatrixA += aMatrix[i][j]; - } - } - avgVectorB += b[i]; - } - if (k > 1) { - avgMatrixA /= k * k - k; - } - avgVectorB /= k; - - double numUsersPlusBeta = numUsers + BETA; - for (i = 0; i < k; i++) { - for (int j = 0; j < k; j++) { - double average; - if (i == j && k > 1) { - average = avgDiagonal; - } else { - average = avgMatrixA; - } - aMatrix[i][j] = (numUsers * aMatrix[i][j] + BETA * average) / numUsersPlusBeta; - } - b[i] = (numUsers * b[i] + BETA * avgVectorB) / numUsersPlusBeta; - } - - return optimizer.optimize(aMatrix, b); - } - - @Override - protected float doEstimatePreference(long theUserID, PreferenceArray preferencesFromUser, long itemID) - throws TasteException { - - DataModel dataModel = getDataModel(); - int size = preferencesFromUser.length(); - FastIDSet possibleItemIDs = new FastIDSet(size); - for (int i = 0; i < size; i++) { - possibleItemIDs.add(preferencesFromUser.getItemID(i)); - } - possibleItemIDs.remove(itemID); - - List mostSimilar = mostSimilarItems(itemID, possibleItemIDs.iterator(), - neighborhoodSize, null); - long[] theNeighborhood = new long[mostSimilar.size() + 1]; - theNeighborhood[0] = -1; - - List usersRatedNeighborhood = new ArrayList(); - int nOffset = 0; - for (RecommendedItem rec : mostSimilar) { - theNeighborhood[nOffset++] = rec.getItemID(); - } - - if (!mostSimilar.isEmpty()) { - theNeighborhood[mostSimilar.size()] = itemID; - for (int i = 0; i < theNeighborhood.length; i++) { - PreferenceArray usersNeighborhood = dataModel.getPreferencesForItem(theNeighborhood[i]); - int size1 = usersRatedNeighborhood.isEmpty() ? usersNeighborhood.length() : usersRatedNeighborhood.size(); - for (int j = 0; j < size1; j++) { - if (i == 0) { - usersRatedNeighborhood.add(usersNeighborhood.getUserID(j)); - } else { - if (j >= usersRatedNeighborhood.size()) { - break; - } - long index = usersRatedNeighborhood.get(j); - if (!usersNeighborhood.hasPrefWithUserID(index) || index == theUserID) { - usersRatedNeighborhood.remove(index); - j--; - } - } - } - } - } - - double[] weights = null; - if (!mostSimilar.isEmpty()) { - weights = getInterpolations(itemID, theNeighborhood, usersRatedNeighborhood); - } - - int i = 0; - double preference = 0.0; - double totalSimilarity = 0.0; - for (long jitem : theNeighborhood) { - - Float pref = dataModel.getPreferenceValue(theUserID, jitem); - - if (pref != null) { - double weight = weights[i]; - preference += pref * weight; - totalSimilarity += weight; - } - i++; - - } - return totalSimilarity == 0.0 ? Float.NaN : (float) (preference / totalSimilarity); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/knn/NonNegativeQuadraticOptimizer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/knn/NonNegativeQuadraticOptimizer.java deleted file mode 100644 index b25090183..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/knn/NonNegativeQuadraticOptimizer.java +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.knn; - -import java.util.Arrays; - -/** - * Non-negative Quadratic Optimization. Based on the paper of Robert M. Bell and Yehuda Koren in ICDM '07. - * Thanks to Dan Tillberg for the hints in the implementation. - */ -public final class NonNegativeQuadraticOptimizer implements Optimizer { - - private static final double EPSILON = 1.0e-10; - private static final double CONVERGENCE_LIMIT = 0.1; - private static final int MAX_ITERATIONS = 1000; - private static final double DEFAULT_STEP = 0.001; - - /** - * Non-negative Quadratic Optimization. - * - * @param matrix - * matrix nxn positions - * @param b - * vector b, n positions - * @return vector of n weights - */ - @Override - public double[] optimize(double[][] matrix, double[] b) { - int k = b.length; - double[] r = new double[k]; - double[] x = new double[k]; - Arrays.fill(x, 3.0 / k); - - for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++) { - - double rdot = 0.0; - for (int n = 0; n < k; n++) { - double sumAw = 0.0; - double[] rowAn = matrix[n]; - for (int i = 0; i < k; i++) { - sumAw += rowAn[i] * x[i]; - } - // r = b - Ax; // the residual, or 'steepest gradient' - double rn = b[n] - sumAw; - - // find active variables - those that are pinned due to - // nonnegativity constraints; set respective ri's to zero - if (x[n] < EPSILON && rn < 0.0) { - rn = 0.0; - } else { - // max step size numerator - rdot += rn * rn; - } - r[n] = rn; - } - - if (rdot <= CONVERGENCE_LIMIT) { - break; - } - - // max step size denominator - double rArdotSum = 0.0; - for (int n = 0; n < k; n++) { - double sumAr = 0.0; - double[] rowAn = matrix[n]; - for (int i = 0; i < k; i++) { - sumAr += rowAn[i] * r[i]; - } - rArdotSum += r[n] * sumAr; - } - - // max step size - double stepSize = rdot / rArdotSum; - - if (Double.isNaN(stepSize)) { - stepSize = DEFAULT_STEP; - } - - // adjust step size to prevent negative values - for (int n = 0; n < k; n++) { - if (r[n] < 0.0) { - double absStepSize = stepSize < 0.0 ? -stepSize : stepSize; - stepSize = Math.min(absStepSize, Math.abs(x[n] / r[n])) * stepSize / absStepSize; - } - } - - // update x values - for (int n = 0; n < k; n++) { - x[n] += stepSize * r[n]; - if (x[n] < EPSILON) { - x[n] = 0.0; - } - } - - // TODO: do something in case of divergence - } - - return x; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/knn/Optimizer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/knn/Optimizer.java deleted file mode 100644 index cba6dead6..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/knn/Optimizer.java +++ /dev/null @@ -1,24 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.knn; - -public interface Optimizer { - - double[] optimize(double[][] matrix, double[] b); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/MemoryDiffStorage.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/MemoryDiffStorage.java deleted file mode 100644 index 30401e5ef..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/MemoryDiffStorage.java +++ /dev/null @@ -1,400 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.slopeone; - -import java.util.Collection; -import java.util.Iterator; -import java.util.Map; -import java.util.concurrent.Callable; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.common.Weighting; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.slopeone.DiffStorage; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * An implementation of {@link DiffStorage} that merely stores item-item diffs in memory. It is fast, but can - * consume a great deal of memory. - *

- */ -public final class MemoryDiffStorage implements DiffStorage { - - private static final Logger log = LoggerFactory.getLogger(MemoryDiffStorage.class); - - private final DataModel dataModel; - private final boolean stdDevWeighted; - private final long maxEntries; - private final FastByIDMap> averageDiffs; - private final FastByIDMap averageItemPref; - private final FastIDSet allRecommendableItemIDs; - private final ReadWriteLock buildAverageDiffsLock; - private final RefreshHelper refreshHelper; - - /** - *

- * See {@link SlopeOneRecommender} for the meaning of - * {@code stdDevWeighted}. {@code maxEntries} controls the maximum number of - * item-item average preference differences that will be tracked internally. After the limit is reached, if - * a new item-item pair is observed in the data it will be ignored. This is recommended for large datasets. - * The first {@code maxEntries} item-item pairs observed in the data are tracked. Assuming that item - * ratings are reasonably distributed among users, this should only ignore item-item pairs that are very - * infrequently co-rated by a user. The intuition is that data on these infrequently co-rated item-item - * pairs is less reliable and should be the first that is ignored. This parameter can be used to limit the - * memory requirements of {@link SlopeOneRecommender}, which otherwise grow as the square of the number of - * items that exist in the {@link DataModel}. Memory requirements can reach gigabytes with only about 10000 - * items, so this may be necessary on larger datasets. - * - * @param stdDevWeighted - * see {@link SlopeOneRecommender} - * @param maxEntries - * maximum number of item-item average preference differences to track internally - * @throws IllegalArgumentException - * if {@code maxEntries} is not positive or {@code dataModel} is null - */ - public MemoryDiffStorage(DataModel dataModel, - Weighting stdDevWeighted, - long maxEntries) throws TasteException { - Preconditions.checkArgument(dataModel != null, "dataModel is null"); - Preconditions.checkArgument(dataModel.getNumItems() >= 1, "dataModel has no items"); - Preconditions.checkArgument(maxEntries > 0L, "maxEntries must be positive"); - this.dataModel = dataModel; - this.stdDevWeighted = stdDevWeighted == Weighting.WEIGHTED; - this.maxEntries = maxEntries; - this.averageDiffs = new FastByIDMap>(); - this.averageItemPref = new FastByIDMap(); - this.buildAverageDiffsLock = new ReentrantReadWriteLock(); - this.allRecommendableItemIDs = new FastIDSet(dataModel.getNumItems()); - this.refreshHelper = new RefreshHelper(new Callable() { - @Override - public Object call() throws TasteException { - buildAverageDiffs(); - return null; - } - }); - refreshHelper.addDependency(dataModel); - buildAverageDiffs(); - } - - @Override - public RunningAverage getDiff(long itemID1, long itemID2) { - - boolean inverted = false; - if (itemID1 > itemID2) { - inverted = true; - long temp = itemID1; - itemID1 = itemID2; - itemID2 = temp; - } - - FastByIDMap level2Map; - try { - buildAverageDiffsLock.readLock().lock(); - level2Map = averageDiffs.get(itemID1); - } finally { - buildAverageDiffsLock.readLock().unlock(); - } - RunningAverage average = null; - if (level2Map != null) { - average = level2Map.get(itemID2); - } - if (inverted) { - return average == null ? null : average.inverse(); - } else { - return average; - } - } - - @Override - public RunningAverage[] getDiffs(long userID, long itemID, PreferenceArray prefs) { - try { - buildAverageDiffsLock.readLock().lock(); - int size = prefs.length(); - RunningAverage[] result = new RunningAverage[size]; - for (int i = 0; i < size; i++) { - result[i] = getDiff(prefs.getItemID(i), itemID); - } - return result; - } finally { - buildAverageDiffsLock.readLock().unlock(); - } - } - - @Override - public RunningAverage getAverageItemPref(long itemID) { - return averageItemPref.get(itemID); - } - - @Override - public void addItemPref(long userID, long itemIDA, float prefValue) throws TasteException { - PreferenceArray userPreferences = dataModel.getPreferencesFromUser(userID); - try { - buildAverageDiffsLock.writeLock().lock(); - - FastByIDMap aMap = averageDiffs.get(itemIDA); - if (aMap == null) { - aMap = new FastByIDMap(); - averageDiffs.put(itemIDA, aMap); - } - - int length = userPreferences.length(); - for (int i = 0; i < length; i++) { - long itemIDB = userPreferences.getItemID(i); - float bValue = userPreferences.getValue(i); - if (itemIDA < itemIDB) { - RunningAverage average = aMap.get(itemIDB); - if (average == null) { - average = buildRunningAverage(); - aMap.put(itemIDB, average); - } - average.addDatum(bValue - prefValue); - } else { - FastByIDMap bMap = averageDiffs.get(itemIDB); - if (bMap == null) { - bMap = new FastByIDMap(); - averageDiffs.put(itemIDB, bMap); - } - RunningAverage average = bMap.get(itemIDA); - if (average == null) { - average = buildRunningAverage(); - bMap.put(itemIDA, average); - } - average.addDatum(prefValue - bValue); - } - } - - } finally { - buildAverageDiffsLock.writeLock().unlock(); - } - } - - @Override - public void updateItemPref(long itemID, float prefDelta) { - if (stdDevWeighted) { - throw new UnsupportedOperationException("Can't update only when stdDevWeighted is set"); - } - try { - buildAverageDiffsLock.readLock().lock(); - for (Map.Entry> entry : averageDiffs.entrySet()) { - boolean matchesItemID1 = itemID == entry.getKey(); - for (Map.Entry entry2 : entry.getValue().entrySet()) { - RunningAverage average = entry2.getValue(); - if (matchesItemID1) { - average.changeDatum(-prefDelta); - } else if (itemID == entry2.getKey()) { - average.changeDatum(prefDelta); - } - } - } - RunningAverage itemAverage = averageItemPref.get(itemID); - if (itemAverage != null) { - itemAverage.changeDatum(prefDelta); - } - } finally { - buildAverageDiffsLock.readLock().unlock(); - } - } - - @Override - public void removeItemPref(long userID, long itemIDA, float prefValue) throws TasteException { - PreferenceArray userPreferences = dataModel.getPreferencesFromUser(userID); - try { - buildAverageDiffsLock.writeLock().lock(); - - FastByIDMap aMap = averageDiffs.get(itemIDA); - - int length = userPreferences.length(); - for (int i = 0; i < length; i++) { - - long itemIDB = userPreferences.getItemID(i); - float bValue = userPreferences.getValue(i); - - if (itemIDA < itemIDB) { - - if (aMap != null) { - RunningAverage average = aMap.get(itemIDB); - if (average != null) { - if (average.getCount() <= 1) { - aMap.remove(itemIDB); - } else { - average.removeDatum(bValue - prefValue); - } - } - } - - } else if (itemIDA > itemIDB) { - - FastByIDMap bMap = averageDiffs.get(itemIDB); - if (bMap != null) { - RunningAverage average = bMap.get(itemIDA); - if (average != null) { - if (average.getCount() <= 1) { - aMap.remove(itemIDA); - } else { - average.removeDatum(prefValue - bValue); - } - } - } - - } - } - - } finally { - buildAverageDiffsLock.writeLock().unlock(); - } - } - - @Override - public FastIDSet getRecommendableItemIDs(long userID) throws TasteException { - FastIDSet result; - try { - buildAverageDiffsLock.readLock().lock(); - result = allRecommendableItemIDs.clone(); - } finally { - buildAverageDiffsLock.readLock().unlock(); - } - Iterator it = result.iterator(); - while (it.hasNext()) { - if (dataModel.getPreferenceValue(userID, it.next()) != null) { - it.remove(); - } - } - return result; - } - - private void buildAverageDiffs() throws TasteException { - log.info("Building average diffs..."); - try { - buildAverageDiffsLock.writeLock().lock(); - averageDiffs.clear(); - long averageCount = 0L; - LongPrimitiveIterator it = dataModel.getUserIDs(); - while (it.hasNext()) { - averageCount = processOneUser(averageCount, it.nextLong()); - } - - pruneInconsequentialDiffs(); - updateAllRecommendableItems(); - - } finally { - buildAverageDiffsLock.writeLock().unlock(); - } - } - - private void pruneInconsequentialDiffs() { - // Go back and prune inconsequential diffs. "Inconsequential" means, here, only represented by one - // data point, so possibly unreliable - Iterator>> it1 = averageDiffs.entrySet().iterator(); - while (it1.hasNext()) { - FastByIDMap map = it1.next().getValue(); - Iterator> it2 = map.entrySet().iterator(); - while (it2.hasNext()) { - RunningAverage average = it2.next().getValue(); - if (average.getCount() <= 1) { - it2.remove(); - } - } - if (map.isEmpty()) { - it1.remove(); - } else { - map.rehash(); - } - } - averageDiffs.rehash(); - } - - private void updateAllRecommendableItems() throws TasteException { - FastIDSet ids = new FastIDSet(dataModel.getNumItems()); - for (Map.Entry> entry : averageDiffs.entrySet()) { - ids.add(entry.getKey()); - LongPrimitiveIterator it = entry.getValue().keySetIterator(); - while (it.hasNext()) { - ids.add(it.next()); - } - } - allRecommendableItemIDs.clear(); - allRecommendableItemIDs.addAll(ids); - allRecommendableItemIDs.rehash(); - } - - private long processOneUser(long averageCount, long userID) throws TasteException { - log.debug("Processing prefs for user {}", userID); - // Save off prefs for the life of this loop iteration - PreferenceArray userPreferences = dataModel.getPreferencesFromUser(userID); - int length = userPreferences.length(); - for (int i = 0; i < length - 1; i++) { - float prefAValue = userPreferences.getValue(i); - long itemIDA = userPreferences.getItemID(i); - FastByIDMap aMap = averageDiffs.get(itemIDA); - if (aMap == null) { - aMap = new FastByIDMap(); - averageDiffs.put(itemIDA, aMap); - } - for (int j = i + 1; j < length; j++) { - // This is a performance-critical block - long itemIDB = userPreferences.getItemID(j); - RunningAverage average = aMap.get(itemIDB); - if (average == null && averageCount < maxEntries) { - average = buildRunningAverage(); - aMap.put(itemIDB, average); - averageCount++; - } - if (average != null) { - average.addDatum(userPreferences.getValue(j) - prefAValue); - } - } - RunningAverage itemAverage = averageItemPref.get(itemIDA); - if (itemAverage == null) { - itemAverage = buildRunningAverage(); - averageItemPref.put(itemIDA, itemAverage); - } - itemAverage.addDatum(prefAValue); - } - return averageCount; - } - - private RunningAverage buildRunningAverage() { - return stdDevWeighted ? new FullRunningAverageAndStdDev() : new FullRunningAverage(); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } - - @Override - public String toString() { - return "MemoryDiffStorage"; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/SlopeOneRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/SlopeOneRecommender.java deleted file mode 100644 index d91ff4632..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/SlopeOneRecommender.java +++ /dev/null @@ -1,226 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.slopeone; - -import java.util.Collection; -import java.util.List; - -import org.apache.mahout.cf.taste.common.NoSuchUserException; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.common.Weighting; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev; -import org.apache.mahout.cf.taste.impl.recommender.AbstractRecommender; -import org.apache.mahout.cf.taste.impl.recommender.TopItems; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.IDRescorer; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.apache.mahout.cf.taste.recommender.slopeone.DiffStorage; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * A basic "slope one" recommender. (See an - * excellent summary here for example.) This {@link org.apache.mahout.cf.taste.recommender.Recommender} is - * especially suitable when user preferences are updating frequently as it can incorporate this information - * without expensive recomputation. - *

- * - *

- * This implementation can also be used as a "weighted slope one" recommender. - *

- */ -public final class SlopeOneRecommender extends AbstractRecommender { - - private static final Logger log = LoggerFactory.getLogger(SlopeOneRecommender.class); - - private final boolean weighted; - private final boolean stdDevWeighted; - private final DiffStorage diffStorage; - - /** - *

- * Creates a default (weighted) based on the given {@link DataModel}. - *

- */ - public SlopeOneRecommender(DataModel dataModel) throws TasteException { - this(dataModel, - Weighting.WEIGHTED, - Weighting.WEIGHTED, - new MemoryDiffStorage(dataModel, Weighting.WEIGHTED, Long.MAX_VALUE)); - } - - /** - *

- * Creates a based on the given {@link DataModel}. - *

- * - *

- * If {@code weighted} is set, acts as a weighted slope one recommender. This implementation also - * includes an experimental "standard deviation" weighting which weights item-item ratings diffs with lower - * standard deviation more highly, on the theory that they are more reliable. - *

- * - * @param weighting - * if {@link Weighting#WEIGHTED}, acts as a weighted slope one recommender - * @param stdDevWeighting - * use optional standard deviation weighting of diffs - * @throws IllegalArgumentException - * if {@code diffStorage} is null, or stdDevWeighted is set when weighted is not set - */ - public SlopeOneRecommender(DataModel dataModel, - Weighting weighting, - Weighting stdDevWeighting, - DiffStorage diffStorage) { - super(dataModel); - Preconditions.checkArgument(stdDevWeighting != Weighting.WEIGHTED || weighting != Weighting.UNWEIGHTED, - "weighted required when stdDevWeighted is set"); - Preconditions.checkArgument(diffStorage != null, "diffStorage is null"); - this.weighted = weighting == Weighting.WEIGHTED; - this.stdDevWeighted = stdDevWeighting == Weighting.WEIGHTED; - this.diffStorage = diffStorage; - } - - @Override - public List recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException { - Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1"); - log.debug("Recommending items for user ID '{}'", userID); - - FastIDSet possibleItemIDs = diffStorage.getRecommendableItemIDs(userID); - - TopItems.Estimator estimator = new Estimator(userID); - - List topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer, - estimator); - - log.debug("Recommendations are: {}", topItems); - return topItems; - } - - @Override - public float estimatePreference(long userID, long itemID) throws TasteException { - DataModel model = getDataModel(); - Float actualPref = model.getPreferenceValue(userID, itemID); - if (actualPref != null) { - return actualPref; - } - return doEstimatePreference(userID, itemID); - } - - private float doEstimatePreference(long userID, long itemID) throws TasteException { - double count = 0.0; - double totalPreference = 0.0; - PreferenceArray prefs = getDataModel().getPreferencesFromUser(userID); - RunningAverage[] averages = diffStorage.getDiffs(userID, itemID, prefs); - int size = prefs.length(); - for (int i = 0; i < size; i++) { - RunningAverage averageDiff = averages[i]; - if (averageDiff != null) { - double averageDiffValue = averageDiff.getAverage(); - if (weighted) { - double weight = averageDiff.getCount(); - if (stdDevWeighted) { - double stdev = ((RunningAverageAndStdDev) averageDiff).getStandardDeviation(); - if (!Double.isNaN(stdev)) { - weight /= 1.0 + stdev; - } - // If stdev is NaN, then it is because count is 1. Because we're weighting by count, - // the weight is already relatively low. We effectively assume stdev is 0.0 here and - // that is reasonable enough. Otherwise, dividing by NaN would yield a weight of NaN - // and disqualify this pref entirely - // (Thanks Daemmon) - } - totalPreference += weight * (prefs.getValue(i) + averageDiffValue); - count += weight; - } else { - totalPreference += prefs.getValue(i) + averageDiffValue; - count += 1.0; - } - } - } - if (count <= 0.0) { - RunningAverage itemAverage = diffStorage.getAverageItemPref(itemID); - return itemAverage == null ? Float.NaN : (float) itemAverage.getAverage(); - } else { - return (float) (totalPreference / count); - } - } - - @Override - public void setPreference(long userID, long itemID, float value) throws TasteException { - DataModel dataModel = getDataModel(); - Float oldPref; - try { - oldPref = dataModel.getPreferenceValue(userID, itemID); - } catch (NoSuchUserException nsee) { - oldPref = null; - } - super.setPreference(userID, itemID, value); - if (oldPref == null) { - // Add new preference - diffStorage.addItemPref(userID, itemID, value); - } else { - // Update preference - diffStorage.updateItemPref(itemID, value - oldPref); - } - } - - @Override - public void removePreference(long userID, long itemID) throws TasteException { - DataModel dataModel = getDataModel(); - Float oldPref = dataModel.getPreferenceValue(userID, itemID); - super.removePreference(userID, itemID); - if (oldPref != null) { - diffStorage.removeItemPref(userID, itemID, oldPref); - } - } - - @Override - public void refresh(Collection alreadyRefreshed) { - alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed); - RefreshHelper.maybeRefresh(alreadyRefreshed, diffStorage); - } - - @Override - public String toString() { - return "SlopeOneRecommender[weighted:" + weighted + ", stdDevWeighted:" + stdDevWeighted - + ", diffStorage:" + diffStorage + ']'; - } - - private final class Estimator implements TopItems.Estimator { - - private final long userID; - - private Estimator(long userID) { - this.userID = userID; - } - - @Override - public double estimate(Long itemID) throws TasteException { - return doEstimatePreference(userID, itemID); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/file/FileDiffStorage.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/file/FileDiffStorage.java deleted file mode 100644 index 6840ac922..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/file/FileDiffStorage.java +++ /dev/null @@ -1,324 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.slopeone.file; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.Collection; -import java.util.Iterator; -import java.util.Map; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; -import java.util.regex.Pattern; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.Weighting; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.impl.recommender.slopeone.SlopeOneRecommender; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.slopeone.DiffStorage; -import org.apache.mahout.common.iterator.FileLineIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * {@link DiffStorage} which reads pre-computed diffs from a file and stores in memory. The file should have - * one diff per line: - *

- * - * {@code itemID1,itemID2,diff[,count[,mk,sk]]} - * - *

- * The fourth column is optional, and is a count representing the number of occurrences of the item-item pair - * that contribute to the diff. It is assumed to be 1 if not present. The fifth and sixth arguments are - * computed values used by {@link FullRunningAverageAndStdDev} implementations to compute a running standard deviation. - * They are required if using {@link Weighting#WEIGHTED} with {@link SlopeOneRecommender}. - *

- * - *

- * Commas or tabs can be delimiters. This is intended for use in conjuction with the output of - * {@link org.apache.mahout.cf.taste.hadoop.slopeone.SlopeOneAverageDiffsJob}. - *

- * - *

Note that the same item-item pair should not appear on multiple lines -- one line per item-item pair.

- */ -public final class FileDiffStorage implements DiffStorage { - - private static final Logger log = LoggerFactory.getLogger(FileDiffStorage.class); - - private static final long MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute? - private static final char COMMENT_CHAR = '#'; - private static final Pattern SEPARATOR = Pattern.compile("[\t,]"); - - private final File dataFile; - private long lastModified; - private final long maxEntries; - private final FastByIDMap> averageDiffs; - private final FastIDSet allRecommendableItemIDs; - private final ReadWriteLock buildAverageDiffsLock; - - /** - * @param dataFile - * diffs file - * @param maxEntries - * maximum number of diffs to store - * @throws FileNotFoundException - * if data file does not exist or is a directory - */ - public FileDiffStorage(File dataFile, long maxEntries) throws FileNotFoundException { - Preconditions.checkArgument(dataFile != null, "dataFile is null"); - if (!dataFile.exists() || dataFile.isDirectory()) { - throw new FileNotFoundException(dataFile.toString()); - } - Preconditions.checkArgument(maxEntries > 0L, "maxEntries must be positive"); - log.info("Creating FileDataModel for file {}", dataFile); - this.dataFile = dataFile.getAbsoluteFile(); - this.lastModified = dataFile.lastModified(); - this.maxEntries = maxEntries; - this.averageDiffs = new FastByIDMap>(); - this.allRecommendableItemIDs = new FastIDSet(); - this.buildAverageDiffsLock = new ReentrantReadWriteLock(); - - buildDiffs(); - } - - private void buildDiffs() { - if (buildAverageDiffsLock.writeLock().tryLock()) { - try { - - averageDiffs.clear(); - allRecommendableItemIDs.clear(); - - FileLineIterator iterator = new FileLineIterator(dataFile, false); - String firstLine = iterator.peek(); - while (firstLine.isEmpty() || firstLine.charAt(0) == COMMENT_CHAR) { - iterator.next(); - firstLine = iterator.peek(); - } - long averageCount = 0L; - while (iterator.hasNext()) { - averageCount = processLine(iterator.next(), averageCount); - } - - pruneInconsequentialDiffs(); - updateAllRecommendableItems(); - - } catch (IOException ioe) { - log.warn("Exception while reloading", ioe); - } finally { - buildAverageDiffsLock.writeLock().unlock(); - } - } - } - - private long processLine(String line, long averageCount) { - - if (line.isEmpty() || line.charAt(0) == COMMENT_CHAR) { - return averageCount; - } - - String[] tokens = SEPARATOR.split(line); - Preconditions.checkArgument(tokens.length >=3 && tokens.length != 5, "Bad line: %s", line); - - long itemID1 = Long.parseLong(tokens[0]); - long itemID2 = Long.parseLong(tokens[1]); - double diff = Double.parseDouble(tokens[2]); - int count = tokens.length >= 4 ? Integer.parseInt(tokens[3]) : 1; - boolean hasMkSk = tokens.length >= 5; - - if (itemID1 > itemID2) { - long temp = itemID1; - itemID1 = itemID2; - itemID2 = temp; - } - - FastByIDMap level1Map = averageDiffs.get(itemID1); - if (level1Map == null) { - level1Map = new FastByIDMap(); - averageDiffs.put(itemID1, level1Map); - } - RunningAverage average = level1Map.get(itemID2); - if (average != null) { - throw new IllegalArgumentException("Duplicated line for item-item pair " + itemID1 + " / " + itemID2); - } - if (averageCount < maxEntries) { - if (hasMkSk) { - double mk = Double.parseDouble(tokens[4]); - double sk = Double.parseDouble(tokens[5]); - average = new FullRunningAverageAndStdDev(count, diff, mk, sk); - } else { - average = new FullRunningAverage(count, diff); - } - level1Map.put(itemID2, average); - averageCount++; - } - - allRecommendableItemIDs.add(itemID1); - allRecommendableItemIDs.add(itemID2); - - return averageCount; - } - - private void pruneInconsequentialDiffs() { - // Go back and prune inconsequential diffs. "Inconsequential" means, here, only represented by one - // data point, so possibly unreliable - Iterator>> it1 = averageDiffs.entrySet().iterator(); - while (it1.hasNext()) { - FastByIDMap map = it1.next().getValue(); - Iterator> it2 = map.entrySet().iterator(); - while (it2.hasNext()) { - RunningAverage average = it2.next().getValue(); - if (average.getCount() <= 1) { - it2.remove(); - } - } - if (map.isEmpty()) { - it1.remove(); - } else { - map.rehash(); - } - } - averageDiffs.rehash(); - } - - private void updateAllRecommendableItems() { - for (Map.Entry> entry : averageDiffs.entrySet()) { - allRecommendableItemIDs.add(entry.getKey()); - LongPrimitiveIterator it = entry.getValue().keySetIterator(); - while (it.hasNext()) { - allRecommendableItemIDs.add(it.next()); - } - } - allRecommendableItemIDs.rehash(); - } - - @Override - public RunningAverage getDiff(long itemID1, long itemID2) { - - boolean inverted = false; - if (itemID1 > itemID2) { - inverted = true; - long temp = itemID1; - itemID1 = itemID2; - itemID2 = temp; - } - - FastByIDMap level2Map; - try { - buildAverageDiffsLock.readLock().lock(); - level2Map = averageDiffs.get(itemID1); - } finally { - buildAverageDiffsLock.readLock().unlock(); - } - RunningAverage average = null; - if (level2Map != null) { - average = level2Map.get(itemID2); - } - if (inverted) { - return average == null ? null : average.inverse(); - } else { - return average; - } - } - - @Override - public RunningAverage[] getDiffs(long userID, long itemID, PreferenceArray prefs) { - try { - buildAverageDiffsLock.readLock().lock(); - int size = prefs.length(); - RunningAverage[] result = new RunningAverage[size]; - for (int i = 0; i < size; i++) { - result[i] = getDiff(prefs.getItemID(i), itemID); - } - return result; - } finally { - buildAverageDiffsLock.readLock().unlock(); - } - } - - @Override - public RunningAverage getAverageItemPref(long itemID) { - return null; // TODO can't do this without a DataModel - } - - @Override - public void addItemPref(long userID, long itemIDA, float prefValue) { - // Can't do this without a DataModel; should it just be a no-op? - throw new UnsupportedOperationException(); - } - - @Override - public void updateItemPref(long itemID, float prefDelta) { - try { - buildAverageDiffsLock.readLock().lock(); - for (Map.Entry> entry : averageDiffs.entrySet()) { - boolean matchesItemID1 = itemID == entry.getKey(); - for (Map.Entry entry2 : entry.getValue().entrySet()) { - RunningAverage average = entry2.getValue(); - if (matchesItemID1) { - average.changeDatum(-prefDelta); - } else if (itemID == entry2.getKey()) { - average.changeDatum(prefDelta); - } - } - } - // RunningAverage itemAverage = averageItemPref.get(itemID); - // if (itemAverage != null) { - // itemAverage.changeDatum(prefDelta); - // } - } finally { - buildAverageDiffsLock.readLock().unlock(); - } - } - - @Override - public void removeItemPref(long userID, long itemIDA, float prefValue) { - // Can't do this without a DataModel; should it just be a no-op? - throw new UnsupportedOperationException(); - } - - @Override - public FastIDSet getRecommendableItemIDs(long userID) { - try { - buildAverageDiffsLock.readLock().lock(); - return allRecommendableItemIDs.clone(); - } finally { - buildAverageDiffsLock.readLock().unlock(); - } - } - - @Override - public void refresh(Collection alreadyRefreshed) { - long mostRecentModification = dataFile.lastModified(); - if (mostRecentModification > lastModified + MIN_RELOAD_INTERVAL_MS) { - log.debug("File has changed; reloading..."); - lastModified = mostRecentModification; - buildDiffs(); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizer.java deleted file mode 100644 index c1094e537..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizer.java +++ /dev/null @@ -1,217 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.svd; - -import com.google.common.collect.Lists; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.als.AlternatingLeastSquaresSolver; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.List; -import java.util.Random; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; - -/** - * factorizes the rating matrix using "Alternating-Least-Squares with Weighted-λ-Regularization" as described in - * the paper - * - * "Large-scale Collaborative Filtering for the Netflix Prize" - */ -public class ALSWRFactorizer extends AbstractFactorizer { - - private final DataModel dataModel; - - /** number of features used to compute this factorization */ - private final int numFeatures; - /** parameter to control the regularization */ - private final double lambda; - /** number of iterations */ - private final int numIterations; - - private static final Logger log = LoggerFactory.getLogger(ALSWRFactorizer.class); - - public ALSWRFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations) throws TasteException { - super(dataModel); - this.dataModel = dataModel; - this.numFeatures = numFeatures; - this.lambda = lambda; - this.numIterations = numIterations; - } - - static class Features { - - private final DataModel dataModel; - private final int numFeatures; - - private final double[][] M; - private final double[][] U; - - Features(ALSWRFactorizer factorizer) throws TasteException { - dataModel = factorizer.dataModel; - numFeatures = factorizer.numFeatures; - Random random = RandomUtils.getRandom(); - M = new double[dataModel.getNumItems()][numFeatures]; - LongPrimitiveIterator itemIDsIterator = dataModel.getItemIDs(); - while (itemIDsIterator.hasNext()) { - long itemID = itemIDsIterator.nextLong(); - int itemIDIndex = factorizer.itemIndex(itemID); - M[itemIDIndex][0] = averateRating(itemID); - for (int feature = 1; feature < numFeatures; feature++) { - M[itemIDIndex][feature] = random.nextDouble() * 0.1; - } - } - U = new double[dataModel.getNumUsers()][numFeatures]; - } - - double[][] getM() { - return M; - } - - double[][] getU() { - return U; - } - - Vector getUserFeatureColumn(int index) { - return new DenseVector(U[index]); - } - - Vector getItemFeatureColumn(int index) { - return new DenseVector(M[index]); - } - - void setFeatureColumnInU(int idIndex, Vector vector) { - setFeatureColumn(U, idIndex, vector); - } - - void setFeatureColumnInM(int idIndex, Vector vector) { - setFeatureColumn(M, idIndex, vector); - } - - protected void setFeatureColumn(double[][] matrix, int idIndex, Vector vector) { - for (int feature = 0; feature < numFeatures; feature++) { - matrix[idIndex][feature] = vector.get(feature); - } - } - - protected double averateRating(long itemID) throws TasteException { - PreferenceArray prefs = dataModel.getPreferencesForItem(itemID); - RunningAverage avg = new FullRunningAverage(); - for (Preference pref : prefs) { - avg.addDatum(pref.getValue()); - } - return avg.getAverage(); - } - } - - @Override - public Factorization factorize() throws TasteException { - log.info("starting to compute the factorization..."); - final AlternatingLeastSquaresSolver solver = new AlternatingLeastSquaresSolver(); - final Features features = new Features(this); - - for (int iteration = 0; iteration < numIterations; iteration++) { - log.info("iteration {}", iteration); - - /* fix M - compute U */ - ExecutorService queue = createQueue(); - LongPrimitiveIterator userIDsIterator = dataModel.getUserIDs(); - try { - while (userIDsIterator.hasNext()) { - final long userID = userIDsIterator.nextLong(); - final LongPrimitiveIterator itemIDsFromUser = dataModel.getItemIDsFromUser(userID).iterator(); - final PreferenceArray userPrefs = dataModel.getPreferencesFromUser(userID); - queue.execute(new Runnable() { - @Override - public void run() { - List featureVectors = Lists.newArrayList(); - while (itemIDsFromUser.hasNext()) { - long itemID = itemIDsFromUser.nextLong(); - featureVectors.add(features.getItemFeatureColumn(itemIndex(itemID))); - } - Vector userFeatures = solver.solve(featureVectors, ratingVector(userPrefs), lambda, numFeatures); - features.setFeatureColumnInU(userIndex(userID), userFeatures); - } - }); - } - } finally { - queue.shutdown(); - try { - queue.awaitTermination(dataModel.getNumUsers(), TimeUnit.SECONDS); - } catch (InterruptedException e) { - log.warn("Error when computing user features", e); - } - } - - /* fix U - compute M */ - queue = createQueue(); - LongPrimitiveIterator itemIDsIterator = dataModel.getItemIDs(); - try { - while (itemIDsIterator.hasNext()) { - final long itemID = itemIDsIterator.nextLong(); - final PreferenceArray itemPrefs = dataModel.getPreferencesForItem(itemID); - queue.execute(new Runnable() { - @Override - public void run() { - List featureVectors = Lists.newArrayList(); - for (Preference pref : itemPrefs) { - long userID = pref.getUserID(); - featureVectors.add(features.getUserFeatureColumn(userIndex(userID))); - } - Vector itemFeatures = solver.solve(featureVectors, ratingVector(itemPrefs), lambda, numFeatures); - features.setFeatureColumnInM(itemIndex(itemID), itemFeatures); - } - }); - } - } finally { - queue.shutdown(); - try { - queue.awaitTermination(dataModel.getNumItems(), TimeUnit.SECONDS); - } catch (InterruptedException e) { - log.warn("Error when computing item features", e); - } - } - } - - log.info("finished computation of the factorization..."); - return createFactorization(features.getU(), features.getM()); - } - - protected ExecutorService createQueue() { - return Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()); - } - - protected Vector ratingVector(PreferenceArray prefs) { - double[] ratings = new double[prefs.length()]; - for (int n = 0; n < prefs.length(); n++) { - ratings[n] = prefs.get(n).getValue(); - } - return new DenseVector(ratings); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/AbstractFactorizer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/AbstractFactorizer.java deleted file mode 100644 index 52252224a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/AbstractFactorizer.java +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.svd; - -import java.util.Collection; -import java.util.concurrent.Callable; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.model.DataModel; - -/** - * base class for {@link Factorizer}s, provides ID to index mapping - */ -public abstract class AbstractFactorizer implements Factorizer { - - private final DataModel dataModel; - private FastByIDMap userIDMapping; - private FastByIDMap itemIDMapping; - private final RefreshHelper refreshHelper; - - protected AbstractFactorizer(DataModel dataModel) throws TasteException { - this.dataModel = dataModel; - buildMappings(); - refreshHelper = new RefreshHelper(new Callable() { - @Override - public Object call() throws TasteException { - buildMappings(); - return null; - } - }); - refreshHelper.addDependency(dataModel); - } - - private void buildMappings() throws TasteException { - userIDMapping = createIDMapping(dataModel.getNumUsers(), dataModel.getUserIDs()); - itemIDMapping = createIDMapping(dataModel.getNumItems(), dataModel.getItemIDs()); - } - - protected Factorization createFactorization(double[][] userFeatures, double[][] itemFeatures) { - return new Factorization(userIDMapping, itemIDMapping, userFeatures, itemFeatures); - } - - protected Integer userIndex(long userID) { - Integer userIndex = userIDMapping.get(userID); - if (userIndex == null) { - userIndex = userIDMapping.size(); - userIDMapping.put(userID, userIndex); - } - return userIndex; - } - - protected Integer itemIndex(long itemID) { - Integer itemIndex = itemIDMapping.get(itemID); - if (itemIndex == null) { - itemIndex = itemIDMapping.size(); - itemIDMapping.put(itemID, itemIndex); - } - return itemIndex; - } - - private static FastByIDMap createIDMapping(int size, LongPrimitiveIterator idIterator) { - FastByIDMap mapping = new FastByIDMap(size); - int index = 0; - while (idIterator.hasNext()) { - mapping.put(idIterator.nextLong(), index++); - } - return mapping; - } - - @Override - public void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ExpectationMaximizationSVDFactorizer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ExpectationMaximizationSVDFactorizer.java deleted file mode 100644 index ec8dc2fd3..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ExpectationMaximizationSVDFactorizer.java +++ /dev/null @@ -1,179 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.svd; - -import java.util.Collections; -import java.util.List; -import java.util.Random; - -import com.google.common.collect.Lists; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.common.RandomUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** Calculates the SVD using an Expectation Maximization algorithm. */ -public final class ExpectationMaximizationSVDFactorizer extends AbstractFactorizer { - - private static final Logger log = LoggerFactory.getLogger(ExpectationMaximizationSVDFactorizer.class); - - private final double learningRate; - /** Parameter used to prevent overfitting. 0.02 is a good value. */ - private final double preventOverfitting; - /** number of features used to compute this factorization */ - private final int numFeatures; - /** number of iterations */ - private final int numIterations; - private final double randomNoise; - /** user singular vectors */ - private double[][] leftVectors; - /** item singular vectors */ - private double[][] rightVectors; - private final DataModel dataModel; - private List cachedPreferences; - private double defaultValue; - private double interval; - - public ExpectationMaximizationSVDFactorizer(DataModel dataModel, - int numFeatures, - int numIterations) throws TasteException { - // use the default parameters from the old SVDRecommender implementation - this(dataModel, numFeatures, 0.005, 0.02, 0.005, numIterations); - } - - public ExpectationMaximizationSVDFactorizer(DataModel dataModel, - int numFeatures, - double learningRate, - double preventOverfitting, - double randomNoise, - int numIterations) throws TasteException { - super(dataModel); - this.dataModel = dataModel; - this.numFeatures = numFeatures; - this.numIterations = numIterations; - - this.learningRate = learningRate; - this.preventOverfitting = preventOverfitting; - this.randomNoise = randomNoise; - - } - - @Override - public Factorization factorize() throws TasteException { - Random random = RandomUtils.getRandom(); - leftVectors = new double[dataModel.getNumUsers()][numFeatures]; - rightVectors = new double[dataModel.getNumItems()][numFeatures]; - - double average = getAveragePreference(); - - double prefInterval = dataModel.getMaxPreference() - dataModel.getMinPreference(); - defaultValue = Math.sqrt((average - prefInterval * 0.1) / numFeatures); - interval = prefInterval * 0.1 / numFeatures; - - for (int feature = 0; feature < numFeatures; feature++) { - for (int userIndex = 0; userIndex < dataModel.getNumUsers(); userIndex++) { - leftVectors[userIndex][feature] = defaultValue + (random.nextDouble() - 0.5) * interval * randomNoise; - } - for (int itemIndex = 0; itemIndex < dataModel.getNumItems(); itemIndex++) { - rightVectors[itemIndex][feature] = defaultValue + (random.nextDouble() - 0.5) * interval * randomNoise; - } - } - cachedPreferences = Lists.newArrayListWithCapacity(dataModel.getNumUsers()); - cachePreferences(); - double rmse = dataModel.getMaxPreference() - dataModel.getMinPreference(); - for (int ii = 0; ii < numFeatures; ii++) { - Collections.shuffle(cachedPreferences, random); - for (int i = 0; i < numIterations; i++) { - double err = 0.0; - for (SVDPreference pref : cachedPreferences) { - int useridx = userIndex(pref.getUserID()); - int itemidx = itemIndex(pref.getItemID()); - err += Math.pow(train(useridx, itemidx, ii, pref), 2.0); - } - rmse = Math.sqrt(err / cachedPreferences.size()); - } - if (ii < numFeatures - 1) { - for (SVDPreference pref : cachedPreferences) { - int useridx = userIndex(pref.getUserID()); - int itemidx = itemIndex(pref.getItemID()); - buildCache(useridx, itemidx, ii, pref); - } - } - log.info("Finished training feature {} with RMSE {}.", ii, rmse); - } - return createFactorization(leftVectors, rightVectors); - } - - double getAveragePreference() throws TasteException { - RunningAverage average = new FullRunningAverage(); - LongPrimitiveIterator it = dataModel.getUserIDs(); - while (it.hasNext()) { - for (Preference pref : dataModel.getPreferencesFromUser(it.nextLong())) { - average.addDatum(pref.getValue()); - } - } - return average.getAverage(); - } - - private double train(int i, int j, int f, SVDPreference pref) { - double[] leftVectorI = leftVectors[i]; - double[] rightVectorJ = rightVectors[j]; - double prediction = predictRating(i, j, f, pref, true); - double err = pref.getValue() - prediction; - double leftVectorIF = leftVectorI[f]; - leftVectorI[f] += learningRate * (err * rightVectorJ[f] - preventOverfitting * leftVectorI[f]); - rightVectorJ[f] += learningRate * (err * leftVectorIF - preventOverfitting * rightVectorJ[f]); - return err; - } - - private void buildCache(int i, int j, int k, SVDPreference pref) { - pref.setCache(predictRating(i, j, k, pref, false)); - } - - private double predictRating(int i, int j, int f, SVDPreference pref, boolean trailing) { - float minPreference = dataModel.getMinPreference(); - float maxPreference = dataModel.getMaxPreference(); - double sum = pref.getCache(); - sum += leftVectors[i][f] * rightVectors[j][f]; - if (trailing) { - sum += (numFeatures - f - 1) * (defaultValue + interval) * (defaultValue + interval); - if (sum > maxPreference) { - sum = maxPreference; - } else if (sum < minPreference) { - sum = minPreference; - } - } - return sum; - } - - private void cachePreferences() throws TasteException { - cachedPreferences.clear(); - LongPrimitiveIterator it = dataModel.getUserIDs(); - while (it.hasNext()) { - for (Preference pref : dataModel.getPreferencesFromUser(it.nextLong())) { - cachedPreferences.add(new SVDPreference(pref.getUserID(), pref.getItemID(), pref.getValue(), 0.0)); - } - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorization.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorization.java deleted file mode 100644 index bf13aac85..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorization.java +++ /dev/null @@ -1,104 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.svd; - -import java.util.Arrays; -import java.util.Map; - -import com.google.common.base.Preconditions; -import org.apache.mahout.cf.taste.common.NoSuchItemException; -import org.apache.mahout.cf.taste.common.NoSuchUserException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; - -/** - * a factorization of the rating matrix - */ -public class Factorization { - - /** used to find the rows in the user features matrix by userID */ - private final FastByIDMap userIDMapping; - /** used to find the rows in the item features matrix by itemID */ - private final FastByIDMap itemIDMapping; - - /** user features matrix */ - private final double[][] userFeatures; - /** item features matrix */ - private final double[][] itemFeatures; - - public Factorization(FastByIDMap userIDMapping, FastByIDMap itemIDMapping, double[][] userFeatures, - double[][] itemFeatures) { - this.userIDMapping = Preconditions.checkNotNull(userIDMapping); - this.itemIDMapping = Preconditions.checkNotNull(itemIDMapping); - this.userFeatures = userFeatures; - this.itemFeatures = itemFeatures; - } - - public double[] getUserFeatures(long userID) throws NoSuchUserException { - Integer index = userIDMapping.get(userID); - if (index == null) { - throw new NoSuchUserException(userID); - } - return userFeatures[index]; - } - - public double[] getItemFeatures(long itemID) throws NoSuchItemException { - Integer index = itemIDMapping.get(itemID); - if (index == null) { - throw new NoSuchItemException(itemID); - } - return itemFeatures[index]; - } - - public Iterable> getUserIDMappings() { - return userIDMapping.entrySet(); - } - - public Iterable> getItemIDMappings() { - return itemIDMapping.entrySet(); - } - - public int numFeatures() { - return userFeatures[0].length; - } - - public int numUsers() { - return userIDMapping.size(); - } - - public int numItems() { - return itemIDMapping.size(); - } - - @Override - public boolean equals(Object o) { - if (o instanceof Factorization) { - Factorization other = (Factorization) o; - return userIDMapping.equals(other.userIDMapping) && itemIDMapping.equals(other.itemIDMapping) && - Arrays.deepEquals(userFeatures, other.userFeatures) && Arrays.deepEquals(itemFeatures, other.itemFeatures); - } - return false; - } - - @Override - public int hashCode() { - int hashCode = 31 * userIDMapping.hashCode() + itemIDMapping.hashCode(); - hashCode = 31 * hashCode + Arrays.deepHashCode(userFeatures); - hashCode = 31 * hashCode + Arrays.deepHashCode(itemFeatures); - return hashCode; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorizer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorizer.java deleted file mode 100644 index 2cabe7358..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorizer.java +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.svd; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; - -/** - * Implementation must be able to create a factorization of a rating matrix - */ -public interface Factorizer extends Refreshable { - - Factorization factorize() throws TasteException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/FilePersistenceStrategy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/FilePersistenceStrategy.java deleted file mode 100644 index 7e35fd1ac..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/FilePersistenceStrategy.java +++ /dev/null @@ -1,148 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.svd; - -import com.google.common.base.Preconditions; -import com.google.common.io.Closeables; -import org.apache.mahout.cf.taste.common.NoSuchItemException; -import org.apache.mahout.cf.taste.common.NoSuchUserException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; -import java.io.DataInput; -import java.io.DataInputStream; -import java.io.DataOutput; -import java.io.DataOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.Map; - -/** Provides a file-based persistent store. */ -public class FilePersistenceStrategy implements PersistenceStrategy { - - private final File file; - - private static final Logger log = LoggerFactory.getLogger(FilePersistenceStrategy.class); - - /** - * @param file the file to use for storage. If the file does not exist it will be created when required. - */ - public FilePersistenceStrategy(File file) { - this.file = Preconditions.checkNotNull(file); - } - - @Override - public Factorization load() throws IOException { - if (!file.exists()) { - log.info("{} does not yet exist, no factorization found", file.getAbsolutePath()); - return null; - } - DataInputStream in = null; - try { - log.info("Reading factorization from {}...", file.getAbsolutePath()); - in = new DataInputStream(new BufferedInputStream(new FileInputStream(file))); - return readBinary(in); - } finally { - Closeables.closeQuietly(in); - } - } - - @Override - public void maybePersist(Factorization factorization) throws IOException { - DataOutputStream out = null; - try { - log.info("Writing factorization to {}...", file.getAbsolutePath()); - out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file))); - writeBinary(factorization, out); - } finally { - Closeables.closeQuietly(out); - } - } - - protected static void writeBinary(Factorization factorization, DataOutput out) throws IOException { - out.writeInt(factorization.numFeatures()); - out.writeInt(factorization.numUsers()); - out.writeInt(factorization.numItems()); - - for (Map.Entry mappingEntry : factorization.getUserIDMappings()) { - long userID = mappingEntry.getKey(); - out.writeInt(mappingEntry.getValue()); - out.writeLong(userID); - try { - double[] userFeatures = factorization.getUserFeatures(userID); - for (int feature = 0; feature < factorization.numFeatures(); feature++) { - out.writeDouble(userFeatures[feature]); - } - } catch (NoSuchUserException e) { - throw new IOException("Unable to persist factorization", e); - } - } - - for (Map.Entry entry : factorization.getItemIDMappings()) { - long itemID = entry.getKey(); - out.writeInt(entry.getValue()); - out.writeLong(itemID); - try { - double[] itemFeatures = factorization.getItemFeatures(itemID); - for (int feature = 0; feature < factorization.numFeatures(); feature++) { - out.writeDouble(itemFeatures[feature]); - } - } catch (NoSuchItemException e) { - throw new IOException("Unable to persist factorization", e); - } - } - } - - public static Factorization readBinary(DataInput in) throws IOException { - int numFeatures = in.readInt(); - int numUsers = in.readInt(); - int numItems = in.readInt(); - - FastByIDMap userIDMapping = new FastByIDMap(numUsers); - double[][] userFeatures = new double[numUsers][numFeatures]; - - for (int n = 0; n < numUsers; n++) { - int userIndex = in.readInt(); - long userID = in.readLong(); - userIDMapping.put(userID, userIndex); - for (int feature = 0; feature < numFeatures; feature++) { - userFeatures[userIndex][feature] = in.readDouble(); - } - } - - FastByIDMap itemIDMapping = new FastByIDMap(numItems); - double[][] itemFeatures = new double[numItems][numFeatures]; - - for (int n = 0; n < numItems; n++) { - int itemIndex = in.readInt(); - long itemID = in.readLong(); - itemIDMapping.put(itemID, itemIndex); - for (int feature = 0; feature < numFeatures; feature++) { - itemFeatures[itemIndex][feature] = in.readDouble(); - } - } - - return new Factorization(userIDMapping, itemIDMapping, userFeatures, itemFeatures); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ImplicitLinearRegressionFactorizer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ImplicitLinearRegressionFactorizer.java deleted file mode 100644 index 6d538c410..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ImplicitLinearRegressionFactorizer.java +++ /dev/null @@ -1,391 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.cf.taste.impl.recommender.svd; - -import java.util.Collection; -import java.util.List; -import java.util.Random; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; - -import com.google.common.collect.Lists; -import org.apache.mahout.cf.taste.common.NoSuchUserException; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.DenseMatrix; -import org.apache.mahout.math.DiagonalMatrix; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.QRDecomposition; -import org.apache.mahout.math.SparseMatrix; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public final class ImplicitLinearRegressionFactorizer extends AbstractFactorizer { - - private static final Logger log = LoggerFactory.getLogger(ImplicitLinearRegressionFactorizer.class); - private final double preventOverfitting; - /** number of features used to compute this factorization */ - private final int numFeatures; - /** number of iterations */ - private final int numIterations; - private final DataModel dataModel; - /** User singular vector. */ - private double[][] userMatrix; - /** Item singular vector. */ - private double[][] itemMatrix; - private Matrix userTransUser; - private Matrix itemTransItem; - Collection> fVectorCallables; - private boolean recomputeUserFeatures; - private RunningAverage avrChange; - - public ImplicitLinearRegressionFactorizer(DataModel dataModel) throws TasteException { - this(dataModel, 64, 10, 0.1); - } - - public ImplicitLinearRegressionFactorizer(DataModel dataModel, int numFeatures, int numIterations, - double preventOverfitting) throws TasteException { - - super(dataModel); - this.dataModel = dataModel; - this.numFeatures = numFeatures; - this.numIterations = numIterations; - this.preventOverfitting = preventOverfitting; - fVectorCallables = Lists.newArrayList(); - avrChange = new FullRunningAverage(); - } - - @Override - public Factorization factorize() throws TasteException { - Random random = RandomUtils.getRandom(); - userMatrix = new double[dataModel.getNumUsers()][numFeatures]; - itemMatrix = new double[dataModel.getNumItems()][numFeatures]; - - /* start with the user side */ - recomputeUserFeatures = true; - - double average = getAveragePreference(); - - double prefInterval = dataModel.getMaxPreference() - dataModel.getMinPreference(); - double defaultValue = Math.sqrt((average - prefInterval * 0.1) / numFeatures); - double interval = prefInterval * 0.1 / numFeatures; - - for (int feature = 0; feature < numFeatures; feature++) { - for (int userIndex = 0; userIndex < dataModel.getNumUsers(); userIndex++) { - userMatrix[userIndex][feature] = defaultValue + (random.nextDouble() - 0.5) * interval * random.nextDouble(); - } - for (int itemIndex = 0; itemIndex < dataModel.getNumItems(); itemIndex++) { - itemMatrix[itemIndex][feature] = defaultValue + (random.nextDouble() - 0.5) * interval * random.nextDouble(); - } - } - train(); - return createFactorization(userMatrix, itemMatrix); - } - - public void train() throws TasteException { - for (int i = 0; i < numIterations; i++) { - if (recomputeUserFeatures) { - LongPrimitiveIterator userIds = dataModel.getUserIDs(); - /* start with calculating X^TX or Y^TX */ - log.info("Calculating Y^TY"); - reCalculateTrans(recomputeUserFeatures); - log.info("Building callables for users."); - while (userIds.hasNext()) { - long userId = userIds.nextLong(); - int useridx = userIndex(userId); - buildCallables(buildConfidenceMatrixForUser(userId), buildPreferenceVectorForUser(userId), useridx); - } - finishProcessing(); - } else { - LongPrimitiveIterator itemIds = dataModel.getItemIDs(); - /* start with calculating X^TX or Y^TX */ - log.info("Calculating X^TX"); - reCalculateTrans(recomputeUserFeatures); - log.info("Building callables for items."); - while (itemIds.hasNext()) { - long itemId = itemIds.nextLong(); - int itemidx = itemIndex(itemId); - buildCallables(buildConfidenceMatrixForItem(itemId), buildPreferenceVectorForItem(itemId), itemidx); - } - finishProcessing(); - } - } - } - - public Matrix buildPreferenceVectorForUser(long realId) throws TasteException { - Matrix ids = new SparseMatrix(1, dataModel.getNumItems()); - for (Preference pref : dataModel.getPreferencesFromUser(realId)) { - int itemidx = itemIndex(pref.getItemID()); - ids.setQuick(0, itemidx, pref.getValue()); - } - return ids; - } - - private Matrix buildConfidenceMatrixForItem(long itemId) throws TasteException { - PreferenceArray prefs = dataModel.getPreferencesForItem(itemId); - Matrix confidenceMatrix = new SparseMatrix(dataModel.getNumUsers(), dataModel.getNumUsers()); - for (Preference pref : prefs) { - long userId = pref.getUserID(); - int userIdx = userIndex(userId); - confidenceMatrix.setQuick(userIdx, userIdx, 1); - } - return new DiagonalMatrix(confidenceMatrix); - } - - private Matrix buildConfidenceMatrixForUser(long userId) throws TasteException { - PreferenceArray prefs = dataModel.getPreferencesFromUser(userId); - Matrix confidenceMatrix = new SparseMatrix(dataModel.getNumItems(), dataModel.getNumItems()); - for (Preference pref : prefs) { - long itemId = pref.getItemID(); - int itemIdx = itemIndex(itemId); - confidenceMatrix.setQuick(itemIdx, itemIdx, 1); - } - return new DiagonalMatrix(confidenceMatrix); - } - - private Matrix buildPreferenceVectorForItem(long realId) throws TasteException { - Matrix ids = new SparseMatrix(1, dataModel.getNumUsers()); - for (Preference pref : dataModel.getPreferencesForItem(realId)) { - int useridx = userIndex(pref.getUserID()); - ids.setQuick(0, useridx, pref.getValue()); - } - return ids; - } - - private Matrix ones(int size) { - double[] vector = new double[size]; - for (int i = 0; i < size; i++) { - vector[i] = 1; - } - Matrix ones = new DiagonalMatrix(vector); - return ones; - } - - private double getAveragePreference() throws TasteException { - RunningAverage average = new FullRunningAverage(); - LongPrimitiveIterator it = dataModel.getUserIDs(); - while (it.hasNext()) { - int count = 0; - PreferenceArray prefs; - try { - prefs = dataModel.getPreferencesFromUser(it.nextLong()); - for (Preference pref : prefs) { - average.addDatum(pref.getValue()); - count++; - } - } catch (NoSuchUserException ex) { - continue; - } - /* add the remaining zeros */ - for (int i = 0; i < (dataModel.getNumItems() - count); i++) { - average.addDatum(0); - } - } - return average.getAverage(); - } - - /** - * Recalculating Y^TY or X^TX which is needed for further calculations - * @param recomputeUserFeatures - */ - public void reCalculateTrans(boolean recomputeUserFeatures) { - if (!recomputeUserFeatures) { - Matrix uMatrix = new DenseMatrix(userMatrix); - userTransUser = uMatrix.transpose().times(uMatrix); - } else { - Matrix iMatrix = new DenseMatrix(itemMatrix); - itemTransItem = iMatrix.transpose().times(iMatrix); - } - } - - private synchronized void updateMatrix(int id, Matrix m) { - double normA = 0; - double normB = 0; - double aTb = 0; - for (int feature = 0; feature < numFeatures; feature++) { - if (recomputeUserFeatures) { - normA += userMatrix[id][feature] * userMatrix[id][feature]; - normB += m.get(feature, 0) * m.get(feature, 0); - aTb += userMatrix[id][feature] * m.get(feature, 0); - userMatrix[id][feature] = m.get(feature, 0); - } else { - normA += itemMatrix[id][feature] * itemMatrix[id][feature]; - normB += m.get(feature, 0) * m.get(feature, 0); - aTb += itemMatrix[id][feature] * m.get(feature, 0); - itemMatrix[id][feature] = m.get(feature, 0); - } - } - /* calculating cosine similarity to determine when to stop the algorithm, this could be used to detect convergence */ - double cosine = (aTb) / (Math.sqrt(normA) * Math.sqrt(normB)); - if (Double.isNaN(cosine)) { - log.info("Cosine similarity is NaN, recomputeUserFeatures=" + recomputeUserFeatures + " id=" + id); - } else { - avrChange.addDatum(cosine); - } - } - - public void resetCallables() { - fVectorCallables = Lists.newArrayList(); - } - - private void resetAvrChange() { - log.info("Avr Change: {}", avrChange.getAverage()); - avrChange = new FullRunningAverage(); - } - - public void buildCallables(Matrix C, Matrix prefVector, int id) throws TasteException { - fVectorCallables.add(new FeatureVectorCallable(C, prefVector, id)); - if (fVectorCallables.size() % (200 * Runtime.getRuntime().availableProcessors()) == 0) { - execute(fVectorCallables); - resetCallables(); - } - } - - public void finishProcessing() throws TasteException { - /* run the remaining part */ - if (fVectorCallables != null) { - execute(fVectorCallables); - } - resetCallables(); - if ((recomputeUserFeatures && avrChange.getCount() != userMatrix.length) - || (!recomputeUserFeatures && avrChange.getCount() != itemMatrix.length)) { - log.info("Matrix length is not equal to count"); - } - resetAvrChange(); - recomputeUserFeatures = !recomputeUserFeatures; - } - - public Matrix identityV(int size) { - return ones(size); - } - - void execute(Collection> callables) throws TasteException { - callables = wrapWithStatsCallables(callables); - int numProcessors = Runtime.getRuntime().availableProcessors(); - ExecutorService executor = Executors.newFixedThreadPool(numProcessors); - log.info("Starting timing of {} tasks in {} threads", callables.size(), numProcessors); - try { - List> futures = executor.invokeAll(callables); - //TODO go look for exceptions here, really - for (Future future : futures) { - future.get(); - } - } catch (InterruptedException ie) { - log.warn("error in factorization", ie); - } catch (ExecutionException ee) { - log.warn("error in factorization", ee); - } - executor.shutdown(); - } - - private Collection> wrapWithStatsCallables(Collection> callables) { - int size = callables.size(); - Collection> wrapped = Lists.newArrayListWithExpectedSize(size); - int count = 1; - RunningAverageAndStdDev timing = new FullRunningAverageAndStdDev(); - for (Callable callable : callables) { - boolean logStats = count++ % 1000 == 0; - wrapped.add(new StatsCallable(callable, logStats, timing)); - } - return wrapped; - } - - private class FeatureVectorCallable implements Callable { - - private final Matrix C; - private final Matrix prefVector; - private final int id; - - private FeatureVectorCallable(Matrix C, Matrix prefVector, int id) { - this.C = C; - this.prefVector = prefVector; - this.id = id; - } - - @Override - public Void call() throws Exception { - Matrix XTCX; - if (recomputeUserFeatures) { - Matrix I = identityV(dataModel.getNumItems()); - Matrix I2 = identityV(numFeatures); - Matrix iTi = itemTransItem.clone(); - Matrix itemM = new DenseMatrix(itemMatrix); - XTCX = iTi.plus(itemM.transpose().times(C.minus(I)).times(itemM)); - - Matrix diag = solve(XTCX.plus(I2.times(preventOverfitting)), I2); - Matrix results = diag.times(itemM.transpose().times(C)).times(prefVector.transpose()); - updateMatrix(id, results); - } else { - Matrix I = identityV(dataModel.getNumUsers()); - Matrix I2 = identityV(numFeatures); - Matrix uTu = userTransUser.clone(); - Matrix userM = new DenseMatrix(userMatrix); - XTCX = uTu.plus(userM.transpose().times(C.minus(I)).times(userM)); - - Matrix diag = solve(XTCX.plus(I2.times(preventOverfitting)), I2); - Matrix results = diag.times(userM.transpose().times(C)).times(prefVector.transpose()); - updateMatrix(id, results); - } - return null; - } - } - - private Matrix solve(Matrix A, Matrix y) { - return new QRDecomposition(A).solve(y); - } - - private static class StatsCallable implements Callable { - - private final Callable delegate; - private final boolean logStats; - private final RunningAverageAndStdDev timing; - - private StatsCallable(Callable delegate, boolean logStats, RunningAverageAndStdDev timing) { - this.delegate = delegate; - this.logStats = logStats; - this.timing = timing; - } - - @Override - public Void call() throws Exception { - long start = System.currentTimeMillis(); - delegate.call(); - long end = System.currentTimeMillis(); - timing.addDatum(end - start); - if (logStats) { - Runtime runtime = Runtime.getRuntime(); - int average = (int) timing.getAverage(); - log.info("Average time per task: {}ms", average); - long totalMemory = runtime.totalMemory(); - long memory = totalMemory - runtime.freeMemory(); - log.info("Approximate memory used: {}MB / {}MB", memory / 1000000L, totalMemory / 1000000L); - } - return null; - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/NoPersistenceStrategy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/NoPersistenceStrategy.java deleted file mode 100644 index 0d1aab072..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/NoPersistenceStrategy.java +++ /dev/null @@ -1,37 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.svd; - -import java.io.IOException; - -/** - * A {@link PersistenceStrategy} which does nothing. - */ -public class NoPersistenceStrategy implements PersistenceStrategy { - - @Override - public Factorization load() throws IOException { - return null; - } - - @Override - public void maybePersist(Factorization factorization) throws IOException { - // do nothing. - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/PersistenceStrategy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/PersistenceStrategy.java deleted file mode 100644 index abf3eca2c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/PersistenceStrategy.java +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.svd; - -import java.io.IOException; - -/** - * Provides storage for {@link Factorization}s - */ -public interface PersistenceStrategy { - - /** - * Load a factorization from a persistent store. - * - * @return a Factorization or null if the persistent store is empty. - * - * @throws IOException - */ - Factorization load() throws IOException; - - /** - * Write a factorization to a persistent store unless it already - * contains an identical factorization. - * - * @param factorization - * - * @throws IOException - */ - void maybePersist(Factorization factorization) throws IOException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPreference.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPreference.java deleted file mode 100644 index 45c54dabf..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPreference.java +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.svd; - -import com.google.common.base.Preconditions; -import org.apache.mahout.cf.taste.impl.model.GenericPreference; - -final class SVDPreference extends GenericPreference { - - private double cache; - - SVDPreference(long userID, long itemID, float value, double cache) { - super(userID, itemID, value); - setCache(cache); - } - - public double getCache() { - return cache; - } - - public void setCache(double value) { - Preconditions.checkArgument(!Double.isNaN(value), "NaN cache value"); - this.cache = value; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDRecommender.java deleted file mode 100644 index 0d20e587b..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDRecommender.java +++ /dev/null @@ -1,182 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.recommender.svd; - -import java.io.IOException; -import java.util.Collection; -import java.util.List; -import java.util.concurrent.Callable; - -import com.google.common.base.Preconditions; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.recommender.AbstractRecommender; -import org.apache.mahout.cf.taste.impl.recommender.TopItems; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy; -import org.apache.mahout.cf.taste.recommender.IDRescorer; -import org.apache.mahout.cf.taste.recommender.RecommendedItem; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * A {@link org.apache.mahout.cf.taste.recommender.Recommender} that uses matrix factorization (a projection of users - * and items onto a feature space) - */ -public final class SVDRecommender extends AbstractRecommender { - - private Factorization factorization; - private final Factorizer factorizer; - private final PersistenceStrategy persistenceStrategy; - private final RefreshHelper refreshHelper; - - private static final Logger log = LoggerFactory.getLogger(SVDRecommender.class); - - public SVDRecommender(DataModel dataModel, Factorizer factorizer) throws TasteException { - this(dataModel, factorizer, getDefaultCandidateItemsStrategy(), getDefaultPersistenceStrategy()); - } - - public SVDRecommender(DataModel dataModel, Factorizer factorizer, CandidateItemsStrategy candidateItemsStrategy) - throws TasteException { - this(dataModel, factorizer, candidateItemsStrategy, getDefaultPersistenceStrategy()); - } - - /** - * Create an SVDRecommender using a persistent store to cache factorizations. A factorization is loaded from the - * store if present, otherwise a new factorization is computed and saved in the store. - * - * The {@link #refresh(java.util.Collection) refresh} method recomputes the factorization and overwrites the store. - * - * @param dataModel - * @param factorizer - * @param persistenceStrategy - * @throws TasteException - * @throws IOException - */ - public SVDRecommender(DataModel dataModel, Factorizer factorizer, PersistenceStrategy persistenceStrategy) - throws TasteException { - this(dataModel, factorizer, getDefaultCandidateItemsStrategy(), persistenceStrategy); - } - - /** - * Create an SVDRecommender using a persistent store to cache factorizations. A factorization is loaded from the - * store if present, otherwise a new factorization is computed and saved in the store. - * - * The {@link #refresh(java.util.Collection) refresh} method recomputes the factorization and overwrites the store. - * - * @param dataModel - * @param factorizer - * @param candidateItemsStrategy - * @param persistenceStrategy - * - * @throws TasteException - */ - public SVDRecommender(DataModel dataModel, Factorizer factorizer, CandidateItemsStrategy candidateItemsStrategy, - PersistenceStrategy persistenceStrategy) throws TasteException { - super(dataModel, candidateItemsStrategy); - this.factorizer = Preconditions.checkNotNull(factorizer); - this.persistenceStrategy = Preconditions.checkNotNull(persistenceStrategy); - try { - factorization = persistenceStrategy.load(); - } catch (IOException e) { - throw new TasteException("Error loading factorization", e); - } - - if (factorization == null) { - train(); - } - - refreshHelper = new RefreshHelper(new Callable() { - @Override - public Object call() throws TasteException { - train(); - return null; - } - }); - refreshHelper.addDependency(getDataModel()); - refreshHelper.addDependency(factorizer); - } - - static PersistenceStrategy getDefaultPersistenceStrategy() { - return new NoPersistenceStrategy(); - } - - private void train() throws TasteException { - factorization = factorizer.factorize(); - try { - persistenceStrategy.maybePersist(factorization); - } catch (IOException e) { - throw new TasteException("Error persisting factorization", e); - } - } - - @Override - public List recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException { - Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1"); - log.debug("Recommending items for user ID '{}'", userID); - - PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID); - FastIDSet possibleItemIDs = getAllOtherItems(userID, preferencesFromUser); - - List topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer, - new Estimator(userID)); - log.debug("Recommendations are: {}", topItems); - - return topItems; - } - - /** - * a preference is estimated by computing the dot-product of the user and item feature vectors - */ - @Override - public float estimatePreference(long userID, long itemID) throws TasteException { - double[] userFeatures = factorization.getUserFeatures(userID); - double[] itemFeatures = factorization.getItemFeatures(itemID); - double estimate = 0; - for (int feature = 0; feature < userFeatures.length; feature++) { - estimate += userFeatures[feature] * itemFeatures[feature]; - } - return (float) estimate; - } - - private final class Estimator implements TopItems.Estimator { - - private final long theUserID; - - private Estimator(long theUserID) { - this.theUserID = theUserID; - } - - @Override - public double estimate(Long itemID) throws TasteException { - return estimatePreference(theUserID, itemID); - } - } - - /** - * Refresh the data model and factorization. - */ - @Override - public void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractItemSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractItemSimilarity.java deleted file mode 100644 index e0d6f599f..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractItemSimilarity.java +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity; - -import com.google.common.base.Preconditions; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.similarity.ItemSimilarity; - -import java.util.Collection; - -public abstract class AbstractItemSimilarity implements ItemSimilarity { - - private final DataModel dataModel; - private final RefreshHelper refreshHelper; - - protected AbstractItemSimilarity(DataModel dataModel) { - Preconditions.checkArgument(dataModel != null, "dataModel is null"); - this.dataModel = dataModel; - this.refreshHelper = new RefreshHelper(null); - refreshHelper.addDependency(this.dataModel); - } - - protected DataModel getDataModel() { - return dataModel; - } - - @Override - public long[] allSimilarItemIDs(long itemID) throws TasteException { - FastIDSet allSimilarItemIDs = new FastIDSet(); - LongPrimitiveIterator allItemIDs = dataModel.getItemIDs(); - while (allItemIDs.hasNext()) { - long possiblySimilarItemID = allItemIDs.nextLong(); - if (!Double.isNaN(itemSimilarity(itemID, possiblySimilarItemID))) { - allSimilarItemIDs.add(possiblySimilarItemID); - } - } - return allSimilarItemIDs.toArray(); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java deleted file mode 100644 index f6790b313..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java +++ /dev/null @@ -1,384 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity; - -import java.util.Collection; -import java.util.concurrent.Callable; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.common.Weighting; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; -import org.apache.mahout.cf.taste.similarity.UserSimilarity; -import org.apache.mahout.cf.taste.transforms.PreferenceTransform; -import org.apache.mahout.cf.taste.transforms.SimilarityTransform; - -import com.google.common.base.Preconditions; - -/** Abstract superclass encapsulating functionality that is common to most implementations in this package. */ -abstract class AbstractSimilarity extends AbstractItemSimilarity implements UserSimilarity { - - private PreferenceInferrer inferrer; - private PreferenceTransform prefTransform; - private SimilarityTransform similarityTransform; - private final boolean weighted; - private final boolean centerData; - private int cachedNumItems; - private int cachedNumUsers; - private final RefreshHelper refreshHelper; - - /** - *

- * Creates a possibly weighted AbstractSimilarity. - *

- */ - AbstractSimilarity(final DataModel dataModel, Weighting weighting, boolean centerData) throws TasteException { - super(dataModel); - this.weighted = weighting == Weighting.WEIGHTED; - this.centerData = centerData; - this.cachedNumItems = dataModel.getNumItems(); - this.cachedNumUsers = dataModel.getNumUsers(); - this.refreshHelper = new RefreshHelper(new Callable() { - @Override - public Object call() throws TasteException { - cachedNumItems = dataModel.getNumItems(); - cachedNumUsers = dataModel.getNumUsers(); - return null; - } - }); - } - - final PreferenceInferrer getPreferenceInferrer() { - return inferrer; - } - - @Override - public final void setPreferenceInferrer(PreferenceInferrer inferrer) { - Preconditions.checkArgument(inferrer != null, "inferrer is null"); - refreshHelper.addDependency(inferrer); - refreshHelper.removeDependency(this.inferrer); - this.inferrer = inferrer; - } - - public final PreferenceTransform getPrefTransform() { - return prefTransform; - } - - public final void setPrefTransform(PreferenceTransform prefTransform) { - refreshHelper.addDependency(prefTransform); - refreshHelper.removeDependency(this.prefTransform); - this.prefTransform = prefTransform; - } - - public final SimilarityTransform getSimilarityTransform() { - return similarityTransform; - } - - public final void setSimilarityTransform(SimilarityTransform similarityTransform) { - refreshHelper.addDependency(similarityTransform); - refreshHelper.removeDependency(this.similarityTransform); - this.similarityTransform = similarityTransform; - } - - final boolean isWeighted() { - return weighted; - } - - /** - *

- * Several subclasses in this package implement this method to actually compute the similarity from figures - * computed over users or items. Note that the computations in this class "center" the data, such that X and - * Y's mean are 0. - *

- * - *

- * Note that the sum of all X and Y values must then be 0. This value isn't passed down into the standard - * similarity computations as a result. - *

- * - * @param n - * total number of users or items - * @param sumXY - * sum of product of user/item preference values, over all items/users prefererred by both - * users/items - * @param sumX2 - * sum of the square of user/item preference values, over the first item/user - * @param sumY2 - * sum of the square of the user/item preference values, over the second item/user - * @param sumXYdiff2 - * sum of squares of differences in X and Y values - * @return similarity value between -1.0 and 1.0, inclusive, or {@link Double#NaN} if no similarity can be - * computed (e.g. when no items have been rated by both uesrs - */ - abstract double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2); - - @Override - public double userSimilarity(long userID1, long userID2) throws TasteException { - DataModel dataModel = getDataModel(); - PreferenceArray xPrefs = dataModel.getPreferencesFromUser(userID1); - PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2); - int xLength = xPrefs.length(); - int yLength = yPrefs.length(); - - if (xLength == 0 || yLength == 0) { - return Double.NaN; - } - - long xIndex = xPrefs.getItemID(0); - long yIndex = yPrefs.getItemID(0); - int xPrefIndex = 0; - int yPrefIndex = 0; - - double sumX = 0.0; - double sumX2 = 0.0; - double sumY = 0.0; - double sumY2 = 0.0; - double sumXY = 0.0; - double sumXYdiff2 = 0.0; - int count = 0; - - boolean hasInferrer = inferrer != null; - boolean hasPrefTransform = prefTransform != null; - - while (true) { - int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0; - if (hasInferrer || compare == 0) { - double x; - double y; - if (xIndex == yIndex) { - // Both users expressed a preference for the item - if (hasPrefTransform) { - x = prefTransform.getTransformedValue(xPrefs.get(xPrefIndex)); - y = prefTransform.getTransformedValue(yPrefs.get(yPrefIndex)); - } else { - x = xPrefs.getValue(xPrefIndex); - y = yPrefs.getValue(yPrefIndex); - } - } else { - // Only one user expressed a preference, but infer the other one's preference and tally - // as if the other user expressed that preference - if (compare < 0) { - // X has a value; infer Y's - x = hasPrefTransform - ? prefTransform.getTransformedValue(xPrefs.get(xPrefIndex)) - : xPrefs.getValue(xPrefIndex); - y = inferrer.inferPreference(userID2, xIndex); - } else { - // compare > 0 - // Y has a value; infer X's - x = inferrer.inferPreference(userID1, yIndex); - y = hasPrefTransform - ? prefTransform.getTransformedValue(yPrefs.get(yPrefIndex)) - : yPrefs.getValue(yPrefIndex); - } - } - sumXY += x * y; - sumX += x; - sumX2 += x * x; - sumY += y; - sumY2 += y * y; - double diff = x - y; - sumXYdiff2 += diff * diff; - count++; - } - if (compare <= 0) { - if (++xPrefIndex >= xLength) { - if (hasInferrer) { - // Must count other Ys; pretend next X is far away - if (yIndex == Long.MAX_VALUE) { - // ... but stop if both are done! - break; - } - xIndex = Long.MAX_VALUE; - } else { - break; - } - } else { - xIndex = xPrefs.getItemID(xPrefIndex); - } - } - if (compare >= 0) { - if (++yPrefIndex >= yLength) { - if (hasInferrer) { - // Must count other Xs; pretend next Y is far away - if (xIndex == Long.MAX_VALUE) { - // ... but stop if both are done! - break; - } - yIndex = Long.MAX_VALUE; - } else { - break; - } - } else { - yIndex = yPrefs.getItemID(yPrefIndex); - } - } - } - - // "Center" the data. If my math is correct, this'll do it. - double result; - if (centerData) { - double meanX = sumX / count; - double meanY = sumY / count; - // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY; - double centeredSumXY = sumXY - meanY * sumX; - // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX; - double centeredSumX2 = sumX2 - meanX * sumX; - // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY; - double centeredSumY2 = sumY2 - meanY * sumY; - result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2); - } else { - result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2); - } - - if (similarityTransform != null) { - result = similarityTransform.transformSimilarity(userID1, userID2, result); - } - - if (!Double.isNaN(result)) { - result = normalizeWeightResult(result, count, cachedNumItems); - } - return result; - } - - @Override - public final double itemSimilarity(long itemID1, long itemID2) throws TasteException { - DataModel dataModel = getDataModel(); - PreferenceArray xPrefs = dataModel.getPreferencesForItem(itemID1); - PreferenceArray yPrefs = dataModel.getPreferencesForItem(itemID2); - int xLength = xPrefs.length(); - int yLength = yPrefs.length(); - - if (xLength == 0 || yLength == 0) { - return Double.NaN; - } - - long xIndex = xPrefs.getUserID(0); - long yIndex = yPrefs.getUserID(0); - int xPrefIndex = 0; - int yPrefIndex = 0; - - double sumX = 0.0; - double sumX2 = 0.0; - double sumY = 0.0; - double sumY2 = 0.0; - double sumXY = 0.0; - double sumXYdiff2 = 0.0; - int count = 0; - - // No, pref inferrers and transforms don't appy here. I think. - - while (true) { - int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0; - if (compare == 0) { - // Both users expressed a preference for the item - double x = xPrefs.getValue(xPrefIndex); - double y = yPrefs.getValue(yPrefIndex); - sumXY += x * y; - sumX += x; - sumX2 += x * x; - sumY += y; - sumY2 += y * y; - double diff = x - y; - sumXYdiff2 += diff * diff; - count++; - } - if (compare <= 0) { - if (++xPrefIndex == xLength) { - break; - } - xIndex = xPrefs.getUserID(xPrefIndex); - } - if (compare >= 0) { - if (++yPrefIndex == yLength) { - break; - } - yIndex = yPrefs.getUserID(yPrefIndex); - } - } - - double result; - if (centerData) { - // See comments above on these computations - double n = (double) count; - double meanX = sumX / n; - double meanY = sumY / n; - // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY; - double centeredSumXY = sumXY - meanY * sumX; - // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX; - double centeredSumX2 = sumX2 - meanX * sumX; - // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY; - double centeredSumY2 = sumY2 - meanY * sumY; - result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2); - } else { - result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2); - } - - if (similarityTransform != null) { - result = similarityTransform.transformSimilarity(itemID1, itemID2, result); - } - - if (!Double.isNaN(result)) { - result = normalizeWeightResult(result, count, cachedNumUsers); - } - return result; - } - - @Override - public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException { - int length = itemID2s.length; - double[] result = new double[length]; - for (int i = 0; i < length; i++) { - result[i] = itemSimilarity(itemID1, itemID2s[i]); - } - return result; - } - - final double normalizeWeightResult(double result, int count, int num) { - if (weighted) { - double scaleFactor = 1.0 - (double) count / (double) (num + 1); - if (result < 0.0) { - result = -1.0 + scaleFactor * (1.0 + result); - } else { - result = 1.0 - scaleFactor * (1.0 - result); - } - } - // Make sure the result is not accidentally a little outside [-1.0, 1.0] due to rounding: - if (result < -1.0) { - result = -1.0; - } else if (result > 1.0) { - result = 1.0; - } - return result; - } - - @Override - public final void refresh(Collection alreadyRefreshed) { - super.refresh(alreadyRefreshed); - refreshHelper.refresh(alreadyRefreshed); - } - - @Override - public final String toString() { - return this.getClass().getSimpleName() + "[dataModel:" + getDataModel() + ",inferrer:" + inferrer + ']'; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java deleted file mode 100644 index fa3a25932..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java +++ /dev/null @@ -1,85 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity; - -import java.util.Collection; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.Cache; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; -import org.apache.mahout.cf.taste.impl.common.Retriever; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; - -/** - *

- * Implementations of this interface compute an inferred preference for a user and an item that the user has - * not expressed any preference for. This might be an average of other preferences scores from that user, for - * example. This technique is sometimes called "default voting". - *

- */ -public final class AveragingPreferenceInferrer implements PreferenceInferrer { - - private static final Float ZERO = 0.0f; - - private final DataModel dataModel; - private final Cache averagePreferenceValue; - - public AveragingPreferenceInferrer(DataModel dataModel) throws TasteException { - this.dataModel = dataModel; - Retriever retriever = new PrefRetriever(); - averagePreferenceValue = new Cache(retriever, dataModel.getNumUsers()); - refresh(null); - } - - @Override - public float inferPreference(long userID, long itemID) throws TasteException { - return averagePreferenceValue.get(userID); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - averagePreferenceValue.clear(); - } - - private final class PrefRetriever implements Retriever { - - @Override - public Float get(Long key) throws TasteException { - RunningAverage average = new FullRunningAverage(); - PreferenceArray prefs = dataModel.getPreferencesFromUser(key); - int size = prefs.length(); - if (size == 0) { - return ZERO; - } - for (int i = 0; i < size; i++) { - average.addDatum(prefs.getValue(i)); - } - return (float) average.getAverage(); - } - } - - @Override - public String toString() { - return "AveragingPreferenceInferrer"; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java deleted file mode 100644 index 2fbc13de6..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java +++ /dev/null @@ -1,111 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity; - -import java.util.Collection; -import java.util.concurrent.Callable; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.Cache; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.common.Retriever; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.similarity.ItemSimilarity; -import org.apache.mahout.common.LongPair; -import com.google.common.base.Preconditions; - -/** - * Caches the results from an underlying {@link ItemSimilarity} implementation. - */ -public final class CachingItemSimilarity implements ItemSimilarity { - - private final ItemSimilarity similarity; - private final Cache similarityCache; - private final RefreshHelper refreshHelper; - - /** - * Creates this on top of the given {@link ItemSimilarity}. - * The cache is sized according to properties of the given {@link DataModel}. - */ - public CachingItemSimilarity(ItemSimilarity similarity, DataModel dataModel) throws TasteException { - this(similarity, dataModel.getNumItems()); - } - - /** - * Creates this on top of the given {@link ItemSimilarity}. - * The cache size is capped by the given size. - */ - public CachingItemSimilarity(ItemSimilarity similarity, int maxCacheSize) { - Preconditions.checkArgument(similarity != null, "similarity is null"); - this.similarity = similarity; - this.similarityCache = new Cache(new SimilarityRetriever(similarity), maxCacheSize); - this.refreshHelper = new RefreshHelper(new Callable() { - @Override - public Void call() { - similarityCache.clear(); - return null; - } - }); - refreshHelper.addDependency(similarity); - } - - @Override - public double itemSimilarity(long itemID1, long itemID2) throws TasteException { - LongPair key = itemID1 < itemID2 ? new LongPair(itemID1, itemID2) : new LongPair(itemID2, itemID1); - return similarityCache.get(key); - } - - @Override - public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException { - int length = itemID2s.length; - double[] result = new double[length]; - for (int i = 0; i < length; i++) { - result[i] = itemSimilarity(itemID1, itemID2s[i]); - } - return result; - } - - @Override - public long[] allSimilarItemIDs(long itemID) throws TasteException { - return similarity.allSimilarItemIDs(itemID); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } - - public void clearCacheForItem(long itemID) { - similarityCache.removeKeysMatching(new LongPairMatchPredicate(itemID)); - } - - private static final class SimilarityRetriever implements Retriever { - private final ItemSimilarity similarity; - - private SimilarityRetriever(ItemSimilarity similarity) { - this.similarity = similarity; - } - - @Override - public Double get(LongPair key) throws TasteException { - return similarity.itemSimilarity(key.getFirst(), key.getSecond()); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java deleted file mode 100644 index a26d9df58..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java +++ /dev/null @@ -1,104 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity; - -import java.util.Collection; -import java.util.concurrent.Callable; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.Cache; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.common.Retriever; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; -import org.apache.mahout.cf.taste.similarity.UserSimilarity; -import org.apache.mahout.common.LongPair; - -import com.google.common.base.Preconditions; - -/** - * Caches the results from an underlying {@link UserSimilarity} implementation. - */ -public final class CachingUserSimilarity implements UserSimilarity { - - private final UserSimilarity similarity; - private final Cache similarityCache; - private final RefreshHelper refreshHelper; - - /** - * Creates this on top of the given {@link UserSimilarity}. - * The cache is sized according to properties of the given {@link DataModel}. - */ - public CachingUserSimilarity(UserSimilarity similarity, DataModel dataModel) throws TasteException { - this(similarity, dataModel.getNumUsers()); - } - - /** - * Creates this on top of the given {@link UserSimilarity}. - * The cache size is capped by the given size. - */ - public CachingUserSimilarity(UserSimilarity similarity, int maxCacheSize) { - Preconditions.checkArgument(similarity != null, "similarity is null"); - this.similarity = similarity; - this.similarityCache = new Cache(new SimilarityRetriever(similarity), maxCacheSize); - this.refreshHelper = new RefreshHelper(new Callable() { - @Override - public Void call() { - similarityCache.clear(); - return null; - } - }); - refreshHelper.addDependency(similarity); - } - - @Override - public double userSimilarity(long userID1, long userID2) throws TasteException { - LongPair key = userID1 < userID2 ? new LongPair(userID1, userID2) : new LongPair(userID2, userID1); - return similarityCache.get(key); - } - - @Override - public void setPreferenceInferrer(PreferenceInferrer inferrer) { - similarityCache.clear(); - similarity.setPreferenceInferrer(inferrer); - } - - public void clearCacheForUser(long userID) { - similarityCache.removeKeysMatching(new LongPairMatchPredicate(userID)); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } - - private static final class SimilarityRetriever implements Retriever { - private final UserSimilarity similarity; - - private SimilarityRetriever(UserSimilarity similarity) { - this.similarity = similarity; - } - - @Override - public Double get(LongPair key) throws TasteException { - return similarity.userSimilarity(key.getFirst(), key.getSecond()); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java deleted file mode 100644 index 670521dff..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java +++ /dev/null @@ -1,98 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.cf.taste.impl.similarity; - -import java.util.Collection; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; -import org.apache.mahout.cf.taste.similarity.UserSimilarity; - -/** - * Implementation of City Block distance (also known as Manhattan distance) - the absolute value of the difference of - * each direction is summed. The resulting unbounded distance is then mapped between 0 and 1. - */ -public final class CityBlockSimilarity extends AbstractItemSimilarity implements UserSimilarity { - - public CityBlockSimilarity(DataModel dataModel) { - super(dataModel); - } - - /** - * @throws UnsupportedOperationException - */ - @Override - public void setPreferenceInferrer(PreferenceInferrer inferrer) { - throw new UnsupportedOperationException(); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed); - RefreshHelper.maybeRefresh(alreadyRefreshed, getDataModel()); - } - - @Override - public double itemSimilarity(long itemID1, long itemID2) throws TasteException { - DataModel dataModel = getDataModel(); - int preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1); - int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2); - int intersection = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2); - return doSimilarity(preferring1, preferring2, intersection); - } - - @Override - public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException { - DataModel dataModel = getDataModel(); - int preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1); - double[] distance = new double[itemID2s.length]; - for (int i = 0; i < itemID2s.length; ++i) { - int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2s[i]); - int intersection = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2s[i]); - distance[i] = doSimilarity(preferring1, preferring2, intersection); - } - return distance; - } - - @Override - public double userSimilarity(long userID1, long userID2) throws TasteException { - DataModel dataModel = getDataModel(); - FastIDSet prefs1 = dataModel.getItemIDsFromUser(userID1); - FastIDSet prefs2 = dataModel.getItemIDsFromUser(userID2); - int prefs1Size = prefs1.size(); - int prefs2Size = prefs2.size(); - int intersectionSize = prefs1Size < prefs2Size ? prefs2.intersectionSize(prefs1) : prefs1.intersectionSize(prefs2); - return doSimilarity(prefs1Size, prefs2Size, intersectionSize); - } - - /** - * Calculate City Block Distance from total non-zero values and intersections and map to a similarity value. - * - * @param pref1 number of non-zero values in left vector - * @param pref2 number of non-zero values in right vector - * @param intersection number of overlapping non-zero values - */ - private static double doSimilarity(int pref1, int pref2, int intersection) { - int distance = pref1 + pref2 - 2 * intersection; - return 1.0 / (1.0 + distance); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java deleted file mode 100644 index 0f0d22089..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java +++ /dev/null @@ -1,67 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.common.Weighting; -import org.apache.mahout.cf.taste.model.DataModel; - -import com.google.common.base.Preconditions; - -/** - *

- * An implementation of a "similarity" based on the Euclidean "distance" between two users X and Y. Thinking - * of items as dimensions and preferences as points along those dimensions, a distance is computed using all - * items (dimensions) where both users have expressed a preference for that item. This is simply the square - * root of the sum of the squares of differences in position (preference) along each dimension.

- * - *

The similarity could be computed as 1 / (1 + distance), so the resulting values are in the range (0,1]. - * This would weight against pairs that overlap in more dimensions, which should indicate more similarity, - * since more dimensions offer more opportunities to be farther apart. Actually, it is computed as - * sqrt(n) / (1 + distance), where n is the number of dimensions, in order to help correct for this. - * sqrt(n) is chosen since randomly-chosen points have a distance that grows as sqrt(n).

- * - *

Note that this could cause a similarity to exceed 1; such values are capped at 1.

- * - *

Note that the distance isn't normalized in any way; it's not valid to compare similarities computed from - * different domains (different rating scales, for example). Within one domain, normalizing doesn't matter much as - * it doesn't change ordering.

- */ -public final class EuclideanDistanceSimilarity extends AbstractSimilarity { - - /** - * @throws IllegalArgumentException if {@link DataModel} does not have preference values - */ - public EuclideanDistanceSimilarity(DataModel dataModel) throws TasteException { - this(dataModel, Weighting.UNWEIGHTED); - } - - /** - * @throws IllegalArgumentException if {@link DataModel} does not have preference values - */ - public EuclideanDistanceSimilarity(DataModel dataModel, Weighting weighting) throws TasteException { - super(dataModel, weighting, false); - Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values"); - } - - @Override - double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) { - return 1.0 / (1.0 + Math.sqrt(sumXYdiff2) / Math.sqrt(n)); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java deleted file mode 100644 index 70e96a75c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java +++ /dev/null @@ -1,358 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity; - -import java.util.Collection; -import java.util.Iterator; - -import com.google.common.collect.AbstractIterator; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.recommender.TopItems; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.similarity.ItemSimilarity; -import org.apache.mahout.common.RandomUtils; - -import com.google.common.base.Preconditions; - -/** - *

- * A "generic" {@link ItemSimilarity} which takes a static list of precomputed item similarities and bases its - * responses on that alone. The values may have been precomputed offline by another process, stored in a file, - * and then read and fed into an instance of this class. - *

- * - *

- * This is perhaps the best {@link ItemSimilarity} to use with - * {@link org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender}, for now, since the point - * of item-based recommenders is that they can take advantage of the fact that item similarity is relatively - * static, can be precomputed, and then used in computation to gain a significant performance advantage. - *

- */ -public final class GenericItemSimilarity implements ItemSimilarity { - - private static final long[] NO_IDS = new long[0]; - - private final FastByIDMap> similarityMaps = new FastByIDMap>(); - private final FastByIDMap similarItemIDsIndex = new FastByIDMap(); - - /** - *

- * Creates a from a precomputed list of {@link ItemItemSimilarity}s. Each - * represents the similarity between two distinct items. Since similarity is assumed to be symmetric, it is - * not necessary to specify similarity between item1 and item2, and item2 and item1. Both are the same. It - * is also not necessary to specify a similarity between any item and itself; these are assumed to be 1.0. - *

- * - *

- * Note that specifying a similarity between two items twice is not an error, but, the later value will win. - *

- * - * @param similarities - * set of {@link ItemItemSimilarity}s on which to base this instance - */ - public GenericItemSimilarity(Iterable similarities) { - initSimilarityMaps(similarities.iterator()); - } - - /** - *

- * Like {@link #GenericItemSimilarity(Iterable)}, but will only keep the specified number of similarities - * from the given {@link Iterable} of similarities. It will keep those with the highest similarity -- those - * that are therefore most important. - *

- * - *

- * Thanks to tsmorton for suggesting this and providing part of the implementation. - *

- * - * @param similarities - * set of {@link ItemItemSimilarity}s on which to base this instance - * @param maxToKeep - * maximum number of similarities to keep - */ - public GenericItemSimilarity(Iterable similarities, int maxToKeep) { - Iterable keptSimilarities = - TopItems.getTopItemItemSimilarities(maxToKeep, similarities.iterator()); - initSimilarityMaps(keptSimilarities.iterator()); - } - - /** - *

- * Builds a list of item-item similarities given an {@link ItemSimilarity} implementation and a - * {@link DataModel}, rather than a list of {@link ItemItemSimilarity}s. - *

- * - *

- * It's valid to build a this way, but perhaps missing some of the point of an - * item-based recommender. Item-based recommenders use the assumption that item-item similarities are - * relatively fixed, and might be known already independent of user preferences. Hence it is useful to - * inject that information, using {@link #GenericItemSimilarity(Iterable)}. - *

- * - * @param otherSimilarity - * other {@link ItemSimilarity} to get similarities from - * @param dataModel - * data model to get items from - * @throws TasteException - * if an error occurs while accessing the {@link DataModel} items - */ - public GenericItemSimilarity(ItemSimilarity otherSimilarity, DataModel dataModel) throws TasteException { - long[] itemIDs = GenericUserSimilarity.longIteratorToList(dataModel.getItemIDs()); - initSimilarityMaps(new DataModelSimilaritiesIterator(otherSimilarity, itemIDs)); - } - - /** - *

- * Like {@link #GenericItemSimilarity(ItemSimilarity, DataModel)} )}, but will only keep the specified - * number of similarities from the given {@link DataModel}. It will keep those with the highest similarity - * -- those that are therefore most important. - *

- * - *

- * Thanks to tsmorton for suggesting this and providing part of the implementation. - *

- * - * @param otherSimilarity - * other {@link ItemSimilarity} to get similarities from - * @param dataModel - * data model to get items from - * @param maxToKeep - * maximum number of similarities to keep - * @throws TasteException - * if an error occurs while accessing the {@link DataModel} items - */ - public GenericItemSimilarity(ItemSimilarity otherSimilarity, - DataModel dataModel, - int maxToKeep) throws TasteException { - long[] itemIDs = GenericUserSimilarity.longIteratorToList(dataModel.getItemIDs()); - Iterator it = new DataModelSimilaritiesIterator(otherSimilarity, itemIDs); - Iterable keptSimilarities = TopItems.getTopItemItemSimilarities(maxToKeep, it); - initSimilarityMaps(keptSimilarities.iterator()); - } - - private void initSimilarityMaps(Iterator similarities) { - while (similarities.hasNext()) { - ItemItemSimilarity iic = similarities.next(); - long similarityItemID1 = iic.getItemID1(); - long similarityItemID2 = iic.getItemID2(); - if (similarityItemID1 != similarityItemID2) { - // Order them -- first key should be the "smaller" one - long itemID1; - long itemID2; - if (similarityItemID1 < similarityItemID2) { - itemID1 = similarityItemID1; - itemID2 = similarityItemID2; - } else { - itemID1 = similarityItemID2; - itemID2 = similarityItemID1; - } - FastByIDMap map = similarityMaps.get(itemID1); - if (map == null) { - map = new FastByIDMap(); - similarityMaps.put(itemID1, map); - } - map.put(itemID2, iic.getValue()); - - doIndex(itemID1, itemID2); - doIndex(itemID2, itemID1); - } - // else similarity between item and itself already assumed to be 1.0 - } - } - - private void doIndex(long fromItemID, long toItemID) { - FastIDSet similarItemIDs = similarItemIDsIndex.get(fromItemID); - if (similarItemIDs == null) { - similarItemIDs = new FastIDSet(); - similarItemIDsIndex.put(fromItemID, similarItemIDs); - } - similarItemIDs.add(toItemID); - } - - /** - *

- * Returns the similarity between two items. Note that similarity is assumed to be symmetric, that - * {@code itemSimilarity(item1, item2) == itemSimilarity(item2, item1)}, and that - * {@code itemSimilarity(item1,item1) == 1.0} for all items. - *

- * - * @param itemID1 - * first item - * @param itemID2 - * second item - * @return similarity between the two - */ - @Override - public double itemSimilarity(long itemID1, long itemID2) { - if (itemID1 == itemID2) { - return 1.0; - } - long firstID; - long secondID; - if (itemID1 < itemID2) { - firstID = itemID1; - secondID = itemID2; - } else { - firstID = itemID2; - secondID = itemID1; - } - FastByIDMap nextMap = similarityMaps.get(firstID); - if (nextMap == null) { - return Double.NaN; - } - Double similarity = nextMap.get(secondID); - return similarity == null ? Double.NaN : similarity; - } - - @Override - public double[] itemSimilarities(long itemID1, long[] itemID2s) { - int length = itemID2s.length; - double[] result = new double[length]; - for (int i = 0; i < length; i++) { - result[i] = itemSimilarity(itemID1, itemID2s[i]); - } - return result; - } - - @Override - public long[] allSimilarItemIDs(long itemID) { - FastIDSet similarItemIDs = similarItemIDsIndex.get(itemID); - return similarItemIDs != null ? similarItemIDs.toArray() : NO_IDS; - } - - @Override - public void refresh(Collection alreadyRefreshed) { - // Do nothing - } - - /** Encapsulates a similarity between two items. Similarity must be in the range [-1.0,1.0]. */ - public static final class ItemItemSimilarity implements Comparable { - - private final long itemID1; - private final long itemID2; - private final double value; - - /** - * @param itemID1 - * first item - * @param itemID2 - * second item - * @param value - * similarity between the two - * @throws IllegalArgumentException - * if value is NaN, less than -1.0 or greater than 1.0 - */ - public ItemItemSimilarity(long itemID1, long itemID2, double value) { - Preconditions.checkArgument(value >= -1.0 && value <= 1.0, "Illegal value: %s", value); - this.itemID1 = itemID1; - this.itemID2 = itemID2; - this.value = value; - } - - public long getItemID1() { - return itemID1; - } - - public long getItemID2() { - return itemID2; - } - - public double getValue() { - return value; - } - - @Override - public String toString() { - return "ItemItemSimilarity[" + itemID1 + ',' + itemID2 + ':' + value + ']'; - } - - /** Defines an ordering from highest similarity to lowest. */ - @Override - public int compareTo(ItemItemSimilarity other) { - double otherValue = other.getValue(); - return value > otherValue ? -1 : value < otherValue ? 1 : 0; - } - - @Override - public boolean equals(Object other) { - if (!(other instanceof ItemItemSimilarity)) { - return false; - } - ItemItemSimilarity otherSimilarity = (ItemItemSimilarity) other; - return otherSimilarity.getItemID1() == itemID1 - && otherSimilarity.getItemID2() == itemID2 - && otherSimilarity.getValue() == value; - } - - @Override - public int hashCode() { - return (int) itemID1 ^ (int) itemID2 ^ RandomUtils.hashDouble(value); - } - - } - - private static final class DataModelSimilaritiesIterator extends AbstractIterator { - - private final ItemSimilarity otherSimilarity; - private final long[] itemIDs; - private int i; - private long itemID1; - private int j; - - private DataModelSimilaritiesIterator(ItemSimilarity otherSimilarity, long[] itemIDs) { - this.otherSimilarity = otherSimilarity; - this.itemIDs = itemIDs; - i = 0; - itemID1 = itemIDs[0]; - j = 1; - } - - @Override - protected ItemItemSimilarity computeNext() { - int size = itemIDs.length; - ItemItemSimilarity result = null; - while (result == null && i < size - 1) { - long itemID2 = itemIDs[j]; - double similarity; - try { - similarity = otherSimilarity.itemSimilarity(itemID1, itemID2); - } catch (TasteException te) { - // ugly: - throw new IllegalStateException(te); - } - if (!Double.isNaN(similarity)) { - result = new ItemItemSimilarity(itemID1, itemID2, similarity); - } - if (++j == size) { - itemID1 = itemIDs[++i]; - j = i + 1; - } - } - if (result == null) { - return endOfData(); - } else { - return result; - } - } - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java deleted file mode 100644 index ac9b65a1d..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java +++ /dev/null @@ -1,238 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity; - -import java.util.Collection; -import java.util.Iterator; - -import com.google.common.collect.AbstractIterator; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.recommender.TopItems; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; -import org.apache.mahout.cf.taste.similarity.UserSimilarity; -import org.apache.mahout.common.RandomUtils; - -import com.google.common.base.Preconditions; - -public final class GenericUserSimilarity implements UserSimilarity { - - private final FastByIDMap> similarityMaps = new FastByIDMap>(); - - public GenericUserSimilarity(Iterable similarities) { - initSimilarityMaps(similarities.iterator()); - } - - public GenericUserSimilarity(Iterable similarities, int maxToKeep) { - Iterable keptSimilarities = - TopItems.getTopUserUserSimilarities(maxToKeep, similarities.iterator()); - initSimilarityMaps(keptSimilarities.iterator()); - } - - public GenericUserSimilarity(UserSimilarity otherSimilarity, DataModel dataModel) throws TasteException { - long[] userIDs = longIteratorToList(dataModel.getUserIDs()); - initSimilarityMaps(new DataModelSimilaritiesIterator(otherSimilarity, userIDs)); - } - - public GenericUserSimilarity(UserSimilarity otherSimilarity, - DataModel dataModel, - int maxToKeep) throws TasteException { - long[] userIDs = longIteratorToList(dataModel.getUserIDs()); - Iterator it = new DataModelSimilaritiesIterator(otherSimilarity, userIDs); - Iterable keptSimilarities = TopItems.getTopUserUserSimilarities(maxToKeep, it); - initSimilarityMaps(keptSimilarities.iterator()); - } - - static long[] longIteratorToList(LongPrimitiveIterator iterator) { - long[] result = new long[5]; - int size = 0; - while (iterator.hasNext()) { - if (size == result.length) { - long[] newResult = new long[result.length << 1]; - System.arraycopy(result, 0, newResult, 0, result.length); - result = newResult; - } - result[size++] = iterator.next(); - } - if (size != result.length) { - long[] newResult = new long[size]; - System.arraycopy(result, 0, newResult, 0, size); - result = newResult; - } - return result; - } - - private void initSimilarityMaps(Iterator similarities) { - while (similarities.hasNext()) { - UserUserSimilarity uuc = similarities.next(); - long similarityUser1 = uuc.getUserID1(); - long similarityUser2 = uuc.getUserID2(); - if (similarityUser1 != similarityUser2) { - // Order them -- first key should be the "smaller" one - long user1; - long user2; - if (similarityUser1 < similarityUser2) { - user1 = similarityUser1; - user2 = similarityUser2; - } else { - user1 = similarityUser2; - user2 = similarityUser1; - } - FastByIDMap map = similarityMaps.get(user1); - if (map == null) { - map = new FastByIDMap(); - similarityMaps.put(user1, map); - } - map.put(user2, uuc.getValue()); - } - // else similarity between user and itself already assumed to be 1.0 - } - } - - @Override - public double userSimilarity(long userID1, long userID2) { - if (userID1 == userID2) { - return 1.0; - } - long first; - long second; - if (userID1 < userID2) { - first = userID1; - second = userID2; - } else { - first = userID2; - second = userID1; - } - FastByIDMap nextMap = similarityMaps.get(first); - if (nextMap == null) { - return Double.NaN; - } - Double similarity = nextMap.get(second); - return similarity == null ? Double.NaN : similarity; - } - - @Override - public void setPreferenceInferrer(PreferenceInferrer inferrer) { - throw new UnsupportedOperationException(); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - // Do nothing - } - - public static final class UserUserSimilarity implements Comparable { - - private final long userID1; - private final long userID2; - private final double value; - - public UserUserSimilarity(long userID1, long userID2, double value) { - Preconditions.checkArgument(value >= -1.0 && value <= 1.0, "Illegal value: %s", value); - this.userID1 = userID1; - this.userID2 = userID2; - this.value = value; - } - - public long getUserID1() { - return userID1; - } - - public long getUserID2() { - return userID2; - } - - public double getValue() { - return value; - } - - @Override - public String toString() { - return "UserUserSimilarity[" + userID1 + ',' + userID2 + ':' + value + ']'; - } - - /** Defines an ordering from highest similarity to lowest. */ - @Override - public int compareTo(UserUserSimilarity other) { - double otherValue = other.getValue(); - return value > otherValue ? -1 : value < otherValue ? 1 : 0; - } - - @Override - public boolean equals(Object other) { - if (!(other instanceof UserUserSimilarity)) { - return false; - } - UserUserSimilarity otherSimilarity = (UserUserSimilarity) other; - return otherSimilarity.getUserID1() == userID1 - && otherSimilarity.getUserID2() == userID2 - && otherSimilarity.getValue() == value; - } - - @Override - public int hashCode() { - return (int) userID1 ^ (int) userID2 ^ RandomUtils.hashDouble(value); - } - - } - - private static final class DataModelSimilaritiesIterator extends AbstractIterator { - - private final UserSimilarity otherSimilarity; - private final long[] itemIDs; - private int i; - private long itemID1; - private int j; - - private DataModelSimilaritiesIterator(UserSimilarity otherSimilarity, long[] itemIDs) { - this.otherSimilarity = otherSimilarity; - this.itemIDs = itemIDs; - i = 0; - itemID1 = itemIDs[0]; - j = 1; - } - - @Override - protected UserUserSimilarity computeNext() { - int size = itemIDs.length; - while (i < size - 1) { - long itemID2 = itemIDs[j]; - double similarity; - try { - similarity = otherSimilarity.userSimilarity(itemID1, itemID2); - } catch (TasteException te) { - // ugly: - throw new IllegalStateException(te); - } - if (!Double.isNaN(similarity)) { - return new UserUserSimilarity(itemID1, itemID2, similarity); - } - if (++j == size) { - itemID1 = itemIDs[++i]; - j = i + 1; - } - } - return endOfData(); - } - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java deleted file mode 100644 index 7fa1fab46..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java +++ /dev/null @@ -1,121 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity; - -import java.util.Collection; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; -import org.apache.mahout.cf.taste.similarity.UserSimilarity; -import org.apache.mahout.math.stats.LogLikelihood; - -/** - * See - * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.5962 and - * - * http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html. - */ -public final class LogLikelihoodSimilarity extends AbstractItemSimilarity implements UserSimilarity { - - public LogLikelihoodSimilarity(DataModel dataModel) { - super(dataModel); - } - - /** - * @throws UnsupportedOperationException - */ - @Override - public void setPreferenceInferrer(PreferenceInferrer inferrer) { - throw new UnsupportedOperationException(); - } - - @Override - public double userSimilarity(long userID1, long userID2) throws TasteException { - - DataModel dataModel = getDataModel(); - FastIDSet prefs1 = dataModel.getItemIDsFromUser(userID1); - FastIDSet prefs2 = dataModel.getItemIDsFromUser(userID2); - - long prefs1Size = prefs1.size(); - long prefs2Size = prefs2.size(); - long intersectionSize = - prefs1Size < prefs2Size ? prefs2.intersectionSize(prefs1) : prefs1.intersectionSize(prefs2); - if (intersectionSize == 0) { - return Double.NaN; - } - long numItems = dataModel.getNumItems(); - double logLikelihood = - LogLikelihood.logLikelihoodRatio(intersectionSize, - prefs2Size - intersectionSize, - prefs1Size - intersectionSize, - numItems - prefs1Size - prefs2Size + intersectionSize); - return 1.0 - 1.0 / (1.0 + logLikelihood); - } - - @Override - public double itemSimilarity(long itemID1, long itemID2) throws TasteException { - DataModel dataModel = getDataModel(); - long preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1); - long numUsers = dataModel.getNumUsers(); - return doItemSimilarity(itemID1, itemID2, preferring1, numUsers); - } - - @Override - public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException { - DataModel dataModel = getDataModel(); - long preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1); - long numUsers = dataModel.getNumUsers(); - int length = itemID2s.length; - double[] result = new double[length]; - for (int i = 0; i < length; i++) { - result[i] = doItemSimilarity(itemID1, itemID2s[i], preferring1, numUsers); - } - return result; - } - - private double doItemSimilarity(long itemID1, long itemID2, long preferring1, long numUsers) throws TasteException { - DataModel dataModel = getDataModel(); - long preferring1and2 = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2); - if (preferring1and2 == 0) { - return Double.NaN; - } - long preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2); - double logLikelihood = - LogLikelihood.logLikelihoodRatio(preferring1and2, - preferring2 - preferring1and2, - preferring1 - preferring1and2, - numUsers - preferring1 - preferring2 + preferring1and2); - return 1.0 - 1.0 / (1.0 + logLikelihood); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed); - RefreshHelper.maybeRefresh(alreadyRefreshed, getDataModel()); - } - - @Override - public String toString() { - return "LogLikelihoodSimilarity[dataModel:" + getDataModel() + ']'; - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java deleted file mode 100644 index 48dc4e089..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity; - -import org.apache.mahout.cf.taste.impl.common.Cache; -import org.apache.mahout.common.LongPair; - -/** - * A {@link Cache.MatchPredicate} which will match an ID against either element of a - * {@link LongPair}. - */ -final class LongPairMatchPredicate implements Cache.MatchPredicate { - - private final long id; - - LongPairMatchPredicate(long id) { - this.id = id; - } - - @Override - public boolean matches(LongPair pair) { - return pair.getFirst() == id || pair.getSecond() == id; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java deleted file mode 100644 index 8ea166017..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java +++ /dev/null @@ -1,93 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.common.Weighting; -import org.apache.mahout.cf.taste.model.DataModel; - -import com.google.common.base.Preconditions; - -/** - *

- * An implementation of the Pearson correlation. For users X and Y, the following values are calculated: - *

- * - *
    - *
  • sumX2: sum of the square of all X's preference values
  • - *
  • sumY2: sum of the square of all Y's preference values
  • - *
  • sumXY: sum of the product of X and Y's preference value for all items for which both X and Y express a - * preference
  • - *
- * - *

- * The correlation is then: - * - *

- * {@code sumXY / sqrt(sumX2 * sumY2)} - *

- * - *

- * Note that this correlation "centers" its data, shifts the user's preference values so that each of their - * means is 0. This is necessary to achieve expected behavior on all data sets. - *

- * - *

- * This correlation implementation is equivalent to the cosine similarity since the data it receives - * is assumed to be centered -- mean is 0. The correlation may be interpreted as the cosine of the angle - * between the two vectors defined by the users' preference values. - *

- * - *

- * For cosine similarity on uncentered data, see {@link UncenteredCosineSimilarity}. - *

- */ -public final class PearsonCorrelationSimilarity extends AbstractSimilarity { - - /** - * @throws IllegalArgumentException if {@link DataModel} does not have preference values - */ - public PearsonCorrelationSimilarity(DataModel dataModel) throws TasteException { - this(dataModel, Weighting.UNWEIGHTED); - } - - /** - * @throws IllegalArgumentException if {@link DataModel} does not have preference values - */ - public PearsonCorrelationSimilarity(DataModel dataModel, Weighting weighting) throws TasteException { - super(dataModel, weighting, true); - Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values"); - } - - @Override - double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) { - if (n == 0) { - return Double.NaN; - } - // Note that sum of X and sum of Y don't appear here since they are assumed to be 0; - // the data is assumed to be centered. - double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2); - if (denominator == 0.0) { - // One or both parties has -all- the same ratings; - // can't really say much similarity under this measure - return Double.NaN; - } - return sumXY / denominator; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java deleted file mode 100644 index 111636824..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java +++ /dev/null @@ -1,135 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity; - -import java.util.Collection; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; -import org.apache.mahout.cf.taste.similarity.UserSimilarity; - -import com.google.common.base.Preconditions; - -/** - *

- * Like {@link PearsonCorrelationSimilarity}, but compares relative ranking of preference values instead of - * preference values themselves. That is, each user's preferences are sorted and then assign a rank as their - * preference value, with 1 being assigned to the least preferred item. - *

- */ -public final class SpearmanCorrelationSimilarity implements UserSimilarity { - - private final DataModel dataModel; - - public SpearmanCorrelationSimilarity(DataModel dataModel) { - this.dataModel = Preconditions.checkNotNull(dataModel); - } - - @Override - public double userSimilarity(long userID1, long userID2) throws TasteException { - PreferenceArray xPrefs = dataModel.getPreferencesFromUser(userID1); - PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2); - int xLength = xPrefs.length(); - int yLength = yPrefs.length(); - - if (xLength <= 1 || yLength <= 1) { - return Double.NaN; - } - - // Copy prefs since we need to modify pref values to ranks - xPrefs = xPrefs.clone(); - yPrefs = yPrefs.clone(); - - // First sort by values from low to high - xPrefs.sortByValue(); - yPrefs.sortByValue(); - - // Assign ranks from low to high - float nextRank = 1.0f; - for (int i = 0; i < xLength; i++) { - // ... but only for items that are common to both pref arrays - if (yPrefs.hasPrefWithItemID(xPrefs.getItemID(i))) { - xPrefs.setValue(i, nextRank); - nextRank += 1.0f; - } - // Other values are bogus but don't matter - } - nextRank = 1.0f; - for (int i = 0; i < yLength; i++) { - if (xPrefs.hasPrefWithItemID(yPrefs.getItemID(i))) { - yPrefs.setValue(i, nextRank); - nextRank += 1.0f; - } - } - - xPrefs.sortByItem(); - yPrefs.sortByItem(); - - long xIndex = xPrefs.getItemID(0); - long yIndex = yPrefs.getItemID(0); - int xPrefIndex = 0; - int yPrefIndex = 0; - - double sumXYRankDiff2 = 0.0; - int count = 0; - - while (true) { - int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0; - if (compare == 0) { - double diff = xPrefs.getValue(xPrefIndex) - yPrefs.getValue(yPrefIndex); - sumXYRankDiff2 += diff * diff; - count++; - } - if (compare <= 0) { - if (++xPrefIndex >= xLength) { - break; - } - xIndex = xPrefs.getItemID(xPrefIndex); - } - if (compare >= 0) { - if (++yPrefIndex >= yLength) { - break; - } - yIndex = yPrefs.getItemID(yPrefIndex); - } - } - - if (count <= 1) { - return Double.NaN; - } - - // When ranks are unique, this formula actually gives the Pearson correlation - return 1.0 - 6.0 * sumXYRankDiff2 / (count * (count * count - 1)); - } - - @Override - public void setPreferenceInferrer(PreferenceInferrer inferrer) { - throw new UnsupportedOperationException(); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed); - RefreshHelper.maybeRefresh(alreadyRefreshed, dataModel); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java deleted file mode 100644 index 5e2660ccb..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java +++ /dev/null @@ -1,126 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity; - -import java.util.Collection; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; -import org.apache.mahout.cf.taste.similarity.UserSimilarity; - -/** - *

- * An implementation of a "similarity" based on the - * Tanimoto coefficient, or extended Jaccard - * coefficient. - *

- * - *

- * This is intended for "binary" data sets where a user either expresses a generic "yes" preference for an - * item or has no preference. The actual preference values do not matter here, only their presence or absence. - *

- * - *

- * The value returned is in [0,1]. - *

- */ -public final class TanimotoCoefficientSimilarity extends AbstractItemSimilarity implements UserSimilarity { - - public TanimotoCoefficientSimilarity(DataModel dataModel) { - super(dataModel); - } - - /** - * @throws UnsupportedOperationException - */ - @Override - public void setPreferenceInferrer(PreferenceInferrer inferrer) { - throw new UnsupportedOperationException(); - } - - @Override - public double userSimilarity(long userID1, long userID2) throws TasteException { - - DataModel dataModel = getDataModel(); - FastIDSet xPrefs = dataModel.getItemIDsFromUser(userID1); - FastIDSet yPrefs = dataModel.getItemIDsFromUser(userID2); - - int xPrefsSize = xPrefs.size(); - int yPrefsSize = yPrefs.size(); - if (xPrefsSize == 0 && yPrefsSize == 0) { - return Double.NaN; - } - if (xPrefsSize == 0 || yPrefsSize == 0) { - return 0.0; - } - - int intersectionSize = - xPrefsSize < yPrefsSize ? yPrefs.intersectionSize(xPrefs) : xPrefs.intersectionSize(yPrefs); - if (intersectionSize == 0) { - return Double.NaN; - } - - int unionSize = xPrefsSize + yPrefsSize - intersectionSize; - - return (double) intersectionSize / (double) unionSize; - } - - @Override - public double itemSimilarity(long itemID1, long itemID2) throws TasteException { - int preferring1 = getDataModel().getNumUsersWithPreferenceFor(itemID1); - return doItemSimilarity(itemID1, itemID2, preferring1); - } - - @Override - public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException { - int preferring1 = getDataModel().getNumUsersWithPreferenceFor(itemID1); - int length = itemID2s.length; - double[] result = new double[length]; - for (int i = 0; i < length; i++) { - result[i] = doItemSimilarity(itemID1, itemID2s[i], preferring1); - } - return result; - } - - private double doItemSimilarity(long itemID1, long itemID2, int preferring1) throws TasteException { - DataModel dataModel = getDataModel(); - int preferring1and2 = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2); - if (preferring1and2 == 0) { - return Double.NaN; - } - int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2); - return (double) preferring1and2 / (double) (preferring1 + preferring2 - preferring1and2); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed); - RefreshHelper.maybeRefresh(alreadyRefreshed, getDataModel()); - } - - @Override - public String toString() { - return "TanimotoCoefficientSimilarity[dataModel:" + getDataModel() + ']'; - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java deleted file mode 100644 index 626060613..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.common.Weighting; -import org.apache.mahout.cf.taste.model.DataModel; - -import com.google.common.base.Preconditions; - -/** - *

- * An implementation of the cosine similarity. The result is the cosine of the angle formed between - * the two preference vectors. - *

- * - *

- * Note that this similarity does not "center" its data, shifts the user's preference values so that each of their - * means is 0. For this behavior, use {@link PearsonCorrelationSimilarity}, which actually is mathematically - * equivalent for centered data. - *

- */ -public final class UncenteredCosineSimilarity extends AbstractSimilarity { - - /** - * @throws IllegalArgumentException if {@link DataModel} does not have preference values - */ - public UncenteredCosineSimilarity(DataModel dataModel) throws TasteException { - this(dataModel, Weighting.UNWEIGHTED); - } - - /** - * @throws IllegalArgumentException if {@link DataModel} does not have preference values - */ - public UncenteredCosineSimilarity(DataModel dataModel, Weighting weighting) throws TasteException { - super(dataModel, weighting, false); - Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values"); - } - - @Override - double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) { - if (n == 0) { - return Double.NaN; - } - double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2); - if (denominator == 0.0) { - // One or both parties has -all- the same ratings; - // can't really say much similarity under this measure - return Double.NaN; - } - return sumXY / denominator; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java deleted file mode 100644 index 1ae45c222..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity.file; - -import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; - -/** - * {@link Iterable} to be able to read a file linewise into a {@link GenericItemSimilarity} - */ -final class FileItemItemSimilarityIterable implements Iterable { - - private final File similaritiesFile; - - FileItemItemSimilarityIterable(File similaritiesFile) { - this.similaritiesFile = similaritiesFile; - } - - @Override - public Iterator iterator() { - try { - return new FileItemItemSimilarityIterator(similaritiesFile); - } catch (IOException ioe) { - throw new IllegalStateException("Can't read " + similaritiesFile, ioe); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java deleted file mode 100644 index c0711598e..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity.file; - -import com.google.common.base.Function; -import com.google.common.collect.ForwardingIterator; -import com.google.common.collect.Iterators; -import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity; -import org.apache.mahout.common.iterator.FileLineIterator; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.regex.Pattern; - -/** - * a simple iterator using a {@link FileLineIterator} internally, parsing each - * line into an {@link GenericItemSimilarity.ItemItemSimilarity}. - */ -final class FileItemItemSimilarityIterator extends ForwardingIterator { - - private static final Pattern SEPARATOR = Pattern.compile("[,\t]"); - - private final Iterator delegate; - - FileItemItemSimilarityIterator(File similaritiesFile) throws IOException { - delegate = Iterators.transform( - new FileLineIterator(similaritiesFile), - new Function() { - @Override - public GenericItemSimilarity.ItemItemSimilarity apply(String from) { - String[] tokens = SEPARATOR.split(from); - return new GenericItemSimilarity.ItemItemSimilarity(Long.parseLong(tokens[0]), - Long.parseLong(tokens[1]), - Double.parseDouble(tokens[2])); - } - }); - } - - @Override - protected Iterator delegate() { - return delegate; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java deleted file mode 100644 index 712b96ab3..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity.file; - -import java.io.File; -import java.util.Collection; -import java.util.concurrent.locks.ReentrantLock; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity; -import org.apache.mahout.cf.taste.similarity.ItemSimilarity; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

- * An {@link ItemSimilarity} backed by a comma-delimited file. This class typically expects a file where each line - * contains an item ID, followed by another item ID, followed by a similarity value, separated by commas. You may also - * use tabs. - *

- * - *

- * The similarity value is assumed to be parseable as a {@code double} having a value between -1 and 1. The - * item IDs are parsed as {@code long}s. Similarities are symmetric so for a pair of items you do not have to - * include 2 lines in the file. - *

- * - *

- * This class will reload data from the data file when {@link #refresh(Collection)} is called, unless the file - * has been reloaded very recently already. - *

- * - *

- * This class is not intended for use with very large amounts of data. For that, a JDBC-backed {@link ItemSimilarity} - * and a database are more appropriate. - *

- */ -public class FileItemSimilarity implements ItemSimilarity { - - public static final long DEFAULT_MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute? - - private ItemSimilarity delegate; - private final ReentrantLock reloadLock; - private final File dataFile; - private long lastModified; - private final long minReloadIntervalMS; - - private static final Logger log = LoggerFactory.getLogger(FileItemSimilarity.class); - - /** - * @param dataFile - * file containing the similarity data - */ - public FileItemSimilarity(File dataFile) { - this(dataFile, DEFAULT_MIN_RELOAD_INTERVAL_MS); - } - - /** - * @param minReloadIntervalMS - * the minimum interval in milliseconds after which a full reload of the original datafile is done - * when refresh() is called - * @see #FileItemSimilarity(File) - */ - public FileItemSimilarity(File dataFile, long minReloadIntervalMS) { - Preconditions.checkArgument(dataFile != null, "dataFile is null"); - Preconditions.checkArgument(dataFile.exists() && !dataFile.isDirectory(), - "dataFile is missing or a directory: %s", dataFile); - - log.info("Creating FileItemSimilarity for file {}", dataFile); - - this.dataFile = dataFile.getAbsoluteFile(); - this.lastModified = dataFile.lastModified(); - this.minReloadIntervalMS = minReloadIntervalMS; - this.reloadLock = new ReentrantLock(); - - reload(); - } - - @Override - public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException { - return delegate.itemSimilarities(itemID1, itemID2s); - } - - @Override - public long[] allSimilarItemIDs(long itemID) throws TasteException { - return delegate.allSimilarItemIDs(itemID); - } - - @Override - public double itemSimilarity(long itemID1, long itemID2) throws TasteException { - return delegate.itemSimilarity(itemID1, itemID2); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - if (dataFile.lastModified() > lastModified + minReloadIntervalMS) { - log.debug("File has changed; reloading..."); - reload(); - } - } - - protected void reload() { - if (reloadLock.tryLock()) { - try { - long newLastModified = dataFile.lastModified(); - delegate = new GenericItemSimilarity(new FileItemItemSimilarityIterable(dataFile)); - lastModified = newLastModified; - } finally { - reloadLock.unlock(); - } - } - } - - @Override - public String toString() { - return "FileItemSimilarity[dataFile:" + dataFile + ']'; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/transforms/CaseAmplification.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/transforms/CaseAmplification.java deleted file mode 100644 index 7a272f4a8..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/transforms/CaseAmplification.java +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.transforms; - -import java.util.Collection; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.transforms.SimilarityTransform; - -import com.google.common.base.Preconditions; - -/** - *

- * Applies "case amplification" to similarities. This essentially makes big values bigger and small values - * smaller by raising each score to a power. It could however be used to achieve the opposite effect. - *

- */ -public final class CaseAmplification implements SimilarityTransform { - - private final double factor; - - /** - *

- * Creates a transformation based on the given factor. - *

- * - * @param factor - * transformation factor - * @throws IllegalArgumentException - * if factor is 0.0 or {@link Double#NaN} - */ - public CaseAmplification(double factor) { - Preconditions.checkArgument(factor != 0.0 && !Double.isNaN(factor), "factor is 0 or NaN"); - this.factor = factor; - } - - /** - *

- * Transforms one similarity value. This implementation is such that it's possible to define this - * transformation on one value in isolation. The "thing" parameters are therefore unused. - *

- * - * @param id1 - * unused - * @param id2 - * unused - * @param value - * similarity to transform - * @return {@code valuefactor} if value is nonnegative; - * {@code -value-factor} otherwise - */ - @Override - public double transformSimilarity(long id1, long id2, double value) { - return value < 0.0 ? -Math.pow(-value, factor) : Math.pow(value, factor); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - // do nothing - } - - @Override - public String toString() { - return "CaseAmplification[factor:" + factor + ']'; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/transforms/Counters.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/transforms/Counters.java deleted file mode 100644 index 0ca2a56ec..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/transforms/Counters.java +++ /dev/null @@ -1,62 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.transforms; - -import java.util.Map; - -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; - -/** - *

- * A simple, fast utility class that maps keys to counts. - *

- */ -final class Counters { - - private final FastByIDMap counts = new FastByIDMap(); - - void increment(long key) { - int[] count = counts.get(key); - if (count == null) { - int[] newCount = new int[1]; - newCount[0] = 1; - counts.put(key, newCount); - } else { - count[0]++; - } - } - - int getCount(long key) { - int[] count = counts.get(key); - return count == null ? 0 : count[0]; - } - - int size() { - return counts.size(); - } - - Iterable> getEntrySet() { - return counts.entrySet(); - } - - @Override - public String toString() { - return "Counters[" + counts + ']'; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/transforms/InverseUserFrequency.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/transforms/InverseUserFrequency.java deleted file mode 100644 index a665184c0..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/transforms/InverseUserFrequency.java +++ /dev/null @@ -1,134 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.transforms; - -import java.util.Collection; -import java.util.Map; -import java.util.concurrent.Callable; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.transforms.PreferenceTransform; - -import com.google.common.base.Preconditions; - -/** - *

- * Implements an "inverse user frequency" transformation, which boosts preference values for items for which - * few users have expressed a preference, and reduces preference values for items for which many users have - * expressed a preference. The idea is that these "rare" items are more useful in deciding how similar two - * users' tastes are, and so should be emphasized in other calculatioons. This idea is mentioned in Empirical Analysis of Predictive Algorithms for - * Collaborative Filtering. - *

- * - *

- * A scaling factor is computed for each item by dividing the total number of users by the number of users - * expressing a preference for that item, and taking the log of that value. The log base of this calculation - * can be controlled in the constructor. Intuitively, the right value for the base is equal to the average - * number of users who express a preference for each item in your model. If each item has about 100 - * preferences on average, 100.0 is a good log base. - *

- */ -public final class InverseUserFrequency implements PreferenceTransform { - - private final DataModel dataModel; - private final RefreshHelper refreshHelper; - private final double logBase; - private FastByIDMap iufFactors; - - /** - *

- * Creates a transformation. Computations use the given log base. - *

- * - * @param dataModel - * {@link DataModel} from which to calculate user frequencies - * @param logBase - * calculation logarithm base - * @throws IllegalArgumentException - * if dataModel is {@code null} or logBase is {@link Double#NaN} or <= 1.0 - */ - public InverseUserFrequency(DataModel dataModel, double logBase) throws TasteException { - Preconditions.checkArgument(logBase > 1.0, "logBase should be > 1.0"); - this.dataModel = Preconditions.checkNotNull(dataModel); - this.logBase = logBase; - this.iufFactors = new FastByIDMap(); - this.refreshHelper = new RefreshHelper(new Callable() { - @Override - public Object call() throws TasteException { - recompute(); - return null; - } - }); - this.refreshHelper.addDependency(this.dataModel); - recompute(); - } - - /** @return log base used in this object's calculations */ - public double getLogBase() { - return logBase; - } - - @Override - public float getTransformedValue(Preference pref) { - Double factor = iufFactors.get(pref.getItemID()); - if (factor != null) { - return (float) (pref.getValue() * factor); - } - return pref.getValue(); - } - - @Override - public void refresh(Collection alreadyRefreshed) { - refreshHelper.refresh(alreadyRefreshed); - } - - private void recompute() throws TasteException { - Counters itemPreferenceCounts = new Counters(); - int numUsers = 0; - LongPrimitiveIterator it = dataModel.getUserIDs(); - while (it.hasNext()) { - PreferenceArray prefs = dataModel.getPreferencesFromUser(it.nextLong()); - int size = prefs.length(); - for (int i = 0; i < size; i++) { - itemPreferenceCounts.increment(prefs.getItemID(i)); - } - numUsers++; - } - FastByIDMap newIufFactors = new FastByIDMap(itemPreferenceCounts.size()); - double logFactor = Math.log(logBase); - for (Map.Entry entry : itemPreferenceCounts.getEntrySet()) { - newIufFactors.put(entry.getKey(), Math.log((double) numUsers / (double) entry.getValue()[0]) - / logFactor); - } - iufFactors = newIufFactors; - } - - @Override - public String toString() { - return "InverseUserFrequency[logBase:" + logBase + ']'; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/transforms/ZScore.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/transforms/ZScore.java deleted file mode 100644 index c09c7ba28..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/impl/transforms/ZScore.java +++ /dev/null @@ -1,97 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.transforms; - -import java.util.Collection; - -import com.google.common.base.Preconditions; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.Cache; -import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev; -import org.apache.mahout.cf.taste.impl.common.RefreshHelper; -import org.apache.mahout.cf.taste.impl.common.Retriever; -import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.Preference; -import org.apache.mahout.cf.taste.model.PreferenceArray; -import org.apache.mahout.cf.taste.transforms.PreferenceTransform; - -/** - *

- * Normalizes preference values for a user by converting them to "z-scores". This process normalizes preference values - * to adjust for variation in mean and variance of a user's preferences. - *

- * - *

- * Imagine two users, one who tends to rate every movie he/she sees four or five stars, and another who uses - * the full one to five star range when assigning ratings. This transform normalizes away the difference in - * scale used by the two users so that both have a mean preference of 0.0 and a standard deviation of 1.0. - *

- */ -public final class ZScore implements PreferenceTransform { - - private final DataModel dataModel; - private final Cache meanAndStdevs; - - public ZScore(DataModel dataModel) { - this.dataModel = Preconditions.checkNotNull(dataModel); - this.meanAndStdevs = new Cache(new MeanStdevRetriever()); - refresh(null); - } - - @Override - public float getTransformedValue(Preference pref) throws TasteException { - RunningAverageAndStdDev meanAndStdev = meanAndStdevs.get(pref.getUserID()); - if (meanAndStdev.getCount() > 1) { - double stdev = meanAndStdev.getStandardDeviation(); - if (stdev > 0.0) { - return (float) ((pref.getValue() - meanAndStdev.getAverage()) / stdev); - } - } - return 0.0f; - } - - @Override - public void refresh(Collection alreadyRefreshed) { - meanAndStdevs.clear(); - alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed); - RefreshHelper.maybeRefresh(alreadyRefreshed, dataModel); - } - - @Override - public String toString() { - return "ZScore"; - } - - private class MeanStdevRetriever implements Retriever { - - @Override - public RunningAverageAndStdDev get(Long userID) throws TasteException { - RunningAverageAndStdDev running = new FullRunningAverageAndStdDev(); - PreferenceArray prefs = dataModel.getPreferencesFromUser(userID); - int size = prefs.length(); - for (int i = 0; i < size; i++) { - running.addDatum(prefs.getValue(i)); - } - return running; - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/DataModel.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/DataModel.java deleted file mode 100644 index d3a88446b..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/DataModel.java +++ /dev/null @@ -1,199 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.model; - -import java.io.Serializable; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; - -/** - *

- * Implementations represent a repository of information about users and their associated {@link Preference}s - * for items. - *

- */ -public interface DataModel extends Refreshable, Serializable { - - /** - * @return all user IDs in the model, in order - * @throws TasteException - * if an error occurs while accessing the data - */ - LongPrimitiveIterator getUserIDs() throws TasteException; - - /** - * @param userID - * ID of user to get prefs for - * @return user's preferences, ordered by item ID - * @throws org.apache.mahout.cf.taste.common.NoSuchUserException - * if the user does not exist - * @throws TasteException - * if an error occurs while accessing the data - */ - PreferenceArray getPreferencesFromUser(long userID) throws TasteException; - - /** - * @param userID - * ID of user to get prefs for - * @return IDs of items user expresses a preference for - * @throws org.apache.mahout.cf.taste.common.NoSuchUserException - * if the user does not exist - * @throws TasteException - * if an error occurs while accessing the data - */ - FastIDSet getItemIDsFromUser(long userID) throws TasteException; - - /** - * @return a {@link LongPrimitiveIterator} of all item IDs in the model, in order - * @throws TasteException - * if an error occurs while accessing the data - */ - LongPrimitiveIterator getItemIDs() throws TasteException; - - /** - * @param itemID - * item ID - * @return all existing {@link Preference}s expressed for that item, ordered by user ID, as an array - * @throws org.apache.mahout.cf.taste.common.NoSuchItemException - * if the item does not exist - * @throws TasteException - * if an error occurs while accessing the data - */ - PreferenceArray getPreferencesForItem(long itemID) throws TasteException; - - /** - * Retrieves the preference value for a single user and item. - * - * @param userID - * user ID to get pref value from - * @param itemID - * item ID to get pref value for - * @return preference value from the given user for the given item or null if none exists - * @throws org.apache.mahout.cf.taste.common.NoSuchUserException - * if the user does not exist - * @throws TasteException - * if an error occurs while accessing the data - */ - Float getPreferenceValue(long userID, long itemID) throws TasteException; - - /** - * Retrieves the time at which a preference value from a user and item was set, if known. - * Time is expressed in the usual way, as a number of milliseconds since the epoch. - * - * @param userID user ID for preference in question - * @param itemID item ID for preference in question - * @return time at which preference was set or null if no preference exists or its time is not known - * @throws org.apache.mahout.cf.taste.common.NoSuchUserException if the user does not exist - * @throws TasteException if an error occurs while accessing the data - */ - Long getPreferenceTime(long userID, long itemID) throws TasteException; - - /** - * @return total number of items known to the model. This is generally the union of all items preferred by - * at least one user but could include more. - * @throws TasteException - * if an error occurs while accessing the data - */ - int getNumItems() throws TasteException; - - /** - * @return total number of users known to the model. - * @throws TasteException - * if an error occurs while accessing the data - */ - int getNumUsers() throws TasteException; - - /** - * @param itemID item ID to check for - * @return the number of users who have expressed a preference for the item - * @throws TasteException if an error occurs while accessing the data - */ - int getNumUsersWithPreferenceFor(long itemID) throws TasteException; - - /** - * @param itemID1 first item ID to check for - * @param itemID2 second item ID to check for - * @return the number of users who have expressed a preference for the items - * @throws TasteException if an error occurs while accessing the data - */ - int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException; - - /** - *

- * Sets a particular preference (item plus rating) for a user. - *

- * - * @param userID - * user to set preference for - * @param itemID - * item to set preference for - * @param value - * preference value - * @throws org.apache.mahout.cf.taste.common.NoSuchItemException - * if the item does not exist - * @throws org.apache.mahout.cf.taste.common.NoSuchUserException - * if the user does not exist - * @throws TasteException - * if an error occurs while accessing the data - */ - void setPreference(long userID, long itemID, float value) throws TasteException; - - /** - *

- * Removes a particular preference for a user. - *

- * - * @param userID - * user from which to remove preference - * @param itemID - * item to remove preference for - * @throws org.apache.mahout.cf.taste.common.NoSuchItemException - * if the item does not exist - * @throws org.apache.mahout.cf.taste.common.NoSuchUserException - * if the user does not exist - * @throws TasteException - * if an error occurs while accessing the data - */ - void removePreference(long userID, long itemID) throws TasteException; - - /** - * @return true iff this implementation actually stores and returns distinct preference values; - * that is, if it is not a 'boolean' DataModel - */ - boolean hasPreferenceValues(); - - /** - * @return the maximum preference value that is possible in the current problem domain being evaluated. For - * example, if the domain is movie ratings on a scale of 1 to 5, this should be 5. While a - * {@link org.apache.mahout.cf.taste.recommender.Recommender} may estimate a preference value above 5.0, it - * isn't "fair" to consider that the system is actually suggesting an impossible rating of, say, 5.4 stars. - * In practice the application would cap this estimate to 5.0. Since evaluators evaluate - * the difference between estimated and actual value, this at least prevents this effect from unfairly - * penalizing a {@link org.apache.mahout.cf.taste.recommender.Recommender} - */ - float getMaxPreference(); - - /** - * @see #getMaxPreference() - */ - float getMinPreference(); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java deleted file mode 100644 index dd951126b..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java +++ /dev/null @@ -1,65 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.model; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; - -/** - *

- * Mahout 0.2 changed the framework to operate only in terms of numeric (long) ID values for users and items. - * This is, obviously, not compatible with applications that used other key types -- most commonly - * {@link String}. Implementation of this class provide support for mapping String to longs and vice versa in - * order to provide a smoother migration path to applications that must still use strings as IDs. - *

- * - *

- * The mapping from strings to 64-bit numeric values is fixed here, to provide a standard implementation that - * is 'portable' or reproducible outside the framework easily. See {@link #toLongID(String)}. - *

- * - *

- * Because this mapping is deterministically computable, it does not need to be stored. Indeed, subclasses' - * job is to store the reverse mapping. There are an infinite number of strings but only a fixed number of - * longs, so, it is possible for two strings to map to the same value. Subclasses do not treat this as an - * error but rather retain only the most recent mapping, overwriting a previous mapping. The probability of - * collision in a 64-bit space is quite small, but not zero. However, in the context of a collaborative - * filtering problem, the consequence of a collision is small, at worst -- perhaps one user receives another - * recommendations. - *

- * - * @since 0.2 - */ -public interface IDMigrator extends Refreshable { - - /** - * @return the top 8 bytes of the MD5 hash of the bytes of the given {@link String}'s UTF-8 encoding as a - * long. - * @throws TasteException - * if an error occurs while storing the mapping - */ - long toLongID(String stringID); - - /** - * @return the string ID most recently associated with the given long ID, or null if doesn't exist - * @throws TasteException - * if an error occurs while retrieving the mapping - */ - String toStringID(long longID) throws TasteException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/JDBCDataModel.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/JDBCDataModel.java deleted file mode 100644 index 1c5e80c72..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/JDBCDataModel.java +++ /dev/null @@ -1,43 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.model; - -import javax.sql.DataSource; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastByIDMap; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; - -public interface JDBCDataModel extends DataModel { - - /** - * @return {@link DataSource} underlying this model - */ - DataSource getDataSource(); - - /** - * Hmm, should this exist elsewhere? seems like most relevant for a DB implementation, which is not in - * memory, which might want to export to memory. - * - * @return all user preference data - */ - FastByIDMap exportWithPrefs() throws TasteException; - - FastByIDMap exportWithIDsOnly() throws TasteException; - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/Preference.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/Preference.java deleted file mode 100644 index 1e164ec0a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/Preference.java +++ /dev/null @@ -1,48 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.model; - -/** - *

- * A encapsulates an item and a preference value, which indicates the strength of the - * preference for it. s are associated to users. - *

- */ -public interface Preference { - - /** @return ID of user who prefers the item */ - long getUserID(); - - /** @return item ID that is preferred */ - long getItemID(); - - /** - * @return strength of the preference for that item. Zero should indicate "no preference either way"; - * positive values indicate preference and negative values indicate dislike - */ - float getValue(); - - /** - * Sets the strength of the preference for this item - * - * @param value - * new preference - */ - void setValue(float value); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/PreferenceArray.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/PreferenceArray.java deleted file mode 100644 index de67d0016..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/PreferenceArray.java +++ /dev/null @@ -1,143 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.model; - -import java.io.Serializable; - -/** - * An alternate representation of an array of {@link Preference}. Implementations, in theory, can produce a - * more memory-efficient representation. - */ -public interface PreferenceArray extends Cloneable, Serializable, Iterable { - - /** - * @return size of length of the "array" - */ - int length(); - - /** - * @param i - * index - * @return a materialized {@link Preference} representation of the preference at i - */ - Preference get(int i); - - /** - * Sets preference at i from information in the given {@link Preference} - * - * @param i - * @param pref - */ - void set(int i, Preference pref); - - /** - * @param i - * index - * @return user ID from preference at i - */ - long getUserID(int i); - - /** - * Sets user ID for preference at i. - * - * @param i - * index - * @param userID - * new user ID - */ - void setUserID(int i, long userID); - - /** - * @param i - * index - * @return item ID from preference at i - */ - long getItemID(int i); - - /** - * Sets item ID for preference at i. - * - * @param i - * index - * @param itemID - * new item ID - */ - void setItemID(int i, long itemID); - - /** - * @return all user or item IDs - */ - long[] getIDs(); - - /** - * @param i - * index - * @return preference value from preference at i - */ - float getValue(int i); - - /** - * Sets preference value for preference at i. - * - * @param i - * index - * @param value - * new preference value - */ - void setValue(int i, float value); - - /** - * @return independent copy of this object - */ - PreferenceArray clone(); - - /** - * Sorts underlying array by user ID, ascending. - */ - void sortByUser(); - - /** - * Sorts underlying array by item ID, ascending. - */ - void sortByItem(); - - /** - * Sorts underlying array by preference value, ascending. - */ - void sortByValue(); - - /** - * Sorts underlying array by preference value, descending. - */ - void sortByValueReversed(); - - /** - * @param userID - * user ID - * @return true if array contains a preference with given user ID - */ - boolean hasPrefWithUserID(long userID); - - /** - * @param itemID - * item ID - * @return true if array contains a preference with given item ID - */ - boolean hasPrefWithItemID(long itemID); - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/UpdatableIDMigrator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/UpdatableIDMigrator.java deleted file mode 100644 index ff29a34a6..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/model/UpdatableIDMigrator.java +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.model; - -import org.apache.mahout.cf.taste.common.TasteException; - -public interface UpdatableIDMigrator extends IDMigrator { - - /** - * Stores the reverse long-to-String mapping in some kind of backing store. Note that this must be called - * directly (or indirectly through {@link #initialize(Iterable)}) for every String that might be encountered - * in the application, or else the mapping will not be known. - * - * @param longID - * long ID - * @param stringID - * string ID that maps to/from that long ID - * @throws TasteException - * if an error occurs while saving the mapping - */ - void storeMapping(long longID, String stringID) throws TasteException; - - /** - * Make the mapping aware of the given string IDs. This must be called initially before the implementation - * is used, or else it will not be aware of reverse long-to-String mappings. - * - * @throws TasteException - * if an error occurs while storing the mappings - */ - void initialize(Iterable stringIDs) throws TasteException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/neighborhood/UserNeighborhood.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/neighborhood/UserNeighborhood.java deleted file mode 100644 index 2a143e13d..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/neighborhood/UserNeighborhood.java +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.neighborhood; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; - -/** - *

- * Implementations of this interface compute a "neighborhood" of users like a given user. This neighborhood - * can be used to compute recommendations then. - *

- */ -public interface UserNeighborhood extends Refreshable { - - /** - * @param userID - * ID of user for which a neighborhood will be computed - * @return IDs of users in the neighborhood - * @throws TasteException - * if an error occurs while accessing data - */ - long[] getUserNeighborhood(long userID) throws TasteException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/CandidateItemsStrategy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/CandidateItemsStrategy.java deleted file mode 100644 index f87d9cc51..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/CandidateItemsStrategy.java +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.recommender; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; - -/** - * Used to retrieve all items that could possibly be recommended to the user - */ -public interface CandidateItemsStrategy { - - /** - * @return IDs of all items that could be recommended to the user - */ - FastIDSet getCandidateItems(long userID, PreferenceArray preferencesFromUser, DataModel dataModel) - throws TasteException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/ClusteringRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/ClusteringRecommender.java deleted file mode 100644 index 75f1c4c80..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/ClusteringRecommender.java +++ /dev/null @@ -1,54 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.recommender; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; - -/** - *

- * Interface implemented by "clustering" recommenders. - *

- */ -public interface ClusteringRecommender extends Recommender { - - /** - *

- * Returns the cluster of users to which the given user, denoted by user ID, belongs. - *

- * - * @param userID - * user ID for which to find a cluster - * @return {@link FastIDSet} of IDs of users in the requested user's cluster - * @throws TasteException - * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel} - */ - FastIDSet getCluster(long userID) throws TasteException; - - /** - *

- * Returns all clusters of users. - *

- * - * @return array of {@link FastIDSet}s of user IDs - * @throws TasteException - * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel} - */ - FastIDSet[] getClusters() throws TasteException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/IDRescorer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/IDRescorer.java deleted file mode 100644 index 9fc6a8fa7..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/IDRescorer.java +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.recommender; - -/** - *

- * A {@link Rescorer} which operates on {@code long} primitive IDs, rather than arbitrary {@link Object}s. - * This is provided since most uses of this interface in the framework take IDs (as {@code long}) as an - * argument, and so this can be used to avoid unnecessary boxing/unboxing. - *

- */ -public interface IDRescorer { - - /** - * @param id - * ID of thing (user, item, etc.) to rescore - * @param originalScore - * original score - * @return modified score, or {@link Double#NaN} to indicate that this should be excluded entirely - */ - double rescore(long id, double originalScore); - - /** - * Returns {@code true} to exclude the given thing. - * - * @param id - * ID of thing (user, item, etc.) to rescore - * @return {@code true} to exclude, {@code false} otherwise - */ - boolean isFiltered(long id); - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/ItemBasedRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/ItemBasedRecommender.java deleted file mode 100644 index 570f85149..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/ItemBasedRecommender.java +++ /dev/null @@ -1,145 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.recommender; - -import java.util.List; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.common.LongPair; - -/** - *

- * Interface implemented by "item-based" recommenders. - *

- */ -public interface ItemBasedRecommender extends Recommender { - - /** - * @param itemID - * ID of item for which to find most similar other items - * @param howMany - * desired number of most similar items to find - * @return items most similar to the given item, ordered from most similar to least - * @throws TasteException - * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel} - */ - List mostSimilarItems(long itemID, int howMany) throws TasteException; - - /** - * @param itemID - * ID of item for which to find most similar other items - * @param howMany - * desired number of most similar items to find - * @param rescorer - * {@link Rescorer} which can adjust item-item similarity estimates used to determine most similar - * items - * @return itemss most similar to the given item, ordered from most similar to least - * @throws TasteException - * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel} - */ - List mostSimilarItems(long itemID, int howMany, Rescorer rescorer) throws TasteException; - - /** - * @param itemIDs - * IDs of item for which to find most similar other items - * @param howMany - * desired number of most similar items to find estimates used to determine most similar items - * @return items most similar to the given items, ordered from most similar to least - * @throws TasteException - * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel} - */ - List mostSimilarItems(long[] itemIDs, int howMany) throws TasteException; - - /** - * @param itemIDs - * IDs of item for which to find most similar other items - * @param howMany - * desired number of most similar items to find - * @param rescorer - * {@link Rescorer} which can adjust item-item similarity estimates used to determine most similar - * items - * @return items most similar to the given items, ordered from most similar to least - * @throws TasteException - * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel} - */ - List mostSimilarItems(long[] itemIDs, - int howMany, - Rescorer rescorer) throws TasteException; - - /** - * @param itemIDs - * IDs of item for which to find most similar other items - * @param howMany - * desired number of most similar items to find - * @param excludeItemIfNotSimilarToAll - * exclude an item if it is not similar to each of the input items - * @return items most similar to the given items, ordered from most similar to least - * @throws TasteException - * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel} - */ - List mostSimilarItems(long[] itemIDs, - int howMany, - boolean excludeItemIfNotSimilarToAll) throws TasteException; - - /** - * @param itemIDs - * IDs of item for which to find most similar other items - * @param howMany - * desired number of most similar items to find - * @param rescorer - * {@link Rescorer} which can adjust item-item similarity estimates used to determine most similar - * items - * @param excludeItemIfNotSimilarToAll - * exclude an item if it is not similar to each of the input items - * @return items most similar to the given items, ordered from most similar to least - * @throws TasteException - * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel} - */ - List mostSimilarItems(long[] itemIDs, - int howMany, - Rescorer rescorer, - boolean excludeItemIfNotSimilarToAll) throws TasteException; - - /** - *

- * Lists the items that were most influential in recommending a given item to a given user. Exactly how this - * is determined is left to the implementation, but, generally this will return items that the user prefers - * and that are similar to the given item. - *

- * - *

- * This returns a {@link List} of {@link RecommendedItem} which is a little misleading since it's returning - * recommending items, but, I thought it more natural to just reuse this class since it - * encapsulates an item and value. The value here does not necessarily have a consistent interpretation or - * expected range; it will be higher the more influential the item was in the recommendation. - *

- * - * @param userID - * ID of user who was recommended the item - * @param itemID - * ID of item that was recommended - * @param howMany - * maximum number of items to return - * @return {@link List} of {@link RecommendedItem}, ordered from most influential in recommended the given - * item to least - * @throws TasteException - * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel} - */ - List recommendedBecause(long userID, long itemID, int howMany) throws TasteException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/MostSimilarItemsCandidateItemsStrategy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/MostSimilarItemsCandidateItemsStrategy.java deleted file mode 100644 index f5cdc37a1..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/MostSimilarItemsCandidateItemsStrategy.java +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.recommender; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.model.DataModel; - -/** - * Used to retrieve all items that could possibly be similar - */ -public interface MostSimilarItemsCandidateItemsStrategy { - - FastIDSet getCandidateItems(long[] itemIDs, DataModel dataModel) throws TasteException; -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/RecommendedItem.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/RecommendedItem.java deleted file mode 100644 index e5bd37585..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/RecommendedItem.java +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.recommender; - -/** - *

- * Implementations encapsulate items that are recommended, and include the item recommended and a value - * expressing the strength of the preference. - *

- */ -public interface RecommendedItem { - - /** @return the recommended item ID */ - long getItemID(); - - /** - *

- * A value expressing the strength of the preference for the recommended item. The range of the values - * depends on the implementation. Implementations must use larger values to express stronger preference. - *

- * - * @return strength of the preference - */ - float getValue(); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/Recommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/Recommender.java deleted file mode 100644 index cdd7adcf0..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/Recommender.java +++ /dev/null @@ -1,100 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.recommender; - -import java.util.List; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.model.DataModel; - -/** - *

- * Implementations of this interface can recommend items for a user. Implementations will likely take - * advantage of several classes in other packages here to compute this. - *

- */ -public interface Recommender extends Refreshable { - - /** - * @param userID - * user for which recommendations are to be computed - * @param howMany - * desired number of recommendations - * @return {@link List} of recommended {@link RecommendedItem}s, ordered from most strongly recommend to - * least - * @throws TasteException - * if an error occurs while accessing the {@link DataModel} - */ - List recommend(long userID, int howMany) throws TasteException; - - /** - * @param userID - * user for which recommendations are to be computed - * @param howMany - * desired number of recommendations - * @param rescorer - * rescoring function to apply before final list of recommendations is determined - * @return {@link List} of recommended {@link RecommendedItem}s, ordered from most strongly recommend to - * least - * @throws TasteException - * if an error occurs while accessing the {@link DataModel} - */ - List recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException; - - /** - * @param userID - * user ID whose preference is to be estimated - * @param itemID - * item ID to estimate preference for - * @return an estimated preference if the user has not expressed a preference for the item, or else the - * user's actual preference for the item. If a preference cannot be estimated, returns - * {@link Double#NaN} - * @throws TasteException - * if an error occurs while accessing the {@link DataModel} - */ - float estimatePreference(long userID, long itemID) throws TasteException; - - /** - * @param userID - * user to set preference for - * @param itemID - * item to set preference for - * @param value - * preference value - * @throws TasteException - * if an error occurs while accessing the {@link DataModel} - */ - void setPreference(long userID, long itemID, float value) throws TasteException; - - /** - * @param userID - * user from which to remove preference - * @param itemID - * item for which to remove preference - * @throws TasteException - * if an error occurs while accessing the {@link DataModel} - */ - void removePreference(long userID, long itemID) throws TasteException; - - /** - * @return underlying {@link DataModel} used by this implementation - */ - DataModel getDataModel(); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/Rescorer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/Rescorer.java deleted file mode 100644 index 367723499..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/Rescorer.java +++ /dev/null @@ -1,52 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.recommender; - -/** - *

- * A simply assigns a new "score" to a thing like an ID of an item or user which a - * {@link Recommender} is considering returning as a top recommendation. It may be used to arbitrarily re-rank - * the results according to application-specific logic before returning recommendations. For example, an - * application may want to boost the score of items in a certain category just for one request. - *

- * - *

- * A can also exclude a thing from consideration entirely by returning {@code true} from - * {@link #isFiltered(Object)}. - *

- */ -public interface Rescorer { - - /** - * @param thing - * thing to rescore - * @param originalScore - * original score - * @return modified score, or {@link Double#NaN} to indicate that this should be excluded entirely - */ - double rescore(T thing, double originalScore); - - /** - * Returns {@code true} to exclude the given thing. - * - * @param thing - * the thing to filter - * @return {@code true} to exclude, {@code false} otherwise - */ - boolean isFiltered(T thing); -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/UserBasedRecommender.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/UserBasedRecommender.java deleted file mode 100644 index b48593a94..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/UserBasedRecommender.java +++ /dev/null @@ -1,54 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.cf.taste.recommender; - -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.common.LongPair; - -/** - *

- * Interface implemented by "user-based" recommenders. - *

- */ -public interface UserBasedRecommender extends Recommender { - - /** - * @param userID - * ID of user for which to find most similar other users - * @param howMany - * desired number of most similar users to find - * @return users most similar to the given user - * @throws TasteException - * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel} - */ - long[] mostSimilarUserIDs(long userID, int howMany) throws TasteException; - - /** - * @param userID - * ID of user for which to find most similar other users - * @param howMany - * desired number of most similar users to find - * @param rescorer - * {@link Rescorer} which can adjust user-user similarity estimates used to determine most similar - * users - * @return IDs of users most similar to the given user - * @throws TasteException - * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel} - */ - long[] mostSimilarUserIDs(long userID, int howMany, Rescorer rescorer) throws TasteException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/slopeone/DiffStorage.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/slopeone/DiffStorage.java deleted file mode 100644 index 2fc11aa26..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/recommender/slopeone/DiffStorage.java +++ /dev/null @@ -1,91 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.recommender.slopeone; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.RunningAverage; -import org.apache.mahout.cf.taste.model.PreferenceArray; - -/** - *

- * Implementations store item-item preference diffs for a - * {@link org.apache.mahout.cf.taste.impl.recommender.slopeone.SlopeOneRecommender}. It actually does a bit - * more for this implementation, like listing all items that may be considered for recommendation, in order to - * maximize what implementations can do to optimize the slope-one algorithm. - *

- * - * @see org.apache.mahout.cf.taste.impl.recommender.slopeone.SlopeOneRecommender - */ -public interface DiffStorage extends Refreshable { - - /** - * @return {@link RunningAverage} encapsulating the average difference in preferences between items - * corresponding to {@code itemID1} and {@code itemID2}, in that direction; that is, it's - * the average of item 2's preferences minus item 1's preferences - */ - RunningAverage getDiff(long itemID1, long itemID2) throws TasteException; - - /** - * @param userID - * user ID to get diffs for - * @param itemID - * itemID to assess - * @param prefs - * user's preferendces - * @return {@link RunningAverage}s for that user's item-item diffs - */ - RunningAverage[] getDiffs(long userID, long itemID, PreferenceArray prefs) throws TasteException; - - /** @return {@link RunningAverage} encapsulating the average preference for the given item */ - RunningAverage getAverageItemPref(long itemID) throws TasteException; - - /** - *

Updates internal data structures to reflect a new preference value for an item.

- * - * @param userID user whose pref is being added - * @param itemID item to add preference value for - * @param prefValue new preference value - */ - void addItemPref(long userID, long itemID, float prefValue) throws TasteException; - - /** - *

Updates internal data structures to reflect an update in a preference value for an item.

- * - * @param itemID item to update preference value for - * @param prefDelta amount by which preference value changed - */ - void updateItemPref(long itemID, float prefDelta) throws TasteException; - - /** - *

Updates internal data structures to reflect an update in a preference value for an item.

- * - * @param userID user whose pref is being removed - * @param itemID item to update preference value for - * @param prefValue old preference value - */ - void removeItemPref(long userID, long itemID, float prefValue) throws TasteException; - - /** - * @return item IDs that may possibly be recommended to the given user, which may not be all items since the - * item-item diff matrix may be sparse - */ - FastIDSet getRecommendableItemIDs(long userID) throws TasteException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/similarity/ItemSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/similarity/ItemSimilarity.java deleted file mode 100644 index 814610bd5..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/similarity/ItemSimilarity.java +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.similarity; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; - -/** - *

- * Implementations of this interface define a notion of similarity between two items. Implementations should - * return values in the range -1.0 to 1.0, with 1.0 representing perfect similarity. - *

- * - * @see UserSimilarity - */ -public interface ItemSimilarity extends Refreshable { - - /** - *

- * Returns the degree of similarity, of two items, based on the preferences that users have expressed for - * the items. - *

- * - * @param itemID1 first item ID - * @param itemID2 second item ID - * @return similarity between the items, in [-1,1] or {@link Double#NaN} similarity is unknown - * @throws org.apache.mahout.cf.taste.common.NoSuchItemException - * if either item is known to be non-existent in the data - * @throws TasteException if an error occurs while accessing the data - */ - double itemSimilarity(long itemID1, long itemID2) throws TasteException; - - /** - *

A bulk-get version of {@link #itemSimilarity(long, long)}.

- * - * @param itemID1 first item ID - * @param itemID2s second item IDs to compute similarity with - * @return similarity between itemID1 and other items - * @throws org.apache.mahout.cf.taste.common.NoSuchItemException - * if any item is known to be non-existent in the data - * @throws TasteException if an error occurs while accessing the data - */ - double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException; - - /** - * @return all IDs of similar items, in no particular order - */ - long[] allSimilarItemIDs(long itemID) throws TasteException; -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/similarity/PreferenceInferrer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/similarity/PreferenceInferrer.java deleted file mode 100644 index 76bb32892..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/similarity/PreferenceInferrer.java +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.similarity; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; - -/** - *

- * Implementations of this interface compute an inferred preference for a user and an item that the user has - * not expressed any preference for. This might be an average of other preferences scores from that user, for - * example. This technique is sometimes called "default voting". - *

- */ -public interface PreferenceInferrer extends Refreshable { - - /** - *

- * Infers the given user's preference value for an item. - *

- * - * @param userID - * ID of user to infer preference for - * @param itemID - * item ID to infer preference for - * @return inferred preference - * @throws TasteException - * if an error occurs while inferring - */ - float inferPreference(long userID, long itemID) throws TasteException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/similarity/UserSimilarity.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/similarity/UserSimilarity.java deleted file mode 100644 index 929ff3d01..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/similarity/UserSimilarity.java +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.similarity; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; - -/** - *

- * Implementations of this interface define a notion of similarity between two users. Implementations should - * return values in the range -1.0 to 1.0, with 1.0 representing perfect similarity. - *

- * - * @see ItemSimilarity - */ -public interface UserSimilarity extends Refreshable { - - /** - *

- * Returns the degree of similarity, of two users, based on the their preferences. - *

- * - * @param userID1 first user ID - * @param userID2 second user ID - * @return similarity between the users, in [-1,1] or {@link Double#NaN} similarity is unknown - * @throws org.apache.mahout.cf.taste.common.NoSuchUserException - * if either user is known to be non-existent in the data - * @throws TasteException if an error occurs while accessing the data - */ - double userSimilarity(long userID1, long userID2) throws TasteException; - - // Should we implement userSimilarities() like ItemSimilarity.itemSimilarities()? - - /** - *

- * Attaches a {@link PreferenceInferrer} to the implementation. - *

- * - * @param inferrer {@link PreferenceInferrer} - */ - void setPreferenceInferrer(PreferenceInferrer inferrer); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/transforms/PreferenceTransform.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/transforms/PreferenceTransform.java deleted file mode 100644 index 75ccfa81e..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/transforms/PreferenceTransform.java +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.transforms; - -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.model.Preference; - -/** - *

- * Implementations encapsulate a transform on a {@link Preference}'s value. These transformations are - * typically applied to values before they are used to compute a similarity value. They are typically not - * applied elsewhere; in particular {@link org.apache.mahout.cf.taste.model.DataModel}s no longer use a - * transform like this to transform all of their preference values at the source. - *

- */ -public interface PreferenceTransform extends Refreshable { - - float getTransformedValue(Preference pref) throws TasteException; - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/transforms/SimilarityTransform.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/transforms/SimilarityTransform.java deleted file mode 100644 index 47234b719..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/cf/taste/transforms/SimilarityTransform.java +++ /dev/null @@ -1,37 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.transforms; - -import org.apache.mahout.cf.taste.common.Refreshable; - -/** - *

- * Implementations encapsulate some transformation on similarity values between two things, where things might - * be IDs of users or items or something else. - *

- */ -public interface SimilarityTransform extends Refreshable { - - /** - * @param value - * original similarity between thing1 and thing2 (should be in [-1,1]) - * @return transformed similarity (should be in [-1,1]) - */ - double transformSimilarity(long id1, long id2, double value); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java deleted file mode 100644 index 7afb4c8ad..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java +++ /dev/null @@ -1,184 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier; - -import org.apache.mahout.math.DenseMatrix; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.Vector; - -import com.google.common.base.Preconditions; - -/** - * Defines the interface for classifiers that take input as a vector. This is implemented - * as an abstract class so that it can implement a number of handy convenience methods - * related to classification of vectors. - */ -public abstract class AbstractVectorClassifier { - // ------ These are all that are necessary to define a vector classifier. - - /** - * Returns the number of categories for the target variable. A vector classifier - * will encode it's output using a zero-based 1 of numCategories encoding. - * @return The number of categories. - */ - public abstract int numCategories(); - - /** - * Classify a vector returning a vector of numCategories-1 scores. It is assumed that - * the score for the missing category is one minus the sum of the scores that are returned. - * - * Note that the missing score is the 0-th score. - * @param instance A feature vector to be classified. - * @return A vector of probabilities in 1 of n-1 encoding. - */ - public abstract Vector classify(Vector instance); - - /** - * Classify a vector, but don't apply the inverse link function. For logistic regression - * and other generalized linear models, this is just the linear part of the classification. - * @param features A feature vector to be classified. - * @return A vector of scores. If transformed by the link function, these will become probabilities. - */ - public Vector classifyNoLink(Vector features) { - throw new UnsupportedOperationException( - this.getClass().getName() + " doesn't support classification without a link"); - } - - /** - * Classifies a vector in the special case of a binary classifier where - * {@link #classify(Vector)} would return a vector with only one element. As such, - * using this method can void the allocation of a vector. - * @param instance The feature vector to be classified. - * @return The score for category 1. - * - * @see #classify(Vector) - */ - public abstract double classifyScalar(Vector instance); - - // ------- From here on, we have convenience methods that provide an easier API to use. - - /** - * Returns n probabilities, one for each category. If you can use an n-1 coding, and are touchy - * about allocation performance, then the classify method is probably better to use. The 0-th - * element of the score vector returned by this method is the missing score as computed by the - * classify method. - * - * @see #classify(Vector) - * @see #classifyFull(Vector r, Vector instance) - * - * @param instance A vector of features to be classified. - * @return A vector of probabilities, one for each category. - */ - public Vector classifyFull(Vector instance) { - return classifyFull(new DenseVector(numCategories()), instance); - } - - /** - * Returns n probabilities, one for each category into a pre-allocated vector. One - * vector allocation is still done in the process of multiplying by the coefficient - * matrix, but that is hard to avoid. The cost of such an ephemeral allocation is - * very small in any case compared to the multiplication itself. - * - * @param r Where to put the results. - * @param instance A vector of features to be classified. - * @return A vector of probabilities, one for each category. - */ - public Vector classifyFull(Vector r, Vector instance) { - r.viewPart(1, numCategories() - 1).assign(classify(instance)); - r.setQuick(0, 1.0 - r.zSum()); - return r; - } - - - /** - * Returns n-1 probabilities, one for each category but the last, for each row of a matrix. The - * probability of the missing 0-th category is 1 - rowSum(this result). - * - * @param data The matrix whose rows are vectors to classify - * @return A matrix of scores, one row per row of the input matrix, one column for each but the - * last category. - */ - public Matrix classify(Matrix data) { - Matrix r = new DenseMatrix(data.numRows(), numCategories() - 1); - for (int row = 0; row < data.numRows(); row++) { - r.assignRow(row, classify(data.viewRow(row))); - } - return r; - } - - /** - * Returns n probabilities, one for each category, for each row of a matrix. - * - * @param data The matrix whose rows are vectors to classify - * @return A matrix of scores, one row per row of the input matrix, one column for each but the - * last category. - */ - public Matrix classifyFull(Matrix data) { - Matrix r = new DenseMatrix(data.numRows(), numCategories()); - for (int row = 0; row < data.numRows(); row++) { - classifyFull(r.viewRow(row), data.viewRow(row)); - } - return r; - } - - /** - * Returns a vector of probabilities of the first category, one for each row of a matrix. This - * only makes sense if there are exactly two categories, but calling this method in that case can - * save a number of vector allocations. - * - * @param data The matrix whose rows are vectors to classify - * @return A vector of scores, with one value per row of the input matrix. - */ - public Vector classifyScalar(Matrix data) { - Preconditions.checkArgument(numCategories() == 2, "Can only call classifyScalar with two categories"); - - Vector r = new DenseVector(data.numRows()); - for (int row = 0; row < data.numRows(); row++) { - r.set(row, classifyScalar(data.viewRow(row))); - } - return r; - } - - /** - * Returns a measure of how good the classification for a particular example actually is. - * - * @param actual The correct category for the example. - * @param data The vector to be classified. - * @return The log likelihood of the correct answer as estimated by the current model. This will - * always be <= 0 and larger (closer to 0) indicates better accuracy. In order to simplify - * code that maintains running averages, we bound this value at -100. - */ - public double logLikelihood(int actual, Vector data) { - if (numCategories() == 2) { - double p = classifyScalar(data); - if (actual > 0) { - return Math.max(-100.0, Math.log(p)); - } else { - return Math.max(-100.0, Math.log1p(-p)); - } - } else { - Vector p = classify(data); - if (actual > 0) { - return Math.max(-100.0, Math.log(p.get(actual - 1))); - } else { - return Math.max(-100.0, Math.log1p(-p.zSum())); - } - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/ClassifierResult.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/ClassifierResult.java deleted file mode 100644 index 00b475193..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/ClassifierResult.java +++ /dev/null @@ -1,84 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier; - -import java.util.Comparator; - -/** - * Result of a document classification. The label and the associated score (usually probabilty) - */ -public class ClassifierResult { - - public static final Comparator COMPARE_BY_SCORE_AND_LABEL = - new Comparator() { - @Override - public int compare(ClassifierResult cr1, ClassifierResult cr2) { - return cr1.score < cr2.score ? 1 : cr1.score > cr2.score ? -1 : cr1.label.compareTo(cr2.label); - } - }; - - private String label; - private double score; - private double logLikelihood = Double.MAX_VALUE; - - public ClassifierResult() { } - - public ClassifierResult(String label, double score) { - this.label = label; - this.score = score; - } - - public ClassifierResult(String label) { - this.label = label; - } - - public ClassifierResult(String label, double score, double logLikelihood) { - this.label = label; - this.score = score; - this.logLikelihood = logLikelihood; - } - - public double getLogLikelihood() { - return logLikelihood; - } - - public void setLogLikelihood(double logLikelihood) { - this.logLikelihood = logLikelihood; - } - - public String getLabel() { - return label; - } - - public double getScore() { - return score; - } - - public void setLabel(String label) { - this.label = label; - } - - public void setScore(double score) { - this.score = score; - } - - @Override - public String toString() { - return "ClassifierResult{" + "category='" + label + '\'' + ", score=" + score + '}'; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java deleted file mode 100644 index c5a9f68f8..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java +++ /dev/null @@ -1,256 +0,0 @@ -/** - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * Licensed to the Apache Software Foundation (ASF) under one or more - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier; - -import java.util.Collection; -import java.util.Collections; -import java.util.Map; - -import org.apache.commons.lang.StringUtils; -import org.apache.mahout.math.DenseMatrix; -import org.apache.mahout.math.Matrix; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; - -/** - * The ConfusionMatrix Class stores the result of Classification of a Test Dataset. - * - * The fact of whether there is a default is not stored. A row of zeros is the only indicator that there is no default. - * - * See http://en.wikipedia.org/wiki/Confusion_matrix for background - */ -public class ConfusionMatrix { - private final Map labelMap = Maps.newLinkedHashMap(); - private final int[][] confusionMatrix; - private String defaultLabel = "unknown"; - - public ConfusionMatrix(Collection labels, String defaultLabel) { - confusionMatrix = new int[labels.size() + 1][labels.size() + 1]; - this.defaultLabel = defaultLabel; - int i = 0; - for (String label : labels) { - labelMap.put(label, i++); - } - labelMap.put(defaultLabel, i); - } - - public ConfusionMatrix(Matrix m) { - confusionMatrix = new int[m.numRows()][m.numRows()]; - setMatrix(m); - } - - public int[][] getConfusionMatrix() { - return confusionMatrix; - } - - public Collection getLabels() { - return Collections.unmodifiableCollection(labelMap.keySet()); - } - - public double getAccuracy(String label) { - int labelId = labelMap.get(label); - int labelTotal = 0; - int correct = 0; - for (int i = 0; i < labelMap.size(); i++) { - labelTotal += confusionMatrix[labelId][i]; - if (i == labelId) { - correct = confusionMatrix[labelId][i]; - } - } - return 100.0 * correct / labelTotal; - } - - public int getCorrect(String label) { - int labelId = labelMap.get(label); - return confusionMatrix[labelId][labelId]; - } - - public int getTotal(String label) { - int labelId = labelMap.get(label); - int labelTotal = 0; - for (int i = 0; i < labelMap.size(); i++) { - labelTotal += confusionMatrix[labelId][i]; - } - return labelTotal; - } - - public void addInstance(String correctLabel, ClassifierResult classifiedResult) { - incrementCount(correctLabel, classifiedResult.getLabel()); - } - - public void addInstance(String correctLabel, String classifiedLabel) { - incrementCount(correctLabel, classifiedLabel); - } - - public int getCount(String correctLabel, String classifiedLabel) { - Preconditions.checkArgument(labelMap.containsKey(correctLabel), "Label not found: " + correctLabel); - Preconditions.checkArgument(labelMap.containsKey(classifiedLabel), "Label not found: " + classifiedLabel); - int correctId = labelMap.get(correctLabel); - int classifiedId = labelMap.get(classifiedLabel); - return confusionMatrix[correctId][classifiedId]; - } - - public void putCount(String correctLabel, String classifiedLabel, int count) { - Preconditions.checkArgument(labelMap.containsKey(correctLabel), "Label not found: " + correctLabel); - Preconditions.checkArgument(labelMap.containsKey(classifiedLabel), "Label not found: " + classifiedLabel); - int correctId = labelMap.get(correctLabel); - int classifiedId = labelMap.get(classifiedLabel); - confusionMatrix[correctId][classifiedId] = count; - } - - public String getDefaultLabel() { - return defaultLabel; - } - - public void incrementCount(String correctLabel, String classifiedLabel, int count) { - putCount(correctLabel, classifiedLabel, count + getCount(correctLabel, classifiedLabel)); - } - - public void incrementCount(String correctLabel, String classifiedLabel) { - incrementCount(correctLabel, classifiedLabel, 1); - } - - public ConfusionMatrix merge(ConfusionMatrix b) { - Preconditions.checkArgument(labelMap.size() == b.getLabels().size(), "The label sizes do not match"); - for (String correctLabel : this.labelMap.keySet()) { - for (String classifiedLabel : this.labelMap.keySet()) { - incrementCount(correctLabel, classifiedLabel, b.getCount(correctLabel, classifiedLabel)); - } - } - return this; - } - - public Matrix getMatrix() { - int length = confusionMatrix.length; - Matrix m = new DenseMatrix(length, length); - for (int r = 0; r < length; r++) { - for (int c = 0; c < length; c++) { - m.set(r, c, confusionMatrix[r][c]); - } - } - Map labels = Maps.newHashMap(); - for (Map.Entry entry : labelMap.entrySet()) { - labels.put(entry.getKey(), entry.getValue()); - } - m.setRowLabelBindings(labels); - m.setColumnLabelBindings(labels); - return m; - } - - public void setMatrix(Matrix m) { - int length = confusionMatrix.length; - if (m.numRows() != m.numCols()) { - throw new IllegalArgumentException( - "ConfusionMatrix: matrix(" + m.numRows() + ',' + m.numCols() + ") must be square"); - } - for (int r = 0; r < length; r++) { - for (int c = 0; c < length; c++) { - confusionMatrix[r][c] = (int) Math.round(m.get(r, c)); - } - } - Map labels = m.getRowLabelBindings(); - if (labels == null) { - labels = m.getColumnLabelBindings(); - } - if (labels != null) { - String[] sorted = sortLabels(labels); - verifyLabels(length, sorted); - labelMap.clear(); - for (int i = 0; i < length; i++) { - labelMap.put(sorted[i], i); - } - } - } - - private static String[] sortLabels(Map labels) { - String[] sorted = new String[labels.keySet().size()]; - for (String label: labels.keySet()) { - Integer index = labels.get(label); - sorted[index] = label; - } - return sorted; - } - - private static void verifyLabels(int length, String[] sorted) { - Preconditions.checkArgument(sorted.length == length, "One label, one row"); - for (int i = 0; i < length; i++) { - if (sorted[i] == null) { - Preconditions.checkArgument(false, "One label, one row"); - } - } - } - - /** - * This is overloaded. toString() is not a formatted report you print for a manager :) - * Assume that if there are no default assignments, the default feature was not used - */ - @Override - public String toString() { - StringBuilder returnString = new StringBuilder(200); - returnString.append("=======================================================").append('\n'); - returnString.append("Confusion Matrix\n"); - returnString.append("-------------------------------------------------------").append('\n'); - - int unclassified = getTotal(defaultLabel); - for (Map.Entry entry : this.labelMap.entrySet()) { - if (entry.getKey().equals(defaultLabel) && unclassified == 0) { - continue; - } - - returnString.append(StringUtils.rightPad(getSmallLabel(entry.getValue()), 5)).append('\t'); - } - - returnString.append("<--Classified as").append('\n'); - for (Map.Entry entry : this.labelMap.entrySet()) { - if (entry.getKey().equals(defaultLabel) && unclassified == 0) { - continue; - } - String correctLabel = entry.getKey(); - int labelTotal = 0; - for (String classifiedLabel : this.labelMap.keySet()) { - if (classifiedLabel.equals(defaultLabel) && unclassified == 0) { - continue; - } - returnString.append( - StringUtils.rightPad(Integer.toString(getCount(correctLabel, classifiedLabel)), 5)).append('\t'); - labelTotal += getCount(correctLabel, classifiedLabel); - } - returnString.append(" | ").append(StringUtils.rightPad(String.valueOf(labelTotal), 6)).append('\t') - .append(StringUtils.rightPad(getSmallLabel(entry.getValue()), 5)) - .append(" = ").append(correctLabel).append('\n'); - } - if (unclassified > 0) { - returnString.append("Default Category: ").append(defaultLabel).append(": ").append(unclassified).append('\n'); - } - returnString.append('\n'); - return returnString.toString(); - } - - static String getSmallLabel(int i) { - int val = i; - StringBuilder returnString = new StringBuilder(); - do { - int n = val % 26; - returnString.insert(0, (char) ('a' + n)); - val /= 26; - } while (val > 0); - return returnString.toString(); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/OnlineLearner.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/OnlineLearner.java deleted file mode 100644 index af1d5e752..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/OnlineLearner.java +++ /dev/null @@ -1,96 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier; - -import org.apache.mahout.math.Vector; - -import java.io.Closeable; - -/** - * The simplest interface for online learning algorithms. - */ -public interface OnlineLearner extends Closeable { - /** - * Updates the model using a particular target variable value and a feature vector. - *

- * There may an assumption that if multiple passes through the training data are necessary, then - * the training examples will be presented in the same order. This is because the order of - * training examples may be used to assign records to different data splits for evaluation by - * cross-validation. Without the order invariance, records might be assigned to training and test - * splits and error estimates could be seriously affected. - *

- * If re-ordering is necessary, then using the alternative API which allows a tracking key to be - * added to the training example can be used. - * - * @param actual The value of the target variable. This value should be in the half-open - * interval [0..n) where n is the number of target categories. - * @param instance The feature vector for this example. - */ - void train(int actual, Vector instance); - - /** - * Updates the model using a particular target variable value and a feature vector. - *

- * There may an assumption that if multiple passes through the training data are necessary that - * the tracking key for a record will be the same for each pass and that there will be a - * relatively large number of distinct tracking keys and that the low-order bits of the tracking - * keys will not correlate with any of the input variables. This tracking key is used to assign - * training examples to different test/training splits. - *

- * Examples of useful tracking keys include id-numbers for the training records derived from - * a database id for the base table from the which the record is derived, or the offset of - * the original data record in a data file. - * - * @param trackingKey The tracking key for this training example. - * @param groupKey An optional value that allows examples to be grouped in the computation of - * the update to the model. - * @param actual The value of the target variable. This value should be in the half-open - * interval [0..n) where n is the number of target categories. - * @param instance The feature vector for this example. - */ - void train(long trackingKey, String groupKey, int actual, Vector instance); - - /** - * Updates the model using a particular target variable value and a feature vector. - *

- * There may an assumption that if multiple passes through the training data are necessary that - * the tracking key for a record will be the same for each pass and that there will be a - * relatively large number of distinct tracking keys and that the low-order bits of the tracking - * keys will not correlate with any of the input variables. This tracking key is used to assign - * training examples to different test/training splits. - *

- * Examples of useful tracking keys include id-numbers for the training records derived from - * a database id for the base table from the which the record is derived, or the offset of - * the original data record in a data file. - * - * @param trackingKey The tracking key for this training example. - * @param actual The value of the target variable. This value should be in the half-open - * interval [0..n) where n is the number of target categories. - * @param instance The feature vector for this example. - */ - void train(long trackingKey, int actual, Vector instance); - - /** - * Prepares the classifier for classification and deallocates any temporary data structures. - * - * An online classifier should be able to accept more training after being closed, but - * closing the classifier may make classification more efficient. - */ - @Override - void close(); -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/RegressionResultAnalyzer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/RegressionResultAnalyzer.java deleted file mode 100644 index 10c54d8f8..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/RegressionResultAnalyzer.java +++ /dev/null @@ -1,127 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier; - -import java.text.DecimalFormat; -import java.text.NumberFormat; -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.lang.StringUtils; - -/** - * ResultAnalyzer captures the classification statistics and displays in a tabular manner - */ -public class RegressionResultAnalyzer { - - private static class Result { - private final double actual; - private final double result; - Result(double actual, double result) { - this.actual = actual; - this.result = result; - } - double getActual() { - return actual; - } - double getResult() { - return result; - } - } - - private List results; - - /** - * - * @param actual - * The actual answer - * @param result - * The regression result - */ - public void addInstance(double actual, double result) { - if (results == null) { - results = new ArrayList(); - } - results.add(new Result(actual, result)); - } - - /** - * - * @param results - * The results table - */ - public void setInstances(double[][] results) { - for (double[] res : results) { - addInstance(res[0], res[1]); - } - } - - @Override - public String toString() { - double sumActual = 0.0; - double sumActualSquared = 0.0; - double sumResult = 0.0; - double sumResultSquared = 0.0; - double sumAbsolute = 0.0; - double sumAbsoluteSquared = 0.0; - - for (Result res : results) { - double actual = res.getActual(); - double result = res.getResult(); - sumActual += actual; - sumActualSquared += actual * actual; - sumResult += result; - sumResultSquared += result * result; - double absolute = Math.abs(actual - result); - sumAbsolute += absolute; - sumAbsoluteSquared += absolute * absolute; - } - - double varActual = sumActualSquared - sumActual * sumActual / results.size(); - double varResult = sumResultSquared - sumResult * sumResult / results.size(); - double varAbsolute = sumResultSquared - sumActual * sumResult / results.size(); - - double correlation; - if (varActual * varResult <= 0) { - correlation = 0.0; - } else { - correlation = varAbsolute / Math.sqrt(varActual * varResult); - } - - StringBuilder returnString = new StringBuilder(); - - returnString.append("=======================================================\n"); - returnString.append("Summary\n"); - returnString.append("-------------------------------------------------------\n"); - - NumberFormat decimalFormatter = new DecimalFormat("0.####"); - - returnString.append(StringUtils.rightPad("Correlation coefficient", 40)).append(": ").append( - StringUtils.leftPad(decimalFormatter.format(correlation), 10)).append('\n'); - returnString.append(StringUtils.rightPad("Mean absolute error", 40)).append(": ").append( - StringUtils.leftPad(decimalFormatter.format(sumAbsolute / results.size()), 10)).append('\n'); - returnString.append(StringUtils.rightPad("Root mean squared error", 40)).append(": ").append( - StringUtils.leftPad(decimalFormatter.format(Math.sqrt(sumAbsoluteSquared / results.size())), - 10)).append('\n'); - returnString.append(StringUtils.rightPad("Total Regressed Instances", 40)).append(": ").append( - StringUtils.leftPad(Integer.toString(results.size()), 10)).append('\n'); - returnString.append('\n'); - - return returnString.toString(); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java deleted file mode 100644 index 34e80451b..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java +++ /dev/null @@ -1,110 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier; - -import java.text.DecimalFormat; -import java.text.NumberFormat; -import java.util.Collection; - -import org.apache.commons.lang.StringUtils; -import org.apache.mahout.math.stats.OnlineSummarizer; - -/** - * ResultAnalyzer captures the classification statistics and displays in a tabular manner - */ -public class ResultAnalyzer { - - private final ConfusionMatrix confusionMatrix; - private final OnlineSummarizer summarizer; - private boolean hasLL = false; - - /* - * === Summary === - * - * Correctly Classified Instances 635 92.9722 % Incorrectly Classified Instances 48 7.0278 % Kappa statistic - * 0.923 Mean absolute error 0.0096 Root mean squared error 0.0817 Relative absolute error 9.9344 % Root - * relative squared error 37.2742 % Total Number of Instances 683 - */ - private int correctlyClassified; - - private int incorrectlyClassified; - - public ResultAnalyzer(Collection labelSet, String defaultLabel) { - confusionMatrix = new ConfusionMatrix(labelSet, defaultLabel); - summarizer = new OnlineSummarizer(); - } - - public ConfusionMatrix getConfusionMatrix() { - return this.confusionMatrix; - } - - /** - * - * @param correctLabel - * The correct label - * @param classifiedResult - * The classified result - * @return whether the instance was correct or not - */ - public boolean addInstance(String correctLabel, ClassifierResult classifiedResult) { - boolean result = correctLabel.equals(classifiedResult.getLabel()); - if (result) { - correctlyClassified++; - } else { - incorrectlyClassified++; - } - confusionMatrix.addInstance(correctLabel, classifiedResult); - if (classifiedResult.getLogLikelihood() != Double.MAX_VALUE) { - summarizer.add(classifiedResult.getLogLikelihood()); - hasLL = true; - } - return result; - } - - @Override - public String toString() { - StringBuilder returnString = new StringBuilder(); - - returnString.append("=======================================================\n"); - returnString.append("Summary\n"); - returnString.append("-------------------------------------------------------\n"); - int totalClassified = correctlyClassified + incorrectlyClassified; - double percentageCorrect = (double) 100 * correctlyClassified / totalClassified; - double percentageIncorrect = (double) 100 * incorrectlyClassified / totalClassified; - NumberFormat decimalFormatter = new DecimalFormat("0.####"); - - returnString.append(StringUtils.rightPad("Correctly Classified Instances", 40)).append(": ").append( - StringUtils.leftPad(Integer.toString(correctlyClassified), 10)).append('\t').append( - StringUtils.leftPad(decimalFormatter.format(percentageCorrect), 10)).append("%\n"); - returnString.append(StringUtils.rightPad("Incorrectly Classified Instances", 40)).append(": ").append( - StringUtils.leftPad(Integer.toString(incorrectlyClassified), 10)).append('\t').append( - StringUtils.leftPad(decimalFormatter.format(percentageIncorrect), 10)).append("%\n"); - returnString.append(StringUtils.rightPad("Total Classified Instances", 40)).append(": ").append( - StringUtils.leftPad(Integer.toString(totalClassified), 10)).append('\n'); - returnString.append('\n'); - - returnString.append(confusionMatrix); - if (hasLL) { - returnString.append("\n\n"); - returnString.append("Avg. Log-likelihood: ").append(summarizer.getMean()).append(" 25%-ile: ").append(summarizer.getQuartile(1)) - .append(" 75%-ile: ").append(summarizer.getQuartile(2)); - } - - return returnString.toString(); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/Bagging.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/Bagging.java deleted file mode 100644 index 0ec5b55f2..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/Bagging.java +++ /dev/null @@ -1,60 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df; - -import org.apache.mahout.classifier.df.builder.TreeBuilder; -import org.apache.mahout.classifier.df.data.Data; -import org.apache.mahout.classifier.df.node.Node; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Arrays; -import java.util.Random; - -/** - * Builds a tree using bagging - */ -public class Bagging { - - private static final Logger log = LoggerFactory.getLogger(Bagging.class); - - private final TreeBuilder treeBuilder; - - private final Data data; - - private final boolean[] sampled; - - public Bagging(TreeBuilder treeBuilder, Data data) { - this.treeBuilder = treeBuilder; - this.data = data; - sampled = new boolean[data.size()]; - } - - /** - * Builds one tree - */ - public Node build(Random rng) { - log.debug("Bagging..."); - Arrays.fill(sampled, false); - Data bag = data.bagging(rng, sampled); - - log.debug("Building..."); - return treeBuilder.build(rng, bag); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/DFUtils.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/DFUtils.java deleted file mode 100644 index 202ab3b71..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/DFUtils.java +++ /dev/null @@ -1,160 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df; - -import com.google.common.collect.Lists; -import com.google.common.io.Closeables; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.classifier.df.node.Node; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.List; - -/** - * Utility class that contains various helper methods - */ -public final class DFUtils { - private DFUtils() { } - - /** - * Writes an Node[] into a DataOutput - * @throws java.io.IOException - */ - public static void writeArray(DataOutput out, Node[] array) throws IOException { - out.writeInt(array.length); - for (Node w : array) { - w.write(out); - } - } - - /** - * Reads a Node[] from a DataInput - * @throws java.io.IOException - */ - public static Node[] readNodeArray(DataInput in) throws IOException { - int length = in.readInt(); - Node[] nodes = new Node[length]; - for (int index = 0; index < length; index++) { - nodes[index] = Node.read(in); - } - - return nodes; - } - - /** - * Writes a double[] into a DataOutput - * @throws java.io.IOException - */ - public static void writeArray(DataOutput out, double[] array) throws IOException { - out.writeInt(array.length); - for (double value : array) { - out.writeDouble(value); - } - } - - /** - * Reads a double[] from a DataInput - * @throws java.io.IOException - */ - public static double[] readDoubleArray(DataInput in) throws IOException { - int length = in.readInt(); - double[] array = new double[length]; - for (int index = 0; index < length; index++) { - array[index] = in.readDouble(); - } - - return array; - } - - /** - * Writes an int[] into a DataOutput - * @throws java.io.IOException - */ - public static void writeArray(DataOutput out, int[] array) throws IOException { - out.writeInt(array.length); - for (int value : array) { - out.writeInt(value); - } - } - - /** - * Reads an int[] from a DataInput - * @throws java.io.IOException - */ - public static int[] readIntArray(DataInput in) throws IOException { - int length = in.readInt(); - int[] array = new int[length]; - for (int index = 0; index < length; index++) { - array[index] = in.readInt(); - } - - return array; - } - - /** - * Return a list of all files in the output directory - * @throws IOException if no file is found - */ - public static Path[] listOutputFiles(FileSystem fs, Path outputPath) throws IOException { - List outputFiles = Lists.newArrayList(); - for (FileStatus s : fs.listStatus(outputPath, PathFilters.logsCRCFilter())) { - if (!s.isDir() && !s.getPath().getName().startsWith("_")) { - outputFiles.add(s.getPath()); - } - } - if (outputFiles.isEmpty()) { - throw new IOException("No output found !"); - } - return outputFiles.toArray(new Path[outputFiles.size()]); - } - - /** - * Formats a time interval in milliseconds to a String in the form "hours:minutes:seconds:millis" - */ - public static String elapsedTime(long milli) { - long seconds = milli / 1000; - milli %= 1000; - - long minutes = seconds / 60; - seconds %= 60; - - long hours = minutes / 60; - minutes %= 60; - - return hours + "h " + minutes + "m " + seconds + "s " + milli; - } - - public static void storeWritable(Configuration conf, Path path, Writable writable) throws IOException { - FileSystem fs = path.getFileSystem(conf); - - FSDataOutputStream out = fs.create(path); - try { - writable.write(out); - } finally { - Closeables.closeQuietly(out); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/DecisionForest.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/DecisionForest.java deleted file mode 100644 index d8b09e589..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/DecisionForest.java +++ /dev/null @@ -1,228 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; -import com.google.common.io.Closeables; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.classifier.df.data.Data; -import org.apache.mahout.classifier.df.data.DataUtils; -import org.apache.mahout.classifier.df.data.Dataset; -import org.apache.mahout.classifier.df.data.Instance; -import org.apache.mahout.classifier.df.node.Node; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.List; -import java.util.Random; - -/** - * Represents a forest of decision trees. - */ -public class DecisionForest implements Writable { - - private final List trees; - - private DecisionForest() { - trees = Lists.newArrayList(); - } - - public DecisionForest(List trees) { - Preconditions.checkArgument(trees != null && !trees.isEmpty(), "trees argument must not be null or empty"); - - this.trees = trees; - } - - List getTrees() { - return trees; - } - - /** - * Classifies the data and calls callback for each classification - */ - public void classify(Data data, double[] predictions) { - Preconditions.checkArgument(data.size() == predictions.length, "predictions.length must be equal to data.size()"); - - if (data.isEmpty()) { - return; // nothing to classify - } - - for (Node tree : trees) { - for (int index = 0; index < data.size(); index++) { - predictions[index] = tree.classify(data.get(index)); - } - } - } - - /** - * predicts the label for the instance - * - * @param rng - * Random number generator, used to break ties randomly - * @return -1 if the label cannot be predicted - */ - public double classify(Dataset dataset, Random rng, Instance instance) { - if (dataset.isNumerical(dataset.getLabelId())) { - double sum = 0; - int cnt = 0; - for (Node tree : trees) { - double prediction = tree.classify(instance); - if (prediction != -1) { - sum += prediction; - cnt++; - } - } - return sum / cnt; - } else { - int[] predictions = new int[dataset.nblabels()]; - for (Node tree : trees) { - double prediction = tree.classify(instance); - if (prediction != -1) { - predictions[(int) prediction]++; - } - } - - if (DataUtils.sum(predictions) == 0) { - return -1; // no prediction available - } - - return DataUtils.maxindex(rng, predictions); - } - } - - /** - * @return Mean number of nodes per tree - */ - public long meanNbNodes() { - long sum = 0; - - for (Node tree : trees) { - sum += tree.nbNodes(); - } - - return sum / trees.size(); - } - - /** - * @return Total number of nodes in all the trees - */ - public long nbNodes() { - long sum = 0; - - for (Node tree : trees) { - sum += tree.nbNodes(); - } - - return sum; - } - - /** - * @return Mean maximum depth per tree - */ - public long meanMaxDepth() { - long sum = 0; - - for (Node tree : trees) { - sum += tree.maxDepth(); - } - - return sum / trees.size(); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (!(obj instanceof DecisionForest)) { - return false; - } - - DecisionForest rf = (DecisionForest) obj; - - return trees.size() == rf.getTrees().size() && trees.containsAll(rf.getTrees()); - } - - @Override - public int hashCode() { - return trees.hashCode(); - } - - @Override - public void write(DataOutput dataOutput) throws IOException { - dataOutput.writeInt(trees.size()); - for (Node tree : trees) { - tree.write(dataOutput); - } - } - - /** - * Reads the trees from the input and adds them to the existing trees - */ - @Override - public void readFields(DataInput dataInput) throws IOException { - int size = dataInput.readInt(); - for (int i = 0; i < size; i++) { - trees.add(Node.read(dataInput)); - } - } - - private static DecisionForest read(DataInput dataInput) throws IOException { - DecisionForest forest = new DecisionForest(); - forest.readFields(dataInput); - return forest; - } - - /** - * Load the forest from a single file or a directory of files - * @throws java.io.IOException - */ - public static DecisionForest load(Configuration conf, Path forestPath) throws IOException { - FileSystem fs = forestPath.getFileSystem(conf); - Path[] files; - if (fs.getFileStatus(forestPath).isDir()) { - files = DFUtils.listOutputFiles(fs, forestPath); - } else { - files = new Path[]{forestPath}; - } - - DecisionForest forest = null; - for (Path path : files) { - FSDataInputStream dataInput = new FSDataInputStream(fs.open(path)); - try { - if (forest == null) { - forest = read(dataInput); - } else { - forest.readFields(dataInput); - } - } finally { - Closeables.closeQuietly(dataInput); - } - } - - return forest; - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/ErrorEstimate.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/ErrorEstimate.java deleted file mode 100644 index 2a7facc87..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/ErrorEstimate.java +++ /dev/null @@ -1,50 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df; - -import com.google.common.base.Preconditions; - -/** - * Various methods to compute from the output of a random forest - */ -public final class ErrorEstimate { - - private ErrorEstimate() { - } - - public static double errorRate(double[] labels, double[] predictions) { - Preconditions.checkArgument(labels.length == predictions.length, "labels.length != predictions.length"); - double nberrors = 0; // number of instance that got bad predictions - double datasize = 0; // number of classified instances - - for (int index = 0; index < labels.length; index++) { - if (predictions[index] == -1) { - continue; // instance not classified - } - - if (predictions[index] != labels[index]) { - nberrors++; - } - - datasize++; - } - - return nberrors / datasize; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/builder/DecisionTreeBuilder.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/builder/DecisionTreeBuilder.java deleted file mode 100644 index 37d890e7c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/builder/DecisionTreeBuilder.java +++ /dev/null @@ -1,423 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.builder; - -import com.google.common.collect.Sets; -import org.apache.mahout.classifier.df.data.Data; -import org.apache.mahout.classifier.df.data.Dataset; -import org.apache.mahout.classifier.df.data.Instance; -import org.apache.mahout.classifier.df.data.conditions.Condition; -import org.apache.mahout.classifier.df.node.CategoricalNode; -import org.apache.mahout.classifier.df.node.Leaf; -import org.apache.mahout.classifier.df.node.Node; -import org.apache.mahout.classifier.df.node.NumericalNode; -import org.apache.mahout.classifier.df.split.IgSplit; -import org.apache.mahout.classifier.df.split.OptIgSplit; -import org.apache.mahout.classifier.df.split.RegressionSplit; -import org.apache.mahout.classifier.df.split.Split; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Collection; -import java.util.Random; - -/** - * Builds a classification tree or regression tree
- * A classification tree is built when the criterion variable is the categorical attribute.
- * A regression tree is built when the criterion variable is the numerical attribute. - */ -public class DecisionTreeBuilder implements TreeBuilder { - - private static final Logger log = LoggerFactory.getLogger(DecisionTreeBuilder.class); - - private static final int[] NO_ATTRIBUTES = new int[0]; - private static final double EPSILON = 1.0e-6; - - /** - * indicates which CATEGORICAL attributes have already been selected in the parent nodes - */ - private boolean[] selected; - /** - * number of attributes to select randomly at each node - */ - private int m; - /** - * IgSplit implementation - */ - private IgSplit igSplit; - /** - * tree is complemented - */ - private boolean complemented = true; - /** - * minimum number for split - */ - private double minSplitNum = 2.0; - /** - * minimum proportion of the total variance for split - */ - private double minVarianceProportion = 1.0e-3; - /** - * full set data - */ - private Data fullSet; - /** - * minimum variance for split - */ - private double minVariance = Double.NaN; - - public void setM(int m) { - this.m = m; - } - - public void setIgSplit(IgSplit igSplit) { - this.igSplit = igSplit; - } - - public void setComplemented(boolean complemented) { - this.complemented = complemented; - } - - public void setMinSplitNum(int minSplitNum) { - this.minSplitNum = minSplitNum; - } - - public void setMinVarianceProportion(double minVarianceProportion) { - this.minVarianceProportion = minVarianceProportion; - } - - @Override - public Node build(Random rng, Data data) { - if (selected == null) { - selected = new boolean[data.getDataset().nbAttributes()]; - selected[data.getDataset().getLabelId()] = true; // never select the label - } - if (m == 0) { - // set default m - double e = data.getDataset().nbAttributes() - 1; - if (data.getDataset().isNumerical(data.getDataset().getLabelId())) { - // regression - m = (int) Math.ceil(e / 3.0); - } else { - // classification - m = (int) Math.ceil(Math.sqrt(e)); - } - } - - if (data.isEmpty()) { - return new Leaf(-1); - } - - double sum = 0.0; - if (data.getDataset().isNumerical(data.getDataset().getLabelId())) { - // regression - // sum and sum squared of a label is computed - double sumSquared = 0.0; - for (int i = 0; i < data.size(); i++) { - double label = data.getDataset().getLabel(data.get(i)); - sum += label; - sumSquared += label * label; - } - - // computes the variance - double var = sumSquared - (sum * sum) / data.size(); - - // computes the minimum variance - if (Double.compare(minVariance, Double.NaN) == 0) { - minVariance = var / data.size() * minVarianceProportion; - log.debug("minVariance:{}", minVariance); - } - - // variance is compared with minimum variance - if ((var / data.size()) < minVariance) { - log.debug("variance(" + (var / data.size()) + ") < minVariance(" + minVariance + ") Leaf(" + - (sum / data.size()) + ')'); - return new Leaf(sum / data.size()); - } - } else { - // classification - if (isIdentical(data)) { - return new Leaf(data.majorityLabel(rng)); - } - if (data.identicalLabel()) { - return new Leaf(data.getDataset().getLabel(data.get(0))); - } - } - - // store full set data - if (fullSet == null) { - fullSet = data; - } - - int[] attributes = randomAttributes(rng, selected, m); - if (attributes == null || attributes.length == 0) { - // we tried all the attributes and could not split the data anymore - double label; - if (data.getDataset().isNumerical(data.getDataset().getLabelId())) { - // regression - label = sum / data.size(); - } else { - // classification - label = data.majorityLabel(rng); - } - log.warn("attribute which can be selected is not found Leaf({})", label); - return new Leaf(label); - } - - if (igSplit == null) { - if (data.getDataset().isNumerical(data.getDataset().getLabelId())) { - // regression - igSplit = new RegressionSplit(); - } else { - // classification - igSplit = new OptIgSplit(); - } - } - - // find the best split - Split best = null; - for (int attr : attributes) { - Split split = igSplit.computeSplit(data, attr); - if (best == null || best.getIg() < split.getIg()) { - best = split; - } - } - - // information gain is near to zero. - if (best.getIg() < EPSILON) { - double label; - if (data.getDataset().isNumerical(data.getDataset().getLabelId())) { - label = sum / data.size(); - } else { - label = data.majorityLabel(rng); - } - log.debug("ig is near to zero Leaf({})", label); - return new Leaf(label); - } - - log.debug("best split attr:" + best.getAttr() + ", split:" + best.getSplit() + ", ig:" - + best.getIg()); - - boolean alreadySelected = selected[best.getAttr()]; - if (alreadySelected) { - // attribute already selected - log.warn("attribute {} already selected in a parent node", best.getAttr()); - } - - Node childNode; - if (data.getDataset().isNumerical(best.getAttr())) { - boolean[] temp = null; - - Data loSubset = data.subset(Condition.lesser(best.getAttr(), best.getSplit())); - Data hiSubset = data.subset(Condition.greaterOrEquals(best.getAttr(), best.getSplit())); - - if (loSubset.isEmpty() || hiSubset.isEmpty()) { - // the selected attribute did not change the data, avoid using it in the child notes - selected[best.getAttr()] = true; - } else { - // the data changed, so we can unselect all previousely selected NUMERICAL attributes - temp = selected; - selected = cloneCategoricalAttributes(data.getDataset(), selected); - } - - // size of the subset is less than the minSpitNum - if (loSubset.size() < minSplitNum || hiSubset.size() < minSplitNum) { - // branch is not split - double label; - if (data.getDataset().isNumerical(data.getDataset().getLabelId())) { - label = sum / data.size(); - } else { - label = data.majorityLabel(rng); - } - log.debug("branch is not split Leaf({})", label); - return new Leaf(label); - } - - Node loChild = build(rng, loSubset); - Node hiChild = build(rng, hiSubset); - - // restore the selection state of the attributes - if (temp != null) { - selected = temp; - } else { - selected[best.getAttr()] = alreadySelected; - } - - childNode = new NumericalNode(best.getAttr(), best.getSplit(), loChild, hiChild); - } else { // CATEGORICAL attribute - double[] values = data.values(best.getAttr()); - - // tree is complemented - Collection subsetValues = null; - if (complemented) { - subsetValues = Sets.newHashSet(); - for (double value : values) { - subsetValues.add(value); - } - values = fullSet.values(best.getAttr()); - } - - int cnt = 0; - Data[] subsets = new Data[values.length]; - for (int index = 0; index < values.length; index++) { - if (complemented && !subsetValues.contains(values[index])) { - continue; - } - subsets[index] = data.subset(Condition.equals(best.getAttr(), values[index])); - if (subsets[index].size() >= minSplitNum) { - cnt++; - } - } - - // size of the subset is less than the minSpitNum - if (cnt < 2) { - // branch is not split - double label; - if (data.getDataset().isNumerical(data.getDataset().getLabelId())) { - label = sum / data.size(); - } else { - label = data.majorityLabel(rng); - } - log.debug("branch is not split Leaf({})", label); - return new Leaf(label); - } - - selected[best.getAttr()] = true; - - Node[] children = new Node[values.length]; - for (int index = 0; index < values.length; index++) { - if (complemented && (subsetValues == null || !subsetValues.contains(values[index]))) { - // tree is complemented - double label; - if (data.getDataset().isNumerical(data.getDataset().getLabelId())) { - label = sum / data.size(); - } else { - label = data.majorityLabel(rng); - } - log.debug("complemented Leaf({})", label); - children[index] = new Leaf(label); - continue; - } - children[index] = build(rng, subsets[index]); - } - - selected[best.getAttr()] = alreadySelected; - - childNode = new CategoricalNode(best.getAttr(), values, children); - } - - return childNode; - } - - /** - * checks if all the vectors have identical attribute values. Ignore selected attributes. - * - * @return true is all the vectors are identical or the data is empty
- * false otherwise - */ - private boolean isIdentical(Data data) { - if (data.isEmpty()) { - return true; - } - - Instance instance = data.get(0); - for (int attr = 0; attr < selected.length; attr++) { - if (selected[attr]) { - continue; - } - - for (int index = 1; index < data.size(); index++) { - if (data.get(index).get(attr) != instance.get(attr)) { - return false; - } - } - } - - return true; - } - - /** - * Make a copy of the selection state of the attributes, unselect all numerical attributes - * - * @param selected selection state to clone - * @return cloned selection state - */ - private static boolean[] cloneCategoricalAttributes(Dataset dataset, boolean[] selected) { - boolean[] cloned = new boolean[selected.length]; - - for (int i = 0; i < selected.length; i++) { - cloned[i] = !dataset.isNumerical(i) && selected[i]; - } - cloned[dataset.getLabelId()] = true; - - return cloned; - } - - /** - * Randomly selects m attributes to consider for split, excludes IGNORED and LABEL attributes - * - * @param rng random-numbers generator - * @param selected attributes' state (selected or not) - * @param m number of attributes to choose - * @return list of selected attributes' indices, or null if all attributes have already been selected - */ - private static int[] randomAttributes(Random rng, boolean[] selected, int m) { - int nbNonSelected = 0; // number of non selected attributes - for (boolean sel : selected) { - if (!sel) { - nbNonSelected++; - } - } - - if (nbNonSelected == 0) { - log.warn("All attributes are selected !"); - return NO_ATTRIBUTES; - } - - int[] result; - if (nbNonSelected <= m) { - // return all non selected attributes - result = new int[nbNonSelected]; - int index = 0; - for (int attr = 0; attr < selected.length; attr++) { - if (!selected[attr]) { - result[index++] = attr; - } - } - } else { - result = new int[m]; - for (int index = 0; index < m; index++) { - // randomly choose a "non selected" attribute - int rind; - do { - rind = rng.nextInt(selected.length); - } while (selected[rind]); - - result[index] = rind; - selected[rind] = true; // temporarily set the chosen attribute to be selected - } - - // the chosen attributes are not yet selected - for (int attr : result) { - selected[attr] = false; - } - } - - return result; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/builder/DefaultTreeBuilder.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/builder/DefaultTreeBuilder.java deleted file mode 100644 index f03698d2c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/builder/DefaultTreeBuilder.java +++ /dev/null @@ -1,252 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.builder; - -import org.apache.mahout.classifier.df.data.Data; -import org.apache.mahout.classifier.df.data.Dataset; -import org.apache.mahout.classifier.df.data.Instance; -import org.apache.mahout.classifier.df.data.conditions.Condition; -import org.apache.mahout.classifier.df.node.CategoricalNode; -import org.apache.mahout.classifier.df.node.Leaf; -import org.apache.mahout.classifier.df.node.Node; -import org.apache.mahout.classifier.df.node.NumericalNode; -import org.apache.mahout.classifier.df.split.IgSplit; -import org.apache.mahout.classifier.df.split.OptIgSplit; -import org.apache.mahout.classifier.df.split.Split; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Random; - -/** - * Builds a Decision Tree
- * Based on the algorithm described in the "Decision Trees" tutorials by Andrew W. Moore, available at:
- *
- * http://www.cs.cmu.edu/~awm/tutorials - *

- * This class can be used when the criterion variable is the categorical attribute. - */ -public class DefaultTreeBuilder implements TreeBuilder { - - private static final Logger log = LoggerFactory.getLogger(DefaultTreeBuilder.class); - - private static final int[] NO_ATTRIBUTES = new int[0]; - - /** - * indicates which CATEGORICAL attributes have already been selected in the parent nodes - */ - private boolean[] selected; - /** - * number of attributes to select randomly at each node - */ - private int m = 1; - /** - * IgSplit implementation - */ - private final IgSplit igSplit; - - public DefaultTreeBuilder() { - igSplit = new OptIgSplit(); - } - - public void setM(int m) { - this.m = m; - } - - @Override - public Node build(Random rng, Data data) { - - if (selected == null) { - selected = new boolean[data.getDataset().nbAttributes()]; - selected[data.getDataset().getLabelId()] = true; // never select the label - } - - if (data.isEmpty()) { - return new Leaf(-1); - } - if (isIdentical(data)) { - return new Leaf(data.majorityLabel(rng)); - } - if (data.identicalLabel()) { - return new Leaf(data.getDataset().getLabel(data.get(0))); - } - - int[] attributes = randomAttributes(rng, selected, m); - if (attributes == null || attributes.length == 0) { - // we tried all the attributes and could not split the data anymore - return new Leaf(data.majorityLabel(rng)); - } - - // find the best split - Split best = null; - for (int attr : attributes) { - Split split = igSplit.computeSplit(data, attr); - if (best == null || best.getIg() < split.getIg()) { - best = split; - } - } - - boolean alreadySelected = selected[best.getAttr()]; - if (alreadySelected) { - // attribute already selected - log.warn("attribute {} already selected in a parent node", best.getAttr()); - } - - Node childNode; - if (data.getDataset().isNumerical(best.getAttr())) { - boolean[] temp = null; - - Data loSubset = data.subset(Condition.lesser(best.getAttr(), best.getSplit())); - Data hiSubset = data.subset(Condition.greaterOrEquals(best.getAttr(), best.getSplit())); - - if (loSubset.isEmpty() || hiSubset.isEmpty()) { - // the selected attribute did not change the data, avoid using it in the child notes - selected[best.getAttr()] = true; - } else { - // the data changed, so we can unselect all previousely selected NUMERICAL attributes - temp = selected; - selected = cloneCategoricalAttributes(data.getDataset(), selected); - } - - Node loChild = build(rng, loSubset); - Node hiChild = build(rng, hiSubset); - - // restore the selection state of the attributes - if (temp != null) { - selected = temp; - } else { - selected[best.getAttr()] = alreadySelected; - } - - childNode = new NumericalNode(best.getAttr(), best.getSplit(), loChild, hiChild); - } else { // CATEGORICAL attribute - selected[best.getAttr()] = true; - - double[] values = data.values(best.getAttr()); - Node[] children = new Node[values.length]; - - for (int index = 0; index < values.length; index++) { - Data subset = data.subset(Condition.equals(best.getAttr(), values[index])); - children[index] = build(rng, subset); - } - - selected[best.getAttr()] = alreadySelected; - - childNode = new CategoricalNode(best.getAttr(), values, children); - } - - return childNode; - } - - /** - * checks if all the vectors have identical attribute values. Ignore selected attributes. - * - * @return true is all the vectors are identical or the data is empty
- * false otherwise - */ - private boolean isIdentical(Data data) { - if (data.isEmpty()) { - return true; - } - - Instance instance = data.get(0); - for (int attr = 0; attr < selected.length; attr++) { - if (selected[attr]) { - continue; - } - - for (int index = 1; index < data.size(); index++) { - if (data.get(index).get(attr) != instance.get(attr)) { - return false; - } - } - } - - return true; - } - - - /** - * Make a copy of the selection state of the attributes, unselect all numerical attributes - * - * @param selected selection state to clone - * @return cloned selection state - */ - private static boolean[] cloneCategoricalAttributes(Dataset dataset, boolean[] selected) { - boolean[] cloned = new boolean[selected.length]; - - for (int i = 0; i < selected.length; i++) { - cloned[i] = !dataset.isNumerical(i) && selected[i]; - } - - return cloned; - } - - /** - * Randomly selects m attributes to consider for split, excludes IGNORED and LABEL attributes - * - * @param rng random-numbers generator - * @param selected attributes' state (selected or not) - * @param m number of attributes to choose - * @return list of selected attributes' indices, or null if all attributes have already been selected - */ - protected static int[] randomAttributes(Random rng, boolean[] selected, int m) { - int nbNonSelected = 0; // number of non selected attributes - for (boolean sel : selected) { - if (!sel) { - nbNonSelected++; - } - } - - if (nbNonSelected == 0) { - log.warn("All attributes are selected !"); - return NO_ATTRIBUTES; - } - - int[] result; - if (nbNonSelected <= m) { - // return all non selected attributes - result = new int[nbNonSelected]; - int index = 0; - for (int attr = 0; attr < selected.length; attr++) { - if (!selected[attr]) { - result[index++] = attr; - } - } - } else { - result = new int[m]; - for (int index = 0; index < m; index++) { - // randomly choose a "non selected" attribute - int rind; - do { - rind = rng.nextInt(selected.length); - } while (selected[rind]); - - result[index] = rind; - selected[rind] = true; // temporarily set the chosen attribute to be selected - } - - // the chosen attributes are not yet selected - for (int attr : result) { - selected[attr] = false; - } - } - - return result; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/builder/TreeBuilder.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/builder/TreeBuilder.java deleted file mode 100644 index 3d4c6d69d..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/builder/TreeBuilder.java +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.builder; - -import org.apache.mahout.classifier.df.data.Data; -import org.apache.mahout.classifier.df.node.Node; - -import java.util.Random; - -/** - * Abstract base class for TreeBuilders - */ -public interface TreeBuilder { - - /** - * Builds a Decision tree using the training data - * - * @param rng - * random-numbers generator - * @param data - * training data - * @return root Node - */ - Node build(Random rng, Data data); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/Data.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/Data.java deleted file mode 100644 index 659bd8402..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/Data.java +++ /dev/null @@ -1,280 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.data; - -import com.google.common.collect.Lists; -import org.apache.mahout.classifier.df.data.conditions.Condition; - -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Random; - -/** - * Holds a list of vectors and their corresponding Dataset. contains various operations that deals with the - * vectors (subset, count,...) - * - */ -public class Data implements Cloneable { - - private final List instances; - - private final Dataset dataset; - - public Data(Dataset dataset) { - this.dataset = dataset; - this.instances = Lists.newArrayList(); - } - - public Data(Dataset dataset, List instances) { - this.dataset = dataset; - this.instances = Lists.newArrayList(instances); - } - - /** - * @return the number of elements - */ - public int size() { - return instances.size(); - } - - /** - * @return true if this data contains no element - */ - public boolean isEmpty() { - return instances.isEmpty(); - } - - /** - * @param v - * element whose presence in this list if to be searched - * @return true is this data contains the specified element. - */ - public boolean contains(Instance v) { - return instances.contains(v); - } - - /** - * Returns the element at the specified position - * - * @param index - * index of element to return - * @return the element at the specified position - * @throws IndexOutOfBoundsException - * if the index is out of range - */ - public Instance get(int index) { - return instances.get(index); - } - - /** - * @return the subset from this data that matches the given condition - */ - public Data subset(Condition condition) { - List subset = Lists.newArrayList(); - - for (Instance instance : instances) { - if (condition.isTrueFor(instance)) { - subset.add(instance); - } - } - - return new Data(dataset, subset); - } - - /** - * if data has N cases, sample N cases at random -but with replacement. - */ - public Data bagging(Random rng) { - int datasize = size(); - List bag = Lists.newArrayListWithCapacity(datasize); - - for (int i = 0; i < datasize; i++) { - bag.add(instances.get(rng.nextInt(datasize))); - } - - return new Data(dataset, bag); - } - - /** - * if data has N cases, sample N cases at random -but with replacement. - * - * @param sampled - * indicating which instance has been sampled - * - * @return sampled data - */ - public Data bagging(Random rng, boolean[] sampled) { - int datasize = size(); - List bag = Lists.newArrayListWithCapacity(datasize); - - for (int i = 0; i < datasize; i++) { - int index = rng.nextInt(datasize); - bag.add(instances.get(index)); - sampled[index] = true; - } - - return new Data(dataset, bag); - } - - /** - * Splits the data in two, returns one part, and this gets the rest of the data. VERY SLOW! - */ - public Data rsplit(Random rng, int subsize) { - List subset = Lists.newArrayListWithCapacity(subsize); - - for (int i = 0; i < subsize; i++) { - subset.add(instances.remove(rng.nextInt(instances.size()))); - } - - return new Data(dataset, subset); - } - - /** - * checks if all the vectors have identical attribute values - * - * @return true is all the vectors are identical or the data is empty
- * false otherwise - */ - public boolean isIdentical() { - if (isEmpty()) { - return true; - } - - Instance instance = get(0); - for (int attr = 0; attr < dataset.nbAttributes(); attr++) { - for (int index = 1; index < size(); index++) { - if (get(index).get(attr) != instance.get(attr)) { - return false; - } - } - } - - return true; - } - - /** - * checks if all the vectors have identical label values - */ - public boolean identicalLabel() { - if (isEmpty()) { - return true; - } - - double label = dataset.getLabel(get(0)); - for (int index = 1; index < size(); index++) { - if (dataset.getLabel(get(index)) != label) { - return false; - } - } - - return true; - } - - /** - * finds all distinct values of a given attribute - */ - public double[] values(int attr) { - Collection result = new HashSet(); - - for (Instance instance : instances) { - result.add(instance.get(attr)); - } - - double[] values = new double[result.size()]; - - int index = 0; - for (Double value : result) { - values[index++] = value; - } - - return values; - } - - @Override - public Data clone() { - return new Data(dataset, Lists.newArrayList(instances)); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (!(obj instanceof Data)) { - return false; - } - - Data data = (Data) obj; - - return instances.equals(data.instances) && dataset.equals(data.dataset); - } - - @Override - public int hashCode() { - return instances.hashCode() + dataset.hashCode(); - } - - /** - * extract the labels of all instances - */ - public double[] extractLabels() { - double[] labels = new double[size()]; - - for (int index = 0; index < labels.length; index++) { - labels[index] = dataset.getLabel(get(index)); - } - - return labels; - } - - /** - * finds the majority label, breaking ties randomly
- * This method can be used when the criterion variable is the categorical attribute. - * - * @return the majority label value - */ - public int majorityLabel(Random rng) { - // count the frequency of each label value - int[] counts = new int[dataset.nblabels()]; - - for (int index = 0; index < size(); index++) { - counts[(int) dataset.getLabel(get(index))]++; - } - - // find the label values that appears the most - return DataUtils.maxindex(rng, counts); - } - - /** - * Counts the number of occurrences of each label value
- * This method can be used when the criterion variable is the categorical attribute. - * - * @param counts - * will contain the results, supposed to be initialized at 0 - */ - public void countLabels(int[] counts) { - for (int index = 0; index < size(); index++) { - counts[(int) dataset.getLabel(get(index))]++; - } - } - - public Dataset getDataset() { - return dataset; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DataConverter.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DataConverter.java deleted file mode 100644 index 15e093841..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DataConverter.java +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.data; - -import com.google.common.base.Preconditions; -import org.apache.commons.lang.ArrayUtils; -import org.apache.mahout.math.DenseVector; - -import java.util.regex.Pattern; - -/** - * Converts String to Instance using a Dataset - */ -public class DataConverter { - - private static final Pattern COMMA_SPACE = Pattern.compile("[, ]"); - - private final Dataset dataset; - - public DataConverter(Dataset dataset) { - this.dataset = dataset; - } - - public Instance convert(CharSequence string) { - // all attributes (categorical, numerical, label), ignored - int nball = dataset.nbAttributes() + dataset.getIgnored().length; - - String[] tokens = COMMA_SPACE.split(string); - Preconditions.checkArgument(tokens.length == nball, - "Wrong number of attributes in the string"); - - int nbattrs = dataset.nbAttributes(); - DenseVector vector = new DenseVector(nbattrs); - - int aId = 0; - for (int attr = 0; attr < nball; attr++) { - if (ArrayUtils.contains(dataset.getIgnored(), attr)) { - continue; // IGNORED - } - - String token = tokens[attr].trim(); - - if ("?".equals(token)) { - // missing value - return null; - } - - if (dataset.isNumerical(aId)) { - vector.set(aId++, Double.parseDouble(token)); - } else { // CATEGORICAL - vector.set(aId, dataset.valueOf(aId, token)); - aId++; - } - } - - return new Instance(vector); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java deleted file mode 100644 index feaad04c1..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java +++ /dev/null @@ -1,259 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.data; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.mahout.classifier.df.data.Dataset.Attribute; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.List; -import java.util.Scanner; -import java.util.Set; -import java.util.regex.Pattern; - -/** - * Converts the input data to a Vector Array using the information given by the Dataset.
- * Generates for each line a Vector that contains :
- *

    - *
  • double parsed value for NUMERICAL attributes
  • - *
  • int value for CATEGORICAL and LABEL attributes
  • - *
- *
- * adds an IGNORED first attribute that will contain a unique id for each instance, which is the line number - * of the instance in the input data - */ -public final class DataLoader { - - private static final Logger log = LoggerFactory.getLogger(DataLoader.class); - - private static final Pattern COMMA_SPACE = Pattern.compile("[, ]"); - - private DataLoader() { } - - /** - * Converts a comma-separated String to a Vector. - * - * @param attrs - * attributes description - * @param values - * used to convert CATEGORICAL attribute values to Integer - * @return false if there are missing values '?' or NUMERICAL attribute values is not numeric - */ - private static boolean parseString(Attribute[] attrs, Set[] values, CharSequence string, - boolean regression) { - String[] tokens = COMMA_SPACE.split(string); - Preconditions.checkArgument(tokens.length == attrs.length, "Wrong number of attributes in the string"); - - // extract tokens and check is there is any missing value - for (int attr = 0; attr < attrs.length; attr++) { - if (attrs[attr].isIgnored()) { - continue; - } - if ("?".equals(tokens[attr])) { - return false; // missing value - } - } - - for (int attr = 0; attr < attrs.length; attr++) { - if (attrs[attr].isIgnored()) { - continue; - } - - String token = tokens[attr]; - - if (attrs[attr].isCategorical() || (!regression && attrs[attr].isLabel())) { - // update values - if (values[attr] == null) { - values[attr] = Sets.newHashSet(); - } - values[attr].add(token); - } else { - try { - Double.parseDouble(token); - } catch (NumberFormatException e) { - return false; - } - } - } - - return true; - } - - /** - * Loads the data from a file - * - * @param fs - * file system - * @param fpath - * data file path - * @throws IOException - * if any problem is encountered - */ - - public static Data loadData(Dataset dataset, FileSystem fs, Path fpath) throws IOException { - FSDataInputStream input = fs.open(fpath); - Scanner scanner = new Scanner(input); - - List instances = Lists.newArrayList(); - - DataConverter converter = new DataConverter(dataset); - - while (scanner.hasNextLine()) { - String line = scanner.nextLine(); - if (line.isEmpty()) { - log.warn("{}: empty string", instances.size()); - continue; - } - - Instance instance = converter.convert(line); - if (instance == null) { - // missing values found - log.warn("{}: missing values", instances.size()); - continue; - } - - instances.add(instance); - } - - scanner.close(); - - return new Data(dataset, instances); - } - - /** - * Loads the data from a String array - */ - public static Data loadData(Dataset dataset, String[] data) { - List instances = Lists.newArrayList(); - - DataConverter converter = new DataConverter(dataset); - - for (String line : data) { - if (line.isEmpty()) { - log.warn("{}: empty string", instances.size()); - continue; - } - - Instance instance = converter.convert(line); - if (instance == null) { - // missing values found - log.warn("{}: missing values", instances.size()); - continue; - } - - instances.add(instance); - } - - return new Data(dataset, instances); - } - - /** - * Generates the Dataset by parsing the entire data - * - * @param descriptor - * attributes description - * @param regression - * if true, the label is numerical - * @param fs - * file system - * @param path - * data path - */ - public static Dataset generateDataset(CharSequence descriptor, - boolean regression, - FileSystem fs, - Path path) throws DescriptorException, IOException { - Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor); - - FSDataInputStream input = fs.open(path); - Scanner scanner = new Scanner(input); - - // used to convert CATEGORICAL attribute to Integer - @SuppressWarnings("unchecked") - Set[] valsets = new Set[attrs.length]; - - int size = 0; - while (scanner.hasNextLine()) { - String line = scanner.nextLine(); - if (line.isEmpty()) { - continue; - } - - if (parseString(attrs, valsets, line, regression)) { - size++; - } - } - - scanner.close(); - - @SuppressWarnings("unchecked") - List[] values = new List[attrs.length]; - for (int i = 0; i < valsets.length; i++) { - if (valsets[i] != null) { - values[i] = Lists.newArrayList(valsets[i]); - } - } - - return new Dataset(attrs, values, size, regression); - } - - /** - * Generates the Dataset by parsing the entire data - * - * @param descriptor - * attributes description - */ - public static Dataset generateDataset(CharSequence descriptor, - boolean regression, - String[] data) throws DescriptorException { - Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor); - - // used to convert CATEGORICAL attributes to Integer - @SuppressWarnings("unchecked") - Set[] valsets = new Set[attrs.length]; - - int size = 0; - for (String aData : data) { - if (aData.isEmpty()) { - continue; - } - - if (parseString(attrs, valsets, aData, regression)) { - size++; - } - } - - @SuppressWarnings("unchecked") - List[] values = new List[attrs.length]; - for (int i = 0; i < valsets.length; i++) { - if (valsets[i] != null) { - values[i] = Lists.newArrayList(valsets[i]); - } - } - - return new Dataset(attrs, values, size, regression); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DataUtils.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DataUtils.java deleted file mode 100644 index 856d452a7..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DataUtils.java +++ /dev/null @@ -1,88 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.data; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - -import java.util.List; -import java.util.Random; - -/** - * Helper methods that deals with data lists and arrays of values - */ -public final class DataUtils { - private DataUtils() { } - - /** - * Computes the sum of the values - * - */ - public static int sum(int[] values) { - int sum = 0; - for (int value : values) { - sum += value; - } - - return sum; - } - - /** - * foreach i : array1[i] += array2[i] - */ - public static void add(int[] array1, int[] array2) { - Preconditions.checkArgument(array1.length == array2.length, "array1.length != array2.length"); - for (int index = 0; index < array1.length; index++) { - array1[index] += array2[index]; - } - } - - /** - * foreach i : array1[i] -= array2[i] - */ - public static void dec(int[] array1, int[] array2) { - Preconditions.checkArgument(array1.length == array2.length, "array1.length != array2.length"); - for (int index = 0; index < array1.length; index++) { - array1[index] -= array2[index]; - } - } - - /** - * return the index of the maximum of the array, breaking ties randomly - * - * @param rng - * used to break ties - * @return index of the maximum - */ - public static int maxindex(Random rng, int[] values) { - int max = 0; - List maxindices = Lists.newArrayList(); - - for (int index = 0; index < values.length; index++) { - if (values[index] > max) { - max = values[index]; - maxindices.clear(); - maxindices.add(index); - } else if (values[index] == max) { - maxindices.add(index); - } - } - - return maxindices.size() > 1 ? maxindices.get(rng.nextInt(maxindices.size())) : maxindices.get(0); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java deleted file mode 100644 index 724741e09..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java +++ /dev/null @@ -1,356 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.data; - -import com.google.common.base.Preconditions; -import com.google.common.io.Closeables; -import org.apache.commons.lang.ArrayUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableUtils; -import org.apache.mahout.classifier.df.DFUtils; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -/** - * Contains informations about the attributes. - */ -public class Dataset implements Writable { - - /** - * Attributes type - */ - public enum Attribute { - IGNORED, - NUMERICAL, - CATEGORICAL, - LABEL; - - public boolean isNumerical() { - return this == NUMERICAL; - } - - public boolean isCategorical() { - return this == CATEGORICAL; - } - - public boolean isLabel() { - return this == LABEL; - } - - public boolean isIgnored() { - return this == IGNORED; - } - } - - private Attribute[] attributes; - - /** - * list of ignored attributes - */ - private int[] ignored; - - /** - * distinct values (CATEGORIAL attributes only) - */ - private String[][] values; - - /** - * index of the label attribute in the loaded data (without ignored attributed) - */ - private int labelId; - - /** - * number of instances in the dataset - */ - private int nbInstances; - - private Dataset() { - } - - /** - * Should only be called by a DataLoader - * - * @param attrs attributes description - * @param values distinct values for all CATEGORICAL attributes - */ - Dataset(Attribute[] attrs, List[] values, int nbInstances, boolean regression) { - validateValues(attrs, values); - - int nbattrs = countAttributes(attrs); - - // the label values are set apart - attributes = new Attribute[nbattrs]; - this.values = new String[nbattrs][]; - ignored = new int[attrs.length - nbattrs]; // nbignored = total - nbattrs - - labelId = -1; - int ignoredId = 0; - int ind = 0; - for (int attr = 0; attr < attrs.length; attr++) { - if (attrs[attr].isIgnored()) { - ignored[ignoredId++] = attr; - continue; - } - - if (attrs[attr].isLabel()) { - if (labelId != -1) { - throw new IllegalStateException("Label found more than once"); - } - labelId = ind; - if (regression) { - attrs[attr] = Attribute.NUMERICAL; - } else { - attrs[attr] = Attribute.CATEGORICAL; - } - } - - if (attrs[attr].isCategorical() || (!regression && attrs[attr].isLabel())) { - this.values[ind] = new String[values[attr].size()]; - values[attr].toArray(this.values[ind]); - } - - attributes[ind++] = attrs[attr]; - } - - if (labelId == -1) { - throw new IllegalStateException("Label not found"); - } - - this.nbInstances = nbInstances; - } - - public int nbValues(int attr) { - return values[attr].length; - } - - public String[] labels() { - return Arrays.copyOf(values[labelId], nblabels()); - } - - public int nblabels() { - return values[labelId].length; - } - - public int getLabelId() { - return labelId; - } - - public double getLabel(Instance instance) { - return instance.get(getLabelId()); - } - - public int nbInstances() { - return nbInstances; - } - - /** - * Returns the code used to represent the label value in the data - * - * @param label label's value to code - * @return label's code - */ - public int labelCode(String label) { - return ArrayUtils.indexOf(values[labelId], label); - } - - /** - * Returns the label value in the data - * This method can be used when the criterion variable is the categorical attribute. - * - * @param code label's code - * @return label's value - */ - public String getLabelString(double code) { - // handle the case (prediction == -1) - if (code == -1) { - return "unknown"; - } - return values[labelId][(int) code]; - } - - /** - * Converts a token to its corresponding int code for a given attribute - * - * @param attr attribute's index - */ - public int valueOf(int attr, String token) { - Preconditions.checkArgument(!isNumerical(attr), "Only for CATEGORICAL attributes"); - Preconditions.checkArgument(values != null, "Values not found"); - return ArrayUtils.indexOf(values[attr], token); - } - - public int[] getIgnored() { - return ignored; - } - - - /** - * @return number of attributes that are not IGNORED - */ - private static int countAttributes(Attribute[] attrs) { - int nbattrs = 0; - - for (Attribute attr : attrs) { - if (!attr.isIgnored()) { - nbattrs++; - } - } - - return nbattrs; - } - - private static void validateValues(Attribute[] attrs, List[] values) { - Preconditions.checkArgument(attrs.length == values.length, "attrs.length != values.length"); - for (int attr = 0; attr < attrs.length; attr++) { - Preconditions.checkArgument(!attrs[attr].isCategorical() || values[attr] != null, - "values not found for attribute " + attr); - } - } - - /** - * @return number of attributes - */ - public int nbAttributes() { - return attributes.length; - } - - /** - * Is this a numerical attribute ? - * - * @param attr index of the attribute to check - * @return true if the attribute is numerical - */ - public boolean isNumerical(int attr) { - return attributes[attr].isNumerical(); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (!(obj instanceof Dataset)) { - return false; - } - - Dataset dataset = (Dataset) obj; - - if (!Arrays.equals(attributes, dataset.attributes)) { - return false; - } - - for (int attr = 0; attr < nbAttributes(); attr++) { - if (!Arrays.equals(values[attr], dataset.values[attr])) { - return false; - } - } - - return labelId == dataset.labelId && nbInstances == dataset.nbInstances; - } - - @Override - public int hashCode() { - int hashCode = labelId + 31 * nbInstances; - for (Attribute attr : attributes) { - hashCode = 31 * hashCode + attr.hashCode(); - } - for (String[] valueRow : values) { - if (valueRow == null) { - continue; - } - for (String value : valueRow) { - hashCode = 31 * hashCode + value.hashCode(); - } - } - return hashCode; - } - - /** - * Loads the dataset from a file - * - * @throws java.io.IOException - */ - public static Dataset load(Configuration conf, Path path) throws IOException { - FileSystem fs = path.getFileSystem(conf); - FSDataInputStream input = fs.open(path); - try { - return read(input); - } finally { - Closeables.closeQuietly(input); - } - } - - public static Dataset read(DataInput in) throws IOException { - Dataset dataset = new Dataset(); - - dataset.readFields(in); - return dataset; - } - - @Override - public void readFields(DataInput in) throws IOException { - int nbAttributes = in.readInt(); - attributes = new Attribute[nbAttributes]; - for (int attr = 0; attr < nbAttributes; attr++) { - String name = WritableUtils.readString(in); - attributes[attr] = Attribute.valueOf(name); - } - - ignored = DFUtils.readIntArray(in); - - // only CATEGORICAL attributes have values - values = new String[nbAttributes][]; - for (int attr = 0; attr < nbAttributes; attr++) { - if (attributes[attr].isCategorical()) { - values[attr] = WritableUtils.readStringArray(in); - } - } - - labelId = in.readInt(); - nbInstances = in.readInt(); - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeInt(attributes.length); // nb attributes - for (Attribute attr : attributes) { - WritableUtils.writeString(out, attr.name()); - } - - DFUtils.writeArray(out, ignored); - - // only CATEGORICAL attributes have values - for (String[] vals : values) { - if (vals != null) { - WritableUtils.writeStringArray(out, vals); - } - } - - out.writeInt(labelId); - out.writeInt(nbInstances); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DescriptorException.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DescriptorException.java deleted file mode 100644 index f4419f0cf..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DescriptorException.java +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.data; - -/** - * Exception thrown when parsing a descriptor - */ -public class DescriptorException extends Exception { - public DescriptorException(String msg) { - super(msg); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DescriptorUtils.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DescriptorUtils.java deleted file mode 100644 index a2198b1ad..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/DescriptorUtils.java +++ /dev/null @@ -1,109 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.data; - -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; -import org.apache.mahout.classifier.df.data.Dataset.Attribute; - -import java.util.List; -import java.util.Locale; - -/** - * Contains various methods that deal with descriptor strings - */ -public final class DescriptorUtils { - - private static final Splitter SPACE = Splitter.on(' ').omitEmptyStrings(); - - private DescriptorUtils() { } - - /** - * Parses a descriptor string and generates the corresponding array of Attributes - * - * @throws DescriptorException - * if a bad token is encountered - */ - public static Attribute[] parseDescriptor(CharSequence descriptor) throws DescriptorException { - List attributes = Lists.newArrayList(); - for (String token : SPACE.split(descriptor)) { - token = token.toUpperCase(Locale.ENGLISH); - if ("I".equals(token)) { - attributes.add(Attribute.IGNORED); - } else if ("N".equals(token)) { - attributes.add(Attribute.NUMERICAL); - } else if ("C".equals(token)) { - attributes.add(Attribute.CATEGORICAL); - } else if ("L".equals(token)) { - attributes.add(Attribute.LABEL); - } else { - throw new DescriptorException("Bad Token : " + token); - } - } - return attributes.toArray(new Attribute[attributes.size()]); - } - - /** - * Generates a valid descriptor string from a user-friendly representation.
- * for example "3 N I N N 2 C L 5 I" generates "N N N I N N C C L I I I I I".
- * this useful when describing datasets with a large number of attributes - * @throws DescriptorException - */ - public static String generateDescriptor(CharSequence description) throws DescriptorException { - return generateDescriptor(SPACE.split(description)); - } - - /** - * Generates a valid descriptor string from a list of tokens - * @throws DescriptorException - */ - public static String generateDescriptor(Iterable tokens) throws DescriptorException { - StringBuilder descriptor = new StringBuilder(); - - int multiplicator = 0; - - for (String token : tokens) { - try { - // try to parse an integer - int number = Integer.parseInt(token); - - if (number <= 0) { - throw new DescriptorException("Multiplicator (" + number + ") must be > 0"); - } - if (multiplicator > 0) { - throw new DescriptorException("A multiplicator cannot be followed by another multiplicator"); - } - - multiplicator = number; - } catch (NumberFormatException e) { - // token is not a number - if (multiplicator == 0) { - multiplicator = 1; - } - - for (int index = 0; index < multiplicator; index++) { - descriptor.append(token).append(' '); - } - - multiplicator = 0; - } - } - - return descriptor.toString().trim(); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/Instance.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/Instance.java deleted file mode 100644 index 3abf1249b..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/Instance.java +++ /dev/null @@ -1,74 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.data; - -import org.apache.mahout.math.Vector; - -/** - * Represents one data instance. - */ -public class Instance { - - /** attributes, except LABEL and IGNORED */ - private final Vector attrs; - - public Instance(Vector attrs) { - this.attrs = attrs; - } - - /** - * Return the attribute at the specified position - * - * @param index - * position of the attribute to retrieve - * @return value of the attribute - */ - public double get(int index) { - return attrs.getQuick(index); - } - - /** - * Set the value at the given index - * - * @param value - * a double value to set - */ - public void set(int index, double value) { - attrs.set(index, value); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (!(obj instanceof Instance)) { - return false; - } - - Instance instance = (Instance) obj; - - return /*id == instance.id &&*/ attrs.equals(instance.attrs); - - } - - @Override - public int hashCode() { - return /*id +*/ attrs.hashCode(); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/conditions/Condition.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/conditions/Condition.java deleted file mode 100644 index b19983495..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/conditions/Condition.java +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.data.conditions; - -import org.apache.mahout.classifier.df.data.Instance; - -/** - * Condition on Instance - */ -public abstract class Condition { - - /** - * Returns true is the checked instance matches the condition - * - * @param instance - * checked instance - * @return true is the checked instance matches the condition - */ - public abstract boolean isTrueFor(Instance instance); - - /** - * Condition that checks if the given attribute has a value "equal" to the given value - */ - public static Condition equals(int attr, double value) { - return new Equals(attr, value); - } - - /** - * Condition that checks if the given attribute has a value "lesser" than the given value - */ - public static Condition lesser(int attr, double value) { - return new Lesser(attr, value); - } - - /** - * Condition that checks if the given attribute has a value "greater or equal" than the given value - */ - public static Condition greaterOrEquals(int attr, double value) { - return new GreaterOrEquals(attr, value); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/conditions/Equals.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/conditions/Equals.java deleted file mode 100644 index 73f4ef6d3..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/conditions/Equals.java +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.data.conditions; - -import org.apache.mahout.classifier.df.data.Instance; - -/** - * True if a given attribute has a given value - */ -public class Equals extends Condition { - - private final int attr; - - private final double value; - - public Equals(int attr, double value) { - this.attr = attr; - this.value = value; - } - - @Override - public boolean isTrueFor(Instance instance) { - return instance.get(attr) == value; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/conditions/GreaterOrEquals.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/conditions/GreaterOrEquals.java deleted file mode 100644 index 2db3f2e5f..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/conditions/GreaterOrEquals.java +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.data.conditions; - -import org.apache.mahout.classifier.df.data.Instance; - -/** - * True if a given attribute has a value "greater or equal" than a given value - */ -public class GreaterOrEquals extends Condition { - - private final int attr; - - private final double value; - - public GreaterOrEquals(int attr, double value) { - this.attr = attr; - this.value = value; - } - - @Override - public boolean isTrueFor(Instance v) { - return v.get(attr) >= value; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/conditions/Lesser.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/conditions/Lesser.java deleted file mode 100644 index 4e49eb791..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/data/conditions/Lesser.java +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.data.conditions; - -import org.apache.mahout.classifier.df.data.Instance; - -/** - * True if a given attribute has a value "lesser" than a given value - */ -public class Lesser extends Condition { - - private final int attr; - - private final double value; - - public Lesser(int attr, double value) { - this.attr = attr; - this.value = value; - } - - @Override - public boolean isTrueFor(Instance instance) { - return instance.get(attr) < value; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/Builder.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/Builder.java deleted file mode 100644 index fb24e9b09..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/Builder.java +++ /dev/null @@ -1,341 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.mapreduce; - -import com.google.common.base.Preconditions; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.filecache.DistributedCache; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapreduce.InputSplit; -import org.apache.hadoop.mapreduce.Job; -import org.apache.mahout.classifier.df.DecisionForest; -import org.apache.mahout.classifier.df.builder.TreeBuilder; -import org.apache.mahout.classifier.df.data.Dataset; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.net.URI; -import java.util.Arrays; -import java.util.Comparator; - -/** - * Base class for Mapred DecisionForest builders. Takes care of storing the parameters common to the mapred - * implementations.
- * The child classes must implement at least : - *
    - *
  • void configureJob(Job) : to further configure the job before its launch; and
  • - *
  • DecisionForest parseOutput(Job, PredictionCallback) : in order to convert the job outputs into a - * DecisionForest and its corresponding oob predictions
  • - *
- * - */ -public abstract class Builder { - - private static final Logger log = LoggerFactory.getLogger(Builder.class); - - private final TreeBuilder treeBuilder; - private final Path dataPath; - private final Path datasetPath; - private final Long seed; - private final Configuration conf; - private String outputDirName = "output"; - - protected Builder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed, Configuration conf) { - this.treeBuilder = treeBuilder; - this.dataPath = dataPath; - this.datasetPath = datasetPath; - this.seed = seed; - this.conf = new Configuration(conf); - } - - protected TreeBuilder getTreeBuilder() { - return treeBuilder; - } - - protected Path getDataPath() { - return dataPath; - } - - protected Long getSeed() { - return seed; - } - - /** - * Return the value of "mapred.map.tasks". - * - * @param conf - * configuration - * @return number of map tasks - */ - public static int getNumMaps(Configuration conf) { - return conf.getInt("mapred.map.tasks", -1); - } - - /** - * Used only for DEBUG purposes. if false, the mappers doesn't output anything, so the builder has nothing - * to process - * - * @param conf - * configuration - * @return true if the builder has to return output. false otherwise - */ - protected static boolean isOutput(Configuration conf) { - return conf.getBoolean("debug.mahout.rf.output", true); - } - - /** - * Returns the random seed - * - * @param conf - * configuration - * @return null if no seed is available - */ - public static Long getRandomSeed(Configuration conf) { - String seed = conf.get("mahout.rf.random.seed"); - if (seed == null) { - return null; - } - - return Long.valueOf(seed); - } - - /** - * Sets the random seed value - * - * @param conf - * configuration - * @param seed - * random seed - */ - private static void setRandomSeed(Configuration conf, long seed) { - conf.setLong("mahout.rf.random.seed", seed); - } - - public static TreeBuilder getTreeBuilder(Configuration conf) { - String string = conf.get("mahout.rf.treebuilder"); - if (string == null) { - return null; - } - - return StringUtils.fromString(string); - } - - private static void setTreeBuilder(Configuration conf, TreeBuilder treeBuilder) { - conf.set("mahout.rf.treebuilder", StringUtils.toString(treeBuilder)); - } - - /** - * Get the number of trees for the map-reduce job. - * - * @param conf - * configuration - * @return number of trees to build - */ - public static int getNbTrees(Configuration conf) { - return conf.getInt("mahout.rf.nbtrees", -1); - } - - /** - * Set the number of trees to grow for the map-reduce job - * - * @param conf - * configuration - * @param nbTrees - * number of trees to build - * @throws IllegalArgumentException - * if (nbTrees <= 0) - */ - public static void setNbTrees(Configuration conf, int nbTrees) { - Preconditions.checkArgument(nbTrees > 0, "nbTrees should be greater than 0"); - - conf.setInt("mahout.rf.nbtrees", nbTrees); - } - - /** - * Sets the Output directory name, will be creating in the working directory - * - * @param name - * output dir. name - */ - public void setOutputDirName(String name) { - outputDirName = name; - } - - /** - * Output Directory name - * - * @param conf - * configuration - * @return output dir. path (%WORKING_DIRECTORY%/OUTPUT_DIR_NAME%) - * @throws IOException - * if we cannot get the default FileSystem - */ - protected Path getOutputPath(Configuration conf) throws IOException { - // the output directory is accessed only by this class, so use the default - // file system - FileSystem fs = FileSystem.get(conf); - return new Path(fs.getWorkingDirectory(), outputDirName); - } - - /** - * Helper method. Get a path from the DistributedCache - * - * @param conf - * configuration - * @param index - * index of the path in the DistributedCache files - * @return path from the DistributedCache - * @throws IOException - * if no path is found - */ - public static Path getDistributedCacheFile(Configuration conf, int index) throws IOException { - URI[] files = DistributedCache.getCacheFiles(conf); - - if (files == null || files.length <= index) { - throw new IOException("path not found in the DistributedCache"); - } - - return new Path(files[index].getPath()); - } - - /** - * Helper method. Load a Dataset stored in the DistributedCache - * - * @param conf - * configuration - * @return loaded Dataset - * @throws IOException - * if we cannot retrieve the Dataset path from the DistributedCache, or the Dataset could not be - * loaded - */ - public static Dataset loadDataset(Configuration conf) throws IOException { - Path datasetPath = getDistributedCacheFile(conf, 0); - - return Dataset.load(conf, datasetPath); - } - - /** - * Used by the inheriting classes to configure the job - * - * - * @param job - * Hadoop's Job - * @throws IOException - * if anything goes wrong while configuring the job - */ - protected abstract void configureJob(Job job) throws IOException; - - /** - * Sequential implementation should override this method to simulate the job execution - * - * @param job - * Hadoop's job - * @return true is the job succeeded - */ - protected boolean runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException { - return job.waitForCompletion(true); - } - - /** - * Parse the output files to extract the trees and pass the predictions to the callback - * - * @param job - * Hadoop's job - * @return Built DecisionForest - * @throws IOException - * if anything goes wrong while parsing the output - */ - protected abstract DecisionForest parseOutput(Job job) throws IOException; - - public DecisionForest build(int nbTrees) - throws IOException, ClassNotFoundException, InterruptedException { - // int numTrees = getNbTrees(conf); - - Path outputPath = getOutputPath(conf); - FileSystem fs = outputPath.getFileSystem(conf); - - // check the output - if (fs.exists(outputPath)) { - throw new IOException("Output path already exists : " + outputPath); - } - - if (seed != null) { - setRandomSeed(conf, seed); - } - setNbTrees(conf, nbTrees); - setTreeBuilder(conf, treeBuilder); - - // put the dataset into the DistributedCache - DistributedCache.addCacheFile(datasetPath.toUri(), conf); - - Job job = new Job(conf, "decision forest builder"); - - log.debug("Configuring the job..."); - configureJob(job); - - log.debug("Running the job..."); - if (!runJob(job)) { - log.error("Job failed!"); - return null; - } - - if (isOutput(conf)) { - log.debug("Parsing the output..."); - DecisionForest forest = parseOutput(job); - HadoopUtil.delete(conf, outputPath); - return forest; - } - - return null; - } - - /** - * sort the splits into order based on size, so that the biggest go first.
- * This is the same code used by Hadoop's JobClient. - * - * @param splits - * input splits - */ - public static void sortSplits(InputSplit[] splits) { - Arrays.sort(splits, new Comparator() { - @Override - public int compare(InputSplit a, InputSplit b) { - try { - long left = a.getLength(); - long right = b.getLength(); - if (left == right) { - return 0; - } else if (left < right) { - return 1; - } else { - return -1; - } - } catch (IOException ie) { - throw new IllegalStateException("Problem getting input split size", ie); - } catch (InterruptedException ie) { - throw new IllegalStateException("Problem getting input split size", ie); - } - } - }); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/Classifier.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/Classifier.java deleted file mode 100644 index cbced7c55..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/Classifier.java +++ /dev/null @@ -1,240 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.mapreduce; - -import com.google.common.io.Closeables; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.filecache.DistributedCache; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.DoubleWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.JobContext; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.FileSplit; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.mahout.classifier.df.DFUtils; -import org.apache.mahout.classifier.df.DecisionForest; -import org.apache.mahout.classifier.df.data.DataConverter; -import org.apache.mahout.classifier.df.data.Dataset; -import org.apache.mahout.classifier.df.data.Instance; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.net.URI; -import java.util.ArrayList; -import java.util.List; -import java.util.Random; - -/** - * Mapreduce implementation that classifies the Input data using a previousely built decision forest - */ -public class Classifier { - - private static final Logger log = LoggerFactory.getLogger(Classifier.class); - - private final Path forestPath; - private final Path inputPath; - private final Path datasetPath; - private final Configuration conf; - private final Path outputPath; // path that will containt the final output of the classifier - private final Path mappersOutputPath; // mappers will output here - private double[][] results; - - public double[][] getResults() { - return results; - } - - public Classifier(Path forestPath, - Path inputPath, - Path datasetPath, - Path outputPath, - Configuration conf) { - this.forestPath = forestPath; - this.inputPath = inputPath; - this.datasetPath = datasetPath; - this.outputPath = outputPath; - this.conf = conf; - - mappersOutputPath = new Path(outputPath, "mappers"); - } - - private void configureJob(Job job) throws IOException { - - job.setJarByClass(Classifier.class); - - FileInputFormat.setInputPaths(job, inputPath); - FileOutputFormat.setOutputPath(job, mappersOutputPath); - - job.setOutputKeyClass(DoubleWritable.class); - job.setOutputValueClass(Text.class); - - job.setMapperClass(CMapper.class); - job.setNumReduceTasks(0); // no reducers - - job.setInputFormatClass(CTextInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - - } - - public void run() throws IOException, ClassNotFoundException, InterruptedException { - FileSystem fs = FileSystem.get(conf); - - // check the output - if (fs.exists(outputPath)) { - throw new IOException("Output path already exists : " + outputPath); - } - - log.info("Adding the dataset to the DistributedCache"); - // put the dataset into the DistributedCache - DistributedCache.addCacheFile(datasetPath.toUri(), conf); - - log.info("Adding the decision forest to the DistributedCache"); - DistributedCache.addCacheFile(forestPath.toUri(), conf); - - Job job = new Job(conf, "decision forest classifier"); - - log.info("Configuring the job..."); - configureJob(job); - - log.info("Running the job..."); - if (!job.waitForCompletion(true)) { - throw new IllegalStateException("Job failed!"); - } - - parseOutput(job); - - HadoopUtil.delete(conf, mappersOutputPath); - } - - /** - * Extract the prediction for each mapper and write them in the corresponding output file. - * The name of the output file is based on the name of the corresponding input file. - * Will compute the ConfusionMatrix if necessary. - */ - private void parseOutput(JobContext job) throws IOException { - Configuration conf = job.getConfiguration(); - FileSystem fs = mappersOutputPath.getFileSystem(conf); - - Path[] outfiles = DFUtils.listOutputFiles(fs, mappersOutputPath); - - // read all the output - List resList = new ArrayList(); - for (Path path : outfiles) { - FSDataOutputStream ofile = null; - try { - for (Pair record : new SequenceFileIterable(path, true, conf)) { - double key = record.getFirst().get(); - String value = record.getSecond().toString(); - if (ofile == null) { - // this is the first value, it contains the name of the input file - ofile = fs.create(new Path(outputPath, value).suffix(".out")); - } else { - // The key contains the correct label of the data. The value contains a prediction - ofile.writeChars(value); // write the prediction - ofile.writeChar('\n'); - - resList.add(new double[]{key, Double.valueOf(value)}); - } - } - } finally { - Closeables.closeQuietly(ofile); - } - } - results = new double[resList.size()][2]; - resList.toArray(results); - } - - /** - * TextInputFormat that does not split the input files. This ensures that each input file is processed by one single - * mapper. - */ - private static class CTextInputFormat extends TextInputFormat { - @Override - protected boolean isSplitable(JobContext jobContext, Path path) { - return false; - } - } - - public static class CMapper extends Mapper { - - /** used to convert input values to data instances */ - private DataConverter converter; - private DecisionForest forest; - private final Random rng = RandomUtils.getRandom(); - private boolean first = true; - private final Text lvalue = new Text(); - private Dataset dataset; - private final DoubleWritable lkey = new DoubleWritable(); - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - super.setup(context); //To change body of overridden methods use File | Settings | File Templates. - - Configuration conf = context.getConfiguration(); - - URI[] files = DistributedCache.getCacheFiles(conf); - - if (files == null || files.length < 2) { - throw new IOException("not enough paths in the DistributedCache"); - } - - dataset = Dataset.load(conf, new Path(files[0].getPath())); - - converter = new DataConverter(dataset); - - forest = DecisionForest.load(conf, new Path(files[1].getPath())); - if (forest == null) { - throw new InterruptedException("DecisionForest not found!"); - } - } - - @Override - protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { - if (first) { - FileSplit split = (FileSplit) context.getInputSplit(); - Path path = split.getPath(); // current split path - lvalue.set(path.getName()); - lkey.set(key.get()); - context.write(lkey, lvalue); - - first = false; - } - - String line = value.toString(); - if (!line.isEmpty()) { - Instance instance = converter.convert(line); - double prediction = forest.classify(dataset, rng, instance); - lkey.set(dataset.getLabel(instance)); - lvalue.set(Double.toString(prediction)); - context.write(lkey, lvalue); - } - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredMapper.java deleted file mode 100644 index 094ba9bef..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredMapper.java +++ /dev/null @@ -1,74 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.mapreduce; - -import com.google.common.base.Preconditions; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.classifier.df.builder.TreeBuilder; -import org.apache.mahout.classifier.df.data.Dataset; - -import java.io.IOException; - -/** - * Base class for Mapred mappers. Loads common parameters from the job - */ -public class MapredMapper extends Mapper { - - private boolean noOutput; - - private TreeBuilder treeBuilder; - - private Dataset dataset; - - /** - * - * @return if false, the mapper does not estimate and output predictions - */ - protected boolean isNoOutput() { - return noOutput; - } - - protected TreeBuilder getTreeBuilder() { - return treeBuilder; - } - - protected Dataset getDataset() { - return dataset; - } - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - super.setup(context); - - Configuration conf = context.getConfiguration(); - - configure(!Builder.isOutput(conf), Builder.getTreeBuilder(conf), Builder - .loadDataset(conf)); - } - - /** - * Useful for testing - */ - protected void configure(boolean noOutput, TreeBuilder treeBuilder, Dataset dataset) { - Preconditions.checkArgument(treeBuilder != null, "TreeBuilder not found in the Job parameters"); - this.noOutput = noOutput; - this.treeBuilder = treeBuilder; - this.dataset = dataset; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredOutput.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredOutput.java deleted file mode 100644 index b177ce5f5..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredOutput.java +++ /dev/null @@ -1,119 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.mapreduce; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.classifier.df.DFUtils; -import org.apache.mahout.classifier.df.node.Node; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.Arrays; - -/** - * Used by various implementation to return the results of a build.
- * Contains a grown tree and and its oob predictions. - */ -public class MapredOutput implements Writable, Cloneable { - - private Node tree; - - private int[] predictions; - - public MapredOutput() { - } - - public MapredOutput(Node tree, int[] predictions) { - this.tree = tree; - this.predictions = predictions; - } - - public MapredOutput(Node tree) { - this(tree, null); - } - - public Node getTree() { - return tree; - } - - int[] getPredictions() { - return predictions; - } - - @Override - public void readFields(DataInput in) throws IOException { - boolean readTree = in.readBoolean(); - if (readTree) { - tree = Node.read(in); - } - - boolean readPredictions = in.readBoolean(); - if (readPredictions) { - predictions = DFUtils.readIntArray(in); - } - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeBoolean(tree != null); - if (tree != null) { - tree.write(out); - } - - out.writeBoolean(predictions != null); - if (predictions != null) { - DFUtils.writeArray(out, predictions); - } - } - - @Override - public MapredOutput clone() { - return new MapredOutput(tree, predictions); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (!(obj instanceof MapredOutput)) { - return false; - } - - MapredOutput mo = (MapredOutput) obj; - - return ((tree == null && mo.getTree() == null) || (tree != null && tree.equals(mo.getTree()))) - && Arrays.equals(predictions, mo.getPredictions()); - } - - @Override - public int hashCode() { - int hashCode = tree == null ? 1 : tree.hashCode(); - for (int prediction : predictions) { - hashCode = 31 * hashCode + prediction; - } - return hashCode; - } - - @Override - public String toString() { - return "{" + tree + " | " + Arrays.toString(predictions) + '}'; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemBuilder.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemBuilder.java deleted file mode 100644 index 573a1e088..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemBuilder.java +++ /dev/null @@ -1,113 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.mapreduce.inmem; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.filecache.DistributedCache; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.mahout.classifier.df.DFUtils; -import org.apache.mahout.classifier.df.DecisionForest; -import org.apache.mahout.classifier.df.builder.TreeBuilder; -import org.apache.mahout.classifier.df.mapreduce.Builder; -import org.apache.mahout.classifier.df.mapreduce.MapredOutput; -import org.apache.mahout.classifier.df.node.Node; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; - -import java.io.IOException; -import java.util.List; -import java.util.Map; - -/** - * MapReduce implementation where each mapper loads a full copy of the data in-memory. The forest trees are - * splitted across all the mappers - */ -public class InMemBuilder extends Builder { - - public InMemBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed, Configuration conf) { - super(treeBuilder, dataPath, datasetPath, seed, conf); - } - - public InMemBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath) { - this(treeBuilder, dataPath, datasetPath, null, new Configuration()); - } - - @Override - protected void configureJob(Job job) throws IOException { - Configuration conf = job.getConfiguration(); - - job.setJarByClass(InMemBuilder.class); - - FileOutputFormat.setOutputPath(job, getOutputPath(conf)); - - // put the data in the DistributedCache - DistributedCache.addCacheFile(getDataPath().toUri(), conf); - - job.setOutputKeyClass(IntWritable.class); - job.setOutputValueClass(MapredOutput.class); - - job.setMapperClass(InMemMapper.class); - job.setNumReduceTasks(0); // no reducers - - job.setInputFormatClass(InMemInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - - } - - @Override - protected DecisionForest parseOutput(Job job) throws IOException { - Configuration conf = job.getConfiguration(); - - Map output = Maps.newHashMap(); - - Path outputPath = getOutputPath(conf); - FileSystem fs = outputPath.getFileSystem(conf); - - Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath); - - // import the InMemOutputs - for (Path path : outfiles) { - for (Pair record : new SequenceFileIterable(path, conf)) { - output.put(record.getFirst().get(), record.getSecond()); - } - } - - return processOutput(output); - } - - /** - * Process the output, extracting the trees - */ - private static DecisionForest processOutput(Map output) { - List trees = Lists.newArrayList(); - - for (Map.Entry entry : output.entrySet()) { - MapredOutput value = entry.getValue(); - trees.add(value.getTree()); - } - - return new DecisionForest(trees); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemInputFormat.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemInputFormat.java deleted file mode 100644 index ed38776c0..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemInputFormat.java +++ /dev/null @@ -1,284 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.mapreduce.inmem; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapreduce.InputFormat; -import org.apache.hadoop.mapreduce.InputSplit; -import org.apache.hadoop.mapreduce.JobContext; -import org.apache.hadoop.mapreduce.RecordReader; -import org.apache.hadoop.mapreduce.TaskAttemptContext; -import org.apache.mahout.classifier.df.mapreduce.Builder; -import org.apache.mahout.common.RandomUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.List; -import java.util.Locale; -import java.util.Random; - -/** - * Custom InputFormat that generates InputSplits given the desired number of trees.
- * each input split contains a subset of the trees.
- * The number of splits is equal to the number of requested splits - */ -public class InMemInputFormat extends InputFormat { - - private static final Logger log = LoggerFactory.getLogger(InMemInputSplit.class); - - private Random rng; - - private Long seed; - - private boolean isSingleSeed; - - /** - * Used for DEBUG purposes only. if true and a seed is available, all the mappers use the same seed, thus - * all the mapper should take the same time to build their trees. - */ - private static boolean isSingleSeed(Configuration conf) { - return conf.getBoolean("debug.mahout.rf.single.seed", false); - } - - @Override - public RecordReader createRecordReader(InputSplit split, - TaskAttemptContext context) throws IOException, - InterruptedException { - Preconditions.checkArgument(split instanceof InMemInputSplit); - return new InMemRecordReader((InMemInputSplit) split); - } - - @Override - public List getSplits(JobContext context) throws IOException, InterruptedException { - Configuration conf = context.getConfiguration(); - int numSplits = conf.getInt("mapred.map.tasks", -1); - - return getSplits(conf, numSplits); - } - - public List getSplits(Configuration conf, int numSplits) { - int nbTrees = Builder.getNbTrees(conf); - int splitSize = nbTrees / numSplits; - - seed = Builder.getRandomSeed(conf); - isSingleSeed = isSingleSeed(conf); - - if (rng != null && seed != null) { - log.warn("getSplits() was called more than once and the 'seed' is set, " - + "this can lead to no-repeatable behavior"); - } - - rng = seed == null || isSingleSeed ? null : RandomUtils.getRandom(seed); - - int id = 0; - - List splits = Lists.newArrayListWithCapacity(numSplits); - - for (int index = 0; index < numSplits - 1; index++) { - splits.add(new InMemInputSplit(id, splitSize, nextSeed())); - id += splitSize; - } - - // take care of the remainder - splits.add(new InMemInputSplit(id, nbTrees - id, nextSeed())); - - return splits; - } - - /** - * @return the seed for the next InputSplit - */ - private Long nextSeed() { - if (seed == null) { - return null; - } else if (isSingleSeed) { - return seed; - } else { - return rng.nextLong(); - } - } - - public static class InMemRecordReader extends RecordReader { - - private final InMemInputSplit split; - private int pos; - private IntWritable key; - private NullWritable value; - - public InMemRecordReader(InMemInputSplit split) { - this.split = split; - } - - @Override - public float getProgress() throws IOException { - return pos == 0 ? 0.0f : (float) (pos - 1) / split.nbTrees; - } - - @Override - public IntWritable getCurrentKey() throws IOException, InterruptedException { - return key; - } - - @Override - public NullWritable getCurrentValue() throws IOException, InterruptedException { - return value; - } - - @Override - public void initialize(InputSplit arg0, TaskAttemptContext arg1) throws IOException, InterruptedException { - key = new IntWritable(); - value = NullWritable.get(); - } - - @Override - public boolean nextKeyValue() throws IOException, InterruptedException { - if (pos < split.nbTrees) { - key.set(split.firstId + pos); - pos++; - return true; - } else { - return false; - } - } - - @Override - public void close() throws IOException { - } - - } - - /** - * Custom InputSplit that indicates how many trees are built by each mapper - */ - public static class InMemInputSplit extends InputSplit implements Writable { - - private static final String[] NO_LOCATIONS = new String[0]; - - /** Id of the first tree of this split */ - private int firstId; - - private int nbTrees; - - private Long seed; - - public InMemInputSplit() { } - - public InMemInputSplit(int firstId, int nbTrees, Long seed) { - this.firstId = firstId; - this.nbTrees = nbTrees; - this.seed = seed; - } - - /** - * @return the Id of the first tree of this split - */ - public int getFirstId() { - return firstId; - } - - /** - * @return the number of trees - */ - public int getNbTrees() { - return nbTrees; - } - - /** - * @return the random seed or null if no seed is available - */ - public Long getSeed() { - return seed; - } - - @Override - public long getLength() throws IOException { - return nbTrees; - } - - @Override - public String[] getLocations() throws IOException { - return NO_LOCATIONS; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (!(obj instanceof InMemInputSplit)) { - return false; - } - - InMemInputSplit split = (InMemInputSplit) obj; - - if (firstId != split.firstId || nbTrees != split.nbTrees) { - return false; - } - if (seed == null) { - return split.seed == null; - } else { - return seed.equals(split.seed); - } - - } - - @Override - public int hashCode() { - return firstId + nbTrees + (seed == null ? 0 : seed.intValue()); - } - - @Override - public String toString() { - return String.format(Locale.ENGLISH, "[firstId:%d, nbTrees:%d, seed:%d]", firstId, nbTrees, seed); - } - - @Override - public void readFields(DataInput in) throws IOException { - firstId = in.readInt(); - nbTrees = in.readInt(); - boolean isSeed = in.readBoolean(); - seed = isSeed ? in.readLong() : null; - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeInt(firstId); - out.writeInt(nbTrees); - out.writeBoolean(seed != null); - if (seed != null) { - out.writeLong(seed); - } - } - - public static InMemInputSplit read(DataInput in) throws IOException { - InMemInputSplit split = new InMemInputSplit(); - split.readFields(in); - return split; - } - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemMapper.java deleted file mode 100644 index 732e4a423..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemMapper.java +++ /dev/null @@ -1,105 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.mapreduce.inmem; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.mahout.classifier.df.Bagging; -import org.apache.mahout.classifier.df.data.Data; -import org.apache.mahout.classifier.df.data.DataLoader; -import org.apache.mahout.classifier.df.data.Dataset; -import org.apache.mahout.classifier.df.mapreduce.Builder; -import org.apache.mahout.classifier.df.mapreduce.MapredMapper; -import org.apache.mahout.classifier.df.mapreduce.MapredOutput; -import org.apache.mahout.classifier.df.mapreduce.inmem.InMemInputFormat.InMemInputSplit; -import org.apache.mahout.classifier.df.node.Node; -import org.apache.mahout.common.RandomUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Random; - -/** - * In-memory mapper that grows the trees using a full copy of the data loaded in-memory. The number of trees - * to grow is determined by the current InMemInputSplit. - */ -public class InMemMapper extends MapredMapper { - - private static final Logger log = LoggerFactory.getLogger(InMemMapper.class); - - private Bagging bagging; - - private Random rng; - - /** - * Load the training data - */ - private static Data loadData(Configuration conf, Dataset dataset) throws IOException { - Path dataPath = Builder.getDistributedCacheFile(conf, 1); - FileSystem fs = FileSystem.get(dataPath.toUri(), conf); - return DataLoader.loadData(dataset, fs, dataPath); - } - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - super.setup(context); - - Configuration conf = context.getConfiguration(); - - log.info("Loading the data..."); - Data data = loadData(conf, getDataset()); - log.info("Data loaded : {} instances", data.size()); - - bagging = new Bagging(getTreeBuilder(), data); - } - - @Override - protected void map(IntWritable key, - NullWritable value, - Context context) throws IOException, InterruptedException { - map(key, context); - } - - void map(IntWritable key, Context context) throws IOException, InterruptedException { - - initRandom((InMemInputSplit) context.getInputSplit()); - - log.debug("Building..."); - Node tree = bagging.build(rng); - - if (!isNoOutput()) { - log.debug("Outputing..."); - MapredOutput mrOut = new MapredOutput(tree); - - context.write(key, mrOut); - } - } - - void initRandom(InMemInputSplit split) { - if (rng == null) { // first execution of this mapper - Long seed = split.getSeed(); - log.debug("Initialising rng with seed : {}", seed); - rng = seed == null ? RandomUtils.getRandom() : RandomUtils.getRandom(seed); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/package-info.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/package-info.java deleted file mode 100644 index 7bcc8ad97..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/** - *

In-memory mapreduce implementation of Random Decision Forests

- * - *

Each mapper is responsible for growing a number of trees with a whole copy of the dataset loaded in memory, - * it uses the reference implementation's code to build each tree and estimate the oob error.

- * - *

The dataset is distributed to the slave nodes using the {@link org.apache.hadoop.filecache.DistributedCache}. - * A custom {@link org.apache.hadoop.mapreduce.InputFormat} - * ({@link org.apache.mahout.classifier.df.mapreduce.inmem.InMemInputFormat}) is configured with the - * desired number of trees and generates a number of {@link org.apache.hadoop.mapreduce.InputSplit}s - * equal to the configured number of maps.

- * - *

There is no need for reducers, each map outputs (the trees it built and, for each tree, the labels the - * tree predicted for each out-of-bag instance. This step has to be done in the mapper because only there we - * know which instances are o-o-b.

- * - *

The Forest builder ({@link org.apache.mahout.classifier.df.mapreduce.inmem.InMemBuilder}) is responsible - * for configuring and launching the job. - * At the end of the job it parses the output files and builds the corresponding - * {@link org.apache.mahout.classifier.df.DecisionForest}.

- */ -package org.apache.mahout.classifier.df.mapreduce.inmem; \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java deleted file mode 100644 index 5fb7d4d8b..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java +++ /dev/null @@ -1,140 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.mapreduce.partial; - -import com.google.common.base.Preconditions; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.JobContext; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.mahout.classifier.df.DFUtils; -import org.apache.mahout.classifier.df.DecisionForest; -import org.apache.mahout.classifier.df.builder.TreeBuilder; -import org.apache.mahout.classifier.df.mapreduce.Builder; -import org.apache.mahout.classifier.df.mapreduce.MapredOutput; -import org.apache.mahout.classifier.df.node.Node; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; - -import java.io.IOException; -import java.util.Arrays; - -/** - * Builds a random forest using partial data. Each mapper uses only the data given by its InputSplit - */ -public class PartialBuilder extends Builder { - - public PartialBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed) { - this(treeBuilder, dataPath, datasetPath, seed, new Configuration()); - } - - public PartialBuilder(TreeBuilder treeBuilder, - Path dataPath, - Path datasetPath, - Long seed, - Configuration conf) { - super(treeBuilder, dataPath, datasetPath, seed, conf); - } - - @Override - protected void configureJob(Job job) throws IOException { - Configuration conf = job.getConfiguration(); - - job.setJarByClass(PartialBuilder.class); - - FileInputFormat.setInputPaths(job, getDataPath()); - FileOutputFormat.setOutputPath(job, getOutputPath(conf)); - - job.setOutputKeyClass(TreeID.class); - job.setOutputValueClass(MapredOutput.class); - - job.setMapperClass(Step1Mapper.class); - job.setNumReduceTasks(0); // no reducers - - job.setInputFormatClass(TextInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - } - - @Override - protected DecisionForest parseOutput(Job job) throws IOException { - Configuration conf = job.getConfiguration(); - - int numTrees = Builder.getNbTrees(conf); - - Path outputPath = getOutputPath(conf); - - TreeID[] keys = new TreeID[numTrees]; - Node[] trees = new Node[numTrees]; - - processOutput(job, outputPath, keys, trees); - - return new DecisionForest(Arrays.asList(trees)); - } - - /** - * Processes the output from the output path.
- * - * @param outputPath - * directory that contains the output of the job - * @param keys - * can be null - * @param trees - * can be null - * @throws java.io.IOException - */ - protected static void processOutput(JobContext job, - Path outputPath, - TreeID[] keys, - Node[] trees) throws IOException { - Preconditions.checkArgument(keys == null && trees == null || keys != null && trees != null, - "if keys is null, trees should also be null"); - Preconditions.checkArgument(keys == null || keys.length == trees.length, "keys.length != trees.length"); - - Configuration conf = job.getConfiguration(); - - FileSystem fs = outputPath.getFileSystem(conf); - - Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath); - - // read all the outputs - int index = 0; - for (Path path : outfiles) { - for (Pair record : new SequenceFileIterable(path, conf)) { - TreeID key = record.getFirst(); - MapredOutput value = record.getSecond(); - if (keys != null) { - keys[index] = key; - } - if (trees != null) { - trees[index] = value.getTree(); - } - index++; - } - } - - // make sure we got all the keys/values - if (keys != null && index != keys.length) { - throw new IllegalStateException("Some key/values are missing from the output"); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java deleted file mode 100644 index 4e269590d..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java +++ /dev/null @@ -1,168 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.mapreduce.partial; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.mahout.classifier.df.Bagging; -import org.apache.mahout.classifier.df.data.Data; -import org.apache.mahout.classifier.df.data.DataConverter; -import org.apache.mahout.classifier.df.data.Instance; -import org.apache.mahout.classifier.df.mapreduce.Builder; -import org.apache.mahout.classifier.df.mapreduce.MapredMapper; -import org.apache.mahout.classifier.df.mapreduce.MapredOutput; -import org.apache.mahout.classifier.df.node.Node; -import org.apache.mahout.common.RandomUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.List; -import java.util.Random; - -/** - * First step of the Partial Data Builder. Builds the trees using the data available in the InputSplit. - * Predict the oob classes for each tree in its growing partition (input split). - */ -public class Step1Mapper extends MapredMapper { - - private static final Logger log = LoggerFactory.getLogger(Step1Mapper.class); - - /** used to convert input values to data instances */ - private DataConverter converter; - - private Random rng; - - /** number of trees to be built by this mapper */ - private int nbTrees; - - /** id of the first tree */ - private int firstTreeId; - - /** mapper's partition */ - private int partition; - - /** will contain all instances if this mapper's split */ - private final List instances = Lists.newArrayList(); - - public int getFirstTreeId() { - return firstTreeId; - } - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - super.setup(context); - Configuration conf = context.getConfiguration(); - - configure(Builder.getRandomSeed(conf), conf.getInt("mapred.task.partition", -1), - Builder.getNumMaps(conf), Builder.getNbTrees(conf)); - } - - /** - * Useful when testing - * - * @param partition - * current mapper inputSplit partition - * @param numMapTasks - * number of running map tasks - * @param numTrees - * total number of trees in the forest - */ - protected void configure(Long seed, int partition, int numMapTasks, int numTrees) { - converter = new DataConverter(getDataset()); - - // prepare random-numders generator - log.debug("seed : {}", seed); - if (seed == null) { - rng = RandomUtils.getRandom(); - } else { - rng = RandomUtils.getRandom(seed); - } - - // mapper's partition - Preconditions.checkArgument(partition >= 0, "Wrong partition ID"); - this.partition = partition; - - // compute number of trees to build - nbTrees = nbTrees(numMapTasks, numTrees, partition); - - // compute first tree id - firstTreeId = 0; - for (int p = 0; p < partition; p++) { - firstTreeId += nbTrees(numMapTasks, numTrees, p); - } - - log.debug("partition : {}", partition); - log.debug("nbTrees : {}", nbTrees); - log.debug("firstTreeId : {}", firstTreeId); - } - - /** - * Compute the number of trees for a given partition. The first partition (0) may be longer than the rest of - * partition because of the remainder. - * - * @param numMaps - * total number of maps (partitions) - * @param numTrees - * total number of trees to build - * @param partition - * partition to compute the number of trees for - */ - public static int nbTrees(int numMaps, int numTrees, int partition) { - int nbTrees = numTrees / numMaps; - if (partition == 0) { - nbTrees += numTrees - nbTrees * numMaps; - } - - return nbTrees; - } - - @Override - protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { - instances.add(converter.convert(value.toString())); - } - - @Override - protected void cleanup(Context context) throws IOException, InterruptedException { - // prepare the data - log.debug("partition: {} numInstances: {}", partition, instances.size()); - - Data data = new Data(getDataset(), instances); - Bagging bagging = new Bagging(getTreeBuilder(), data); - - TreeID key = new TreeID(); - - log.debug("Building {} trees", nbTrees); - for (int treeId = 0; treeId < nbTrees; treeId++) { - log.debug("Building tree number : {}", treeId); - - Node tree = bagging.build(rng); - - key.set(partition, firstTreeId + treeId); - - if (!isNoOutput()) { - MapredOutput emOut = new MapredOutput(tree); - context.write(key, emOut); - } - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/TreeID.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/TreeID.java deleted file mode 100644 index cea75a71d..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/TreeID.java +++ /dev/null @@ -1,57 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.mapreduce.partial; - -import com.google.common.base.Preconditions; -import org.apache.hadoop.io.LongWritable; - -/** - * Indicates both the tree and the data partition used to grow the tree - */ -public class TreeID extends LongWritable implements Cloneable { - - public static final int MAX_TREEID = 100000; - - public TreeID() { } - - public TreeID(int partition, int treeId) { - Preconditions.checkArgument(partition >= 0, "partition < 0"); - Preconditions.checkArgument(treeId >= 0, "treeId < 0"); - set(partition, treeId); - } - - public void set(int partition, int treeId) { - set((long) partition * MAX_TREEID + treeId); - } - - /** - * Data partition (InputSplit's index) that was used to grow the tree - */ - public int partition() { - return (int) (get() / MAX_TREEID); - } - - public int treeId() { - return (int) (get() % MAX_TREEID); - } - - @Override - public TreeID clone() { - return new TreeID(partition(), treeId()); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/package-info.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/package-info.java deleted file mode 100644 index ae5bd7bff..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/package-info.java +++ /dev/null @@ -1,16 +0,0 @@ -/** - *

Partial-data mapreduce implementation of Random Decision Forests

- * - *

The builder splits the data, using a FileInputSplit, among the mappers. - * Building the forest and estimating the oob error takes two job steps.

- * - *

In the first step, each mapper is responsible for growing a number of trees with its partition's, - * loading the data instances in its {@code map()} function, then building the trees in the {@code close()} method. It - * uses the reference implementation's code to build each tree and estimate the oob error.

- * - *

The second step is needed when estimating the oob error. Each mapper loads all the trees that does not - * belong to its own partition (were not built using the partition's data) and uses them to classify the - * partition's data instances. The data instances are loaded in the {@code map()} method and the classification - * is performed in the {@code close()} method.

- */ -package org.apache.mahout.classifier.df.mapreduce.partial; \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/node/CategoricalNode.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/node/CategoricalNode.java deleted file mode 100644 index 5f2cc383d..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/node/CategoricalNode.java +++ /dev/null @@ -1,134 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.node; - -import org.apache.commons.lang.ArrayUtils; -import org.apache.mahout.classifier.df.DFUtils; -import org.apache.mahout.classifier.df.data.Instance; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.Arrays; - -public class CategoricalNode extends Node { - private int attr; - - private double[] values; - - private Node[] childs; - - public CategoricalNode() { } - - public CategoricalNode(int attr, double[] values, Node[] childs) { - this.attr = attr; - this.values = values; - this.childs = childs; - } - - @Override - public double classify(Instance instance) { - int index = ArrayUtils.indexOf(values, instance.get(attr)); - if (index == -1) { - // value not available, we cannot predict - return -1; - } - return childs[index].classify(instance); - } - - @Override - public long maxDepth() { - long max = 0; - - for (Node child : childs) { - long depth = child.maxDepth(); - if (depth > max) { - max = depth; - } - } - - return 1 + max; - } - - @Override - public long nbNodes() { - long nbNodes = 1; - - for (Node child : childs) { - nbNodes += child.nbNodes(); - } - - return nbNodes; - } - - @Override - protected Type getType() { - return Type.CATEGORICAL; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (!(obj instanceof CategoricalNode)) { - return false; - } - - CategoricalNode node = (CategoricalNode) obj; - - return attr == node.attr && Arrays.equals(values, node.values) && Arrays.equals(childs, node.childs); - } - - @Override - public int hashCode() { - int hashCode = attr; - for (double value : values) { - hashCode = 31 * hashCode + (int) Double.doubleToLongBits(value); - } - for (Node node : childs) { - hashCode = 31 * hashCode + node.hashCode(); - } - return hashCode; - } - - @Override - protected String getString() { - StringBuilder buffer = new StringBuilder(); - - for (Node child : childs) { - buffer.append(child).append(','); - } - - return buffer.toString(); - } - - @Override - public void readFields(DataInput in) throws IOException { - attr = in.readInt(); - values = DFUtils.readDoubleArray(in); - childs = DFUtils.readNodeArray(in); - } - - @Override - protected void writeNode(DataOutput out) throws IOException { - out.writeInt(attr); - DFUtils.writeArray(out, values); - DFUtils.writeArray(out, childs); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/node/Leaf.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/node/Leaf.java deleted file mode 100644 index 285a134f9..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/node/Leaf.java +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.node; - -import org.apache.mahout.classifier.df.data.Instance; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * Represents a Leaf node - */ -public class Leaf extends Node { - private static final double EPSILON = 1.0e-6; - - private double label; - - Leaf() { } - - public Leaf(double label) { - this.label = label; - } - - @Override - public double classify(Instance instance) { - return label; - } - - @Override - public long maxDepth() { - return 1; - } - - @Override - public long nbNodes() { - return 1; - } - - @Override - protected Type getType() { - return Type.LEAF; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (!(obj instanceof Leaf)) { - return false; - } - - Leaf leaf = (Leaf) obj; - - return Math.abs(label - leaf.label) < EPSILON; - } - - @Override - public int hashCode() { - long bits = Double.doubleToLongBits(label); - return (int)(bits ^ (bits >>> 32)); - } - - @Override - protected String getString() { - return ""; - } - - @Override - public void readFields(DataInput in) throws IOException { - label = in.readDouble(); - } - - @Override - protected void writeNode(DataOutput out) throws IOException { - out.writeDouble(label); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/node/Node.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/node/Node.java deleted file mode 100644 index cb6deb2a0..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/node/Node.java +++ /dev/null @@ -1,95 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.node; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.classifier.df.data.Instance; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * Represents an abstract node of a decision tree - */ -public abstract class Node implements Writable { - - protected enum Type { - LEAF, - NUMERICAL, - CATEGORICAL - } - - /** - * predicts the label for the instance - * - * @return -1 if the label cannot be predicted - */ - public abstract double classify(Instance instance); - - /** - * @return the total number of nodes of the tree - */ - public abstract long nbNodes(); - - /** - * @return the maximum depth of the tree - */ - public abstract long maxDepth(); - - protected abstract Type getType(); - - public static Node read(DataInput in) throws IOException { - Type type = Type.values()[in.readInt()]; - Node node; - - switch (type) { - case LEAF: - node = new Leaf(); - break; - case NUMERICAL: - node = new NumericalNode(); - break; - case CATEGORICAL: - node = new CategoricalNode(); - break; - default: - throw new IllegalStateException("This implementation is not currently supported"); - } - - node.readFields(in); - - return node; - } - - @Override - public final String toString() { - return getType() + ":" + getString() + ';'; - } - - protected abstract String getString(); - - @Override - public final void write(DataOutput out) throws IOException { - out.writeInt(getType().ordinal()); - writeNode(out); - } - - protected abstract void writeNode(DataOutput out) throws IOException; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/node/NumericalNode.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/node/NumericalNode.java deleted file mode 100644 index 19b3e57dc..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/node/NumericalNode.java +++ /dev/null @@ -1,114 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.node; - -import org.apache.mahout.classifier.df.data.Instance; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * Represents a node that splits using a numerical attribute - */ -public class NumericalNode extends Node { - /** numerical attribute to split for */ - private int attr; - - /** split value */ - private double split; - - /** child node when attribute's value < split value */ - private Node loChild; - - /** child node when attribute's value >= split value */ - private Node hiChild; - - public NumericalNode() { } - - public NumericalNode(int attr, double split, Node loChild, Node hiChild) { - this.attr = attr; - this.split = split; - this.loChild = loChild; - this.hiChild = hiChild; - } - - @Override - public double classify(Instance instance) { - if (instance.get(attr) < split) { - return loChild.classify(instance); - } else { - return hiChild.classify(instance); - } - } - - @Override - public long maxDepth() { - return 1 + Math.max(loChild.maxDepth(), hiChild.maxDepth()); - } - - @Override - public long nbNodes() { - return 1 + loChild.nbNodes() + hiChild.nbNodes(); - } - - @Override - protected Type getType() { - return Type.NUMERICAL; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (!(obj instanceof NumericalNode)) { - return false; - } - - NumericalNode node = (NumericalNode) obj; - - return attr == node.attr && split == node.split && loChild.equals(node.loChild) && hiChild.equals(node.hiChild); - } - - @Override - public int hashCode() { - return attr + (int) Double.doubleToLongBits(split) + loChild.hashCode() + hiChild.hashCode(); - } - - @Override - protected String getString() { - return loChild.toString() + ',' + hiChild.toString(); - } - - @Override - public void readFields(DataInput in) throws IOException { - attr = in.readInt(); - split = in.readDouble(); - loChild = Node.read(in); - hiChild = Node.read(in); - } - - @Override - protected void writeNode(DataOutput out) throws IOException { - out.writeInt(attr); - out.writeDouble(split); - loChild.write(out); - hiChild.write(out); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/ref/SequentialBuilder.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/ref/SequentialBuilder.java deleted file mode 100644 index 292b591e9..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/ref/SequentialBuilder.java +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.ref; - -import com.google.common.collect.Lists; -import org.apache.mahout.classifier.df.Bagging; -import org.apache.mahout.classifier.df.DecisionForest; -import org.apache.mahout.classifier.df.builder.TreeBuilder; -import org.apache.mahout.classifier.df.data.Data; -import org.apache.mahout.classifier.df.node.Node; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.List; -import java.util.Random; - -/** - * Builds a Random Decision Forest using a given TreeBuilder to grow the trees - */ -public class SequentialBuilder { - - private static final Logger log = LoggerFactory.getLogger(SequentialBuilder.class); - - private final Random rng; - - private final Bagging bagging; - - /** - * Constructor - * - * @param rng - * random-numbers generator - * @param treeBuilder - * tree builder - * @param data - * training data - */ - public SequentialBuilder(Random rng, TreeBuilder treeBuilder, Data data) { - this.rng = rng; - bagging = new Bagging(treeBuilder, data); - } - - public DecisionForest build(int nbTrees) { - List trees = Lists.newArrayList(); - - for (int treeId = 0; treeId < nbTrees; treeId++) { - trees.add(bagging.build(rng)); - logProgress(((float) treeId + 1) / nbTrees); - } - - return new DecisionForest(trees); - } - - private static void logProgress(float progress) { - int percent = (int) (progress * 100); - if (percent % 10 == 0) { - log.info("Building {}%", percent); - } - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/DefaultIgSplit.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/DefaultIgSplit.java deleted file mode 100644 index 38d300748..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/DefaultIgSplit.java +++ /dev/null @@ -1,117 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.split; - -import org.apache.mahout.classifier.df.data.Data; -import org.apache.mahout.classifier.df.data.conditions.Condition; - -import java.util.Arrays; - -/** - * Default, not optimized, implementation of IgSplit - */ -public class DefaultIgSplit extends IgSplit { - - /** used by entropy() */ - private int[] counts; - - @Override - public Split computeSplit(Data data, int attr) { - if (data.getDataset().isNumerical(attr)) { - double[] values = data.values(attr); - double bestIg = -1; - double bestSplit = 0.0; - - for (double value : values) { - double ig = numericalIg(data, attr, value); - if (ig > bestIg) { - bestIg = ig; - bestSplit = value; - } - } - - return new Split(attr, bestIg, bestSplit); - } else { - double ig = categoricalIg(data, attr); - - return new Split(attr, ig); - } - } - - /** - * Computes the Information Gain for a CATEGORICAL attribute - */ - double categoricalIg(Data data, int attr) { - double[] values = data.values(attr); - double hy = entropy(data); // H(Y) - double hyx = 0.0; // H(Y|X) - double invDataSize = 1.0 / data.size(); - - for (double value : values) { - Data subset = data.subset(Condition.equals(attr, value)); - hyx += subset.size() * invDataSize * entropy(subset); - } - - return hy - hyx; - } - - /** - * Computes the Information Gain for a NUMERICAL attribute given a splitting value - */ - double numericalIg(Data data, int attr, double split) { - double hy = entropy(data); - double invDataSize = 1.0 / data.size(); - - // LO subset - Data subset = data.subset(Condition.lesser(attr, split)); - hy -= subset.size() * invDataSize * entropy(subset); - - // HI subset - subset = data.subset(Condition.greaterOrEquals(attr, split)); - hy -= subset.size() * invDataSize * entropy(subset); - - return hy; - } - - /** - * Computes the Entropy - */ - protected double entropy(Data data) { - double invDataSize = 1.0 / data.size(); - - if (counts == null) { - counts = new int[data.getDataset().nblabels()]; - } - - Arrays.fill(counts, 0); - data.countLabels(counts); - - double entropy = 0.0; - for (int label = 0; label < data.getDataset().nblabels(); label++) { - int count = counts[label]; - if (count == 0) { - continue; // otherwise we get a NaN - } - double p = count * invDataSize; - entropy += -p * Math.log(p) / LOG2; - } - - return entropy; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/IgSplit.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/IgSplit.java deleted file mode 100644 index da37cf388..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/IgSplit.java +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.split; - -import org.apache.mahout.classifier.df.data.Data; - -/** - * Computes the best split using the Information Gain measure - */ -public abstract class IgSplit { - - static final double LOG2 = Math.log(2.0); - - /** - * Computes the best split for the given attribute - */ - public abstract Split computeSplit(Data data, int attr); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/OptIgSplit.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/OptIgSplit.java deleted file mode 100644 index 46396292d..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/OptIgSplit.java +++ /dev/null @@ -1,178 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.split; - -import org.apache.commons.lang.ArrayUtils; -import org.apache.mahout.classifier.df.data.Data; -import org.apache.mahout.classifier.df.data.DataUtils; -import org.apache.mahout.classifier.df.data.Dataset; -import org.apache.mahout.classifier.df.data.Instance; - -import java.util.Arrays; - -/** - * Optimized implementation of IgSplit
- * This class can be used when the criterion variable is the categorical attribute. - */ -public class OptIgSplit extends IgSplit { - - private int[][] counts; - - private int[] countAll; - - private int[] countLess; - - @Override - public Split computeSplit(Data data, int attr) { - if (data.getDataset().isNumerical(attr)) { - return numericalSplit(data, attr); - } else { - return categoricalSplit(data, attr); - } - } - - /** - * Computes the split for a CATEGORICAL attribute - */ - private static Split categoricalSplit(Data data, int attr) { - double[] values = data.values(attr); - int[][] counts = new int[values.length][data.getDataset().nblabels()]; - int[] countAll = new int[data.getDataset().nblabels()]; - - Dataset dataset = data.getDataset(); - - // compute frequencies - for (int index = 0; index < data.size(); index++) { - Instance instance = data.get(index); - counts[ArrayUtils.indexOf(values, instance.get(attr))][(int) dataset.getLabel(instance)]++; - countAll[(int) dataset.getLabel(instance)]++; - } - - int size = data.size(); - double hy = entropy(countAll, size); // H(Y) - double hyx = 0.0; // H(Y|X) - double invDataSize = 1.0 / size; - - for (int index = 0; index < values.length; index++) { - size = DataUtils.sum(counts[index]); - hyx += size * invDataSize * entropy(counts[index], size); - } - - double ig = hy - hyx; - return new Split(attr, ig); - } - - /** - * Return the sorted list of distinct values for the given attribute - */ - private static double[] sortedValues(Data data, int attr) { - double[] values = data.values(attr); - Arrays.sort(values); - - return values; - } - - /** - * Instantiates the counting arrays - */ - void initCounts(Data data, double[] values) { - counts = new int[values.length][data.getDataset().nblabels()]; - countAll = new int[data.getDataset().nblabels()]; - countLess = new int[data.getDataset().nblabels()]; - } - - void computeFrequencies(Data data, int attr, double[] values) { - Dataset dataset = data.getDataset(); - - for (int index = 0; index < data.size(); index++) { - Instance instance = data.get(index); - counts[ArrayUtils.indexOf(values, instance.get(attr))][(int) dataset.getLabel(instance)]++; - countAll[(int) dataset.getLabel(instance)]++; - } - } - - /** - * Computes the best split for a NUMERICAL attribute - */ - Split numericalSplit(Data data, int attr) { - double[] values = sortedValues(data, attr); - - initCounts(data, values); - - computeFrequencies(data, attr, values); - - int size = data.size(); - double hy = entropy(countAll, size); - double invDataSize = 1.0 / size; - - int best = -1; - double bestIg = -1.0; - - // try each possible split value - for (int index = 0; index < values.length; index++) { - double ig = hy; - - // instance with attribute value < values[index] - size = DataUtils.sum(countLess); - ig -= size * invDataSize * entropy(countLess, size); - - // instance with attribute value >= values[index] - size = DataUtils.sum(countAll); - ig -= size * invDataSize * entropy(countAll, size); - - if (ig > bestIg) { - bestIg = ig; - best = index; - } - - DataUtils.add(countLess, counts[index]); - DataUtils.dec(countAll, counts[index]); - } - - if (best == -1) { - throw new IllegalStateException("no best split found !"); - } - return new Split(attr, bestIg, values[best]); - } - - /** - * Computes the Entropy - * - * @param counts counts[i] = numInstances with label i - * @param dataSize numInstances - */ - private static double entropy(int[] counts, int dataSize) { - if (dataSize == 0) { - return 0.0; - } - - double entropy = 0.0; - double invDataSize = 1.0 / dataSize; - - for (int count : counts) { - if (count == 0) { - continue; // otherwise we get a NaN - } - double p = count * invDataSize; - entropy += -p * Math.log(p) / LOG2; - } - - return entropy; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/RegressionSplit.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/RegressionSplit.java deleted file mode 100644 index 697c514f9..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/RegressionSplit.java +++ /dev/null @@ -1,177 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.split; - -import org.apache.mahout.classifier.df.data.Data; -import org.apache.mahout.classifier.df.data.Instance; - -import java.util.Arrays; - -/** - * Regression problem implementation of IgSplit. - * This class can be used when the criterion variable is the numerical attribute. - */ -public class RegressionSplit extends IgSplit { - - /** - * Comparator for Instance sort - */ - private static class InstanceComparator implements java.util.Comparator { - private final int attr; - - InstanceComparator(int attr) { - this.attr = attr; - } - - @Override - public int compare(Instance arg0, Instance arg1) { - return Double.compare(arg0.get(attr), arg1.get(attr)); - } - } - - @Override - public Split computeSplit(Data data, int attr) { - if (data.getDataset().isNumerical(attr)) { - return numericalSplit(data, attr); - } else { - return categoricalSplit(data, attr); - } - } - - /** - * Computes the split for a CATEGORICAL attribute - */ - private static Split categoricalSplit(Data data, int attr) { - double[] sums = new double[data.getDataset().nbValues(attr)]; - double[] sumSquared = new double[data.getDataset().nbValues(attr)]; - double[] counts = new double[data.getDataset().nbValues(attr)]; - double totalSum = 0; - double totalSumSquared = 0; - - // sum and sum of squares - for (int i = 0; i < data.size(); i++) { - Instance instance = data.get(i); - int value = (int) instance.get(attr); - double label = data.getDataset().getLabel(instance); - double square = label * label; - - sums[value] += label; - sumSquared[value] += square; - counts[value]++; - totalSum += label; - totalSumSquared += square; - } - - // computes the variance - double totalVar = totalSumSquared - (totalSum * totalSum) / data.size(); - double var = variance(sums, sumSquared, counts); - double ig = totalVar - var; - - return new Split(attr, ig); - } - - /** - * Computes the best split for a NUMERICAL attribute - */ - static Split numericalSplit(Data data, int attr) { - - // Instance sort - Instance[] instances = new Instance[data.size()]; - for (int i = 0; i < data.size(); i++) { - instances[i] = data.get(i); - } - Arrays.sort(instances, new InstanceComparator(attr)); - - // sum and sum of squares - double totalSum = 0.0; - double totalSumSquared = 0.0; - for (Instance instance : instances) { - double label = data.getDataset().getLabel(instance); - totalSum += label; - totalSumSquared += label * label; - } - double[] sums = new double[2]; - double[] curSums = new double[2]; - sums[1] = curSums[1] = totalSum; - double[] sumSquared = new double[2]; - double[] curSumSquared = new double[2]; - sumSquared[1] = curSumSquared[1] = totalSumSquared; - double[] counts = new double[2]; - double[] curCounts = new double[2]; - counts[1] = curCounts[1] = data.size(); - - // find the best split point - double curSplit = instances[0].get(attr); - double bestVal = Double.MAX_VALUE; - double split = Double.NaN; - for (Instance instance : instances) { - if (instance.get(attr) > curSplit) { - double curVal = variance(curSums, curSumSquared, curCounts); - if (curVal < bestVal) { - bestVal = curVal; - split = (instance.get(attr) + curSplit) / 2.0; - for (int j = 0; j < 2; j++) { - sums[j] = curSums[j]; - sumSquared[j] = curSumSquared[j]; - counts[j] = curCounts[j]; - } - } - } - - curSplit = instance.get(attr); - - double label = data.getDataset().getLabel(instance); - double square = label * label; - - curSums[0] += label; - curSumSquared[0] += square; - curCounts[0]++; - - curSums[1] -= label; - curSumSquared[1] -= square; - curCounts[1]--; - } - - // computes the variance - double totalVar = totalSumSquared - (totalSum * totalSum) / data.size(); - double var = variance(sums, sumSquared, counts); - double ig = totalVar - var; - - return new Split(attr, ig, split); - } - - /** - * Computes the variance - * - * @param s - * data - * @param ss - * squared data - * @param dataSize - * numInstances - */ - private static double variance(double[] s, double[] ss, double[] dataSize) { - double var = 0; - for (int i = 0; i < s.length; i++) { - if (dataSize[i] > 0) { - var += ss[i] - ((s[i] * s[i]) / dataSize[i]); - } - } - return var; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/Split.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/Split.java deleted file mode 100644 index bf079debf..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/split/Split.java +++ /dev/null @@ -1,67 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.split; - -import java.util.Locale; - -/** - * Contains enough information to identify each split - */ -public final class Split { - - private final int attr; - private final double ig; - private final double split; - - public Split(int attr, double ig, double split) { - this.attr = attr; - this.ig = ig; - this.split = split; - } - - public Split(int attr, double ig) { - this(attr, ig, Double.NaN); - } - - /** - * @return attribute to split for - */ - public int getAttr() { - return attr; - } - - /** - * @return Information Gain of the split - */ - public double getIg() { - return ig; - } - - /** - * @return split value for NUMERICAL attributes - */ - public double getSplit() { - return split; - } - - @Override - public String toString() { - return String.format(Locale.ENGLISH, "attr: %d, ig: %f, split: %f", attr, ig, split); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java deleted file mode 100644 index cf7b1bfe9..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java +++ /dev/null @@ -1,148 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.tools; - -import com.google.common.collect.Lists; -import org.apache.commons.cli2.CommandLine; -import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; -import org.apache.commons.cli2.OptionException; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.mahout.classifier.df.DFUtils; -import org.apache.mahout.classifier.df.data.DataLoader; -import org.apache.mahout.classifier.df.data.Dataset; -import org.apache.mahout.classifier.df.data.DescriptorException; -import org.apache.mahout.classifier.df.data.DescriptorUtils; -import org.apache.mahout.common.CommandLineUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Collection; -import java.util.List; - -/** - * Generates a file descriptor for a given dataset - */ -public final class Describe { - - private static final Logger log = LoggerFactory.getLogger(Describe.class); - - private Describe() { - } - - public static void main(String[] args) throws IOException, DescriptorException { - - DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); - ArgumentBuilder abuilder = new ArgumentBuilder(); - GroupBuilder gbuilder = new GroupBuilder(); - - Option pathOpt = obuilder.withLongName("path").withShortName("p").withRequired(true).withArgument( - abuilder.withName("path").withMinimum(1).withMaximum(1).create()).withDescription("Data path").create(); - - Option descriptorOpt = obuilder.withLongName("descriptor").withShortName("d").withRequired(true) - .withArgument(abuilder.withName("descriptor").withMinimum(1).create()).withDescription( - "data descriptor").create(); - - Option descPathOpt = obuilder.withLongName("file").withShortName("f").withRequired(true).withArgument( - abuilder.withName("file").withMinimum(1).withMaximum(1).create()).withDescription( - "Path to generated descriptor file").create(); - - Option regOpt = obuilder.withLongName("regression").withDescription("Regression Problem").withShortName("r") - .create(); - - Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") - .create(); - - Group group = gbuilder.withName("Options").withOption(pathOpt).withOption(descPathOpt).withOption( - descriptorOpt).withOption(regOpt).withOption(helpOpt).create(); - - try { - Parser parser = new Parser(); - parser.setGroup(group); - CommandLine cmdLine = parser.parse(args); - - if (cmdLine.hasOption(helpOpt)) { - CommandLineUtil.printHelp(group); - return; - } - - String dataPath = cmdLine.getValue(pathOpt).toString(); - String descPath = cmdLine.getValue(descPathOpt).toString(); - List descriptor = convert(cmdLine.getValues(descriptorOpt)); - boolean regression = cmdLine.hasOption(regOpt); - - log.debug("Data path : {}", dataPath); - log.debug("Descriptor path : {}", descPath); - log.debug("Descriptor : {}", descriptor); - log.debug("Regression : {}", regression); - - runTool(dataPath, descriptor, descPath, regression); - } catch (OptionException e) { - log.warn(e.toString()); - CommandLineUtil.printHelp(group); - } - } - - private static void runTool(String dataPath, Iterable description, String filePath, boolean regression) - throws DescriptorException, IOException { - log.info("Generating the descriptor..."); - String descriptor = DescriptorUtils.generateDescriptor(description); - - Path fPath = validateOutput(filePath); - - log.info("generating the dataset..."); - Dataset dataset = generateDataset(descriptor, dataPath, regression); - - log.info("storing the dataset description"); - DFUtils.storeWritable(new Configuration(), fPath, dataset); - } - - private static Dataset generateDataset(String descriptor, String dataPath, boolean regression) throws IOException, - DescriptorException { - Path path = new Path(dataPath); - FileSystem fs = path.getFileSystem(new Configuration()); - - return DataLoader.generateDataset(descriptor, regression, fs, path); - } - - private static Path validateOutput(String filePath) throws IOException { - Path path = new Path(filePath); - FileSystem fs = path.getFileSystem(new Configuration()); - if (fs.exists(path)) { - throw new IllegalStateException("Descriptor's file already exists"); - } - - return path; - } - - private static List convert(Collection values) { - List list = Lists.newArrayListWithCapacity(values.size()); - for (Object value : values) { - list.add(value.toString()); - } - return list; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/ForestVisualizer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/ForestVisualizer.java deleted file mode 100644 index dacc651a4..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/ForestVisualizer.java +++ /dev/null @@ -1,148 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.tools; - -import java.lang.reflect.Method; -import java.util.List; - -import org.apache.commons.cli2.CommandLine; -import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.mahout.classifier.df.DecisionForest; -import org.apache.mahout.classifier.df.data.Dataset; -import org.apache.mahout.classifier.df.node.Node; -import org.apache.mahout.common.CommandLineUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This tool is to visualize the Decision Forest - */ -public final class ForestVisualizer { - - private static final Logger log = LoggerFactory.getLogger(ForestVisualizer.class); - - private ForestVisualizer() { - } - - public static String toString(DecisionForest forest, Dataset dataset, String[] attrNames) - throws Exception { - Method getTrees = forest.getClass().getDeclaredMethod("getTrees"); - getTrees.setAccessible(true); - @SuppressWarnings("unchecked") - List trees = (List) getTrees.invoke(forest); - - int cnt = 1; - StringBuilder buff = new StringBuilder(); - for (Node tree : trees) { - buff.append("Tree[" + cnt + "]:"); - buff.append(TreeVisualizer.toString(tree, dataset, attrNames)); - buff.append('\n'); - cnt++; - } - return buff.toString(); - } - - /** - * Decision Forest to String - * @param forestPath - * path to the Decision Forest - * @param datasetPath - * dataset path - * @param attrNames - * attribute names - */ - public static String toString(String forestPath, String datasetPath, String[] attrNames) - throws Exception { - Configuration conf = new Configuration(); - DecisionForest forest = DecisionForest.load(conf, new Path(forestPath)); - Dataset dataset = Dataset.load(conf, new Path(datasetPath)); - return toString(forest, dataset, attrNames); - } - - /** - * Print Decision Forest - * @param forestPath - * path to the Decision Forest - * @param datasetPath - * dataset path - * @param attrNames - * attribute names - */ - public static void print(String forestPath, String datasetPath, String[] attrNames) - throws Exception { - System.out.println(toString(forestPath, datasetPath, attrNames)); - } - - public static void main(String[] args) { - DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); - ArgumentBuilder abuilder = new ArgumentBuilder(); - GroupBuilder gbuilder = new GroupBuilder(); - - Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true) - .withArgument(abuilder.withName("dataset").withMinimum(1).withMaximum(1).create()) - .withDescription("Dataset path").create(); - - Option modelOpt = obuilder.withLongName("model").withShortName("m").withRequired(true) - .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) - .withDescription("Path to the Decision Forest").create(); - - Option attrNamesOpt = obuilder.withLongName("names").withShortName("n").withRequired(false) - .withArgument(abuilder.withName("names").withMinimum(1).create()) - .withDescription("Optional, Attribute names").create(); - - Option helpOpt = obuilder.withLongName("help").withShortName("h") - .withDescription("Print out help").create(); - - Group group = gbuilder.withName("Options").withOption(datasetOpt).withOption(modelOpt) - .withOption(attrNamesOpt).withOption(helpOpt).create(); - - try { - Parser parser = new Parser(); - parser.setGroup(group); - CommandLine cmdLine = parser.parse(args); - - if (cmdLine.hasOption("help")) { - CommandLineUtil.printHelp(group); - return; - } - - String datasetName = cmdLine.getValue(datasetOpt).toString(); - String modelName = cmdLine.getValue(modelOpt).toString(); - String[] attrNames = null; - if (cmdLine.hasOption(attrNamesOpt)) { - List names = (List) cmdLine.getValues(attrNamesOpt); - if (!names.isEmpty()) { - attrNames = new String[names.size()]; - names.toArray(attrNames); - } - } - - print(modelName, datasetName, attrNames); - } catch (Exception e) { - log.error("Exception", e); - CommandLineUtil.printHelp(group); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/Frequencies.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/Frequencies.java deleted file mode 100644 index 277f1baab..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/Frequencies.java +++ /dev/null @@ -1,128 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.tools; - -import org.apache.commons.cli2.CommandLine; -import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; -import org.apache.commons.cli2.OptionException; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configured; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.Tool; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.common.CommandLineUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Arrays; - -/** - * Compute the frequency distribution of the "class label"
- * This class can be used when the criterion variable is the categorical attribute. - */ -public final class Frequencies extends Configured implements Tool { - - private static final Logger log = LoggerFactory.getLogger(Frequencies.class); - - private Frequencies() { } - - @Override - public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { - - DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); - ArgumentBuilder abuilder = new ArgumentBuilder(); - GroupBuilder gbuilder = new GroupBuilder(); - - Option dataOpt = obuilder.withLongName("data").withShortName("d").withRequired(true).withArgument( - abuilder.withName("path").withMinimum(1).withMaximum(1).create()).withDescription("Data path").create(); - - Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true).withArgument( - abuilder.withName("path").withMinimum(1).create()).withDescription("dataset path").create(); - - Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") - .create(); - - Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(datasetOpt).withOption(helpOpt) - .create(); - - try { - Parser parser = new Parser(); - parser.setGroup(group); - CommandLine cmdLine = parser.parse(args); - - if (cmdLine.hasOption(helpOpt)) { - CommandLineUtil.printHelp(group); - return 0; - } - - String dataPath = cmdLine.getValue(dataOpt).toString(); - String datasetPath = cmdLine.getValue(datasetOpt).toString(); - - log.debug("Data path : {}", dataPath); - log.debug("Dataset path : {}", datasetPath); - - runTool(dataPath, datasetPath); - } catch (OptionException e) { - log.warn(e.toString(), e); - CommandLineUtil.printHelp(group); - } - - return 0; - } - - private void runTool(String data, String dataset) throws IOException, - ClassNotFoundException, - InterruptedException { - - FileSystem fs = FileSystem.get(getConf()); - Path workingDir = fs.getWorkingDirectory(); - - Path dataPath = new Path(data); - Path datasetPath = new Path(dataset); - - log.info("Computing the frequencies..."); - FrequenciesJob job = new FrequenciesJob(new Path(workingDir, "output"), dataPath, datasetPath); - - int[][] counts = job.run(getConf()); - - // compute the partitions' sizes - int numPartitions = counts.length; - // int[] sizes = new int[numPartitions]; // TODO this isn't used? - // for (int p = 0; p < numPartitions; p++) { - // sizes[p] = DataUtils.sum(counts[p]); - // } - - // outputing the frequencies - log.info("counts[partition][class]"); - for (int p = 0; p < numPartitions; p++) { - log.info(Arrays.toString(counts[p])); - } - } - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new Frequencies(), args); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/FrequenciesJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/FrequenciesJob.java deleted file mode 100644 index d02d97418..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/FrequenciesJob.java +++ /dev/null @@ -1,296 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.tools; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.filecache.DistributedCache; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.JobContext; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.mahout.classifier.df.DFUtils; -import org.apache.mahout.classifier.df.data.DataConverter; -import org.apache.mahout.classifier.df.data.Dataset; -import org.apache.mahout.classifier.df.data.Instance; -import org.apache.mahout.classifier.df.mapreduce.Builder; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.net.URI; -import java.util.Arrays; - -/** - * Temporary class used to compute the frequency distribution of the "class attribute".
- * This class can be used when the criterion variable is the categorical attribute. - */ -public class FrequenciesJob { - - private static final Logger log = LoggerFactory.getLogger(FrequenciesJob.class); - - /** directory that will hold this job's output */ - private final Path outputPath; - - /** file that contains the serialized dataset */ - private final Path datasetPath; - - /** directory that contains the data used in the first step */ - private final Path dataPath; - - /** - * @param base - * base directory - * @param dataPath - * data used in the first step - */ - public FrequenciesJob(Path base, Path dataPath, Path datasetPath) { - this.outputPath = new Path(base, "frequencies.output"); - this.dataPath = dataPath; - this.datasetPath = datasetPath; - } - - /** - * @return counts[partition][label] = num tuples from 'partition' with class == label - */ - public int[][] run(Configuration conf) throws IOException, ClassNotFoundException, InterruptedException { - - // check the output - FileSystem fs = outputPath.getFileSystem(conf); - if (fs.exists(outputPath)) { - throw new IOException("Output path already exists : " + outputPath); - } - - // put the dataset into the DistributedCache - URI[] files = {datasetPath.toUri()}; - DistributedCache.setCacheFiles(files, conf); - - Job job = new Job(conf); - job.setJarByClass(FrequenciesJob.class); - - FileInputFormat.setInputPaths(job, dataPath); - FileOutputFormat.setOutputPath(job, outputPath); - - job.setMapOutputKeyClass(LongWritable.class); - job.setMapOutputValueClass(IntWritable.class); - job.setOutputKeyClass(LongWritable.class); - job.setOutputValueClass(Frequencies.class); - - job.setMapperClass(FrequenciesMapper.class); - job.setReducerClass(FrequenciesReducer.class); - - job.setInputFormatClass(TextInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - - // run the job - boolean succeeded = job.waitForCompletion(true); - if (!succeeded) { - throw new IllegalStateException("Job failed!"); - } - - int[][] counts = parseOutput(job); - - HadoopUtil.delete(conf, outputPath); - - return counts; - } - - /** - * Extracts the output and processes it - * - * @return counts[partition][label] = num tuples from 'partition' with class == label - */ - int[][] parseOutput(JobContext job) throws IOException { - Configuration conf = job.getConfiguration(); - - int numMaps = conf.getInt("mapred.map.tasks", -1); - log.info("mapred.map.tasks = {}", numMaps); - - FileSystem fs = outputPath.getFileSystem(conf); - - Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath); - - Frequencies[] values = new Frequencies[numMaps]; - - // read all the outputs - int index = 0; - for (Path path : outfiles) { - for (Frequencies value : new SequenceFileValueIterable(path, conf)) { - values[index++] = value; - } - } - - if (index < numMaps) { - throw new IllegalStateException("number of output Frequencies (" + index - + ") is lesser than the number of mappers!"); - } - - // sort the frequencies using the firstIds - Arrays.sort(values); - return Frequencies.extractCounts(values); - } - - /** - * Outputs the first key and the label of each tuple - * - */ - private static class FrequenciesMapper extends Mapper { - - private LongWritable firstId; - - private DataConverter converter; - private Dataset dataset; - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - Configuration conf = context.getConfiguration(); - - dataset = Builder.loadDataset(conf); - setup(dataset); - } - - /** - * Useful when testing - */ - void setup(Dataset dataset) { - converter = new DataConverter(dataset); - } - - @Override - protected void map(LongWritable key, Text value, Context context) throws IOException, - InterruptedException { - if (firstId == null) { - firstId = new LongWritable(key.get()); - } - - Instance instance = converter.convert(value.toString()); - - context.write(firstId, new IntWritable((int) dataset.getLabel(instance))); - } - - } - - private static class FrequenciesReducer extends Reducer { - - private int nblabels; - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - Configuration conf = context.getConfiguration(); - Dataset dataset = Builder.loadDataset(conf); - setup(dataset.nblabels()); - } - - /** - * Useful when testing - */ - void setup(int nblabels) { - this.nblabels = nblabels; - } - - @Override - protected void reduce(LongWritable key, Iterable values, Context context) - throws IOException, InterruptedException { - int[] counts = new int[nblabels]; - for (IntWritable value : values) { - counts[value.get()]++; - } - - context.write(key, new Frequencies(key.get(), counts)); - } - } - - /** - * Output of the job - * - */ - private static class Frequencies implements Writable, Comparable, Cloneable { - - /** first key of the partition used to sort the partitions */ - private long firstId; - - /** counts[c] = num tuples from the partition with label == c */ - private int[] counts; - - Frequencies() { } - - Frequencies(long firstId, int[] counts) { - this.firstId = firstId; - this.counts = Arrays.copyOf(counts, counts.length); - } - - @Override - public void readFields(DataInput in) throws IOException { - firstId = in.readLong(); - counts = DFUtils.readIntArray(in); - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeLong(firstId); - DFUtils.writeArray(out, counts); - } - - @Override - public boolean equals(Object other) { - return other instanceof Frequencies && firstId == ((Frequencies) other).firstId; - } - - @Override - public int hashCode() { - return (int) firstId; - } - - @Override - protected Frequencies clone() { - return new Frequencies(firstId, counts); - } - - @Override - public int compareTo(Frequencies obj) { - if (firstId < obj.firstId) { - return -1; - } else if (firstId > obj.firstId) { - return 1; - } else { - return 0; - } - } - - public static int[][] extractCounts(Frequencies[] partitions) { - int[][] counts = new int[partitions.length][]; - for (int p = 0; p < partitions.length; p++) { - counts[p] = partitions[p].counts; - } - return counts; - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java deleted file mode 100644 index ead6943d2..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java +++ /dev/null @@ -1,241 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.tools; - -import java.lang.reflect.Field; -import java.text.DecimalFormat; -import java.util.HashMap; -import java.util.Map; - -import org.apache.commons.lang.ArrayUtils; -import org.apache.mahout.classifier.df.data.Data; -import org.apache.mahout.classifier.df.data.Dataset; -import org.apache.mahout.classifier.df.data.Instance; -import org.apache.mahout.classifier.df.node.CategoricalNode; -import org.apache.mahout.classifier.df.node.Leaf; -import org.apache.mahout.classifier.df.node.Node; -import org.apache.mahout.classifier.df.node.NumericalNode; - -/** - * This tool is to visualize the Decision tree - */ -public final class TreeVisualizer { - - private TreeVisualizer() { - } - - private static String doubleToString(double value) { - DecimalFormat df = new DecimalFormat("0.##"); - return df.format(value); - } - - private static String toStringNode(Node node, Dataset dataset, String[] attrNames, - Map fields, int layer) throws IllegalAccessException { - StringBuilder buff = new StringBuilder(); - - if (node instanceof CategoricalNode) { - CategoricalNode cnode = (CategoricalNode) node; - int attr = (Integer) fields.get("CategoricalNode.attr").get(cnode); - double[] values = (double[]) fields.get("CategoricalNode.values").get(cnode); - Node[] childs = (Node[]) fields.get("CategoricalNode.childs").get(cnode); - String[][] attrValues = (String[][]) fields.get("Dataset.values").get(dataset); - for (int i = 0; i < childs.length; i++) { - buff.append('\n'); - for (int j = 0; j < layer; j++) { - buff.append("| "); - } - buff.append((attrNames == null ? attr : attrNames[attr]) + " = " + attrValues[attr][i]); - int index = ArrayUtils.indexOf(values, i); - if (index >= 0) { - buff.append(toStringNode(childs[index], dataset, attrNames, fields, layer + 1)); - } - } - } else if (node instanceof NumericalNode) { - NumericalNode nnode = (NumericalNode) node; - int attr = (Integer) fields.get("NumericalNode.attr").get(nnode); - double split = (Double) fields.get("NumericalNode.split").get(nnode); - Node loChild = (Node) fields.get("NumericalNode.loChild").get(nnode); - Node hiChild = (Node) fields.get("NumericalNode.hiChild").get(nnode); - buff.append('\n'); - for (int j = 0; j < layer; j++) { - buff.append("| "); - } - buff.append((attrNames == null ? attr : attrNames[attr]) + " < " + doubleToString(split)); - buff.append(toStringNode(loChild, dataset, attrNames, fields, layer + 1)); - buff.append('\n'); - for (int j = 0; j < layer; j++) { - buff.append("| "); - } - buff.append((attrNames == null ? attr : attrNames[attr]) + " >= " + doubleToString(split)); - buff.append(toStringNode(hiChild, dataset, attrNames, fields, layer + 1)); - } else if (node instanceof Leaf) { - Leaf leaf = (Leaf) node; - double label = (Double) fields.get("Leaf.label").get(leaf); - if (dataset.isNumerical(dataset.getLabelId())) { - buff.append(" : ").append(doubleToString(label)); - } else { - buff.append(" : ").append(dataset.getLabelString((int) label)); - } - } - - return buff.toString(); - } - - private static Map getReflectMap() throws Exception { - Map fields = new HashMap(); - - Field m = CategoricalNode.class.getDeclaredField("attr"); - m.setAccessible(true); - fields.put("CategoricalNode.attr", m); - m = CategoricalNode.class.getDeclaredField("values"); - m.setAccessible(true); - fields.put("CategoricalNode.values", m); - m = CategoricalNode.class.getDeclaredField("childs"); - m.setAccessible(true); - fields.put("CategoricalNode.childs", m); - m = NumericalNode.class.getDeclaredField("attr"); - m.setAccessible(true); - fields.put("NumericalNode.attr", m); - m = NumericalNode.class.getDeclaredField("split"); - m.setAccessible(true); - fields.put("NumericalNode.split", m); - m = NumericalNode.class.getDeclaredField("loChild"); - m.setAccessible(true); - fields.put("NumericalNode.loChild", m); - m = NumericalNode.class.getDeclaredField("hiChild"); - m.setAccessible(true); - fields.put("NumericalNode.hiChild", m); - m = Leaf.class.getDeclaredField("label"); - m.setAccessible(true); - fields.put("Leaf.label", m); - m = Dataset.class.getDeclaredField("values"); - m.setAccessible(true); - fields.put("Dataset.values", m); - - return fields; - } - - /** - * Decision tree to String - * @param tree - * Node of tree - * @param dataset - * @param attrNames - * attribute names - */ - public static String toString(Node tree, Dataset dataset, String[] attrNames) - throws Exception { - return toStringNode(tree, dataset, attrNames, getReflectMap(), 0); - } - - /** - * Print Decision tree - * @param tree - * Node of tree - * @param dataset - * @param attrNames - * attribute names - */ - public static void print(Node tree, Dataset dataset, String[] attrNames) throws Exception { - System.out.println(toString(tree, dataset, attrNames)); - } - - private static String toStringPredict(Node node, Instance instance, Dataset dataset, - String[] attrNames, Map fields) throws IllegalAccessException { - StringBuilder buff = new StringBuilder(); - - if (node instanceof CategoricalNode) { - CategoricalNode cnode = (CategoricalNode) node; - int attr = (Integer) fields.get("CategoricalNode.attr").get(cnode); - double[] values = (double[]) fields.get("CategoricalNode.values").get(cnode); - Node[] childs = (Node[]) fields.get("CategoricalNode.childs").get(cnode); - String[][] attrValues = (String[][]) fields.get("Dataset.values").get(dataset); - - int index = ArrayUtils.indexOf(values, instance.get(attr)); - if (index >= 0) { - buff.append((attrNames == null ? attr : attrNames[attr]) + " = " - + attrValues[attr][(int) instance.get(attr)]); - buff.append(" -> "); - buff.append(toStringPredict(childs[index], instance, dataset, attrNames, fields)); - } - } else if (node instanceof NumericalNode) { - NumericalNode nnode = (NumericalNode) node; - int attr = (Integer) fields.get("NumericalNode.attr").get(nnode); - double split = (Double) fields.get("NumericalNode.split").get(nnode); - Node loChild = (Node) fields.get("NumericalNode.loChild").get(nnode); - Node hiChild = (Node) fields.get("NumericalNode.hiChild").get(nnode); - - if (instance.get(attr) < split) { - buff.append("(" + (attrNames == null ? attr : attrNames[attr]) + " = " - + doubleToString(instance.get(attr)) + ") < " + doubleToString(split)); - buff.append(" -> "); - buff.append(toStringPredict(loChild, instance, dataset, attrNames, fields)); - } else { - buff.append("(" + (attrNames == null ? attr : attrNames[attr]) + " = " - + doubleToString(instance.get(attr)) + ") >= " + doubleToString(split)); - buff.append(" -> "); - buff.append(toStringPredict(hiChild, instance, dataset, attrNames, fields)); - } - } else if (node instanceof Leaf) { - Leaf leaf = (Leaf) node; - double label = (Double) fields.get("Leaf.label").get(leaf); - if (dataset.isNumerical(dataset.getLabelId())) { - buff.append(doubleToString(label)); - } else { - buff.append(dataset.getLabelString((int) label)); - } - } - - return buff.toString(); - } - - /** - * Predict trace to String - * @param tree - * Node of tree - * @param data - * @param attrNames - * attribute names - */ - public static String[] predictTrace(Node tree, Data data, String[] attrNames) - throws Exception { - Map reflectMap = getReflectMap(); - String[] prediction = new String[data.size()]; - for (int i = 0; i < data.size(); i++) { - prediction[i] = toStringPredict(tree, data.get(i), data.getDataset(), attrNames, reflectMap); - } - return prediction; - } - - /** - * Print predict trace - * @param tree - * Node of tree - * @param data - * @param attrNames - * attribute names - */ - public static void predictTracePrint(Node tree, Data data, String[] attrNames) - throws Exception { - Map reflectMap = getReflectMap(); - for (int i = 0; i < data.size(); i++) { - System.out.println(toStringPredict(tree, data.get(i), data.getDataset(), attrNames, - reflectMap)); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java deleted file mode 100644 index e2daf4c02..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java +++ /dev/null @@ -1,213 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.df.tools; - -import com.google.common.base.Preconditions; -import com.google.common.io.Closeables; -import org.apache.commons.cli2.CommandLine; -import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; -import org.apache.commons.cli2.OptionException; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.FileUtil; -import org.apache.hadoop.fs.Path; -import org.apache.mahout.classifier.df.data.DataConverter; -import org.apache.mahout.classifier.df.data.Dataset; -import org.apache.mahout.classifier.df.data.Instance; -import org.apache.mahout.common.CommandLineUtil; -import org.apache.mahout.common.RandomUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.util.Locale; -import java.util.Random; -import java.util.Scanner; - -/** - * This tool is used to uniformly distribute the class of all the tuples of the dataset over a given number of - * partitions.
- * This class can be used when the criterion variable is the categorical attribute. - */ -public final class UDistrib { - - private static final Logger log = LoggerFactory.getLogger(UDistrib.class); - - private UDistrib() { - } - - /** - * Launch the uniform distribution tool. Requires the following command line arguments:
- * - * data : data path dataset : dataset path numpartitions : num partitions output : output path - * - * @throws java.io.IOException - */ - public static void main(String[] args) throws IOException { - - DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); - ArgumentBuilder abuilder = new ArgumentBuilder(); - GroupBuilder gbuilder = new GroupBuilder(); - - Option dataOpt = obuilder.withLongName("data").withShortName("d").withRequired(true).withArgument( - abuilder.withName("data").withMinimum(1).withMaximum(1).create()).withDescription("Data path").create(); - - Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true).withArgument( - abuilder.withName("dataset").withMinimum(1).create()).withDescription("Dataset path").create(); - - Option outputOpt = obuilder.withLongName("output").withShortName("o").withRequired(true).withArgument( - abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription( - "Path to generated files").create(); - - Option partitionsOpt = obuilder.withLongName("numpartitions").withShortName("p").withRequired(true) - .withArgument(abuilder.withName("numparts").withMinimum(1).withMinimum(1).create()).withDescription( - "Number of partitions to create").create(); - Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") - .create(); - - Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(outputOpt).withOption( - datasetOpt).withOption(partitionsOpt).withOption(helpOpt).create(); - - try { - Parser parser = new Parser(); - parser.setGroup(group); - CommandLine cmdLine = parser.parse(args); - - if (cmdLine.hasOption(helpOpt)) { - CommandLineUtil.printHelp(group); - return; - } - - String data = cmdLine.getValue(dataOpt).toString(); - String dataset = cmdLine.getValue(datasetOpt).toString(); - int numPartitions = Integer.parseInt(cmdLine.getValue(partitionsOpt).toString()); - String output = cmdLine.getValue(outputOpt).toString(); - - runTool(data, dataset, output, numPartitions); - } catch (OptionException e) { - log.warn(e.toString(), e); - CommandLineUtil.printHelp(group); - } - - } - - private static void runTool(String dataStr, String datasetStr, String output, int numPartitions) throws IOException { - Configuration conf = new Configuration(); - - Preconditions.checkArgument(numPartitions > 0, "numPartitions <= 0"); - - // make sure the output file does not exist - Path outputPath = new Path(output); - FileSystem fs = outputPath.getFileSystem(conf); - - Preconditions.checkArgument(!fs.exists(outputPath), "Output path already exists"); - - // create a new file corresponding to each partition - // Path workingDir = fs.getWorkingDirectory(); - // FileSystem wfs = workingDir.getFileSystem(conf); - // File parentFile = new File(workingDir.toString()); - // File tempFile = FileUtil.createLocalTempFile(parentFile, "Parts", true); - // File tempFile = File.createTempFile("df.tools.UDistrib",""); - // tempFile.deleteOnExit(); - File tempFile = FileUtil.createLocalTempFile(new File(""), "df.tools.UDistrib", true); - Path partsPath = new Path(tempFile.toString()); - FileSystem pfs = partsPath.getFileSystem(conf); - - Path[] partPaths = new Path[numPartitions]; - FSDataOutputStream[] files = new FSDataOutputStream[numPartitions]; - for (int p = 0; p < numPartitions; p++) { - partPaths[p] = new Path(partsPath, String.format(Locale.ENGLISH, "part.%03d", p)); - files[p] = pfs.create(partPaths[p]); - } - - Path datasetPath = new Path(datasetStr); - Dataset dataset = Dataset.load(conf, datasetPath); - - // currents[label] = next partition file where to place the tuple - int[] currents = new int[dataset.nblabels()]; - - // currents is initialized randomly in the range [0, numpartitions[ - Random random = RandomUtils.getRandom(); - for (int c = 0; c < currents.length; c++) { - currents[c] = random.nextInt(numPartitions); - } - - // foreach tuple of the data - Path dataPath = new Path(dataStr); - FileSystem ifs = dataPath.getFileSystem(conf); - FSDataInputStream input = ifs.open(dataPath); - Scanner scanner = new Scanner(input); - DataConverter converter = new DataConverter(dataset); - int nbInstances = dataset.nbInstances(); - - int id = 0; - while (scanner.hasNextLine()) { - if (id % 1000 == 0) { - log.info("progress : {} / {}", id, nbInstances); - } - - String line = scanner.nextLine(); - if (line.isEmpty()) { - continue; // skip empty lines - } - - // write the tuple in files[tuple.label] - Instance instance = converter.convert(line); - int label = (int) dataset.getLabel(instance); - files[currents[label]].writeBytes(line); - files[currents[label]].writeChar('\n'); - - // update currents - currents[label]++; - if (currents[label] == numPartitions) { - currents[label] = 0; - } - } - - // close all the files. - scanner.close(); - for (FSDataOutputStream file : files) { - Closeables.closeQuietly(file); - } - - // merge all output files - FileUtil.copyMerge(pfs, partsPath, fs, outputPath, true, conf, null); - /* - * FSDataOutputStream joined = fs.create(new Path(outputPath, "uniform.data")); for (int p = 0; p < - * numPartitions; p++) {log.info("Joining part : {}", p); FSDataInputStream partStream = - * fs.open(partPaths[p]); - * - * IOUtils.copyBytes(partStream, joined, conf, false); - * - * partStream.close(); } - * - * joined.close(); - * - * fs.delete(partsPath, true); - */ - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/LinearModel.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/LinearModel.java deleted file mode 100644 index 1329d2c85..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/LinearModel.java +++ /dev/null @@ -1,103 +0,0 @@ -/* Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.classifier.discriminative; - -import org.apache.mahout.math.Vector; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Classifies a data point using a hyperplane. - */ -public class LinearModel { - - private static final Logger log = LoggerFactory.getLogger(LinearModel.class); - - /** Represents the direction of the hyperplane found during training.*/ - private Vector hyperplane; - /** Displacement of hyperplane from origin.*/ - private double bias; - /** Classification threshold. */ - private final double threshold; - - /** - * Init a linear model with a hyperplane, distance and displacement. - */ - public LinearModel(Vector hyperplane, double displacement, double threshold) { - this.hyperplane = hyperplane; - this.bias = displacement; - this.threshold = threshold; - } - - /** - * Init a linear model with zero displacement and a threshold of 0.5. - */ - public LinearModel(Vector hyperplane) { - this(hyperplane, 0, 0.5); - } - - /** - * Classify a point to either belong to the class modeled by this linear model or not. - * @param dataPoint the data point to classify. - * @return returns true if data point should be classified as belonging to this model. - */ - public boolean classify(Vector dataPoint) { - double product = this.hyperplane.dot(dataPoint); - if (log.isDebugEnabled()) { - log.debug("model: {} product: {} Bias: {} threshold: {}", - new Object[] {this, product, bias, threshold}); - } - return product + this.bias > this.threshold; - } - - /** - * Update the hyperplane by adding delta. - * @param delta the delta to add to the hyperplane vector. - */ - public void addDelta(Vector delta) { - this.hyperplane = this.hyperplane.plus(delta); - } - - @Override - public String toString() { - StringBuilder builder = new StringBuilder("Model: "); - for (int i = 0; i < this.hyperplane.size(); i++) { - builder.append(" ").append(this.hyperplane.get(i)); - } - builder.append(" C: ").append(this.bias); - return builder.toString(); - } - - /** - * Shift the bias of the model. - * @param factor factor to multiply the bias by. - */ - public void shiftBias(double factor) { - this.bias += factor; - } - - /** - * Multiply the weight at index by delta. - * @param index the index of the element to update. - * @param delta the delta to multiply the element with. - */ - public void timesDelta(int index, double delta) { - double element = this.hyperplane.get(index); - element *= delta; - this.hyperplane.setQuick(index, element); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/LinearTrainer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/LinearTrainer.java deleted file mode 100644 index e3ed432d2..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/LinearTrainer.java +++ /dev/null @@ -1,126 +0,0 @@ -/* Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.classifier.discriminative; - -import org.apache.mahout.math.CardinalityException; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.Vector; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Implementors of this class need to provide a way to train linear - * discriminative classifiers. - * - * As this is just the reference implementation we assume that the dataset fits - * into main memory - this should be the first thing to change when switching to - * Hadoop. - */ -public abstract class LinearTrainer { - - private static final Logger log = LoggerFactory.getLogger(LinearTrainer.class); - - /** The model to train. */ - private final LinearModel model; - - /** - * Initialize the trainer. Distance is initialized to cosine distance, all - * weights are represented through a dense vector. - * - * - * @param dimension - * number of expected features. - * @param threshold - * threshold to use for classification. - * @param init - * initial value of weight vector. - * @param initBias - * initial classification bias. - */ - protected LinearTrainer(int dimension, double threshold, - double init, double initBias) { - DenseVector initialWeights = new DenseVector(dimension); - initialWeights.assign(init); - this.model = new LinearModel(initialWeights, initBias, threshold); - } - - /** - * Initializes training. Runs through all data points in the training set and - * updates the weight vector whenever a classification error occurs. - * - * Can be called multiple times. - * - * @param dataset - * the dataset to train on. Each column is treated as point. - * @param labelset - * the set of labels, one for each data point. If the cardinalities - * of data- and labelset do not match, a CardinalityException is - * thrown - */ - public void train(Vector labelset, Matrix dataset) throws TrainingException { - if (labelset.size() != dataset.columnSize()) { - throw new CardinalityException(labelset.size(), dataset.columnSize()); - } - - boolean converged = false; - int iteration = 0; - while (!converged) { - if (iteration > 1000) { - throw new TrainingException("Too many iterations needed to find hyperplane."); - } - - converged = true; - int columnCount = dataset.columnSize(); - for (int i = 0; i < columnCount; i++) { - Vector dataPoint = dataset.viewColumn(i); - log.debug("Training point: {}", dataPoint); - - synchronized (this.model) { - boolean prediction = model.classify(dataPoint); - double label = labelset.get(i); - if (label <= 0 && prediction || label > 0 && !prediction) { - log.debug("updating"); - converged = false; - update(label, dataPoint, this.model); - } - } - } - iteration++; - } - } - - /** - * Retrieves the trained model if called after train, otherwise the raw model. - */ - public LinearModel getModel() { - return this.model; - } - - /** - * Implement this method to match your training strategy. - * - * @param model - * the model to update. - * @param label - * the target label of the wrongly classified data point. - * @param dataPoint - * the data point that was classified incorrectly. - */ - protected abstract void update(double label, Vector dataPoint, LinearModel model); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/PerceptronTrainer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/PerceptronTrainer.java deleted file mode 100644 index 76221c9f4..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/PerceptronTrainer.java +++ /dev/null @@ -1,62 +0,0 @@ -/* Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.classifier.discriminative; - -import org.apache.mahout.math.Vector; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Implements training according to the perceptron update rule. - */ -public class PerceptronTrainer extends LinearTrainer { - - private static final Logger log = LoggerFactory.getLogger(PerceptronTrainer.class); - - /** Rate the model is to be updated with at each step. */ - private final double learningRate; - - public PerceptronTrainer(int dimension, double threshold, - double learningRate, double init, double initBias) { - super(dimension, threshold, init, initBias); - this.learningRate = learningRate; - } - - /** - * {@inheritDoc} Perceptron update works such that in case the predicted label - * does not match the real label, the weight vector is updated as follows: In - * case the prediction was positive but should have been negative, the weight vector - * is set to the sum of weight vector and example (multiplied by the learning rate). - * - * In case the prediction was negative but should have been positive, the example - * vector (multiplied by the learning rate) is subtracted from the weight vector. - */ - @Override - protected void update(double label, Vector dataPoint, LinearModel model) { - double factor = 1.0; - if (label == 0.0) { - factor = -1.0; - } - - Vector updateVector = dataPoint.times(factor).times(this.learningRate); - log.debug("Updatevec: {}", updateVector); - - model.addDelta(updateVector); - model.shiftBias(factor * this.learningRate); - log.debug("{}", model); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/TrainingException.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/TrainingException.java deleted file mode 100644 index 89d22cb10..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/TrainingException.java +++ /dev/null @@ -1,33 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.discriminative; - -/** - * This exception is thrown in case training fails. E.g. training with an algorithm - * that can find linear separating hyperplanes only on a training set that is not - * linearly separable. - * */ -public class TrainingException extends Exception { - - /** - * Init with message string describing the cause of the exception. - * */ - public TrainingException(String message) { - super(message); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/WinnowTrainer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/WinnowTrainer.java deleted file mode 100644 index cd36150d4..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/discriminative/WinnowTrainer.java +++ /dev/null @@ -1,92 +0,0 @@ -/* Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.classifier.discriminative; - -import java.util.Iterator; - -import org.apache.mahout.math.Vector; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This class implements training according to the winnow update algorithm. - */ -public class WinnowTrainer extends LinearTrainer { - - private static final Logger log = LoggerFactory.getLogger(WinnowTrainer.class); - - /** Promotion step to multiply weights with on update. */ - private final double promotionStep; - - public WinnowTrainer(int dimension, double promotionStep, double threshold, double init, double initBias) { - super(dimension, threshold, init, initBias); - this.promotionStep = promotionStep; - } - - public WinnowTrainer(int dimension, double promotionStep) { - this(dimension, promotionStep, 0.5, 1, 0); - } - - /** - * Initializes with dimension and promotionStep of 2. - * - * @param dimension - * number of features. - */ - public WinnowTrainer(int dimension) { - this(dimension, 2); - } - - /** - * {@inheritDoc} Winnow update works such that in case the predicted label - * does not match the real label, the weight vector is updated as follows: In - * case the prediction was positiv but should have been negative, all entries - * in the weight vector that correspond to non null features in the example - * are doubled. - * - * In case the prediction was negative but should have been positive, all - * entries in the weight vector that correspond to non null features in the - * example are halfed. - */ - @Override - protected void update(double label, Vector dataPoint, LinearModel model) { - if (label > 0) { - // case one - Vector updateVector = dataPoint.times(1 / this.promotionStep); - log.info("Winnow update positive: {}", updateVector); - Iterator iter = updateVector.iterateNonZero(); - while (iter.hasNext()) { - Vector.Element element = iter.next(); - if (element.get() != 0) { - model.timesDelta(element.index(), element.get()); - } - } - } else { - // case two - Vector updateVector = dataPoint.times(1 / this.promotionStep); - log.info("Winnow update negative: {}", updateVector); - Iterator iter = updateVector.iterateNonZero(); - while (iter.hasNext()) { - Vector.Element element = iter.next(); - if (element.get() != 0) { - model.timesDelta(element.index(), element.get()); - } - } - } - log.info(model.toString()); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java deleted file mode 100644 index a738969c3..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.evaluation; - -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.DenseMatrix; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.list.DoubleArrayList; - -import com.google.common.base.Preconditions; - -import java.util.Random; - -/** - * Computes AUC and a few other accuracy statistics without storing huge amounts of data. This is - * done by keeping uniform samples of the positive and negative scores. Then, when AUC is to be - * computed, the remaining scores are sorted and a rank-sum statistic is used to compute the AUC. - * Since AUC is invariant with respect to down-sampling of either positives or negatives, this is - * close to correct and is exactly correct if maxBufferSize or fewer positive and negative scores - * are examined. - */ -public class Auc { - - private int maxBufferSize = 10000; - private final DoubleArrayList[] scores = { new DoubleArrayList(), new DoubleArrayList() }; - private final Random rand; - private int samples; - private final double threshold; - private final Matrix confusion; - private final DenseMatrix entropy; - - private boolean probabilityScore = true; - - private boolean hasScore; - - /** - * Allocates a new data-structure for accumulating information about AUC and a few other accuracy - * measures. - * @param threshold The threshold to use in computing the confusion matrix. - */ - public Auc(double threshold) { - confusion = new DenseMatrix(2, 2); - entropy = new DenseMatrix(2, 2); - this.rand = RandomUtils.getRandom(); - this.threshold = threshold; - } - - public Auc() { - this(0.5); - } - - /** - * Adds a score to the AUC buffers. - * - * @param trueValue Whether this score is for a true-positive or a true-negative example. - * @param score The score for this example. - */ - public void add(int trueValue, double score) { - Preconditions.checkArgument(trueValue == 0 || trueValue == 1, "True value must be 0 or 1"); - hasScore = true; - - int predictedClass = score > threshold ? 1 : 0; - confusion.set(trueValue, predictedClass, confusion.get(trueValue, predictedClass) + 1); - - samples++; - if (isProbabilityScore()) { - double limited = Math.max(1.0e-20, Math.min(score, 1 - 1.0e-20)); - double v0 = entropy.get(trueValue, 0); - entropy.set(trueValue, 0, (Math.log1p(-limited) - v0) / samples + v0); - - double v1 = entropy.get(trueValue, 1); - entropy.set(trueValue, 1, (Math.log(limited) - v1) / samples + v1); - } - - // add to buffers - DoubleArrayList buf = scores[trueValue]; - if (buf.size() >= maxBufferSize) { - // but if too many points are seen, we insert into a random - // place and discard the predecessor. The random place could - // be anywhere, possibly not even in the buffer. - // this is a special case of Knuth's permutation algorithm - // but since we don't ever shuffle the first maxBufferSize - // samples, the result isn't just a fair sample of the prefixes - // of all permutations. The CONTENTs of the result, however, - // will be a fair and uniform sample of maxBufferSize elements - // chosen from all elements without replacement - int index = rand.nextInt(samples); - if (index < buf.size()) { - buf.set(index, score); - } - } else { - // for small buffers, we collect all points without permuting - // since we sort the data later, permuting now would just be - // pedantic - buf.add(score); - } - } - - public void add(int trueValue, int predictedClass) { - hasScore = false; - Preconditions.checkArgument(trueValue == 0 || trueValue == 1, "True value must be 0 or 1"); - confusion.set(trueValue, predictedClass, confusion.get(trueValue, predictedClass) + 1); - } - - /** - * Computes the AUC of points seen so far. This can be moderately expensive since it requires - * that all points that have been retained be sorted. - * - * @return The value of the Area Under the receiver operating Curve. - */ - public double auc() { - Preconditions.checkArgument(hasScore, "Can't compute AUC for classifier without a score"); - scores[0].sort(); - scores[1].sort(); - - double n0 = scores[0].size(); - double n1 = scores[1].size(); - - if (n0 == 0 || n1 == 0) { - return 0.5; - } - - // scan the data - int i0 = 0; - int i1 = 0; - int rank = 1; - double rankSum = 0; - while (i0 < n0 && i1 < n1) { - - double v0 = scores[0].get(i0); - double v1 = scores[1].get(i1); - - if (v0 < v1) { - i0++; - rank++; - } else if (v1 < v0) { - i1++; - rankSum += rank; - rank++; - } else { - // ties have to be handled delicately - double tieScore = v0; - - // how many negatives are tied? - int k0 = 0; - while (i0 < n0 && scores[0].get(i0) == tieScore) { - k0++; - i0++; - } - - // and how many positives - int k1 = 0; - while (i1 < n1 && scores[1].get(i1) == tieScore) { - k1++; - i1++; - } - - // we found k0 + k1 tied values which have - // ranks in the half open interval [rank, rank + k0 + k1) - // the average rank is assigned to all - rankSum += (rank + (k0 + k1 - 1) / 2.0) * k1; - rank += k0 + k1; - } - } - - if (i1 < n1) { - rankSum += (rank + (n1 - i1 - 1) / 2.0) * (n1 - i1); - rank += (int) (n1 - i1); - } - - return (rankSum / n1 - (n1 + 1) / 2) / n0; - } - - /** - * Returns the confusion matrix for the classifier supposing that we were to use a particular - * threshold. - * @return The confusion matrix. - */ - public Matrix confusion() { - return confusion; - } - - /** - * Returns a matrix related to the confusion matrix and to the log-likelihood. For a - * pretty accurate classifier, N + entropy is nearly the same as the confusion matrix - * because log(1-eps) \approx -eps if eps is small. - * - * For lower accuracy classifiers, this measure will give us a better picture of how - * things work our. - * - * Also, by definition, log-likelihood = sum(diag(entropy)) - * @return Returns a cell by cell break-down of the log-likelihood - */ - public Matrix entropy() { - if (!hasScore) { - // find a constant score that would optimize log-likelihood, but use a dash of Bayesian - // conservatism to avoid dividing by zero or taking log(0) - double p = (0.5 + confusion.get(1, 1)) / (1 + confusion.get(0, 0) + confusion.get(1, 1)); - entropy.set(0, 0, confusion.get(0, 0) * Math.log1p(-p)); - entropy.set(0, 1, confusion.get(0, 1) * Math.log(p)); - entropy.set(1, 0, confusion.get(1, 0) * Math.log1p(-p)); - entropy.set(1, 1, confusion.get(1, 1) * Math.log(p)); - } - return entropy; - } - - public void setMaxBufferSize(int maxBufferSize) { - this.maxBufferSize = maxBufferSize; - } - - public boolean isProbabilityScore() { - return probabilityScore; - } - - public void setProbabilityScore(boolean probabilityScore) { - this.probabilityScore = probabilityScore; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java deleted file mode 100644 index d0ca0d714..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.naivebayes; - -import java.util.Iterator; - -import org.apache.mahout.classifier.AbstractVectorClassifier; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.Vector.Element; - -/** Class implementing the Naive Bayes Classifier Algorithm */ -public abstract class AbstractNaiveBayesClassifier extends AbstractVectorClassifier { - - private final NaiveBayesModel model; - - protected AbstractNaiveBayesClassifier(NaiveBayesModel model) { - this.model = model; - } - - protected NaiveBayesModel getModel() { - return model; - } - - protected abstract double getScoreForLabelFeature(int label, int feature); - - protected double getScoreForLabelInstance(int label, Vector instance) { - double result = 0.0; - Iterator elements = instance.iterateNonZero(); - while (elements.hasNext()) { - Element e = elements.next(); - result += e.get() * getScoreForLabelFeature(label, e.index()); - } - return result; - } - - @Override - public int numCategories() { - return model.numLabels(); - } - - @Override - public Vector classifyFull(Vector instance) { - Vector score = model.createScoringVector(); - for (int label = 0; label < model.numLabels(); label++) { - score.set(label, getScoreForLabelInstance(label, instance)); - } - return score; - } - - @Override - public Vector classifyFull(Vector r, Vector instance) { - r = classifyFull(instance); - return r; - } - - @Override - public double classifyScalar(Vector instance) { - throw new UnsupportedOperationException("Not supported in Naive Bayes"); - } - - @Override - public Vector classify(Vector instance) { - throw new UnsupportedOperationException("probabilites not supported in Naive Bayes"); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java deleted file mode 100644 index 2a9cd7f70..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java +++ /dev/null @@ -1,159 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.naivebayes; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; -import com.google.common.io.Closeables; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.classifier.naivebayes.training.ThetaMapper; -import org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.SparseMatrix; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.map.OpenObjectIntHashMap; - -import java.io.IOException; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; - -public final class BayesUtils { - - private BayesUtils() {} - - public static NaiveBayesModel readModelFromDir(Path base, Configuration conf) { - - float alphaI = conf.getFloat(ThetaMapper.ALPHA_I, 1.0f); - - // read feature sums and label sums - Vector scoresPerLabel = null; - Vector scoresPerFeature = null; - for (Pair record : new SequenceFileDirIterable( - new Path(base, TrainNaiveBayesJob.WEIGHTS), PathType.LIST, PathFilters.partFilter(), conf)) { - String key = record.getFirst().toString(); - VectorWritable value = record.getSecond(); - if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE)) { - scoresPerFeature = value.get(); - } else if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_LABEL)) { - scoresPerLabel = value.get(); - } - } - - Preconditions.checkNotNull(scoresPerFeature); - Preconditions.checkNotNull(scoresPerLabel); - - Matrix scoresPerLabelAndFeature = new SparseMatrix(scoresPerLabel.size(), scoresPerFeature.size()); - for (Pair entry : new SequenceFileDirIterable( - new Path(base, TrainNaiveBayesJob.SUMMED_OBSERVATIONS), PathType.LIST, PathFilters.partFilter(), conf)) { - scoresPerLabelAndFeature.assignRow(entry.getFirst().get(), entry.getSecond().get()); - } - - Vector perlabelThetaNormalizer = scoresPerLabel.like(); - /* for (Pair entry : new SequenceFileDirIterable( - new Path(base, TrainNaiveBayesJob.THETAS), PathType.LIST, PathFilters.partFilter(), conf)) { - if (entry.getFirst().toString().equals(TrainNaiveBayesJob.LABEL_THETA_NORMALIZER)) { - perlabelThetaNormalizer = entry.getSecond().get(); - } - } - - Preconditions.checkNotNull(perlabelThetaNormalizer); - */ - return new NaiveBayesModel(scoresPerLabelAndFeature, scoresPerFeature, scoresPerLabel, perlabelThetaNormalizer, - alphaI); - } - - /** Write the list of labels into a map file */ - public static int writeLabelIndex(Configuration conf, Iterable labels, Path indexPath) - throws IOException { - FileSystem fs = FileSystem.get(indexPath.toUri(), conf); - SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, indexPath, Text.class, IntWritable.class); - int i = 0; - try { - for (String label : labels) { - writer.append(new Text(label), new IntWritable(i++)); - } - } finally { - Closeables.closeQuietly(writer); - } - return i; - } - - public static int writeLabelIndex(Configuration conf, Path indexPath, - Iterable> labels) throws IOException { - FileSystem fs = FileSystem.get(indexPath.toUri(), conf); - SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, indexPath, Text.class, IntWritable.class); - Collection seen = new HashSet(); - int i = 0; - try { - for (Object label : labels) { - String theLabel = ((Pair) label).getFirst().toString().split("/")[1]; - if (!seen.contains(theLabel)) { - writer.append(new Text(theLabel), new IntWritable(i++)); - seen.add(theLabel); - } - } - } finally { - Closeables.closeQuietly(writer); - } - return i; - } - - public static Map readLabelIndex(Configuration conf, Path indexPath) { - Map labelMap = new HashMap(); - for (Pair pair : new SequenceFileIterable(indexPath, true, conf)) { - labelMap.put(pair.getSecond().get(), pair.getFirst().toString()); - } - return labelMap; - } - - public static OpenObjectIntHashMap readIndexFromCache(Configuration conf) throws IOException { - OpenObjectIntHashMap index = new OpenObjectIntHashMap(); - for (Pair entry : - new SequenceFileIterable(HadoopUtil.cachedFile(conf), conf)) { - index.put(entry.getFirst().toString(), entry.getSecond().get()); - } - return index; - } - - public static Map readScoresFromCache(Configuration conf) throws IOException { - Map sumVectors = Maps.newHashMap(); - for (Pair entry : - new SequenceFileDirIterable(HadoopUtil.cachedFile(conf), - PathType.LIST, PathFilters.partFilter(), conf)) { - sumVectors.put(entry.getFirst().toString(), entry.getSecond().get()); - } - return sumVectors; - } - - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/ComplementaryNaiveBayesClassifier.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/ComplementaryNaiveBayesClassifier.java deleted file mode 100644 index 70e8d1ac7..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/ComplementaryNaiveBayesClassifier.java +++ /dev/null @@ -1,43 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.naivebayes; - - -/** - * Class implementing the Naive Bayes Classifier Algorithm - * - */ -public class ComplementaryNaiveBayesClassifier extends AbstractNaiveBayesClassifier { - public ComplementaryNaiveBayesClassifier(NaiveBayesModel model) { - super(model); - } - - @Override - public double getScoreForLabelFeature(int label, int feature) { - NaiveBayesModel model = getModel(); - return computeWeight(model.featureWeight(feature), model.weight(label, feature), - model.totalWeightSum(), model.labelWeight(label), model.alphaI(), model.numFeatures()); - } - - public static double computeWeight(double featureWeight, double featureLabelWeight, - double totalWeight, double labelWeight, double alphaI, double numFeatures) { - double numerator = featureWeight - featureLabelWeight + alphaI; - double denominator = totalWeight - labelWeight + alphaI * numFeatures; - return -Math.log(numerator / denominator); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/NaiveBayesModel.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/NaiveBayesModel.java deleted file mode 100644 index e91e48cb5..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/NaiveBayesModel.java +++ /dev/null @@ -1,165 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.naivebayes; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.SparseRowMatrix; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -import com.google.common.base.Preconditions; -import com.google.common.io.Closeables; - -/** NaiveBayesModel holds the weight Matrix, the feature and label sums and the weight normalizer vectors.*/ -public class NaiveBayesModel { - - private final Vector weightsPerLabel; - private final Vector perlabelThetaNormalizer; - private final double minThetaNormalizer; - private final Vector weightsPerFeature; - private final Matrix weightsPerLabelAndFeature; - private final float alphaI; - private final double numFeatures; - private final double totalWeightSum; - - public NaiveBayesModel(Matrix weightMatrix, - Vector weightsPerFeature, - Vector weightsPerLabel, - Vector thetaNormalizer, - float alphaI) { - this.weightsPerLabelAndFeature = weightMatrix; - this.weightsPerFeature = weightsPerFeature; - this.weightsPerLabel = weightsPerLabel; - this.perlabelThetaNormalizer = thetaNormalizer; - this.numFeatures = weightsPerFeature.getNumNondefaultElements(); - this.totalWeightSum = weightsPerLabel.zSum(); - this.alphaI = alphaI; - this.minThetaNormalizer = thetaNormalizer.maxValue(); - } - - public double labelWeight(int label) { - return weightsPerLabel.getQuick(label); - } - - public double thetaNormalizer(int label) { - return perlabelThetaNormalizer.get(label) / minThetaNormalizer; - } - - public double featureWeight(int feature) { - return weightsPerFeature.getQuick(feature); - } - - public double weight(int label, int feature) { - return weightsPerLabelAndFeature.getQuick(label, feature); - } - - public float alphaI() { - return alphaI; - } - - public double numFeatures() { - return numFeatures; - } - - public double totalWeightSum() { - return totalWeightSum; - } - - public int numLabels() { - return weightsPerLabel.size(); - } - - public Vector createScoringVector() { - return weightsPerLabel.like(); - } - - public static NaiveBayesModel materialize(Path output, Configuration conf) throws IOException { - FileSystem fs = output.getFileSystem(conf); - - Vector weightsPerLabel = null; - Vector perLabelThetaNormalizer = null; - Vector weightsPerFeature = null; - Matrix weightsPerLabelAndFeature; - float alphaI; - - FSDataInputStream in = fs.open(new Path(output, "naiveBayesModel.bin")); - try { - alphaI = in.readFloat(); - weightsPerFeature = VectorWritable.readVector(in); - weightsPerLabel = new DenseVector(VectorWritable.readVector(in)); - perLabelThetaNormalizer = new DenseVector(VectorWritable.readVector(in)); - - weightsPerLabelAndFeature = new SparseRowMatrix(weightsPerLabel.size(), weightsPerFeature.size() ); - for (int label = 0; label < weightsPerLabelAndFeature.numRows(); label++) { - weightsPerLabelAndFeature.assignRow(label, VectorWritable.readVector(in)); - } - } finally { - Closeables.closeQuietly(in); - } - NaiveBayesModel model = new NaiveBayesModel(weightsPerLabelAndFeature, weightsPerFeature, weightsPerLabel, - perLabelThetaNormalizer, alphaI); - model.validate(); - return model; - } - - public void serialize(Path output, Configuration conf) throws IOException { - FileSystem fs = output.getFileSystem(conf); - FSDataOutputStream out = fs.create(new Path(output, "naiveBayesModel.bin")); - try { - out.writeFloat(alphaI); - VectorWritable.writeVector(out, weightsPerFeature); - VectorWritable.writeVector(out, weightsPerLabel); - VectorWritable.writeVector(out, perlabelThetaNormalizer); - for (int row = 0; row < weightsPerLabelAndFeature.numRows(); row++) { - VectorWritable.writeVector(out, weightsPerLabelAndFeature.viewRow(row)); - } - } finally { - Closeables.closeQuietly(out); - } - } - - public void validate() { - Preconditions.checkState(alphaI > 0, "alphaI has to be greater than 0!"); - Preconditions.checkArgument(numFeatures > 0, "the vocab count has to be greater than 0!"); - Preconditions.checkArgument(totalWeightSum > 0, "the totalWeightSum has to be greater than 0!"); - Preconditions.checkArgument(weightsPerLabel != null, "the number of labels has to be defined!"); - Preconditions.checkArgument(weightsPerLabel.getNumNondefaultElements() > 0, - "the number of labels has to be greater than 0!"); - Preconditions.checkArgument(perlabelThetaNormalizer != null, "the theta normalizers have to be defined"); - // Preconditions.checkArgument(perlabelThetaNormalizer.getNumNondefaultElements() > 0, - // "the number of theta normalizers has to be greater than 0!"); - Preconditions.checkArgument(weightsPerFeature != null, "the feature sums have to be defined"); - Preconditions.checkArgument(weightsPerFeature.getNumNondefaultElements() > 0, - "the feature sums have to be greater than 0!"); - // Check if all thetas have same sign. - /*Iterator it = perlabelThetaNormalizer.iterateNonZero(); - while (it.hasNext()) { - Element e = it.next(); - Preconditions.checkArgument(Math.signum(e.get()) == Math.signum(minThetaNormalizer), e.get() - + " " + minThetaNormalizer); - }*/ - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/StandardNaiveBayesClassifier.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/StandardNaiveBayesClassifier.java deleted file mode 100644 index 09021fa3e..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/StandardNaiveBayesClassifier.java +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.naivebayes; - - -/** Class implementing the Naive Bayes Classifier Algorithm */ -public class StandardNaiveBayesClassifier extends AbstractNaiveBayesClassifier { - - public StandardNaiveBayesClassifier(NaiveBayesModel model) { - super(model); - } - - @Override - public double getScoreForLabelFeature(int label, int feature) { - NaiveBayesModel model = getModel(); - return computeWeight(model.weight(label, feature), model.labelWeight(label), model.alphaI(), - model.numFeatures()); - } - - public static double computeWeight(double featureLabelWeight, double labelWeight, double alphaI, - double numFeatures) { - double numerator = featureLabelWeight + alphaI; - double denominator = labelWeight + alphaI * numFeatures; - return Math.log(numerator / denominator); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java deleted file mode 100644 index 8eb66f827..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.naivebayes.test; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.classifier.naivebayes.AbstractNaiveBayesClassifier; -import org.apache.mahout.classifier.naivebayes.ComplementaryNaiveBayesClassifier; -import org.apache.mahout.classifier.naivebayes.NaiveBayesModel; -import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -import java.io.IOException; - -/** - * Run the input through the model and see if it matches. - *

- * The output value is the generated label, the Pair is the expected label and true if they match: - */ -public class BayesTestMapper extends Mapper { - - private AbstractNaiveBayesClassifier classifier; - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - super.setup(context); - System.out.println("Setup"); - Configuration conf = context.getConfiguration(); - Path modelPath = HadoopUtil.cachedFile(conf); - NaiveBayesModel model = NaiveBayesModel.materialize(modelPath, conf); - boolean compl = Boolean.parseBoolean(conf.get(TestNaiveBayesDriver.COMPLEMENTARY)); - if (compl) { - classifier = new ComplementaryNaiveBayesClassifier(model); - } else { - classifier = new StandardNaiveBayesClassifier(model); - } - } - - @Override - protected void map(Text key, VectorWritable value, Context context) throws IOException, InterruptedException { - Vector result = classifier.classifyFull(value.get()); - //the key is the expected value - context.write(new Text(key.toString().split("/")[1]), new VectorWritable(result)); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java deleted file mode 100644 index 9fe8b6a03..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java +++ /dev/null @@ -1,162 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.naivebayes.test; - -import java.io.IOException; -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.SequenceFile.Reader; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.classifier.ClassifierResult; -import org.apache.mahout.classifier.ResultAnalyzer; -import org.apache.mahout.classifier.naivebayes.AbstractNaiveBayesClassifier; -import org.apache.mahout.classifier.naivebayes.BayesUtils; -import org.apache.mahout.classifier.naivebayes.ComplementaryNaiveBayesClassifier; -import org.apache.mahout.classifier.naivebayes.NaiveBayesModel; -import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Test the (Complementary) Naive Bayes model that was built during training - * by running the iterating the test set and comparing it to the model - */ -public class TestNaiveBayesDriver extends AbstractJob { - - private static final Logger log = LoggerFactory.getLogger(TestNaiveBayesDriver.class); - - public static final String LABEL_KEY = "labels"; - public static final String COMPLEMENTARY = "class"; //b for bayes, c for complementary - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new TestNaiveBayesDriver(), args); - } - - @Override - public int run(String[] args) throws Exception { - addInputOption(); - addOutputOption(); - addOption(addOption(DefaultOptionCreator.overwriteOption().create())); - addOption("model", "m", "The path to the model built during training", true); - addOption(buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false))); - addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false))); - addOption("labelIndex", "l", "The path to the location of the label index", true); - Map> parsedArgs = parseArguments(args); - if (parsedArgs == null) { - return -1; - } - if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(getConf(), getOutputPath()); - } - - boolean complementary = hasOption("testComplementary"); - boolean sequential = hasOption("runSequential"); - if (sequential) { - FileSystem fs = FileSystem.get(getConf()); - NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf()); - AbstractNaiveBayesClassifier classifier; - if (complementary) { - classifier = new ComplementaryNaiveBayesClassifier(model); - } else { - classifier = new StandardNaiveBayesClassifier(model); - } - SequenceFile.Writer writer = - new SequenceFile.Writer(fs, getConf(), getOutputPath(), Text.class, VectorWritable.class); - SequenceFile.Reader reader = new Reader(fs, getInputPath(), getConf()); - Text key = new Text(); - VectorWritable vw = new VectorWritable(); - while (reader.next(key, vw)) { - writer.append(new Text(key.toString().split("/")[1]), - new VectorWritable(classifier.classifyFull(vw.get()))); - } - writer.close(); - reader.close(); - } else { - boolean succeeded = runMapReduce(parsedArgs); - if (!succeeded) { - return -1; - } - } - - //load the labels - Map labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex"))); - - //loop over the results and create the confusion matrix - SequenceFileDirIterable dirIterable = - new SequenceFileDirIterable(getOutputPath(), - PathType.LIST, - PathFilters.partFilter(), - getConf()); - ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT"); - analyzeResults(labelMap, dirIterable, analyzer); - - log.info("{} Results: {}", complementary ? "Complementary" : "Standard NB", analyzer); - return 0; - } - - private boolean runMapReduce(Map> parsedArgs) throws IOException, - InterruptedException, ClassNotFoundException { - Path model = new Path(getOption("model")); - HadoopUtil.cacheFiles(model, getConf()); - //the output key is the expected value, the output value are the scores for all the labels - Job testJob = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, BayesTestMapper.class, - Text.class, VectorWritable.class, SequenceFileOutputFormat.class); - //testJob.getConfiguration().set(LABEL_KEY, getOption("--labels")); - boolean complementary = parsedArgs.containsKey("testComplementary"); - testJob.getConfiguration().set(COMPLEMENTARY, String.valueOf(complementary)); - boolean succeeded = testJob.waitForCompletion(true); - return succeeded; - } - - private static void analyzeResults(Map labelMap, - SequenceFileDirIterable dirIterable, - ResultAnalyzer analyzer) { - for (Pair pair : dirIterable) { - int bestIdx = Integer.MIN_VALUE; - double bestScore = Long.MIN_VALUE; - for (Vector.Element element : pair.getSecond().get()) { - if (element.get() > bestScore) { - bestScore = element.get(); - bestIdx = element.index(); - } - } - if (bestIdx != Integer.MIN_VALUE) { - ClassifierResult classifierResult = new ClassifierResult(labelMap.get(bestIdx), bestScore); - analyzer.addInstance(pair.getFirst().toString(), classifierResult); - } - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/AbstractThetaTrainer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/AbstractThetaTrainer.java deleted file mode 100644 index daf8e9a58..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/AbstractThetaTrainer.java +++ /dev/null @@ -1,72 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.naivebayes.training; - -import com.google.common.base.Preconditions; -import org.apache.mahout.math.Vector; - -public abstract class AbstractThetaTrainer { - - private final Vector weightsPerFeature; - private final Vector weightsPerLabel; - private final Vector perLabelThetaNormalizer; - private final double alphaI; - private final double totalWeightSum; - private final double numFeatures; - - protected AbstractThetaTrainer(Vector weightsPerFeature, Vector weightsPerLabel, double alphaI) { - Preconditions.checkNotNull(weightsPerFeature); - Preconditions.checkNotNull(weightsPerLabel); - this.weightsPerFeature = weightsPerFeature; - this.weightsPerLabel = weightsPerLabel; - this.alphaI = alphaI; - perLabelThetaNormalizer = weightsPerLabel.like(); - totalWeightSum = weightsPerLabel.zSum(); - numFeatures = weightsPerFeature.getNumNondefaultElements(); - } - - public abstract void train(int label, Vector instance); - - protected double alphaI() { - return alphaI; - } - - protected double numFeatures() { - return numFeatures; - } - - protected double labelWeight(int label) { - return weightsPerLabel.get(label); - } - - protected double totalWeightSum() { - return totalWeightSum; - } - - protected double featureWeight(int feature) { - return weightsPerFeature.get(feature); - } - - protected void updatePerLabelThetaNormalizer(int label, double weight) { - perLabelThetaNormalizer.set(label, perLabelThetaNormalizer.get(label) + weight); - } - - public Vector retrievePerLabelThetaNormalizer() { - return perLabelThetaNormalizer.clone(); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/ComplementaryThetaTrainer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/ComplementaryThetaTrainer.java deleted file mode 100644 index 58e7e18fc..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/ComplementaryThetaTrainer.java +++ /dev/null @@ -1,42 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.naivebayes.training; - -import java.util.Iterator; - -import org.apache.mahout.classifier.naivebayes.ComplementaryNaiveBayesClassifier; -import org.apache.mahout.math.Vector; - -public class ComplementaryThetaTrainer extends AbstractThetaTrainer { - - public ComplementaryThetaTrainer(Vector weightsPerFeature, Vector weightsPerLabel, double alphaI) { - super(weightsPerFeature, weightsPerLabel, alphaI); - } - - @Override - public void train(int label, Vector perLabelWeight) { - double labelWeight = labelWeight(label); - Iterator it = perLabelWeight.iterateNonZero(); - while (it.hasNext()) { - Vector.Element e = it.next(); - updatePerLabelThetaNormalizer(label, - ComplementaryNaiveBayesClassifier.computeWeight(featureWeight(e.index()), e.get(), - totalWeightSum(), labelWeight, alphaI(), numFeatures())); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java deleted file mode 100644 index 272deb284..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java +++ /dev/null @@ -1,50 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.naivebayes.training; - -import java.io.IOException; - -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.classifier.naivebayes.BayesUtils; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.map.OpenObjectIntHashMap; - -public class IndexInstancesMapper extends Mapper { - - public enum Counter { SKIPPED_INSTANCES } - - private OpenObjectIntHashMap labelIndex; - - @Override - protected void setup(Context ctx) throws IOException, InterruptedException { - super.setup(ctx); - labelIndex = BayesUtils.readIndexFromCache(ctx.getConfiguration()); - } - - @Override - protected void map(Text labelText, VectorWritable instance, Context ctx) throws IOException, InterruptedException { - String label = labelText.toString().split("/")[1]; - if (labelIndex.containsKey(label)) { - ctx.write(new IntWritable(labelIndex.get(label)), instance); - } else { - ctx.getCounter(Counter.SKIPPED_INSTANCES).increment(1); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/StandardThetaTrainer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/StandardThetaTrainer.java deleted file mode 100644 index 3fdee8eb7..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/StandardThetaTrainer.java +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - package org.apache.mahout.classifier.naivebayes.training; - -import java.util.Iterator; - -import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier; -import org.apache.mahout.math.Vector; - -public class StandardThetaTrainer extends AbstractThetaTrainer { - - public StandardThetaTrainer(Vector weightsPerFeature, Vector weightsPerLabel, double alphaI) { - super(weightsPerFeature, weightsPerLabel, alphaI); - } - - @Override - public void train(int label, Vector perLabelWeight) { - double labelWeight = labelWeight(label); - Iterator it = perLabelWeight.iterateNonZero(); - while (it.hasNext()) { - Vector.Element e = it.next(); - updatePerLabelThetaNormalizer(label, - StandardNaiveBayesClassifier.computeWeight(e.get(), labelWeight, alphaI(), numFeatures())); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/ThetaMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/ThetaMapper.java deleted file mode 100644 index 7e92e98f1..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/ThetaMapper.java +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.naivebayes.training; - -import java.io.IOException; -import java.util.Map; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.classifier.naivebayes.BayesUtils; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -public class ThetaMapper extends Mapper { - - public static final String ALPHA_I = ThetaMapper.class.getName() + ".alphaI"; - static final String TRAIN_COMPLEMENTARY = ThetaMapper.class.getName() + ".trainComplementary"; - - private AbstractThetaTrainer trainer; - - @Override - protected void setup(Context ctx) throws IOException, InterruptedException { - super.setup(ctx); - Configuration conf = ctx.getConfiguration(); - - float alphaI = conf.getFloat(ALPHA_I, 1.0f); - Map scores = BayesUtils.readScoresFromCache(conf); - - if (conf.getBoolean(TRAIN_COMPLEMENTARY, false)) { - trainer = new ComplementaryThetaTrainer(scores.get(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE), - scores.get(TrainNaiveBayesJob.WEIGHTS_PER_LABEL), alphaI); - } else { - trainer = new StandardThetaTrainer(scores.get(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE), - scores.get(TrainNaiveBayesJob.WEIGHTS_PER_LABEL), alphaI); - } - } - - @Override - protected void map(IntWritable key, VectorWritable value, Context ctx) throws IOException, InterruptedException { - trainer.train(key.get(), value.get()); - } - - @Override - protected void cleanup(Context ctx) throws IOException, InterruptedException { - ctx.write(new Text(TrainNaiveBayesJob.LABEL_THETA_NORMALIZER), - new VectorWritable(trainer.retrievePerLabelThetaNormalizer())); - super.cleanup(ctx); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java deleted file mode 100644 index e840c068b..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java +++ /dev/null @@ -1,157 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.naivebayes.training; - -import com.google.common.base.Splitter; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.classifier.naivebayes.BayesUtils; -import org.apache.mahout.classifier.naivebayes.NaiveBayesModel; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; -import org.apache.mahout.common.mapreduce.VectorSumReducer; -import org.apache.mahout.math.VectorWritable; - -import java.io.IOException; -import java.util.List; -import java.util.Map; - -/** - * This class trains a Naive Bayes Classifier (Parameters for both Naive Bayes and Complementary Naive Bayes) - */ -public final class TrainNaiveBayesJob extends AbstractJob { - private static final String TRAIN_COMPLEMENTARY = "trainComplementary"; - private static final String ALPHA_I = "alphaI"; - private static final String LABEL_INDEX = "labelIndex"; - private static final String EXTRACT_LABELS = "extractLabels"; - private static final String LABELS = "labels"; - public static final String WEIGHTS_PER_FEATURE = "__SPF"; - public static final String WEIGHTS_PER_LABEL = "__SPL"; - public static final String LABEL_THETA_NORMALIZER = "_LTN"; - - public static final String SUMMED_OBSERVATIONS = "summedObservations"; - public static final String WEIGHTS = "weights"; - public static final String THETAS = "thetas"; - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new TrainNaiveBayesJob(), args); - } - - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOutputOption(); - addOption(LABELS, "l", "comma-separated list of labels to include in training", false); - - addOption(buildOption(EXTRACT_LABELS, "el", "Extract the labels from the input", false, false, "")); - addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f)); - addOption(buildOption(TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false))); - addOption(LABEL_INDEX, "li", "The path to store the label index in", false); - addOption(DefaultOptionCreator.overwriteOption().create()); - Map> parsedArgs = parseArguments(args); - if (parsedArgs == null) { - return -1; - } - if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(getConf(), getOutputPath()); - HadoopUtil.delete(getConf(), getTempPath()); - } - Path labPath; - String labPathStr = getOption(LABEL_INDEX); - if (labPathStr != null) { - labPath = new Path(labPathStr); - } else { - labPath = getTempPath(LABEL_INDEX); - } - long labelSize = createLabelIndex(labPath); - float alphaI = Float.parseFloat(getOption(ALPHA_I)); - boolean trainComplementary = Boolean.parseBoolean(getOption(TRAIN_COMPLEMENTARY)); - - - HadoopUtil.setSerializations(getConf()); - HadoopUtil.cacheFiles(labPath, getConf()); - - //add up all the vectors with the same labels, while mapping the labels into our index - Job indexInstances = prepareJob(getInputPath(), getTempPath(SUMMED_OBSERVATIONS), SequenceFileInputFormat.class, - IndexInstancesMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, - VectorWritable.class, SequenceFileOutputFormat.class); - indexInstances.setCombinerClass(VectorSumReducer.class); - boolean succeeded = indexInstances.waitForCompletion(true); - if (!succeeded) { - return -1; - } - //sum up all the weights from the previous step, per label and per feature - Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(WEIGHTS), - SequenceFileInputFormat.class, WeightsMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, - Text.class, VectorWritable.class, SequenceFileOutputFormat.class); - weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize)); - weightSummer.setCombinerClass(VectorSumReducer.class); - succeeded = weightSummer.waitForCompletion(true); - if (!succeeded) { - return -1; - } - - //put the per label and per feature vectors into the cache - HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf()); - - //calculate the Thetas, write out to LABEL_THETA_NORMALIZER vectors -- TODO: add reference here to the part of the Rennie paper that discusses this - Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(THETAS), - SequenceFileInputFormat.class, ThetaMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, - Text.class, VectorWritable.class, SequenceFileOutputFormat.class); - thetaSummer.setCombinerClass(VectorSumReducer.class); - thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI); - thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary); - /* TODO(robinanil): Enable this when thetanormalization works. - succeeded = thetaSummer.waitForCompletion(true); - if (!succeeded) { - return -1; - }*/ - - //validate our model and then write it out to the official output - NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(getTempPath(), getConf()); - naiveBayesModel.validate(); - naiveBayesModel.serialize(getOutputPath(), getConf()); - - return 0; - } - - private long createLabelIndex(Path labPath) throws IOException { - long labelSize = 0; - if (hasOption(LABELS)) { - Iterable labels = Splitter.on(",").split(getOption(LABELS)); - labelSize = BayesUtils.writeLabelIndex(getConf(), labels, labPath); - } else if (hasOption(EXTRACT_LABELS)) { - SequenceFileDirIterable iterable = - new SequenceFileDirIterable(getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), getConf()); - labelSize = BayesUtils.writeLabelIndex(getConf(), labPath, iterable); - } - return labelSize; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/WeightsMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/WeightsMapper.java deleted file mode 100644 index 31547371a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/WeightsMapper.java +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.naivebayes.training; - -import java.io.IOException; - -import com.google.common.base.Preconditions; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.function.Functions; - -public class WeightsMapper extends Mapper { - - static final String NUM_LABELS = WeightsMapper.class.getName() + ".numLabels"; - - private Vector weightsPerFeature; - private Vector weightsPerLabel; - - @Override - protected void setup(Context ctx) throws IOException, InterruptedException { - super.setup(ctx); - int numLabels = Integer.parseInt(ctx.getConfiguration().get(NUM_LABELS)); - Preconditions.checkArgument(numLabels > 0); - weightsPerLabel = new RandomAccessSparseVector(numLabels); - } - - @Override - protected void map(IntWritable index, VectorWritable value, Context ctx) throws IOException, InterruptedException { - Vector instance = value.get(); - if (weightsPerFeature == null) { - weightsPerFeature = new RandomAccessSparseVector(instance.size(), instance.getNumNondefaultElements()); - } - - int label = index.get(); - weightsPerFeature.assign(instance, Functions.PLUS); - weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum()); - } - - @Override - protected void cleanup(Context ctx) throws IOException, InterruptedException { - if (weightsPerFeature != null) { - ctx.write(new Text(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE), new VectorWritable(weightsPerFeature)); - ctx.write(new Text(TrainNaiveBayesJob.WEIGHTS_PER_LABEL), new VectorWritable(weightsPerLabel)); - } - super.cleanup(ctx); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java deleted file mode 100644 index 3ceeb0aad..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java +++ /dev/null @@ -1,168 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sequencelearning.hmm; - -import com.google.common.io.Closeables; -import org.apache.commons.cli2.CommandLine; -import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; -import org.apache.commons.cli2.OptionException; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; -import org.apache.mahout.common.CommandLineUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; - -import java.io.DataOutputStream; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Date; -import java.util.List; -import java.util.Scanner; - -/** - * A class for EM training of HMM from console - */ -public final class BaumWelchTrainer { - - private BaumWelchTrainer() { - } - - public static void main(String[] args) throws IOException { - DefaultOptionBuilder optionBuilder = new DefaultOptionBuilder(); - ArgumentBuilder argumentBuilder = new ArgumentBuilder(); - - Option inputOption = DefaultOptionCreator.inputOption().create(); - - Option outputOption = DefaultOptionCreator.outputOption().create(); - - Option stateNumberOption = optionBuilder.withLongName("nrOfHiddenStates"). - withDescription("Number of hidden states"). - withShortName("nh").withArgument(argumentBuilder.withMaximum(1).withMinimum(1). - withName("number").create()).withRequired(true).create(); - - Option observedStateNumberOption = optionBuilder.withLongName("nrOfObservedStates"). - withDescription("Number of observed states"). - withShortName("no").withArgument(argumentBuilder.withMaximum(1).withMinimum(1). - withName("number").create()).withRequired(true).create(); - - Option epsilonOption = optionBuilder.withLongName("epsilon"). - withDescription("Convergence threshold"). - withShortName("e").withArgument(argumentBuilder.withMaximum(1).withMinimum(1). - withName("number").create()).withRequired(true).create(); - - Option iterationsOption = optionBuilder.withLongName("max-iterations"). - withDescription("Maximum iterations number"). - withShortName("m").withArgument(argumentBuilder.withMaximum(1).withMinimum(1). - withName("number").create()).withRequired(true).create(); - - Group optionGroup = new GroupBuilder().withOption(inputOption). - withOption(outputOption).withOption(stateNumberOption).withOption(observedStateNumberOption). - withOption(epsilonOption).withOption(iterationsOption). - withName("Options").create(); - - try { - Parser parser = new Parser(); - parser.setGroup(optionGroup); - CommandLine commandLine = parser.parse(args); - - String input = (String) commandLine.getValue(inputOption); - String output = (String) commandLine.getValue(outputOption); - - int nrOfHiddenStates = Integer.parseInt((String) commandLine.getValue(stateNumberOption)); - int nrOfObservedStates = Integer.parseInt((String) commandLine.getValue(observedStateNumberOption)); - - double epsilon = Double.parseDouble((String) commandLine.getValue(epsilonOption)); - int maxIterations = Integer.parseInt((String) commandLine.getValue(iterationsOption)); - - //constructing random-generated HMM - HmmModel model = new HmmModel(nrOfHiddenStates, nrOfObservedStates, new Date().getTime()); - List observations = new ArrayList(); - - //reading observations - Scanner scanner = new Scanner(new FileInputStream(input)); - try { - while (scanner.hasNextInt()) { - observations.add(scanner.nextInt()); - } - } finally { - scanner.close(); - } - - int[] observationsArray = new int[observations.size()]; - for (int i = 0; i < observations.size(); ++i) { - observationsArray[i] = observations.get(i); - } - - //training - HmmModel trainedModel = HmmTrainer.trainBaumWelch(model, - observationsArray, epsilon, maxIterations, true); - - //serializing trained model - DataOutputStream stream = new DataOutputStream(new FileOutputStream(output)); - try { - LossyHmmSerializer.serialize(trainedModel, stream); - } finally { - Closeables.closeQuietly(stream); - } - - //printing tranied model - System.out.println("Initial probabilities: "); - for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) { - System.out.print(i + " "); - } - System.out.println(); - for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) { - System.out.print(trainedModel.getInitialProbabilities().get(i) + " "); - } - System.out.println(); - - System.out.println("Transition matrix:"); - System.out.print(" "); - for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) { - System.out.print(i + " "); - } - System.out.println(); - for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) { - System.out.print(i + " "); - for (int j = 0; j < trainedModel.getNrOfHiddenStates(); ++j) { - System.out.print(trainedModel.getTransitionMatrix().get(i, j) + " "); - } - System.out.println(); - } - System.out.println("Emission matrix: "); - System.out.print(" "); - for (int i = 0; i < trainedModel.getNrOfOutputStates(); ++i) { - System.out.print(i + " "); - } - System.out.println(); - for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) { - System.out.print(i + " "); - for (int j = 0; j < trainedModel.getNrOfOutputStates(); ++j) { - System.out.print(trainedModel.getEmissionMatrix().get(i, j) + " "); - } - System.out.println(); - } - } catch (OptionException e) { - CommandLineUtil.printHelp(optionGroup); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmAlgorithms.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmAlgorithms.java deleted file mode 100644 index c1d328ed2..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmAlgorithms.java +++ /dev/null @@ -1,306 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sequencelearning.hmm; - -import org.apache.mahout.math.DenseMatrix; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.Vector; - -/** - * Class containing implementations of the three major HMM algorithms: forward, - * backward and Viterbi - */ -public final class HmmAlgorithms { - - - /** - * No public constructors for utility classes. - */ - private HmmAlgorithms() { - // nothing to do here really - } - - /** - * External function to compute a matrix of alpha factors - * - * @param model model to run forward algorithm for. - * @param observations observation sequence to train on. - * @param scaled Should log-scaled beta factors be computed? - * @return matrix of alpha factors. - */ - public static Matrix forwardAlgorithm(HmmModel model, int[] observations, boolean scaled) { - Matrix alpha = new DenseMatrix(observations.length, model.getNrOfHiddenStates()); - forwardAlgorithm(alpha, model, observations, scaled); - - return alpha; - } - - /** - * Internal function to compute the alpha factors - * - * @param alpha matrix to store alpha factors in. - * @param model model to use for alpha factor computation. - * @param observations observation sequence seen. - * @param scaled set to true if log-scaled beta factors should be computed. - */ - static void forwardAlgorithm(Matrix alpha, HmmModel model, int[] observations, boolean scaled) { - - // fetch references to the model parameters - Vector ip = model.getInitialProbabilities(); - Matrix b = model.getEmissionMatrix(); - Matrix a = model.getTransitionMatrix(); - - if (scaled) { // compute log scaled alpha values - // Initialization - for (int i = 0; i < model.getNrOfHiddenStates(); i++) { - alpha.setQuick(0, i, Math.log(ip.getQuick(i) * b.getQuick(i, observations[0]))); - } - - // Induction - for (int t = 1; t < observations.length; t++) { - for (int i = 0; i < model.getNrOfHiddenStates(); i++) { - double sum = Double.NEGATIVE_INFINITY; // log(0) - for (int j = 0; j < model.getNrOfHiddenStates(); j++) { - double tmp = alpha.getQuick(t - 1, j) + Math.log(a.getQuick(j, i)); - if (tmp > Double.NEGATIVE_INFINITY) { - // make sure we handle log(0) correctly - sum = tmp + Math.log1p(Math.exp(sum - tmp)); - } - } - alpha.setQuick(t, i, sum + Math.log(b.getQuick(i, observations[t]))); - } - } - } else { - - // Initialization - for (int i = 0; i < model.getNrOfHiddenStates(); i++) { - alpha.setQuick(0, i, ip.getQuick(i) * b.getQuick(i, observations[0])); - } - - // Induction - for (int t = 1; t < observations.length; t++) { - for (int i = 0; i < model.getNrOfHiddenStates(); i++) { - double sum = 0.0; - for (int j = 0; j < model.getNrOfHiddenStates(); j++) { - sum += alpha.getQuick(t - 1, j) * a.getQuick(j, i); - } - alpha.setQuick(t, i, sum * b.getQuick(i, observations[t])); - } - } - } - } - - /** - * External function to compute a matrix of beta factors - * - * @param model model to use for estimation. - * @param observations observation sequence seen. - * @param scaled Set to true if log-scaled beta factors should be computed. - * @return beta factors based on the model and observation sequence. - */ - public static Matrix backwardAlgorithm(HmmModel model, int[] observations, boolean scaled) { - // initialize the matrix - Matrix beta = new DenseMatrix(observations.length, model.getNrOfHiddenStates()); - // compute the beta factors - backwardAlgorithm(beta, model, observations, scaled); - - return beta; - } - - /** - * Internal function to compute the beta factors - * - * @param beta Matrix to store resulting factors in. - * @param model model to use for factor estimation. - * @param observations sequence of observations to estimate. - * @param scaled set to true to compute log-scaled parameters. - */ - static void backwardAlgorithm(Matrix beta, HmmModel model, int[] observations, boolean scaled) { - // fetch references to the model parameters - Matrix b = model.getEmissionMatrix(); - Matrix a = model.getTransitionMatrix(); - - if (scaled) { // compute log-scaled factors - // initialization - for (int i = 0; i < model.getNrOfHiddenStates(); i++) { - beta.setQuick(observations.length - 1, i, 0); - } - - // induction - for (int t = observations.length - 2; t >= 0; t--) { - for (int i = 0; i < model.getNrOfHiddenStates(); i++) { - double sum = Double.NEGATIVE_INFINITY; // log(0) - for (int j = 0; j < model.getNrOfHiddenStates(); j++) { - double tmp = beta.getQuick(t + 1, j) + Math.log(a.getQuick(i, j)) - + Math.log(b.getQuick(j, observations[t + 1])); - if (tmp > Double.NEGATIVE_INFINITY) { - // handle log(0) - sum = tmp + Math.log1p(Math.exp(sum - tmp)); - } - } - beta.setQuick(t, i, sum); - } - } - } else { - // initialization - for (int i = 0; i < model.getNrOfHiddenStates(); i++) { - beta.setQuick(observations.length - 1, i, 1); - } - // induction - for (int t = observations.length - 2; t >= 0; t--) { - for (int i = 0; i < model.getNrOfHiddenStates(); i++) { - double sum = 0; - for (int j = 0; j < model.getNrOfHiddenStates(); j++) { - sum += beta.getQuick(t + 1, j) * a.getQuick(i, j) * b.getQuick(j, observations[t + 1]); - } - beta.setQuick(t, i, sum); - } - } - } - } - - /** - * Viterbi algorithm to compute the most likely hidden sequence for a given - * model and observed sequence - * - * @param model HmmModel for which the Viterbi path should be computed - * @param observations Sequence of observations - * @param scaled Use log-scaled computations, this requires higher computational - * effort but is numerically more stable for large observation - * sequences - * @return nrOfObservations 1D int array containing the most likely hidden - * sequence - */ - public static int[] viterbiAlgorithm(HmmModel model, int[] observations, boolean scaled) { - - // probability that the most probable hidden states ends at state i at - // time t - double[][] delta = new double[observations.length][model - .getNrOfHiddenStates()]; - - // previous hidden state in the most probable state leading up to state - // i at time t - int[][] phi = new int[observations.length - 1][model.getNrOfHiddenStates()]; - - // initialize the return array - int[] sequence = new int[observations.length]; - - viterbiAlgorithm(sequence, delta, phi, model, observations, scaled); - - return sequence; - } - - /** - * Internal version of the viterbi algorithm, allowing to reuse existing - * arrays instead of allocating new ones - * - * @param sequence NrOfObservations 1D int array for storing the viterbi sequence - * @param delta NrOfObservations x NrHiddenStates 2D double array for storing the - * delta factors - * @param phi NrOfObservations-1 x NrHiddenStates 2D int array for storing the - * phi values - * @param model HmmModel for which the viterbi path should be computed - * @param observations Sequence of observations - * @param scaled Use log-scaled computations, this requires higher computational - * effort but is numerically more stable for large observation - * sequences - */ - static void viterbiAlgorithm(int[] sequence, double[][] delta, int[][] phi, HmmModel model, int[] observations, - boolean scaled) { - // fetch references to the model parameters - Vector ip = model.getInitialProbabilities(); - Matrix b = model.getEmissionMatrix(); - Matrix a = model.getTransitionMatrix(); - - // Initialization - if (scaled) { - for (int i = 0; i < model.getNrOfHiddenStates(); i++) { - delta[0][i] = Math.log(ip.getQuick(i) * b.getQuick(i, observations[0])); - } - } else { - - for (int i = 0; i < model.getNrOfHiddenStates(); i++) { - delta[0][i] = ip.getQuick(i) * b.getQuick(i, observations[0]); - } - } - - // Induction - // iterate over the time - if (scaled) { - for (int t = 1; t < observations.length; t++) { - // iterate over the hidden states - for (int i = 0; i < model.getNrOfHiddenStates(); i++) { - // find the maximum probability and most likely state - // leading up - // to this - int maxState = 0; - double maxProb = delta[t - 1][0] + Math.log(a.getQuick(0, i)); - for (int j = 1; j < model.getNrOfHiddenStates(); j++) { - double prob = delta[t - 1][j] + Math.log(a.getQuick(j, i)); - if (prob > maxProb) { - maxProb = prob; - maxState = j; - } - } - delta[t][i] = maxProb + Math.log(b.getQuick(i, observations[t])); - phi[t - 1][i] = maxState; - } - } - } else { - for (int t = 1; t < observations.length; t++) { - // iterate over the hidden states - for (int i = 0; i < model.getNrOfHiddenStates(); i++) { - // find the maximum probability and most likely state - // leading up - // to this - int maxState = 0; - double maxProb = delta[t - 1][0] * a.getQuick(0, i); - for (int j = 1; j < model.getNrOfHiddenStates(); j++) { - double prob = delta[t - 1][j] * a.getQuick(j, i); - if (prob > maxProb) { - maxProb = prob; - maxState = j; - } - } - delta[t][i] = maxProb * b.getQuick(i, observations[t]); - phi[t - 1][i] = maxState; - } - } - } - - // find the most likely end state for initialization - double maxProb; - if (scaled) { - maxProb = Double.NEGATIVE_INFINITY; - } else { - maxProb = 0.0; - } - for (int i = 0; i < model.getNrOfHiddenStates(); i++) { - if (delta[observations.length - 1][i] > maxProb) { - maxProb = delta[observations.length - 1][i]; - sequence[observations.length - 1] = i; - } - } - - // now backtrack to find the most likely hidden sequence - for (int t = observations.length - 2; t >= 0; t--) { - sequence[t] = phi[t][sequence[t + 1]]; - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmEvaluator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmEvaluator.java deleted file mode 100644 index d287d4d8f..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmEvaluator.java +++ /dev/null @@ -1,192 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sequencelearning.hmm; - -import java.util.Random; - -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.Vector; - -/** - * The HMMEvaluator class offers several methods to evaluate an HMM Model. The - * following use-cases are covered: 1) Generate a sequence of output states from - * a given model (prediction). 2) Compute the likelihood that a given model - * generated a given sequence of output states (model likelihood). 3) Compute - * the most likely hidden sequence for a given model and a given observed - * sequence (decoding). - */ -public final class HmmEvaluator { - - /** - * No constructor for utility classes. - */ - private HmmEvaluator() { - // Nothing to do here. - } - - /** - * Predict a sequence of steps output states for the given HMM model - * - * @param model The Hidden Markov model used to generate the output sequence - * @param steps Size of the generated output sequence - * @return integer array containing a sequence of steps output state IDs, - * generated by the specified model - */ - public static int[] predict(HmmModel model, int steps) { - return predict(model, steps, 0); - } - - /** - * Predict a sequence of steps output states for the given HMM model using the - * given seed for probabilistic experiments - * - * @param model The Hidden Markov model used to generate the output sequence - * @param steps Size of the generated output sequence - * @param seed Seed to initialize the RNG - * @return integer array containing a sequence of steps output state IDs, - * generated by the specified model - */ - public static int[] predict(HmmModel model, int steps, long seed) { - // create the random number generator - Random rand; - if (seed == 0) { - rand = RandomUtils.getRandom(); - } else { - rand = RandomUtils.getRandom(seed); - } - // fetch the cumulative distributions - Vector cip = HmmUtils.getCumulativeInitialProbabilities(model); - Matrix ctm = HmmUtils.getCumulativeTransitionMatrix(model); - Matrix com = HmmUtils.getCumulativeOutputMatrix(model); - // allocate the result IntArrayList - int[] result = new int[steps]; - // choose the initial state - int hiddenState = 0; - - double randnr = rand.nextDouble(); - while (cip.get(hiddenState) < randnr) { - hiddenState++; - } - - // now draw steps output states according to the cumulative - // distributions - for (int step = 0; step < steps; ++step) { - // choose output state to given hidden state - randnr = rand.nextDouble(); - int outputState = 0; - while (com.get(hiddenState, outputState) < randnr) { - outputState++; - } - result[step] = outputState; - // choose the next hidden state - randnr = rand.nextDouble(); - int nextHiddenState = 0; - while (ctm.get(hiddenState, nextHiddenState) < randnr) { - nextHiddenState++; - } - hiddenState = nextHiddenState; - } - return result; - } - - /** - * Returns the likelihood that a given output sequence was produced by the - * given model. Internally, this function calls the forward algorithm to - * compute the alpha values and then uses the overloaded function to compute - * the actual model likelihood. - * - * @param model Model to base the likelihood on. - * @param outputSequence Sequence to compute likelihood for. - * @param scaled Use log-scaled parameters for computation. This is computationally - * more expensive, but offers better numerically stability in case of - * long output sequences - * @return Likelihood that the given model produced the given sequence - */ - public static double modelLikelihood(HmmModel model, int[] outputSequence, - boolean scaled) { - return modelLikelihood(HmmAlgorithms.forwardAlgorithm(model, outputSequence, scaled), scaled); - } - - /** - * Computes the likelihood that a given output sequence was computed by a - * given model using the alpha values computed by the forward algorithm. - * // TODO I am a bit confused here - where is the output sequence referenced in the comment above in the code? - * @param alpha Matrix of alpha values - * @param scaled Set to true if the alpha values are log-scaled. - * @return model likelihood. - */ - public static double modelLikelihood(Matrix alpha, boolean scaled) { - double likelihood = 0; - if (scaled) { - for (int i = 0; i < alpha.numCols(); ++i) { - likelihood += Math.exp(alpha.getQuick(alpha.numRows() - 1, i)); - } - } else { - for (int i = 0; i < alpha.numCols(); ++i) { - likelihood += alpha.getQuick(alpha.numRows() - 1, i); - } - } - return likelihood; - } - - /** - * Computes the likelihood that a given output sequence was computed by a - * given model. - * - * @param model model to compute sequence likelihood for. - * @param outputSequence sequence to base computation on. - * @param beta beta parameters. - * @param scaled set to true if betas are log-scaled. - * @return likelihood of the outputSequence given the model. - */ - public static double modelLikelihood(HmmModel model, int[] outputSequence, Matrix beta, boolean scaled) { - double likelihood = 0; - // fetch the emission probabilities - Matrix e = model.getEmissionMatrix(); - Vector pi = model.getInitialProbabilities(); - int firstOutput = outputSequence[0]; - if (scaled) { - for (int i = 0; i < model.getNrOfHiddenStates(); ++i) { - likelihood += pi.getQuick(i) * Math.exp(beta.getQuick(0, i)) * e.getQuick(i, firstOutput); - } - } else { - for (int i = 0; i < model.getNrOfHiddenStates(); ++i) { - likelihood += pi.getQuick(i) * beta.getQuick(0, i) * e.getQuick(i, firstOutput); - } - } - return likelihood; - } - - /** - * Returns the most likely sequence of hidden states for the given model and - * observation - * - * @param model model to use for decoding. - * @param observations integer Array containing a sequence of observed state IDs - * @param scaled Use log-scaled computations, this requires higher computational - * effort but is numerically more stable for large observation - * sequences - * @return integer array containing the most likely sequence of hidden state - * IDs - */ - public static int[] decode(HmmModel model, int[] observations, boolean scaled) { - return HmmAlgorithms.viterbiAlgorithm(model, observations, scaled); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmModel.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmModel.java deleted file mode 100644 index bc24884ab..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmModel.java +++ /dev/null @@ -1,383 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sequencelearning.hmm; - -import java.util.Map; -import java.util.Random; - -import com.google.common.collect.BiMap; -import com.google.common.collect.HashBiMap; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.DenseMatrix; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.Vector; - -/** - * Main class defining a Hidden Markov Model - */ -public class HmmModel implements Cloneable { - - /** Bi-directional Map for storing the observed state names */ - private BiMap outputStateNames; - - /** Bi-Directional Map for storing the hidden state names */ - private BiMap hiddenStateNames; - - /* Number of hidden states */ - private int nrOfHiddenStates; - - /** Number of output states */ - private int nrOfOutputStates; - - /** - * Transition matrix containing the transition probabilities between hidden - * states. TransitionMatrix(i,j) is the probability that we change from hidden - * state i to hidden state j In general: P(h(t+1)=h_j | h(t) = h_i) = - * transitionMatrix(i,j) Since we have to make sure that each hidden state can - * be "left", the following normalization condition has to hold: - * sum(transitionMatrix(i,j),j=1..hiddenStates) = 1 - */ - private Matrix transitionMatrix; - - /** - * Output matrix containing the probabilities that we observe a given output - * state given a hidden state. outputMatrix(i,j) is the probability that we - * observe output state j if we are in hidden state i Formally: P(o(t)=o_j | - * h(t)=h_i) = outputMatrix(i,j) Since we always have an observation for each - * hidden state, the following normalization condition has to hold: - * sum(outputMatrix(i,j),j=1..outputStates) = 1 - */ - private Matrix emissionMatrix; - - /** - * Vector containing the initial hidden state probabilities. That is - * P(h(0)=h_i) = initialProbabilities(i). Since we are dealing with - * probabilities the following normalization condition has to hold: - * sum(initialProbabilities(i),i=1..hiddenStates) = 1 - */ - private Vector initialProbabilities; - - - /** - * Get a copy of this model - */ - @Override - public HmmModel clone() { - HmmModel model = new HmmModel(transitionMatrix.clone(), emissionMatrix.clone(), initialProbabilities.clone()); - if (hiddenStateNames != null) { - model.hiddenStateNames = HashBiMap.create(hiddenStateNames); - } - if (outputStateNames != null) { - model.outputStateNames = HashBiMap.create(outputStateNames); - } - return model; - } - - /** - * Assign the content of another HMM model to this one - * - * @param model The HmmModel that will be assigned to this one - */ - public void assign(HmmModel model) { - this.nrOfHiddenStates = model.nrOfHiddenStates; - this.nrOfOutputStates = model.nrOfOutputStates; - this.hiddenStateNames = model.hiddenStateNames; - this.outputStateNames = model.outputStateNames; - // for now clone the matrix/vectors - this.initialProbabilities = model.initialProbabilities.clone(); - this.emissionMatrix = model.emissionMatrix.clone(); - this.transitionMatrix = model.transitionMatrix.clone(); - } - - /** - * Construct a valid random Hidden-Markov parameter set with the given number - * of hidden and output states using a given seed. - * - * @param nrOfHiddenStates Number of hidden states - * @param nrOfOutputStates Number of output states - * @param seed Seed for the random initialization, if set to 0 the current time - * is used - */ - public HmmModel(int nrOfHiddenStates, int nrOfOutputStates, long seed) { - this.nrOfHiddenStates = nrOfHiddenStates; - this.nrOfOutputStates = nrOfOutputStates; - this.transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates); - this.emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates); - this.initialProbabilities = new DenseVector(nrOfHiddenStates); - // initialize a random, valid parameter set - initRandomParameters(seed); - } - - /** - * Construct a valid random Hidden-Markov parameter set with the given number - * of hidden and output states. - * - * @param nrOfHiddenStates Number of hidden states - * @param nrOfOutputStates Number of output states - */ - public HmmModel(int nrOfHiddenStates, int nrOfOutputStates) { - this(nrOfHiddenStates, nrOfOutputStates, 0); - } - - /** - * Generates a Hidden Markov model using the specified parameters - * - * @param transitionMatrix transition probabilities. - * @param emissionMatrix emission probabilities. - * @param initialProbabilities initial start probabilities. - * @throws IllegalArgumentException If the given parameter set is invalid - */ - public HmmModel(Matrix transitionMatrix, Matrix emissionMatrix, Vector initialProbabilities) { - this.nrOfHiddenStates = initialProbabilities.size(); - this.nrOfOutputStates = emissionMatrix.numCols(); - this.transitionMatrix = transitionMatrix; - this.emissionMatrix = emissionMatrix; - this.initialProbabilities = initialProbabilities; - } - - /** - * Initialize a valid random set of HMM parameters - * - * @param seed seed to use for Random initialization. Use 0 to use Java-built-in-version. - */ - private void initRandomParameters(long seed) { - Random rand; - // initialize the random number generator - if (seed == 0) { - rand = RandomUtils.getRandom(); - } else { - rand = RandomUtils.getRandom(seed); - } - // initialize the initial Probabilities - double sum = 0; // used for normalization - for (int i = 0; i < nrOfHiddenStates; i++) { - double nextRand = rand.nextDouble(); - initialProbabilities.set(i, nextRand); - sum += nextRand; - } - // "normalize" the vector to generate probabilities - initialProbabilities = initialProbabilities.divide(sum); - - // initialize the transition matrix - double[] values = new double[nrOfHiddenStates]; - for (int i = 0; i < nrOfHiddenStates; i++) { - sum = 0; - for (int j = 0; j < nrOfHiddenStates; j++) { - values[j] = rand.nextDouble(); - sum += values[j]; - } - // normalize the random values to obtain probabilities - for (int j = 0; j < nrOfHiddenStates; j++) { - values[j] /= sum; - } - // set this row of the transition matrix - transitionMatrix.set(i, values); - } - - // initialize the output matrix - values = new double[nrOfOutputStates]; - for (int i = 0; i < nrOfHiddenStates; i++) { - sum = 0; - for (int j = 0; j < nrOfOutputStates; j++) { - values[j] = rand.nextDouble(); - sum += values[j]; - } - // normalize the random values to obtain probabilities - for (int j = 0; j < nrOfOutputStates; j++) { - values[j] /= sum; - } - // set this row of the output matrix - emissionMatrix.set(i, values); - } - } - - /** - * Getter Method for the number of hidden states - * - * @return Number of hidden states - */ - public int getNrOfHiddenStates() { - return nrOfHiddenStates; - } - - /** - * Getter Method for the number of output states - * - * @return Number of output states - */ - public int getNrOfOutputStates() { - return nrOfOutputStates; - } - - /** - * Getter function to get the hidden state transition matrix - * - * @return returns the model's transition matrix. - */ - public Matrix getTransitionMatrix() { - return transitionMatrix; - } - - /** - * Getter function to get the output state probability matrix - * - * @return returns the models emission matrix. - */ - public Matrix getEmissionMatrix() { - return emissionMatrix; - } - - /** - * Getter function to return the vector of initial hidden state probabilities - * - * @return returns the model's init probabilities. - */ - public Vector getInitialProbabilities() { - return initialProbabilities; - } - - /** - * Getter method for the hidden state Names map - * - * @return hidden state names. - */ - public Map getHiddenStateNames() { - return hiddenStateNames; - } - - /** - * Register an array of hidden state Names. We assume that the state name at - * position i has the ID i - * - * @param stateNames names of hidden states. - */ - public void registerHiddenStateNames(String[] stateNames) { - if (stateNames != null) { - hiddenStateNames = HashBiMap.create(); - for (int i = 0; i < stateNames.length; ++i) { - hiddenStateNames.put(stateNames[i], i); - } - } - } - - /** - * Register a map of hidden state Names/state IDs - * - * @param stateNames Map that assigns each state name an integer ID - */ - public void registerHiddenStateNames(Map stateNames) { - if (stateNames != null) { - hiddenStateNames = HashBiMap.create(stateNames); - } - } - - /** - * Lookup the name for the given hidden state ID - * - * @param id Integer id of the hidden state - * @return String containing the name for the given ID, null if this ID is not - * known or no hidden state names were specified - */ - public String getHiddenStateName(int id) { - if (hiddenStateNames == null) { - return null; - } - return hiddenStateNames.inverse().get(id); - } - - /** - * Lookup the ID for the given hidden state name - * - * @param name Name of the hidden state - * @return int containing the ID for the given name, -1 if this name is not - * known or no hidden state names were specified - */ - public int getHiddenStateID(String name) { - if (hiddenStateNames == null) { - return -1; - } - Integer tmp = hiddenStateNames.get(name); - return tmp == null ? -1 : tmp; - } - - /** - * Getter method for the output state Names map - * - * @return names of output states. - */ - public Map getOutputStateNames() { - return outputStateNames; - } - - /** - * Register an array of hidden state Names. We assume that the state name at - * position i has the ID i - * - * @param stateNames state names to register. - */ - public void registerOutputStateNames(String[] stateNames) { - if (stateNames != null) { - outputStateNames = HashBiMap.create(); - for (int i = 0; i < stateNames.length; ++i) { - outputStateNames.put(stateNames[i], i); - } - } - } - - /** - * Register a map of hidden state Names/state IDs - * - * @param stateNames Map that assigns each state name an integer ID - */ - public void registerOutputStateNames(Map stateNames) { - if (stateNames != null) { - outputStateNames = HashBiMap.create(stateNames); - } - } - - /** - * Lookup the name for the given output state id - * - * @param id Integer id of the output state - * @return String containing the name for the given id, null if this id is not - * known or no output state names were specified - */ - public String getOutputStateName(int id) { - if (outputStateNames == null) { - return null; - } - return outputStateNames.inverse().get(id); - } - - /** - * Lookup the ID for the given output state name - * - * @param name Name of the output state - * @return int containing the ID for the given name, -1 if this name is not - * known or no output state names were specified - */ - public int getOutputStateID(String name) { - if (outputStateNames == null) { - return -1; - } - Integer tmp = outputStateNames.get(name); - return tmp == null ? -1 : tmp; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmTrainer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmTrainer.java deleted file mode 100644 index 1a5e07df4..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmTrainer.java +++ /dev/null @@ -1,488 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sequencelearning.hmm; - -import java.util.Collection; -import java.util.Iterator; - -import org.apache.mahout.math.DenseMatrix; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.Vector; - -/** - * Class containing several algorithms used to train a Hidden Markov Model. The - * three main algorithms are: supervised learning, unsupervised Viterbi and - * unsupervised Baum-Welch. - */ -public final class HmmTrainer { - - /** - * No public constructor for utility classes. - */ - private HmmTrainer() { - // nothing to do here really. - } - - /** - * Create an supervised initial estimate of an HMM Model based on a sequence - * of observed and hidden states. - * - * @param nrOfHiddenStates The total number of hidden states - * @param nrOfOutputStates The total number of output states - * @param observedSequence Integer array containing the observed sequence - * @param hiddenSequence Integer array containing the hidden sequence - * @param pseudoCount Value that is assigned to non-occurring transitions to avoid zero - * probabilities. - * @return An initial model using the estimated parameters - */ - public static HmmModel trainSupervised(int nrOfHiddenStates, int nrOfOutputStates, int[] observedSequence, - int[] hiddenSequence, double pseudoCount) { - // make sure the pseudo count is not zero - pseudoCount = pseudoCount == 0 ? Double.MIN_VALUE : pseudoCount; - - // initialize the parameters - DenseMatrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates); - DenseMatrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates); - // assign a small initial probability that is larger than zero, so - // unseen states will not get a zero probability - transitionMatrix.assign(pseudoCount); - emissionMatrix.assign(pseudoCount); - // given no prior knowledge, we have to assume that all initial hidden - // states are equally likely - DenseVector initialProbabilities = new DenseVector(nrOfHiddenStates); - initialProbabilities.assign(1.0 / (double) nrOfHiddenStates); - - // now loop over the sequences to count the number of transitions - countTransitions(transitionMatrix, emissionMatrix, observedSequence, - hiddenSequence); - - // make sure that probabilities are normalized - for (int i = 0; i < nrOfHiddenStates; i++) { - // compute sum of probabilities for current row of transition matrix - double sum = 0; - for (int j = 0; j < nrOfHiddenStates; j++) { - sum += transitionMatrix.getQuick(i, j); - } - // normalize current row of transition matrix - for (int j = 0; j < nrOfHiddenStates; j++) { - transitionMatrix.setQuick(i, j, transitionMatrix.getQuick(i, j) / sum); - } - // compute sum of probabilities for current row of emission matrix - sum = 0; - for (int j = 0; j < nrOfOutputStates; j++) { - sum += emissionMatrix.getQuick(i, j); - } - // normalize current row of emission matrix - for (int j = 0; j < nrOfOutputStates; j++) { - emissionMatrix.setQuick(i, j, emissionMatrix.getQuick(i, j) / sum); - } - } - - // return a new model using the parameter estimations - return new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities); - } - - /** - * Function that counts the number of state->state and state->output - * transitions for the given observed/hidden sequence. - * - * @param transitionMatrix transition matrix to use. - * @param emissionMatrix emission matrix to use for counting. - * @param observedSequence observation sequence to use. - * @param hiddenSequence sequence of hidden states to use. - */ - private static void countTransitions(Matrix transitionMatrix, - Matrix emissionMatrix, int[] observedSequence, int[] hiddenSequence) { - emissionMatrix.setQuick(hiddenSequence[0], observedSequence[0], - emissionMatrix.getQuick(hiddenSequence[0], observedSequence[0]) + 1); - for (int i = 1; i < observedSequence.length; ++i) { - transitionMatrix - .setQuick(hiddenSequence[i - 1], hiddenSequence[i], transitionMatrix - .getQuick(hiddenSequence[i - 1], hiddenSequence[i]) + 1); - emissionMatrix.setQuick(hiddenSequence[i], observedSequence[i], - emissionMatrix.getQuick(hiddenSequence[i], observedSequence[i]) + 1); - } - } - - /** - * Create an supervised initial estimate of an HMM Model based on a number of - * sequences of observed and hidden states. - * - * @param nrOfHiddenStates The total number of hidden states - * @param nrOfOutputStates The total number of output states - * @param hiddenSequences Collection of hidden sequences to use for training - * @param observedSequences Collection of observed sequences to use for training associated with hidden sequences. - * @param pseudoCount Value that is assigned to non-occurring transitions to avoid zero - * probabilities. - * @return An initial model using the estimated parameters - */ - public static HmmModel trainSupervisedSequence(int nrOfHiddenStates, - int nrOfOutputStates, Collection hiddenSequences, - Collection observedSequences, double pseudoCount) { - - // make sure the pseudo count is not zero - pseudoCount = pseudoCount == 0 ? Double.MIN_VALUE : pseudoCount; - - // initialize parameters - DenseMatrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, - nrOfHiddenStates); - DenseMatrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, - nrOfOutputStates); - DenseVector initialProbabilities = new DenseVector(nrOfHiddenStates); - - // assign pseudo count to avoid zero probabilities - transitionMatrix.assign(pseudoCount); - emissionMatrix.assign(pseudoCount); - initialProbabilities.assign(pseudoCount); - - // now loop over the sequences to count the number of transitions - Iterator hiddenSequenceIt = hiddenSequences.iterator(); - Iterator observedSequenceIt = observedSequences.iterator(); - while (hiddenSequenceIt.hasNext() && observedSequenceIt.hasNext()) { - // fetch the current set of sequences - int[] hiddenSequence = hiddenSequenceIt.next(); - int[] observedSequence = observedSequenceIt.next(); - // increase the count for initial probabilities - initialProbabilities.setQuick(hiddenSequence[0], initialProbabilities - .getQuick(hiddenSequence[0]) + 1); - countTransitions(transitionMatrix, emissionMatrix, observedSequence, - hiddenSequence); - } - - // make sure that probabilities are normalized - double isum = 0; // sum of initial probabilities - for (int i = 0; i < nrOfHiddenStates; i++) { - isum += initialProbabilities.getQuick(i); - // compute sum of probabilities for current row of transition matrix - double sum = 0; - for (int j = 0; j < nrOfHiddenStates; j++) { - sum += transitionMatrix.getQuick(i, j); - } - // normalize current row of transition matrix - for (int j = 0; j < nrOfHiddenStates; j++) { - transitionMatrix.setQuick(i, j, transitionMatrix.getQuick(i, j) / sum); - } - // compute sum of probabilities for current row of emission matrix - sum = 0; - for (int j = 0; j < nrOfOutputStates; j++) { - sum += emissionMatrix.getQuick(i, j); - } - // normalize current row of emission matrix - for (int j = 0; j < nrOfOutputStates; j++) { - emissionMatrix.setQuick(i, j, emissionMatrix.getQuick(i, j) / sum); - } - } - // normalize the initial probabilities - for (int i = 0; i < nrOfHiddenStates; ++i) { - initialProbabilities.setQuick(i, initialProbabilities.getQuick(i) / isum); - } - - // return a new model using the parameter estimates - return new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities); - } - - /** - * Iteratively train the parameters of the given initial model wrt to the - * observed sequence using Viterbi training. - * - * @param initialModel The initial model that gets iterated - * @param observedSequence The sequence of observed states - * @param pseudoCount Value that is assigned to non-occurring transitions to avoid zero - * probabilities. - * @param epsilon Convergence criteria - * @param maxIterations The maximum number of training iterations - * @param scaled Use Log-scaled implementation, this is computationally more - * expensive but offers better numerical stability for large observed - * sequences - * @return The iterated model - */ - public static HmmModel trainViterbi(HmmModel initialModel, - int[] observedSequence, double pseudoCount, double epsilon, - int maxIterations, boolean scaled) { - - // make sure the pseudo count is not zero - pseudoCount = pseudoCount == 0 ? Double.MIN_VALUE : pseudoCount; - - // allocate space for iteration models - HmmModel lastIteration = initialModel.clone(); - HmmModel iteration = initialModel.clone(); - - // allocate space for Viterbi path calculation - int[] viterbiPath = new int[observedSequence.length]; - int[][] phi = new int[observedSequence.length - 1][initialModel - .getNrOfHiddenStates()]; - double[][] delta = new double[observedSequence.length][initialModel - .getNrOfHiddenStates()]; - - // now run the Viterbi training iteration - for (int i = 0; i < maxIterations; ++i) { - // compute the Viterbi path - HmmAlgorithms.viterbiAlgorithm(viterbiPath, delta, phi, lastIteration, - observedSequence, scaled); - // Viterbi iteration uses the viterbi path to update - // the probabilities - Matrix emissionMatrix = iteration.getEmissionMatrix(); - Matrix transitionMatrix = iteration.getTransitionMatrix(); - - // first, assign the pseudo count - emissionMatrix.assign(pseudoCount); - transitionMatrix.assign(pseudoCount); - - // now count the transitions - countTransitions(transitionMatrix, emissionMatrix, observedSequence, - viterbiPath); - - // and normalize the probabilities - for (int j = 0; j < iteration.getNrOfHiddenStates(); ++j) { - double sum = 0; - // normalize the rows of the transition matrix - for (int k = 0; k < iteration.getNrOfHiddenStates(); ++k) { - sum += transitionMatrix.getQuick(j, k); - } - for (int k = 0; k < iteration.getNrOfHiddenStates(); ++k) { - transitionMatrix - .setQuick(j, k, transitionMatrix.getQuick(j, k) / sum); - } - // normalize the rows of the emission matrix - sum = 0; - for (int k = 0; k < iteration.getNrOfOutputStates(); ++k) { - sum += emissionMatrix.getQuick(j, k); - } - for (int k = 0; k < iteration.getNrOfOutputStates(); ++k) { - emissionMatrix.setQuick(j, k, emissionMatrix.getQuick(j, k) / sum); - } - } - // check for convergence - if (checkConvergence(lastIteration, iteration, epsilon)) { - break; - } - // overwrite the last iterated model by the new iteration - lastIteration.assign(iteration); - } - // we are done :) - return iteration; - } - - /** - * Iteratively train the parameters of the given initial model wrt the - * observed sequence using Baum-Welch training. - * - * @param initialModel The initial model that gets iterated - * @param observedSequence The sequence of observed states - * @param epsilon Convergence criteria - * @param maxIterations The maximum number of training iterations - * @param scaled Use log-scaled implementations of forward/backward algorithm. This - * is computationally more expensive, but offers better numerical - * stability for long output sequences. - * @return The iterated model - */ - public static HmmModel trainBaumWelch(HmmModel initialModel, - int[] observedSequence, double epsilon, int maxIterations, boolean scaled) { - // allocate space for the iterations - HmmModel lastIteration = initialModel.clone(); - HmmModel iteration = initialModel.clone(); - - // allocate space for baum-welch factors - int hiddenCount = initialModel.getNrOfHiddenStates(); - int visibleCount = observedSequence.length; - Matrix alpha = new DenseMatrix(visibleCount, hiddenCount); - Matrix beta = new DenseMatrix(visibleCount, hiddenCount); - - // now run the baum Welch training iteration - for (int it = 0; it < maxIterations; ++it) { - // fetch emission and transition matrix of current iteration - Vector initialProbabilities = iteration.getInitialProbabilities(); - Matrix emissionMatrix = iteration.getEmissionMatrix(); - Matrix transitionMatrix = iteration.getTransitionMatrix(); - - // compute forward and backward factors - HmmAlgorithms.forwardAlgorithm(alpha, iteration, observedSequence, scaled); - HmmAlgorithms.backwardAlgorithm(beta, iteration, observedSequence, scaled); - - if (scaled) { - logScaledBaumWelch(observedSequence, iteration, alpha, beta); - } else { - unscaledBaumWelch(observedSequence, iteration, alpha, beta); - } - // normalize transition/emission probabilities - // and normalize the probabilities - double isum = 0; - for (int j = 0; j < iteration.getNrOfHiddenStates(); ++j) { - double sum = 0; - // normalize the rows of the transition matrix - for (int k = 0; k < iteration.getNrOfHiddenStates(); ++k) { - sum += transitionMatrix.getQuick(j, k); - } - for (int k = 0; k < iteration.getNrOfHiddenStates(); ++k) { - transitionMatrix - .setQuick(j, k, transitionMatrix.getQuick(j, k) / sum); - } - // normalize the rows of the emission matrix - sum = 0; - for (int k = 0; k < iteration.getNrOfOutputStates(); ++k) { - sum += emissionMatrix.getQuick(j, k); - } - for (int k = 0; k < iteration.getNrOfOutputStates(); ++k) { - emissionMatrix.setQuick(j, k, emissionMatrix.getQuick(j, k) / sum); - } - // normalization parameter for initial probabilities - isum += initialProbabilities.getQuick(j); - } - // normalize initial probabilities - for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) { - initialProbabilities.setQuick(i, initialProbabilities.getQuick(i) - / isum); - } - // check for convergence - if (checkConvergence(lastIteration, iteration, epsilon)) { - break; - } - // overwrite the last iterated model by the new iteration - lastIteration.assign(iteration); - } - // we are done :) - return iteration; - } - - private static void unscaledBaumWelch(int[] observedSequence, HmmModel iteration, Matrix alpha, Matrix beta) { - Vector initialProbabilities = iteration.getInitialProbabilities(); - Matrix emissionMatrix = iteration.getEmissionMatrix(); - Matrix transitionMatrix = iteration.getTransitionMatrix(); - double modelLikelihood = HmmEvaluator.modelLikelihood(alpha, false); - - for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) { - initialProbabilities.setQuick(i, alpha.getQuick(0, i) - * beta.getQuick(0, i)); - } - - // recompute transition probabilities - for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) { - for (int j = 0; j < iteration.getNrOfHiddenStates(); ++j) { - double temp = 0; - for (int t = 0; t < observedSequence.length - 1; ++t) { - temp += alpha.getQuick(t, i) - * emissionMatrix.getQuick(j, observedSequence[t + 1]) - * beta.getQuick(t + 1, j); - } - transitionMatrix.setQuick(i, j, transitionMatrix.getQuick(i, j) - * temp / modelLikelihood); - } - } - // recompute emission probabilities - for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) { - for (int j = 0; j < iteration.getNrOfOutputStates(); ++j) { - double temp = 0; - for (int t = 0; t < observedSequence.length; ++t) { - // delta tensor - if (observedSequence[t] == j) { - temp += alpha.getQuick(t, i) * beta.getQuick(t, i); - } - } - emissionMatrix.setQuick(i, j, temp / modelLikelihood); - } - } - } - - private static void logScaledBaumWelch(int[] observedSequence, HmmModel iteration, Matrix alpha, Matrix beta) { - Vector initialProbabilities = iteration.getInitialProbabilities(); - Matrix emissionMatrix = iteration.getEmissionMatrix(); - Matrix transitionMatrix = iteration.getTransitionMatrix(); - double modelLikelihood = HmmEvaluator.modelLikelihood(alpha, true); - - for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) { - initialProbabilities.setQuick(i, Math.exp(alpha.getQuick(0, i) + beta.getQuick(0, i))); - } - - // recompute transition probabilities - for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) { - for (int j = 0; j < iteration.getNrOfHiddenStates(); ++j) { - double sum = Double.NEGATIVE_INFINITY; // log(0) - for (int t = 0; t < observedSequence.length - 1; ++t) { - double temp = alpha.getQuick(t, i) - + Math.log(emissionMatrix.getQuick(j, observedSequence[t + 1])) - + beta.getQuick(t + 1, j); - if (temp > Double.NEGATIVE_INFINITY) { - // handle 0-probabilities - sum = temp + Math.log1p(Math.exp(sum - temp)); - } - } - transitionMatrix.setQuick(i, j, transitionMatrix.getQuick(i, j) - * Math.exp(sum - modelLikelihood)); - } - } - // recompute emission probabilities - for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) { - for (int j = 0; j < iteration.getNrOfOutputStates(); ++j) { - double sum = Double.NEGATIVE_INFINITY; // log(0) - for (int t = 0; t < observedSequence.length; ++t) { - // delta tensor - if (observedSequence[t] == j) { - double temp = alpha.getQuick(t, i) + beta.getQuick(t, i); - if (temp > Double.NEGATIVE_INFINITY) { - // handle 0-probabilities - sum = temp + Math.log1p(Math.exp(sum - temp)); - } - } - } - emissionMatrix.setQuick(i, j, Math.exp(sum - modelLikelihood)); - } - } - } - - /** - * Check convergence of two HMM models by computing a simple distance between - * emission / transition matrices - * - * @param oldModel Old HMM Model - * @param newModel New HMM Model - * @param epsilon Convergence Factor - * @return true if training converged to a stable state. - */ - private static boolean checkConvergence(HmmModel oldModel, HmmModel newModel, - double epsilon) { - // check convergence of transitionProbabilities - Matrix oldTransitionMatrix = oldModel.getTransitionMatrix(); - Matrix newTransitionMatrix = newModel.getTransitionMatrix(); - double diff = 0; - for (int i = 0; i < oldModel.getNrOfHiddenStates(); ++i) { - for (int j = 0; j < oldModel.getNrOfHiddenStates(); ++j) { - double tmp = oldTransitionMatrix.getQuick(i, j) - - newTransitionMatrix.getQuick(i, j); - diff += tmp * tmp; - } - } - double norm = Math.sqrt(diff); - diff = 0; - // check convergence of emissionProbabilities - Matrix oldEmissionMatrix = oldModel.getEmissionMatrix(); - Matrix newEmissionMatrix = newModel.getEmissionMatrix(); - for (int i = 0; i < oldModel.getNrOfHiddenStates(); i++) { - for (int j = 0; j < oldModel.getNrOfOutputStates(); j++) { - - double tmp = oldEmissionMatrix.getQuick(i, j) - - newEmissionMatrix.getQuick(i, j); - diff += tmp * tmp; - } - } - norm += Math.sqrt(diff); - // iteration has converged :) - return norm < epsilon; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmUtils.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmUtils.java deleted file mode 100644 index 18b145988..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmUtils.java +++ /dev/null @@ -1,362 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sequencelearning.hmm; - -import java.util.Collection; -import java.util.Iterator; -import java.util.List; - -import com.google.common.collect.Lists; -import org.apache.mahout.math.DenseMatrix; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.SparseMatrix; -import org.apache.mahout.math.Vector; -import org.uncommons.maths.Maths; - -import com.google.common.base.Preconditions; - -/** - * A collection of utilities for handling HMMModel objects. - */ -public final class HmmUtils { - - /** - * No public constructor for utility classes. - */ - private HmmUtils() { - // nothing to do here really. - } - - /** - * Compute the cumulative transition probability matrix for the given HMM - * model. Matrix where each row i is the cumulative distribution of the - * transition probability distribution for hidden state i. - * - * @param model The HMM model for which the cumulative transition matrix should be - * computed - * @return The computed cumulative transition matrix. - */ - public static Matrix getCumulativeTransitionMatrix(HmmModel model) { - // fetch the needed parameters from the model - int hiddenStates = model.getNrOfHiddenStates(); - Matrix transitionMatrix = model.getTransitionMatrix(); - // now compute the cumulative transition matrix - Matrix resultMatrix = new DenseMatrix(hiddenStates, hiddenStates); - for (int i = 0; i < hiddenStates; ++i) { - double sum = 0; - for (int j = 0; j < hiddenStates; ++j) { - sum += transitionMatrix.get(i, j); - resultMatrix.set(i, j, sum); - } - resultMatrix.set(i, hiddenStates - 1, 1.0); - // make sure the last - // state has always a - // cumulative - // probability of - // exactly 1.0 - } - return resultMatrix; - } - - /** - * Compute the cumulative output probability matrix for the given HMM model. - * Matrix where each row i is the cumulative distribution of the output - * probability distribution for hidden state i. - * - * @param model The HMM model for which the cumulative output matrix should be - * computed - * @return The computed cumulative output matrix. - */ - public static Matrix getCumulativeOutputMatrix(HmmModel model) { - // fetch the needed parameters from the model - int hiddenStates = model.getNrOfHiddenStates(); - int outputStates = model.getNrOfOutputStates(); - Matrix outputMatrix = model.getEmissionMatrix(); - // now compute the cumulative output matrix - Matrix resultMatrix = new DenseMatrix(hiddenStates, outputStates); - for (int i = 0; i < hiddenStates; ++i) { - double sum = 0; - for (int j = 0; j < outputStates; ++j) { - sum += outputMatrix.get(i, j); - resultMatrix.set(i, j, sum); - } - resultMatrix.set(i, outputStates - 1, 1.0); - // make sure the last - // output state has - // always a cumulative - // probability of 1.0 - } - return resultMatrix; - } - - /** - * Compute the cumulative distribution of the initial hidden state - * probabilities for the given HMM model. - * - * @param model The HMM model for which the cumulative initial state probabilities - * should be computed - * @return The computed cumulative initial state probability vector. - */ - public static Vector getCumulativeInitialProbabilities(HmmModel model) { - // fetch the needed parameters from the model - int hiddenStates = model.getNrOfHiddenStates(); - Vector initialProbabilities = model.getInitialProbabilities(); - // now compute the cumulative output matrix - Vector resultVector = new DenseVector(initialProbabilities.size()); - double sum = 0; - for (int i = 0; i < hiddenStates; ++i) { - sum += initialProbabilities.get(i); - resultVector.set(i, sum); - } - resultVector.set(hiddenStates - 1, 1.0); // make sure the last initial - // hidden state probability - // has always a cumulative - // probability of 1.0 - return resultVector; - } - - /** - * Validates an HMM model set - * - * @param model model to sanity check. - */ - public static void validate(HmmModel model) { - if (model == null) { - return; // empty models are valid - } - - /* - * The number of hidden states is positive. - */ - Preconditions.checkArgument(model.getNrOfHiddenStates() > 0, - "Error: The number of hidden states has to be greater than 0"); - - /* - * The number of output states is positive. - */ - Preconditions.checkArgument(model.getNrOfOutputStates() > 0, - "Error: The number of output states has to be greater than 0!"); - - /* - * The size of the vector of initial probabilities is equal to the number of - * the hidden states. Each initial probability is non-negative. The sum of - * initial probabilities is equal to 1. - */ - Preconditions.checkArgument(model.getInitialProbabilities() != null - && model.getInitialProbabilities().size() == model.getNrOfHiddenStates(), - "Error: The vector of initial probabilities is not initialized!"); - - double sum = 0; - for (int i = 0; i < model.getInitialProbabilities().size(); i++) { - Preconditions.checkArgument(model.getInitialProbabilities().get(i) >= 0, - "Error: Initial probability of state %d is negative", i); - sum += model.getInitialProbabilities().get(i); - } - Preconditions.checkArgument(Maths.approxEquals(sum, 1, 0.00001), - "Error: Initial probabilities do not add up to 1"); - /* - * The row size of the output matrix is equal to the number of the hidden - * states. The column size is equal to the number of output states. Each - * probability of the matrix is non-negative. The sum of each row is equal - * to 1. - */ - Preconditions.checkNotNull(model.getEmissionMatrix(), "Error: The output state matrix is not initialized!"); - Preconditions.checkArgument(model.getEmissionMatrix().numRows() == model.getNrOfHiddenStates() - && model.getEmissionMatrix().numCols() == model.getNrOfOutputStates(), - "Error: The output state matrix is not of the form nrOfHiddenStates x nrOfOutputStates"); - for (int i = 0; i < model.getEmissionMatrix().numRows(); i++) { - sum = 0; - for (int j = 0; j < model.getEmissionMatrix().numCols(); j++) { - Preconditions.checkArgument(model.getEmissionMatrix().get(i, j) >= 0, - "The output state probability from hidden state " + i + " to output state " + j + " is negative"); - sum += model.getEmissionMatrix().get(i, j); - } - Preconditions.checkArgument(Maths.approxEquals(sum, 1, 0.00001), - "Error: The output state probabilities for hidden state %d don't add up to 1", i); - } - - /* - * The size of both dimension of the transition matrix is equal to the - * number of the hidden states. Each probability of the matrix is - * non-negative. The sum of each row in transition matrix is equal to 1. - */ - Preconditions.checkArgument(model.getTransitionMatrix() != null, - "Error: The hidden state matrix is not initialized!"); - Preconditions.checkArgument(model.getTransitionMatrix().numRows() == model.getNrOfHiddenStates() - && model.getTransitionMatrix().numCols() == model.getNrOfHiddenStates(), - "Error: The output state matrix is not of the form nrOfHiddenStates x nrOfHiddenStates"); - for (int i = 0; i < model.getTransitionMatrix().numRows(); i++) { - sum = 0; - for (int j = 0; j < model.getTransitionMatrix().numCols(); j++) { - Preconditions.checkArgument(model.getTransitionMatrix().get(i, j) >= 0, - "Error: The transition probability from hidden state %d to hidden state %d is negative", i, j); - sum += model.getTransitionMatrix().get(i, j); - } - Preconditions.checkArgument(Maths.approxEquals(sum, 1, 0.00001), - "Error: The transition probabilities for hidden state " + i + " don't add up to 1."); - } - } - - /** - * Encodes a given collection of state names by the corresponding state IDs - * registered in a given model. - * - * @param model Model to provide the encoding for - * @param sequence Collection of state names - * @param observed If set, the sequence is encoded as a sequence of observed states, - * else it is encoded as sequence of hidden states - * @param defaultValue The default value in case a state is not known - * @return integer array containing the encoded state IDs - */ - public static int[] encodeStateSequence(HmmModel model, - Collection sequence, boolean observed, int defaultValue) { - int[] encoded = new int[sequence.size()]; - Iterator seqIter = sequence.iterator(); - for (int i = 0; i < sequence.size(); ++i) { - String nextState = seqIter.next(); - int nextID; - if (observed) { - nextID = model.getOutputStateID(nextState); - } else { - nextID = model.getHiddenStateID(nextState); - } - // if the ID is -1, use the default value - encoded[i] = nextID < 0 ? defaultValue : nextID; - } - return encoded; - } - - /** - * Decodes a given collection of state IDs into the corresponding state names - * registered in a given model. - * - * @param model model to use for retrieving state names - * @param sequence int array of state IDs - * @param observed If set, the sequence is encoded as a sequence of observed states, - * else it is encoded as sequence of hidden states - * @param defaultValue The default value in case a state is not known - * @return list containing the decoded state names - */ - public static List decodeStateSequence(HmmModel model, - int[] sequence, - boolean observed, - String defaultValue) { - List decoded = Lists.newArrayListWithCapacity(sequence.length); - for (int position : sequence) { - String nextState; - if (observed) { - nextState = model.getOutputStateName(position); - } else { - nextState = model.getHiddenStateName(position); - } - // if null was returned, use the default value - decoded.add(nextState == null ? defaultValue : nextState); - } - return decoded; - } - - /** - * Function used to normalize the probabilities of a given HMM model - * - * @param model model to normalize - */ - public static void normalizeModel(HmmModel model) { - Vector ip = model.getInitialProbabilities(); - Matrix emission = model.getEmissionMatrix(); - Matrix transition = model.getTransitionMatrix(); - // check normalization for all probabilities - double isum = 0; - for (int i = 0; i < model.getNrOfHiddenStates(); ++i) { - isum += ip.getQuick(i); - double sum = 0; - for (int j = 0; j < model.getNrOfHiddenStates(); ++j) { - sum += transition.getQuick(i, j); - } - if (sum != 1.0) { - for (int j = 0; j < model.getNrOfHiddenStates(); ++j) { - transition.setQuick(i, j, transition.getQuick(i, j) / sum); - } - } - sum = 0; - for (int j = 0; j < model.getNrOfOutputStates(); ++j) { - sum += emission.getQuick(i, j); - } - if (sum != 1.0) { - for (int j = 0; j < model.getNrOfOutputStates(); ++j) { - emission.setQuick(i, j, emission.getQuick(i, j) / sum); - } - } - } - if (isum != 1.0) { - for (int i = 0; i < model.getNrOfHiddenStates(); ++i) { - ip.setQuick(i, ip.getQuick(i) / isum); - } - } - } - - /** - * Method to reduce the size of an HMMmodel by converting the models - * DenseMatrix/DenseVectors to sparse implementations and setting every value - * < threshold to 0 - * - * @param model model to truncate - * @param threshold minimum value a model entry must have to be retained. - * @return Truncated model - */ - public static HmmModel truncateModel(HmmModel model, double threshold) { - Vector ip = model.getInitialProbabilities(); - Matrix em = model.getEmissionMatrix(); - Matrix tr = model.getTransitionMatrix(); - // allocate the sparse data structures - RandomAccessSparseVector sparseIp = new RandomAccessSparseVector(model - .getNrOfHiddenStates()); - SparseMatrix sparseEm = new SparseMatrix(model.getNrOfHiddenStates(), model.getNrOfOutputStates()); - SparseMatrix sparseTr = new SparseMatrix(model.getNrOfHiddenStates(), model.getNrOfHiddenStates()); - // now transfer the values - for (int i = 0; i < model.getNrOfHiddenStates(); ++i) { - double value = ip.getQuick(i); - if (value > threshold) { - sparseIp.setQuick(i, value); - } - for (int j = 0; j < model.getNrOfHiddenStates(); ++j) { - value = tr.getQuick(i, j); - if (value > threshold) { - sparseTr.setQuick(i, j, value); - } - } - - for (int j = 0; j < model.getNrOfOutputStates(); ++j) { - value = em.getQuick(i, j); - if (value > threshold) { - sparseEm.setQuick(i, j, value); - } - } - } - // create a new model - HmmModel sparseModel = new HmmModel(sparseTr, sparseEm, sparseIp); - // normalize the model - normalizeModel(sparseModel); - // register the names - sparseModel.registerHiddenStateNames(model.getHiddenStateNames()); - sparseModel.registerOutputStateNames(model.getOutputStateNames()); - // and return - return sparseModel; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/LossyHmmSerializer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/LossyHmmSerializer.java deleted file mode 100644 index d0ae9c2af..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/LossyHmmSerializer.java +++ /dev/null @@ -1,62 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sequencelearning.hmm; - -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.MatrixWritable; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * Utils for serializing Writable parts of HmmModel (that means without hidden state names and so on) - */ -final class LossyHmmSerializer { - - private LossyHmmSerializer() { - } - - static void serialize(HmmModel model, DataOutput output) throws IOException { - MatrixWritable matrix = new MatrixWritable(model.getEmissionMatrix()); - matrix.write(output); - matrix.set(model.getTransitionMatrix()); - matrix.write(output); - - VectorWritable vector = new VectorWritable(model.getInitialProbabilities()); - vector.write(output); - } - - static HmmModel deserialize(DataInput input) throws IOException { - MatrixWritable matrix = new MatrixWritable(); - matrix.readFields(input); - Matrix emissionMatrix = matrix.get(); - - matrix.readFields(input); - Matrix transitionMatrix = matrix.get(); - - VectorWritable vector = new VectorWritable(); - vector.readFields(input); - Vector initialProbabilities = vector.get(); - - return new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java deleted file mode 100644 index b1a836964..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java +++ /dev/null @@ -1,106 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.mahout.classifier.sequencelearning.hmm; - -import com.google.common.io.Closeables; -import org.apache.commons.cli2.CommandLine; -import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; -import org.apache.commons.cli2.OptionException; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; -import org.apache.mahout.common.CommandLineUtil; - -import java.io.DataInputStream; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.PrintWriter; - -/** - * Command-line tool for generating random sequences by given HMM - */ -public final class RandomSequenceGenerator { - - private RandomSequenceGenerator() { - } - - public static void main(String[] args) throws IOException { - DefaultOptionBuilder optionBuilder = new DefaultOptionBuilder(); - ArgumentBuilder argumentBuilder = new ArgumentBuilder(); - - Option outputOption = optionBuilder.withLongName("output"). - withDescription("Output file with sequence of observed states"). - withShortName("o").withArgument(argumentBuilder.withMaximum(1).withMinimum(1). - withName("path").create()).withRequired(false).create(); - - Option modelOption = optionBuilder.withLongName("model"). - withDescription("Path to serialized HMM model"). - withShortName("m").withArgument(argumentBuilder.withMaximum(1).withMinimum(1). - withName("path").create()).withRequired(true).create(); - - Option lengthOption = optionBuilder.withLongName("length"). - withDescription("Length of generated sequence"). - withShortName("l").withArgument(argumentBuilder.withMaximum(1).withMinimum(1). - withName("number").create()).withRequired(true).create(); - - Group optionGroup = new GroupBuilder(). - withOption(outputOption).withOption(modelOption).withOption(lengthOption). - withName("Options").create(); - - try { - Parser parser = new Parser(); - parser.setGroup(optionGroup); - CommandLine commandLine = parser.parse(args); - - String output = (String) commandLine.getValue(outputOption); - - String modelPath = (String) commandLine.getValue(modelOption); - - int length = Integer.parseInt((String) commandLine.getValue(lengthOption)); - - //reading serialized HMM - DataInputStream modelStream = new DataInputStream(new FileInputStream(modelPath)); - HmmModel model; - try { - model = LossyHmmSerializer.deserialize(modelStream); - } finally { - Closeables.closeQuietly(modelStream); - } - - //generating observations - int[] observations = HmmEvaluator.predict(model, length, System.currentTimeMillis()); - - //writing output - PrintWriter writer = new PrintWriter(new FileOutputStream(output), true); - try { - for (int observation : observations) { - writer.print(observation); - writer.print(' '); - } - } finally { - Closeables.closeQuietly(writer); - } - } catch (OptionException e) { - CommandLineUtil.printHelp(optionGroup); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java deleted file mode 100644 index 3b778ac1f..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java +++ /dev/null @@ -1,128 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sequencelearning.hmm; - -import com.google.common.io.Closeables; -import org.apache.commons.cli2.CommandLine; -import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; -import org.apache.commons.cli2.OptionException; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; -import org.apache.mahout.common.CommandLineUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; - -import java.io.DataInputStream; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.List; -import java.util.Scanner; - -/** - * Command-line tool for Viterbi evaluating - */ -public final class ViterbiEvaluator { - - private ViterbiEvaluator() { - } - - public static void main(String[] args) throws IOException { - DefaultOptionBuilder optionBuilder = new DefaultOptionBuilder(); - ArgumentBuilder argumentBuilder = new ArgumentBuilder(); - - Option inputOption = DefaultOptionCreator.inputOption().create(); - - Option outputOption = DefaultOptionCreator.outputOption().create(); - - Option modelOption = optionBuilder.withLongName("model"). - withDescription("Path to serialized HMM model"). - withShortName("m").withArgument(argumentBuilder.withMaximum(1).withMinimum(1). - withName("path").create()).withRequired(true).create(); - - Option likelihoodOption = optionBuilder.withLongName("likelihood"). - withDescription("Compute likelihood of observed sequence"). - withShortName("l").withRequired(false).create(); - - Group optionGroup = new GroupBuilder().withOption(inputOption). - withOption(outputOption).withOption(modelOption).withOption(likelihoodOption). - withName("Options").create(); - - try { - Parser parser = new Parser(); - parser.setGroup(optionGroup); - CommandLine commandLine = parser.parse(args); - - String input = (String) commandLine.getValue(inputOption); - String output = (String) commandLine.getValue(outputOption); - - String modelPath = (String) commandLine.getValue(modelOption); - - boolean computeLikelihood = commandLine.hasOption(likelihoodOption); - - //reading serialized HMM - DataInputStream modelStream = new DataInputStream(new FileInputStream(modelPath)); - HmmModel model; - try { - model = LossyHmmSerializer.deserialize(modelStream); - } finally { - Closeables.closeQuietly(modelStream); - } - - //reading observations - List observations = new ArrayList(); - Scanner scanner = new Scanner(new FileInputStream(input)); - try { - while (scanner.hasNextInt()) { - observations.add(scanner.nextInt()); - } - } finally { - scanner.close(); - } - - int[] observationsArray = new int[observations.size()]; - for (int i = 0; i < observations.size(); ++i) { - observationsArray[i] = observations.get(i); - } - - //decoding - int[] hiddenStates = HmmEvaluator.decode(model, observationsArray, true); - - //writing output - PrintWriter writer = new PrintWriter(new FileOutputStream(output), true); - try { - for (int hiddenState : hiddenStates) { - writer.print(hiddenState); - writer.print(' '); - } - } finally { - Closeables.closeQuietly(writer); - } - - if (computeLikelihood) { - System.out.println("Likelihood: " + HmmEvaluator.modelLikelihood(model, observationsArray, true)); - } - } catch (OptionException e) { - CommandLineUtil.printHelp(optionGroup); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java deleted file mode 100644 index 411b36f41..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import com.google.common.base.Preconditions; -import org.apache.mahout.classifier.AbstractVectorClassifier; -import org.apache.mahout.classifier.OnlineLearner; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.function.DoubleFunction; -import org.apache.mahout.math.function.Functions; - -import java.util.Iterator; - -/** - * Generic definition of a 1 of n logistic regression classifier that returns probabilities in - * response to a feature vector. This classifier uses 1 of n-1 coding where the 0-th category - * is not stored explicitly. - *

- * Provides the SGD based algorithm for learning a logistic regression, but omits all - * annealing of learning rates. Any extension of this abstract class must define the overall - * and per-term annealing for themselves. - */ -public abstract class AbstractOnlineLogisticRegression extends AbstractVectorClassifier implements OnlineLearner { - // coefficients for the classification. This is a dense matrix - // that is (numCategories-1) x numFeatures - protected Matrix beta; - - // number of categories we are classifying. This should the number of rows of beta plus one. - protected int numCategories; - - protected int step; - - // information about how long since coefficient rows were updated. This allows lazy regularization. - protected Vector updateSteps; - - // information about how many updates we have had on a location. This allows per-term - // annealing a la confidence weighted learning. - protected Vector updateCounts; - - // weight of the prior on beta - private double lambda = 1.0e-5; - protected PriorFunction prior; - - // can we ignore any further regularization when doing classification? - private boolean sealed; - - // by default we don't do any fancy training - private Gradient gradient = new DefaultGradient(); - - /** - * Chainable configuration option. - * - * @param lambda New value of lambda, the weighting factor for the prior distribution. - * @return This, so other configurations can be chained. - */ - public AbstractOnlineLogisticRegression lambda(double lambda) { - this.lambda = lambda; - return this; - } - - /** - * Computes the inverse link function, by default the logistic link function. - * - * @param v The output of the linear combination in a GLM. Note that the value - * of v is disturbed. - * @return A version of v with the link function applied. - */ - public Vector link(Vector v) { - double max = v.maxValue(); - if (max >= 40) { - // if max > 40, we subtract the large offset first - // the size of the max means that 1+sum(exp(v)) = sum(exp(v)) to within round-off - v.assign(Functions.minus(max)).assign(Functions.EXP); - return v.divide(v.norm(1)); - } else { - v.assign(Functions.EXP); - return v.divide(1 + v.norm(1)); - } - } - - /** - * Computes the binomial logistic inverse link function. - * - * @param r The value to transform. - * @return The logit of r. - */ - public double link(double r) { - if (r < 0.0) { - double s = Math.exp(r); - return s / (1.0 + s); - } else { - double s = Math.exp(-r); - return 1.0 / (1.0 + s); - } - } - - @Override - public Vector classifyNoLink(Vector instance) { - // apply pending regularization to whichever coefficients matter - regularize(instance); - return beta.times(instance); - } - - public double classifyScalarNoLink(Vector instance) { - return beta.viewRow(0).dot(instance); - } - - /** - * Returns n-1 probabilities, one for each category but the 0-th. The probability of the 0-th - * category is 1 - sum(this result). - * - * @param instance A vector of features to be classified. - * @return A vector of probabilities, one for each of the first n-1 categories. - */ - @Override - public Vector classify(Vector instance) { - return link(classifyNoLink(instance)); - } - - /** - * Returns a single scalar probability in the case where we have two categories. Using this - * method avoids an extra vector allocation as opposed to calling classify() or an extra two - * vector allocations relative to classifyFull(). - * - * @param instance The vector of features to be classified. - * @return The probability of the first of two categories. - * @throws IllegalArgumentException If the classifier doesn't have two categories. - */ - @Override - public double classifyScalar(Vector instance) { - Preconditions.checkArgument(numCategories() == 2, "Can only call classifyScalar with two categories"); - - // apply pending regularization to whichever coefficients matter - regularize(instance); - - // result is a vector with one element so we can just use dot product - return link(classifyScalarNoLink(instance)); - } - - @Override - public void train(long trackingKey, String groupKey, int actual, Vector instance) { - unseal(); - - double learningRate = currentLearningRate(); - - // push coefficients back to zero based on the prior - regularize(instance); - - // update each row of coefficients according to result - Vector gradient = this.gradient.apply(groupKey, actual, instance, this); - for (int i = 0; i < numCategories - 1; i++) { - double gradientBase = gradient.get(i); - - // then we apply the gradientBase to the resulting element. - Iterator nonZeros = instance.iterateNonZero(); - while (nonZeros.hasNext()) { - Vector.Element updateLocation = nonZeros.next(); - int j = updateLocation.index(); - - double newValue = beta.getQuick(i, j) + gradientBase * learningRate * perTermLearningRate(j) * instance.get(j); - beta.setQuick(i, j, newValue); - } - } - - // remember that these elements got updated - Iterator i = instance.iterateNonZero(); - while (i.hasNext()) { - Vector.Element element = i.next(); - int j = element.index(); - updateSteps.setQuick(j, getStep()); - updateCounts.setQuick(j, updateCounts.getQuick(j) + 1); - } - nextStep(); - - } - - @Override - public void train(long trackingKey, int actual, Vector instance) { - train(trackingKey, null, actual, instance); - } - - @Override - public void train(int actual, Vector instance) { - train(0, null, actual, instance); - } - - public void regularize(Vector instance) { - if (updateSteps == null || isSealed()) { - return; - } - - // anneal learning rate - double learningRate = currentLearningRate(); - - // here we lazily apply the prior to make up for our neglect - for (int i = 0; i < numCategories - 1; i++) { - Iterator nonZeros = instance.iterateNonZero(); - while (nonZeros.hasNext()) { - Vector.Element updateLocation = nonZeros.next(); - int j = updateLocation.index(); - double missingUpdates = getStep() - updateSteps.get(j); - if (missingUpdates > 0) { - double rate = getLambda() * learningRate * perTermLearningRate(j); - double newValue = prior.age(beta.get(i, j), missingUpdates, rate); - beta.set(i, j, newValue); - updateSteps.set(j, getStep()); - } - } - } - } - - // these two abstract methods are how extensions can modify the basic learning behavior of this object. - - public abstract double perTermLearningRate(int j); - - public abstract double currentLearningRate(); - - public void setPrior(PriorFunction prior) { - this.prior = prior; - } - - public void setGradient(Gradient gradient) { - this.gradient = gradient; - } - - public PriorFunction getPrior() { - return prior; - } - - public Matrix getBeta() { - close(); - return beta; - } - - public void setBeta(int i, int j, double betaIJ) { - beta.set(i, j, betaIJ); - } - - @Override - public int numCategories() { - return numCategories; - } - - public int numFeatures() { - return beta.numCols(); - } - - public double getLambda() { - return lambda; - } - - public int getStep() { - return step; - } - - protected void nextStep() { - step++; - } - - public boolean isSealed() { - return sealed; - } - - protected void unseal() { - sealed = false; - } - - private void regularizeAll() { - Vector all = new DenseVector(beta.numCols()); - all.assign(1); - regularize(all); - } - - @Override - public void close() { - if (!sealed) { - step++; - regularizeAll(); - sealed = true; - } - } - - public void copyFrom(AbstractOnlineLogisticRegression other) { - // number of categories we are classifying. This should the number of rows of beta plus one. - Preconditions.checkArgument(numCategories == other.numCategories, - "Can't copy unless number of target categories is the same"); - - beta.assign(other.beta); - - step = other.step; - - updateSteps.assign(other.updateSteps); - updateCounts.assign(other.updateCounts); - } - - public boolean validModel() { - double k = beta.aggregate(Functions.PLUS, new DoubleFunction() { - @Override - public double apply(double v) { - return Double.isNaN(v) || Double.isInfinite(v) ? 1 : 0; - } - }); - return k < 1; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java deleted file mode 100644 index dd3960205..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java +++ /dev/null @@ -1,577 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import com.google.common.collect.Lists; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.classifier.OnlineLearner; -import org.apache.mahout.ep.EvolutionaryProcess; -import org.apache.mahout.ep.Mapping; -import org.apache.mahout.ep.Payload; -import org.apache.mahout.ep.State; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.stats.OnlineAuc; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.List; -import java.util.Locale; -import java.util.concurrent.ExecutionException; - -/** - * This is a meta-learner that maintains a pool of ordinary {@link org.apache.mahout.classifier.sgd.OnlineLogisticRegression} learners. Each - * member of the pool has different learning rates. Whichever of the learners in the pool falls - * behind in terms of average log-likelihood will be tossed out and replaced with variants of the - * survivors. This will let us automatically derive an annealing schedule that optimizes learning - * speed. Since on-line learners tend to be IO bound anyway, it doesn't cost as much as it might - * seem that it would to maintain multiple learners in memory. Doing this adaptation on-line as we - * learn also decreases the number of learning rate parameters required and replaces the normal - * hyper-parameter search. - *

- * One wrinkle is that the pool of learners that we maintain is actually a pool of {@link org.apache.mahout.classifier.sgd.CrossFoldLearner} - * which themselves contain several OnlineLogisticRegression objects. These pools allow estimation - * of performance on the fly even if we make many passes through the data. This does, however, - * increase the cost of training since if we are using 5-fold cross-validation, each vector is used - * 4 times for training and once for classification. If this becomes a problem, then we should - * probably use a 2-way unbalanced train/test split rather than full cross validation. With the - * current default settings, we have 100 learners running. This is better than the alternative of - * running hundreds of training passes to find good hyper-parameters because we only have to parse - * and feature-ize our inputs once. If you already have good hyper-parameters, then you might - * prefer to just run one CrossFoldLearner with those settings. - *

- * The fitness used here is AUC. Another alternative would be to try log-likelihood, but it is much - * easier to get bogus values of log-likelihood than with AUC and the results seem to accord pretty - * well. It would be nice to allow the fitness function to be pluggable. This use of AUC means that - * AdaptiveLogisticRegression is mostly suited for binary target variables. This will be fixed - * before long by extending OnlineAuc to handle non-binary cases or by using a different fitness - * value in non-binary cases. - */ -public class AdaptiveLogisticRegression implements OnlineLearner, Writable { - public static final int DEFAULT_THREAD_COUNT = 20; - public static final int DEFAULT_POOL_SIZE = 20; - private static final int SURVIVORS = 2; - - private int record; - private int cutoff = 1000; - private int minInterval = 1000; - private int maxInterval = 1000; - private int currentStep = 1000; - private int bufferSize = 1000; - - private List buffer = Lists.newArrayList(); - private EvolutionaryProcess ep; - private State best; - private int threadCount = DEFAULT_THREAD_COUNT; - private int poolSize = DEFAULT_POOL_SIZE; - private State seed; - private int numFeatures; - - private boolean freezeSurvivors = true; - - public AdaptiveLogisticRegression() { - } - - /** - * Uses {@link #DEFAULT_THREAD_COUNT} and {@link #DEFAULT_POOL_SIZE} - * @param numCategories The number of categories (labels) to train on - * @param numFeatures The number of features used in creating the vectors (i.e. the cardinality of the vector) - * @param prior The {@link org.apache.mahout.classifier.sgd.PriorFunction} to use - * - * @see {@link #AdaptiveLogisticRegression(int, int, org.apache.mahout.classifier.sgd.PriorFunction, int, int)} - */ - public AdaptiveLogisticRegression(int numCategories, int numFeatures, PriorFunction prior) { - this(numCategories, numFeatures, prior, DEFAULT_THREAD_COUNT, DEFAULT_POOL_SIZE); - } - - /** - * - * @param numCategories The number of categories (labels) to train on - * @param numFeatures The number of features used in creating the vectors (i.e. the cardinality of the vector) - * @param prior The {@link org.apache.mahout.classifier.sgd.PriorFunction} to use - * @param threadCount The number of threads to use for training - * @param poolSize The number of {@link org.apache.mahout.classifier.sgd.CrossFoldLearner} to use. - */ - public AdaptiveLogisticRegression(int numCategories, int numFeatures, PriorFunction prior, int threadCount, int poolSize) { - this.numFeatures = numFeatures; - this.threadCount = threadCount; - this.poolSize = poolSize; - seed = new State(new double[2], 10); - Wrapper w = new Wrapper(numCategories, numFeatures, prior); - seed.setPayload(w); - - w.setMappings(seed); - seed.setPayload(w); - setPoolSize(this.poolSize); - } - - @Override - public void train(int actual, Vector instance) { - train(record, null, actual, instance); - } - - @Override - public void train(long trackingKey, int actual, Vector instance) { - train(trackingKey, null, actual, instance); - } - - @Override - public void train(long trackingKey, String groupKey, int actual, Vector instance) { - record++; - - buffer.add(new TrainingExample(trackingKey, groupKey, actual, instance)); - //don't train until we have enough examples - if (buffer.size() > bufferSize) { - trainWithBufferedExamples(); - } - } - - private void trainWithBufferedExamples() { - try { - this.best = ep.parallelDo(new EvolutionaryProcess.Function>() { - @Override - public double apply(Payload z, double[] params) { - Wrapper x = (Wrapper) z; - for (TrainingExample example : buffer) { - x.train(example); - } - if (x.getLearner().validModel()) { - if (x.getLearner().numCategories() == 2) { - return x.wrapped.auc(); - } else { - return x.wrapped.logLikelihood(); - } - } else { - return Double.NaN; - } - } - }); - } catch (InterruptedException e) { - // ignore ... shouldn't happen - } catch (ExecutionException e) { - throw new IllegalStateException(e.getCause()); - } - buffer.clear(); - - if (record > cutoff) { - cutoff = nextStep(record); - - // evolve based on new fitness - ep.mutatePopulation(SURVIVORS); - - if (freezeSurvivors) { - // now grossly hack the top survivors so they stick around. Set their - // mutation rates small and also hack their learning rate to be small - // as well. - for (State state : ep.getPopulation().subList(0, SURVIVORS)) { - state.getPayload().freeze(state); - } - } - } - - } - - public int nextStep(int recordNumber) { - int stepSize = stepSize(recordNumber, 2.6); - if (stepSize < minInterval) { - stepSize = minInterval; - } - - if (stepSize > maxInterval) { - stepSize = maxInterval; - } - - int newCutoff = stepSize * (recordNumber / stepSize + 1); - if (newCutoff < cutoff + currentStep) { - newCutoff = cutoff + currentStep; - } else { - this.currentStep = stepSize; - } - return newCutoff; - } - - public static int stepSize(int recordNumber, double multiplier) { - int[] bumps = {1, 2, 5}; - double log = Math.floor(multiplier * Math.log10(recordNumber)); - int bump = bumps[(int) log % bumps.length]; - int scale = (int) Math.pow(10, Math.floor(log / bumps.length)); - - return bump * scale; - } - - @Override - public void close() { - trainWithBufferedExamples(); - try { - ep.parallelDo(new EvolutionaryProcess.Function>() { - @Override - public double apply(Payload payload, double[] params) { - CrossFoldLearner learner = ((Wrapper) payload).getLearner(); - learner.close(); - return learner.logLikelihood(); - } - }); - ep.close(); - } catch (InterruptedException e) { - // ignore - } catch (ExecutionException e) { - throw new IllegalStateException(e); - } - } - - /** - * How often should the evolutionary optimization of learning parameters occur? - * - * @param interval Number of training examples to use in each epoch of optimization. - */ - public void setInterval(int interval) { - setInterval(interval, interval); - } - - /** - * Starts optimization using the shorter interval and progresses to the longer using the specified - * number of steps per decade. Note that values < 200 are not accepted. Values even that small - * are unlikely to be useful. - * - * @param minInterval The minimum epoch length for the evolutionary optimization - * @param maxInterval The maximum epoch length - */ - public void setInterval(int minInterval, int maxInterval) { - this.minInterval = Math.max(200, minInterval); - this.maxInterval = Math.max(200, maxInterval); - this.cutoff = minInterval * (record / minInterval + 1); - this.currentStep = minInterval; - bufferSize = Math.min(minInterval, bufferSize); - } - - public void setPoolSize(int poolSize) { - this.poolSize = poolSize; - setupOptimizer(poolSize); - } - - public void setThreadCount(int threadCount) { - this.threadCount = threadCount; - setupOptimizer(poolSize); - } - - public void setAucEvaluator(OnlineAuc auc) { - seed.getPayload().setAucEvaluator(auc); - setupOptimizer(poolSize); - } - - private void setupOptimizer(int poolSize) { - ep = new EvolutionaryProcess(threadCount, poolSize, seed); - } - - /** - * Returns the size of the internal feature vector. Note that this is not the same as the number - * of distinct features, especially if feature hashing is being used. - * - * @return The internal feature vector size. - */ - public int numFeatures() { - return numFeatures; - } - - /** - * What is the AUC for the current best member of the population. If no member is best, usually - * because we haven't done any training yet, then the result is set to NaN. - * - * @return The AUC of the best member of the population or NaN if we can't figure that out. - */ - public double auc() { - if (best == null) { - return Double.NaN; - } else { - Wrapper payload = best.getPayload(); - return payload.getLearner().auc(); - } - } - - public State getBest() { - return best; - } - - public void setBest(State best) { - this.best = best; - } - - public int getRecord() { - return record; - } - - public void setRecord(int record) { - this.record = record; - } - - public int getMinInterval() { - return minInterval; - } - - public int getMaxInterval() { - return maxInterval; - } - - public int getNumCategories() { - return seed.getPayload().getLearner().numCategories(); - } - - public PriorFunction getPrior() { - return seed.getPayload().getLearner().getPrior(); - } - - public void setBuffer(List buffer) { - this.buffer = buffer; - } - - public List getBuffer() { - return buffer; - } - - public EvolutionaryProcess getEp() { - return ep; - } - - public void setEp(EvolutionaryProcess ep) { - this.ep = ep; - } - - public State getSeed() { - return seed; - } - - public void setSeed(State seed) { - this.seed = seed; - } - - public int getNumFeatures() { - return numFeatures; - } - - public void setAveragingWindow(int averagingWindow) { - seed.getPayload().getLearner().setWindowSize(averagingWindow); - setupOptimizer(poolSize); - } - - public void setFreezeSurvivors(boolean freezeSurvivors) { - this.freezeSurvivors = freezeSurvivors; - } - - /** - * Provides a shim between the EP optimization stuff and the CrossFoldLearner. The most important - * interface has to do with the parameters of the optimization. These are taken from the double[] - * params in the following order

  • regularization constant lambda
  • learningRate
. - * All other parameters are set in such a way so as to defeat annealing to the extent possible. - * This lets the evolutionary algorithm handle the annealing. - *

- * Note that per coefficient annealing is still done and no optimization of the per coefficient - * offset is done. - */ - public static class Wrapper implements Payload { - private CrossFoldLearner wrapped; - - public Wrapper() { - } - - public Wrapper(int numCategories, int numFeatures, PriorFunction prior) { - wrapped = new CrossFoldLearner(5, numCategories, numFeatures, prior); - } - - @Override - public Wrapper copy() { - Wrapper r = new Wrapper(); - r.wrapped = wrapped.copy(); - return r; - } - - @Override - public void update(double[] params) { - int i = 0; - wrapped.lambda(params[i++]); - wrapped.learningRate(params[i]); - - wrapped.stepOffset(1); - wrapped.alpha(1); - wrapped.decayExponent(0); - } - - public void freeze(State s) { - // radically decrease learning rate - s.getParams()[1] -= 10; - - // and cause evolution to hold (almost) - s.setOmni(s.getOmni() / 20); - double[] step = s.getStep(); - for (int i = 0; i < step.length; i++) { - step[i] /= 20; - } - } - - public void setMappings(State x) { - int i = 0; - // set the range for regularization (lambda) - x.setMap(i++, Mapping.logLimit(1.0e-8, 0.1)); - // set the range for learning rate (mu) - x.setMap(i, Mapping.logLimit(1.0e-8, 1)); - } - - public void train(TrainingExample example) { - wrapped.train(example.getKey(), example.getGroupKey(), example.getActual(), example.getInstance()); - } - - public CrossFoldLearner getLearner() { - return wrapped; - } - - @Override - public String toString() { - return String.format(Locale.ENGLISH, "auc=%.2f", wrapped.auc()); - } - - public void setAucEvaluator(OnlineAuc auc) { - wrapped.setAucEvaluator(auc); - } - - @Override - public void write(DataOutput out) throws IOException { - wrapped.write(out); - } - - @Override - public void readFields(DataInput input) throws IOException { - wrapped = new CrossFoldLearner(); - wrapped.readFields(input); - } - } - - public static class TrainingExample implements Writable { - private long key; - private String groupKey; - private int actual; - private Vector instance; - - private TrainingExample() { - } - - public TrainingExample(long key, String groupKey, int actual, Vector instance) { - this.key = key; - this.groupKey = groupKey; - this.actual = actual; - this.instance = instance; - } - - public long getKey() { - return key; - } - - public int getActual() { - return actual; - } - - public Vector getInstance() { - return instance; - } - - public String getGroupKey() { - return groupKey; - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeLong(key); - if (groupKey != null) { - out.writeBoolean(true); - out.writeUTF(groupKey); - } else { - out.writeBoolean(false); - } - out.writeInt(actual); - VectorWritable.writeVector(out, instance, true); - } - - @Override - public void readFields(DataInput in) throws IOException { - key = in.readLong(); - if (in.readBoolean()) { - groupKey = in.readUTF(); - } - actual = in.readInt(); - instance = VectorWritable.readVector(in); - } - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeInt(record); - out.writeInt(cutoff); - out.writeInt(minInterval); - out.writeInt(maxInterval); - out.writeInt(currentStep); - out.writeInt(bufferSize); - - out.writeInt(buffer.size()); - for (TrainingExample example : buffer) { - example.write(out); - } - - ep.write(out); - - best.write(out); - - out.writeInt(threadCount); - out.writeInt(poolSize); - seed.write(out); - out.writeInt(numFeatures); - - out.writeBoolean(freezeSurvivors); - } - - @Override - public void readFields(DataInput in) throws IOException { - record = in.readInt(); - cutoff = in.readInt(); - minInterval = in.readInt(); - maxInterval = in.readInt(); - currentStep = in.readInt(); - bufferSize = in.readInt(); - - int n = in.readInt(); - buffer = Lists.newArrayList(); - for (int i = 0; i < n; i++) { - TrainingExample example = new TrainingExample(); - example.readFields(in); - buffer.add(example); - } - - ep = new EvolutionaryProcess(); - ep.readFields(in); - - best = new State(); - best.readFields(in); - - threadCount = in.readInt(); - poolSize = in.readInt(); - seed = new State(); - seed.readFields(in); - - numFeatures = in.readInt(); - freezeSurvivors = in.readBoolean(); - } -} - diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/CrossFoldLearner.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/CrossFoldLearner.java deleted file mode 100644 index 33f0266cf..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/CrossFoldLearner.java +++ /dev/null @@ -1,329 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import com.google.common.collect.Lists; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.classifier.AbstractVectorClassifier; -import org.apache.mahout.classifier.OnlineLearner; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.function.DoubleDoubleFunction; -import org.apache.mahout.math.function.Functions; -import org.apache.mahout.math.stats.GlobalOnlineAuc; -import org.apache.mahout.math.stats.OnlineAuc; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.List; - -/** - * Does cross-fold validation of log-likelihood and AUC on several online logistic regression - * models. Each record is passed to all but one of the models for training and to the remaining - * model for evaluation. In order to maintain proper segregation between the different folds across - * training data iterations, data should either be passed to this learner in the same order each - * time the training data is traversed or a tracking key such as the file offset of the training - * record should be passed with each training example. - */ -public class CrossFoldLearner extends AbstractVectorClassifier implements OnlineLearner, Writable { - private int record; - // minimum score to be used for computing log likelihood - private static final double MIN_SCORE = 1.0e-50; - private OnlineAuc auc = new GlobalOnlineAuc(); - private double logLikelihood; - private final List models = Lists.newArrayList(); - - // lambda, learningRate, perTermOffset, perTermExponent - private double[] parameters = new double[4]; - private int numFeatures; - private PriorFunction prior; - private double percentCorrect; - - private int windowSize = Integer.MAX_VALUE; - - public CrossFoldLearner() { - } - - public CrossFoldLearner(int folds, int numCategories, int numFeatures, PriorFunction prior) { - this.numFeatures = numFeatures; - this.prior = prior; - for (int i = 0; i < folds; i++) { - OnlineLogisticRegression model = new OnlineLogisticRegression(numCategories, numFeatures, prior); - model.alpha(1).stepOffset(0).decayExponent(0); - models.add(model); - } - } - - // -------- builder-like configuration methods - - public CrossFoldLearner lambda(double v) { - for (OnlineLogisticRegression model : models) { - model.lambda(v); - } - return this; - } - - public CrossFoldLearner learningRate(double x) { - for (OnlineLogisticRegression model : models) { - model.learningRate(x); - } - return this; - } - - public CrossFoldLearner stepOffset(int x) { - for (OnlineLogisticRegression model : models) { - model.stepOffset(x); - } - return this; - } - - public CrossFoldLearner decayExponent(double x) { - for (OnlineLogisticRegression model : models) { - model.decayExponent(x); - } - return this; - } - - public CrossFoldLearner alpha(double alpha) { - for (OnlineLogisticRegression model : models) { - model.alpha(alpha); - } - return this; - } - - // -------- training methods - @Override - public void train(int actual, Vector instance) { - train(record, null, actual, instance); - } - - @Override - public void train(long trackingKey, int actual, Vector instance) { - train(trackingKey, null, actual, instance); - } - - @Override - public void train(long trackingKey, String groupKey, int actual, Vector instance) { - record++; - int k = 0; - for (OnlineLogisticRegression model : models) { - if (k == trackingKey % models.size()) { - Vector v = model.classifyFull(instance); - double score = Math.max(v.get(actual), MIN_SCORE); - logLikelihood += (Math.log(score) - logLikelihood) / Math.min(record, windowSize); - - int correct = v.maxValueIndex() == actual ? 1 : 0; - percentCorrect += (correct - percentCorrect) / Math.min(record, windowSize); - if (numCategories() == 2) { - auc.addSample(actual, groupKey, v.get(1)); - } - } else { - model.train(trackingKey, groupKey, actual, instance); - } - k++; - } - } - - @Override - public void close() { - for (OnlineLogisticRegression m : models) { - m.close(); - } - } - - public void resetLineCounter() { - record = 0; - } - - public boolean validModel() { - boolean r = true; - for (OnlineLogisticRegression model : models) { - r &= model.validModel(); - } - return r; - } - - // -------- classification methods - - @Override - public Vector classify(Vector instance) { - Vector r = new DenseVector(numCategories() - 1); - DoubleDoubleFunction scale = Functions.plusMult(1.0 / models.size()); - for (OnlineLogisticRegression model : models) { - r.assign(model.classify(instance), scale); - } - return r; - } - - @Override - public Vector classifyNoLink(Vector instance) { - Vector r = new DenseVector(numCategories() - 1); - DoubleDoubleFunction scale = Functions.plusMult(1.0 / models.size()); - for (OnlineLogisticRegression model : models) { - r.assign(model.classifyNoLink(instance), scale); - } - return r; - } - - @Override - public double classifyScalar(Vector instance) { - double r = 0; - int n = 0; - for (OnlineLogisticRegression model : models) { - n++; - r += model.classifyScalar(instance); - } - return r / n; - } - - // -------- status reporting methods - - @Override - public int numCategories() { - return models.get(0).numCategories(); - } - - public double auc() { - return auc.auc(); - } - - public double logLikelihood() { - return logLikelihood; - } - - public double percentCorrect() { - return percentCorrect; - } - - // -------- evolutionary optimization - - public CrossFoldLearner copy() { - CrossFoldLearner r = new CrossFoldLearner(models.size(), numCategories(), numFeatures, prior); - r.models.clear(); - for (OnlineLogisticRegression model : models) { - model.close(); - OnlineLogisticRegression newModel = - new OnlineLogisticRegression(model.numCategories(), model.numFeatures(), model.prior); - newModel.copyFrom(model); - r.models.add(newModel); - } - return r; - } - - public int getRecord() { - return record; - } - - public void setRecord(int record) { - this.record = record; - } - - public OnlineAuc getAucEvaluator() { - return auc; - } - - public void setAucEvaluator(OnlineAuc auc) { - this.auc = auc; - } - - public double getLogLikelihood() { - return logLikelihood; - } - - public void setLogLikelihood(double logLikelihood) { - this.logLikelihood = logLikelihood; - } - - public List getModels() { - return models; - } - - public void addModel(OnlineLogisticRegression model) { - models.add(model); - } - - public double[] getParameters() { - return parameters; - } - - public void setParameters(double[] parameters) { - this.parameters = parameters; - } - - public int getNumFeatures() { - return numFeatures; - } - - public void setNumFeatures(int numFeatures) { - this.numFeatures = numFeatures; - } - - public void setWindowSize(int windowSize) { - this.windowSize = windowSize; - auc.setWindowSize(windowSize); - } - - public PriorFunction getPrior() { - return prior; - } - - public void setPrior(PriorFunction prior) { - this.prior = prior; - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeInt(record); - PolymorphicWritable.write(out, auc); - out.writeDouble(logLikelihood); - out.writeInt(models.size()); - for (OnlineLogisticRegression model : models) { - model.write(out); - } - - for (double x : parameters) { - out.writeDouble(x); - } - out.writeInt(numFeatures); - PolymorphicWritable.write(out, prior); - out.writeDouble(percentCorrect); - out.writeInt(windowSize); - } - - @Override - public void readFields(DataInput in) throws IOException { - record = in.readInt(); - auc = PolymorphicWritable.read(in, OnlineAuc.class); - logLikelihood = in.readDouble(); - int n = in.readInt(); - for (int i = 0; i < n; i++) { - OnlineLogisticRegression olr = new OnlineLogisticRegression(); - olr.readFields(in); - models.add(olr); - } - parameters = new double[4]; - for (int i = 0; i < 4; i++) { - parameters[i] = in.readDouble(); - } - numFeatures = in.readInt(); - prior = PolymorphicWritable.read(in, PriorFunction.class); - percentCorrect = in.readDouble(); - windowSize = in.readInt(); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java deleted file mode 100644 index 2b49a1bc9..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java +++ /dev/null @@ -1,372 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import com.google.common.base.CharMatcher; -import com.google.common.base.Function; -import com.google.common.base.Preconditions; -import com.google.common.base.Splitter; -import com.google.common.collect.Collections2; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import org.apache.mahout.math.Vector; -import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder; -import org.apache.mahout.vectorizer.encoders.ContinuousValueEncoder; -import org.apache.mahout.vectorizer.encoders.Dictionary; -import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder; -import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder; -import org.apache.mahout.vectorizer.encoders.TextValueEncoder; - -import java.lang.reflect.Constructor; -import java.lang.reflect.InvocationTargetException; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Set; - -/** - * Converts csv data lines to vectors. - * - * Use of this class proceeds in a few steps. - *

    - *
  • At construction time, you tell the class about the target variable and provide - * a dictionary of the types of the predictor values. At this point, - * the class yet cannot decode inputs because it doesn't know the fields that are in the - * data records, nor their order. - *
  • Optionally, you tell the parser object about the possible values of the target - * variable. If you don't do this then you probably should set the number of distinct - * values so that the target variable values will be taken from a restricted range. - *
  • Later, when you get a list of the fields, typically from the first line of a CSV - * file, you tell the factory about these fields and it builds internal data structures - * that allow it to decode inputs. The most important internal state is the field numbers - * for various fields. After this point, you can use the factory for decoding data. - *
  • To encode data as a vector, you present a line of input to the factory and it - * mutates a vector that you provide. The factory also retains trace information so - * that it can approximately reverse engineer vectors later. - *
  • After converting data, you can ask for an explanation of the data in terms of - * terms and weights. In order to explain a vector accurately, the factory needs to - * have seen the particular values of categorical fields (typically during encoding vectors) - * and needs to have a reasonably small number of collisions in the vector encoding. - *
- */ -public class CsvRecordFactory implements RecordFactory { - private static final String INTERCEPT_TERM = "Intercept Term"; - - // crude CSV value splitter. This will fail if any double quoted strings have - // commas inside. Also, escaped quotes will not be unescaped. Good enough for now. - private static final Splitter COMMA = Splitter.on(',').trimResults(CharMatcher.is('"')); - - private static final Map> TYPE_DICTIONARY = - ImmutableMap.>builder() - .put("continuous", ContinuousValueEncoder.class) - .put("numeric", ContinuousValueEncoder.class) - .put("n", ContinuousValueEncoder.class) - .put("word", StaticWordValueEncoder.class) - .put("w", StaticWordValueEncoder.class) - .put("text", TextValueEncoder.class) - .put("t", TextValueEncoder.class) - .build(); - - private final Map> traceDictionary = Maps.newTreeMap(); - - private int target; - private final Dictionary targetDictionary; - - //Which column is used for identify a CSV file line - private String idName; - private int id = -1; - - private List predictors; - private Map predictorEncoders; - private int maxTargetValue = Integer.MAX_VALUE; - private final String targetName; - private final Map typeMap; - private List variableNames; - private boolean includeBiasTerm; - private static final String CANNOT_CONSTRUCT_CONVERTER = - "Unable to construct type converter... shouldn't be possible"; - - /** - * Construct a parser for CSV lines that encodes the parsed data in vector form. - * @param targetName The name of the target variable. - * @param typeMap A map describing the types of the predictor variables. - */ - public CsvRecordFactory(String targetName, Map typeMap) { - this.targetName = targetName; - this.typeMap = typeMap; - targetDictionary = new Dictionary(); - } - - public CsvRecordFactory(String targetName, String idName, Map typeMap) { - this(targetName, typeMap); - this.idName = idName; - } - - /** - * Defines the values and thus the encoding of values of the target variables. Note - * that any values of the target variable not present in this list will be given the - * value of the last member of the list. - * @param values The values the target variable can have. - */ - @Override - public void defineTargetCategories(List values) { - Preconditions.checkArgument( - values.size() <= maxTargetValue, - "Must have less than or equal to " + maxTargetValue + " categories for target variable, but found " - + values.size()); - if (maxTargetValue == Integer.MAX_VALUE) { - maxTargetValue = values.size(); - } - - for (String value : values) { - targetDictionary.intern(value); - } - } - - /** - * Defines the number of target variable categories, but allows this parser to - * pick encodings for them as they appear. - * @param max The number of categories that will be excpeted. Once this many have been - * seen, all others will get the encoding max-1. - */ - @Override - public CsvRecordFactory maxTargetValue(int max) { - maxTargetValue = max; - return this; - } - - @Override - public boolean usesFirstLineAsSchema() { - return true; - } - - /** - * Processes the first line of a file (which should contain the variable names). The target and - * predictor column numbers are set from the names on this line. - * - * @param line Header line for the file. - */ - @Override - public void firstLine(String line) { - // read variable names, build map of name -> column - final Map vars = Maps.newHashMap(); - variableNames = Lists.newArrayList(COMMA.split(line)); - int column = 0; - for (String var : variableNames) { - vars.put(var, column++); - } - - // record target column and establish dictionary for decoding target - target = vars.get(targetName); - - // record id column - if (idName != null) { - id = vars.get(idName); - } - - // create list of predictor column numbers - predictors = Lists.newArrayList(Collections2.transform(typeMap.keySet(), new Function() { - @Override - public Integer apply(String from) { - Integer r = vars.get(from); - Preconditions.checkArgument(r != null, "Can't find variable %s, only know about %s", from, vars); - return r; - } - })); - - if (includeBiasTerm) { - predictors.add(-1); - } - Collections.sort(predictors); - - // and map from column number to type encoder for each column that is a predictor - predictorEncoders = Maps.newHashMap(); - for (Integer predictor : predictors) { - String name; - Class c; - if (predictor == -1) { - name = INTERCEPT_TERM; - c = ConstantValueEncoder.class; - } else { - name = variableNames.get(predictor); - c = TYPE_DICTIONARY.get(typeMap.get(name)); - } - try { - Preconditions.checkArgument(c != null, "Invalid type of variable %s, wanted one of %s", - typeMap.get(name), TYPE_DICTIONARY.keySet()); - Constructor constructor = c.getConstructor(String.class); - Preconditions.checkArgument(constructor != null, "Can't find correct constructor for %s", typeMap.get(name)); - FeatureVectorEncoder encoder = constructor.newInstance(name); - predictorEncoders.put(predictor, encoder); - encoder.setTraceDictionary(traceDictionary); - } catch (InstantiationException e) { - throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e); - } catch (IllegalAccessException e) { - throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e); - } catch (InvocationTargetException e) { - throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e); - } catch (NoSuchMethodException e) { - throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e); - } - } - } - - - /** - * Decodes a single line of csv data and records the target and predictor variables in a record. - * As a side effect, features are added into the featureVector. Returns the value of the target - * variable. - * - * @param line The raw data. - * @param featureVector Where to fill in the features. Should be zeroed before calling - * processLine. - * @return The value of the target variable. - */ - @Override - public int processLine(String line, Vector featureVector) { - List values = Lists.newArrayList(COMMA.split(line)); - - int targetValue = targetDictionary.intern(values.get(target)); - if (targetValue >= maxTargetValue) { - targetValue = maxTargetValue - 1; - } - - for (Integer predictor : predictors) { - String value; - if (predictor >= 0) { - value = values.get(predictor); - } else { - value = null; - } - predictorEncoders.get(predictor).addToVector(value, featureVector); - } - return targetValue; - } - - /*** - * Decodes a single line of csv data and records the target(if retrunTarget is true) - * and predictor variables in a record. As a side effect, features are added into the featureVector. - * Returns the value of the target variable. When used during classify against production data without - * target value, the method will be called with returnTarget = false. - * @param line The raw data. - * @param featureVector Where to fill in the features. Should be zeroed before calling - * processLine. - * @param returnTarget whether process and return target value, -1 will be returned if false. - * @return The value of the target variable. - */ - public int processLine(CharSequence line, Vector featureVector, boolean returnTarget) { - List values = Lists.newArrayList(COMMA.split(line)); - int targetValue = -1; - if (returnTarget) { - targetValue = targetDictionary.intern(values.get(target)); - if (targetValue >= maxTargetValue) { - targetValue = maxTargetValue - 1; - } - } - - for (Integer predictor : predictors) { - String value = predictor >= 0 ? values.get(predictor) : null; - predictorEncoders.get(predictor).addToVector(value, featureVector); - } - return targetValue; - } - - /*** - * Extract the raw target string from a line read from a CSV file. - * @param line the line of content read from CSV file - * @return the raw target value in the corresponding column of CSV line - */ - public String getTargetString(CharSequence line) { - List values = Lists.newArrayList(COMMA.split(line)); - return values.get(target); - - } - - /*** - * Extract the corresponding raw target label according to a code - * @param code the integer code encoded during training process - * @return the raw target label - */ - public String getTargetLabel(int code) { - for (String key: targetDictionary.values()) { - if (targetDictionary.intern(key) == code) { - return key; - } - } - return null; - } - - /*** - * Extract the id column value from the CSV record - * @param line the line of content read from CSV file - * @return the id value of the CSV record - */ - public String getIdString(CharSequence line) { - List values = Lists.newArrayList(COMMA.split(line)); - return values.get(id); - } - - /** - * Returns a list of the names of the predictor variables. - * - * @return A list of variable names. - */ - @Override - public Iterable getPredictors() { - return Lists.transform(predictors, new Function() { - @Override - public String apply(Integer v) { - if (v >= 0) { - return variableNames.get(v); - } else { - return INTERCEPT_TERM; - } - } - }); - } - - @Override - public Map> getTraceDictionary() { - return traceDictionary; - } - - @Override - public CsvRecordFactory includeBiasTerm(boolean useBias) { - includeBiasTerm = useBias; - return this; - } - - @Override - public List getTargetCategories() { - List r = targetDictionary.values(); - if (r.size() > maxTargetValue) { - r.subList(maxTargetValue, r.size()).clear(); - } - return r; - } - - public String getIdName() { - return idName; - } - - public void setIdName(String idName) { - this.idName = idName; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/DefaultGradient.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/DefaultGradient.java deleted file mode 100644 index f81d8ce76..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/DefaultGradient.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import org.apache.mahout.classifier.AbstractVectorClassifier; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.function.Functions; - -/** - * Implements the basic logistic training law. - */ -public class DefaultGradient implements Gradient { - /** - * Provides a default gradient computation useful for logistic regression. - * - * @param groupKey A grouping key to allow per-something AUC loss to be used for training. - * @param actual The target variable value. - * @param instance The current feature vector to use for gradient computation - * @param classifier The classifier that can compute scores - * @return The gradient to be applied to beta - */ - @Override - public final Vector apply(String groupKey, int actual, Vector instance, AbstractVectorClassifier classifier) { - // what does the current model say? - Vector v = classifier.classify(instance); - - Vector r = v.like(); - if (actual != 0) { - r.setQuick(actual - 1, 1); - } - r.assign(v, Functions.MINUS); - return r; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/ElasticBandPrior.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/ElasticBandPrior.java deleted file mode 100644 index 812837018..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/ElasticBandPrior.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * Implements a linear combination of L1 and L2 priors. This can give an - * interesting mixture of sparsity and load-sharing between redundant predictors. - */ -public class ElasticBandPrior implements PriorFunction { - private double alphaByLambda; - private L1 l1; - private L2 l2; - - // Exists for Writable - public ElasticBandPrior() { - this(0.0); - } - - public ElasticBandPrior(double alphaByLambda) { - this.alphaByLambda = alphaByLambda; - l1 = new L1(); - l2 = new L2(1); - } - - @Override - public double age(double oldValue, double generations, double learningRate) { - oldValue *= Math.pow(1 - alphaByLambda * learningRate, generations); - double newValue = oldValue - Math.signum(oldValue) * learningRate * generations; - if (newValue * oldValue < 0.0) { - // don't allow the value to change sign - return 0.0; - } else { - return newValue; - } - } - - @Override - public double logP(double betaIJ) { - return l1.logP(betaIJ) + alphaByLambda * l2.logP(betaIJ); - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeDouble(alphaByLambda); - l1.write(out); - l2.write(out); - } - - @Override - public void readFields(DataInput in) throws IOException { - alphaByLambda = in.readDouble(); - l1 = new L1(); - l1.readFields(in); - l2 = new L2(); - l2.readFields(in); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/Gradient.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/Gradient.java deleted file mode 100644 index 524fc067a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/Gradient.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import org.apache.mahout.classifier.AbstractVectorClassifier; -import org.apache.mahout.math.Vector; - -/** - * Provides the ability to inject a gradient into the SGD logistic regresion. - * Typical uses of this are to use a ranking score such as AUC instead of a - * normal loss function. - */ -public interface Gradient { - Vector apply(String groupKey, int actual, Vector instance, AbstractVectorClassifier classifier); -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/GradientMachine.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/GradientMachine.java deleted file mode 100644 index a2dbf21de..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/GradientMachine.java +++ /dev/null @@ -1,405 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.classifier.AbstractVectorClassifier; -import org.apache.mahout.classifier.OnlineLearner; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.function.Functions; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.Collection; -import java.util.HashSet; -import java.util.Random; - -/** - * Online gradient machine learner that tries to minimize the label ranking hinge loss. - * Implements a gradient machine with one sigmpod hidden layer. - * It tries to minimize the ranking loss of some given set of labels, - * so this can be used for multi-class, multi-label - * or auto-encoding of sparse data (e.g. text). - */ -public class GradientMachine extends AbstractVectorClassifier implements OnlineLearner, Writable { - - public static final int WRITABLE_VERSION = 1; - - // the learning rate of the algorithm - private double learningRate = 0.1; - - // the regularization term, a positive number that controls the size of the weight vector - private double regularization = 0.1; - - // the sparsity term, a positive number that controls the sparsity of the hidden layer. (0 - 1) - private double sparsity = 0.1; - - // the sparsity learning rate. - private double sparsityLearningRate = 0.1; - - // the number of features - private int numFeatures = 10; - // the number of hidden nodes - private int numHidden = 100; - // the number of output nodes - private int numOutput = 2; - - // coefficients for the input to hidden layer. - // There are numHidden Vectors of dimension numFeatures. - private Vector[] hiddenWeights; - - // coefficients for the hidden to output layer. - // There are numOuput Vectors of dimension numHidden. - private Vector[] outputWeights; - - // hidden unit bias - private Vector hiddenBias; - - // output unit bias - private Vector outputBias; - - private final Random rnd; - - public GradientMachine(int numFeatures, int numHidden, int numOutput) { - this.numFeatures = numFeatures; - this.numHidden = numHidden; - this.numOutput = numOutput; - hiddenWeights = new DenseVector[numHidden]; - for (int i = 0; i < numHidden; i++) { - hiddenWeights[i] = new DenseVector(numFeatures); - hiddenWeights[i].assign(0); - } - hiddenBias = new DenseVector(numHidden); - hiddenBias.assign(0); - outputWeights = new DenseVector[numOutput]; - for (int i = 0; i < numOutput; i++) { - outputWeights[i] = new DenseVector(numHidden); - outputWeights[i].assign(0); - } - outputBias = new DenseVector(numOutput); - outputBias.assign(0); - rnd = RandomUtils.getRandom(); - } - - /** - * Initialize weights. - * - * @param gen random number generator. - */ - public void initWeights(Random gen) { - double hiddenFanIn = 1.0f / Math.sqrt(numFeatures); - for (int i = 0; i < numHidden; i++) { - for (int j = 0; j < numFeatures; j++) { - double val = (2.0 * gen.nextDouble() - 1.0) * hiddenFanIn; - hiddenWeights[i].setQuick(j, val); - } - } - double outputFanIn = 1.0f / Math.sqrt(numHidden); - for (int i = 0; i < numOutput; i++) { - for (int j = 0; j < numHidden; j++) { - double val = (2.0 * gen.nextDouble() - 1.0) * outputFanIn; - outputWeights[i].setQuick(j, val); - } - } - } - - /** - * Chainable configuration option. - * - * @param learningRate New value of initial learning rate. - * @return This, so other configurations can be chained. - */ - public GradientMachine learningRate(double learningRate) { - this.learningRate = learningRate; - return this; - } - - /** - * Chainable configuration option. - * - * @param regularization A positive value that controls the weight vector size. - * @return This, so other configurations can be chained. - */ - public GradientMachine regularization(double regularization) { - this.regularization = regularization; - return this; - } - - /** - * Chainable configuration option. - * - * @param sparsity A value between zero and one that controls the fraction of hidden units - * that are activated on average. - * @return This, so other configurations can be chained. - */ - public GradientMachine sparsity(double sparsity) { - this.sparsity = sparsity; - return this; - } - - /** - * Chainable configuration option. - * - * @param sparsityLearningRate New value of initial learning rate for sparsity. - * @return This, so other configurations can be chained. - */ - public GradientMachine sparsityLearningRate(double sparsityLearningRate) { - this.sparsityLearningRate = sparsityLearningRate; - return this; - } - - public void copyFrom(GradientMachine other) { - numFeatures = other.numFeatures; - numHidden = other.numHidden; - numOutput = other.numOutput; - learningRate = other.learningRate; - regularization = other.regularization; - sparsity = other.sparsity; - sparsityLearningRate = other.sparsityLearningRate; - hiddenWeights = new DenseVector[numHidden]; - for (int i = 0; i < numHidden; i++) { - hiddenWeights[i] = other.hiddenWeights[i].clone(); - } - hiddenBias = other.hiddenBias.clone(); - outputWeights = new DenseVector[numOutput]; - for (int i = 0; i < numOutput; i++) { - outputWeights[i] = other.outputWeights[i].clone(); - } - outputBias = other.outputBias.clone(); - } - - @Override - public int numCategories() { - return numOutput; - } - - public int numFeatures() { - return numFeatures; - } - - public int numHidden() { - return numHidden; - } - - /** - * Feeds forward from input to hidden unit.. - * - * @return Hidden unit activations. - */ - public DenseVector inputToHidden(Vector input) { - DenseVector activations = new DenseVector(numHidden); - for (int i = 0; i < numHidden; i++) { - activations.setQuick(i, hiddenWeights[i].dot(input)); - } - activations.assign(hiddenBias, Functions.PLUS); - activations.assign(Functions.min(40.0)).assign(Functions.max(-40)); - activations.assign(Functions.SIGMOID); - return activations; - } - - /** - * Feeds forward from hidden to output - * - * @return Output unit activations. - */ - public DenseVector hiddenToOutput(Vector hiddenActivation) { - DenseVector activations = new DenseVector(numOutput); - for (int i = 0; i < numOutput; i++) { - activations.setQuick(i, outputWeights[i].dot(hiddenActivation)); - } - activations.assign(outputBias, Functions.PLUS); - return activations; - } - - /** - * Updates using ranking loss. - * - * @param hiddenActivation the hidden unit's activation - * @param goodLabels the labels you want ranked above others. - * @param numTrials how many times you want to search for the highest scoring bad label. - * @param gen Random number generator. - */ - public void updateRanking(Vector hiddenActivation, - Collection goodLabels, - int numTrials, - Random gen) { - // All the labels are good, do nothing. - if (goodLabels.size() >= numOutput) { - return; - } - for (Integer good : goodLabels) { - double goodScore = outputWeights[good].dot(hiddenActivation); - int highestBad = -1; - double highestBadScore = Double.NEGATIVE_INFINITY; - for (int i = 0; i < numTrials; i++) { - int bad = gen.nextInt(numOutput); - while (goodLabels.contains(bad)) { - bad = gen.nextInt(numOutput); - } - double badScore = outputWeights[bad].dot(hiddenActivation); - if (badScore > highestBadScore) { - highestBadScore = badScore; - highestBad = bad; - } - } - int bad = highestBad; - double loss = 1.0 - goodScore + highestBadScore; - if (loss < 0.0) { - continue; - } - // Note from the loss above the gradient dloss/dy , y being the label is -1 for good - // and +1 for bad. - // dy / dw is just w since y = x' * w + b. - // Hence by the chain rule, dloss / dw = dloss / dy * dy / dw = -w. - // For the regularization part, 0.5 * lambda * w' w, the gradient is lambda * w. - // dy / db = 1. - Vector gradGood = outputWeights[good].clone(); - gradGood.assign(Functions.NEGATE); - Vector propHidden = gradGood.clone(); - Vector gradBad = outputWeights[bad].clone(); - propHidden.assign(gradBad, Functions.PLUS); - gradGood.assign(Functions.mult(-learningRate * (1.0 - regularization))); - outputWeights[good].assign(gradGood, Functions.PLUS); - gradBad.assign(Functions.mult(-learningRate * (1.0 + regularization))); - outputWeights[bad].assign(gradBad, Functions.PLUS); - outputBias.setQuick(good, outputBias.get(good) + learningRate); - outputBias.setQuick(bad, outputBias.get(bad) - learningRate); - // Gradient of sigmoid is s * (1 -s). - Vector gradSig = hiddenActivation.clone(); - gradSig.assign(Functions.SIGMOIDGRADIENT); - // Multiply by the change caused by the ranking loss. - for (int i = 0; i < numHidden; i++) { - gradSig.setQuick(i, gradSig.get(i) * propHidden.get(i)); - } - for (int i = 0; i < numHidden; i++) { - for (int j = 0; j < numFeatures; j++) { - double v = hiddenWeights[i].get(j); - v -= learningRate * (gradSig.get(i) + regularization * v); - hiddenWeights[i].setQuick(j, v); - } - } - } - } - - @Override - public Vector classify(Vector instance) { - Vector result = classifyNoLink(instance); - // Find the max value's index. - int max = result.maxValueIndex(); - result.assign(0); - result.setQuick(max, 1.0); - return result.viewPart(1, result.size() - 1); - } - - @Override - public Vector classifyNoLink(Vector instance) { - DenseVector hidden = inputToHidden(instance); - return hiddenToOutput(hidden); - } - - @Override - public double classifyScalar(Vector instance) { - Vector output = classifyNoLink(instance); - if (output.get(0) > output.get(1)) { - return 0; - } - return 1; - } - - public GradientMachine copy() { - close(); - GradientMachine r = new GradientMachine(numFeatures(), numHidden(), numCategories()); - r.copyFrom(this); - return r; - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeInt(WRITABLE_VERSION); - out.writeDouble(learningRate); - out.writeDouble(regularization); - out.writeDouble(sparsity); - out.writeDouble(sparsityLearningRate); - out.writeInt(numFeatures); - out.writeInt(numHidden); - out.writeInt(numOutput); - VectorWritable.writeVector(out, hiddenBias); - for (int i = 0; i < numHidden; i++) { - VectorWritable.writeVector(out, hiddenWeights[i]); - } - VectorWritable.writeVector(out, outputBias); - for (int i = 0; i < numOutput; i++) { - VectorWritable.writeVector(out, outputWeights[i]); - } - } - - @Override - public void readFields(DataInput in) throws IOException { - int version = in.readInt(); - if (version == WRITABLE_VERSION) { - learningRate = in.readDouble(); - regularization = in.readDouble(); - sparsity = in.readDouble(); - sparsityLearningRate = in.readDouble(); - numFeatures = in.readInt(); - numHidden = in.readInt(); - numOutput = in.readInt(); - hiddenWeights = new DenseVector[numHidden]; - hiddenBias = VectorWritable.readVector(in); - for (int i = 0; i < numHidden; i++) { - hiddenWeights[i] = VectorWritable.readVector(in); - } - outputWeights = new DenseVector[numOutput]; - outputBias = VectorWritable.readVector(in); - for (int i = 0; i < numOutput; i++) { - outputWeights[i] = VectorWritable.readVector(in); - } - } else { - throw new IOException("Incorrect object version, wanted " + WRITABLE_VERSION + " got " + version); - } - } - - @Override - public void close() { - // This is an online classifier, nothing to do. - } - - @Override - public void train(long trackingKey, String groupKey, int actual, Vector instance) { - Vector hiddenActivation = inputToHidden(instance); - hiddenToOutput(hiddenActivation); - Collection goodLabels = new HashSet(); - goodLabels.add(actual); - updateRanking(hiddenActivation, goodLabels, 2, rnd); - } - - @Override - public void train(long trackingKey, int actual, Vector instance) { - train(trackingKey, null, actual, instance); - } - - @Override - public void train(int actual, Vector instance) { - train(0, null, actual, instance); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/L1.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/L1.java deleted file mode 100644 index 28a05f23d..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/L1.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * Implements the Laplacian or bi-exponential prior. This prior has a strong tendency to set coefficients to zero - * and thus is useful as an alternative to variable selection. This version implements truncation which prevents - * a coefficient from changing sign. If a correction would change the sign, the coefficient is truncated to zero. - * - * Note that it doesn't matter to have a scale for this distribution because after taking the derivative of the logP, - * the lambda coefficient used to combine the prior with the observations has the same effect. If we had a scale here, - * then it would be the same effect as just changing lambda. - */ -public class L1 implements PriorFunction { - @Override - public double age(double oldValue, double generations, double learningRate) { - double newValue = oldValue - Math.signum(oldValue) * learningRate * generations; - if (newValue * oldValue < 0) { - // don't allow the value to change sign - return 0; - } else { - return newValue; - } - } - - @Override - public double logP(double betaIJ) { - return -Math.abs(betaIJ); - } - - @Override - public void write(DataOutput out) throws IOException { - // stateless class has nothing to serialize - } - - @Override - public void readFields(DataInput dataInput) throws IOException { - // stateless class has nothing to serialize - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/L2.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/L2.java deleted file mode 100644 index 9526e814c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/L2.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * Implements the Gaussian prior. This prior has a tendency to decrease large coefficients toward zero, but - * doesn't tend to set them to exactly zero. - */ -public class L2 implements PriorFunction { - - private static final double HALF_LOG_2PI = Math.log(2.0 * Math.PI) / 2.0; - - private double s2; - private double s; - - public L2(double scale) { - this.s = scale; - this.s2 = scale * scale; - } - - public L2() { - } - - @Override - public double age(double oldValue, double generations, double learningRate) { - return oldValue * Math.pow(1.0 - learningRate / s2, generations); - } - - @Override - public double logP(double betaIJ) { - return -betaIJ * betaIJ / s2 / 2.0 - Math.log(s) - HALF_LOG_2PI; - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeDouble(s2); - out.writeDouble(s); - } - - @Override - public void readFields(DataInput in) throws IOException { - s2 = in.readDouble(); - s = in.readDouble(); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/MixedGradient.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/MixedGradient.java deleted file mode 100644 index a290b226d..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/MixedGradient.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import org.apache.mahout.classifier.AbstractVectorClassifier; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.Vector; - -import java.util.Random; - -/** - *

Provides a stochastic mixture of ranking updates and normal logistic updates. This uses a - * combination of AUC driven learning to improve ranking performance and traditional log-loss driven - * learning to improve log-likelihood.

- * - *

See www.eecs.tufts.edu/~dsculley/papers/combined-ranking-and-regression.pdf

- * - *

This implementation only makes sense for the binomial case.

- */ -public class MixedGradient implements Gradient { - - private final double alpha; - private final RankingGradient rank; - private final Gradient basic; - private final Random random = RandomUtils.getRandom(); - private boolean hasZero; - private boolean hasOne; - - public MixedGradient(double alpha, int window) { - this.alpha = alpha; - this.rank = new RankingGradient(window); - this.basic = this.rank.getBaseGradient(); - } - - @Override - public Vector apply(String groupKey, int actual, Vector instance, AbstractVectorClassifier classifier) { - if (random.nextDouble() < alpha) { - // one option is to apply a ranking update relative to our recent history - if (!hasZero || !hasOne) { - throw new IllegalStateException(); - } - return rank.apply(groupKey, actual, instance, classifier); - } else { - hasZero |= actual == 0; - hasOne |= actual == 1; - // the other option is a normal update, but we have to update our history on the way - rank.addToHistory(actual, instance); - return basic.apply(groupKey, actual, instance, classifier); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/ModelDissector.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/ModelDissector.java deleted file mode 100644 index dc423ea9f..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/ModelDissector.java +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Ordering; -import org.apache.mahout.classifier.AbstractVectorClassifier; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.Vector; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.PriorityQueue; -import java.util.Queue; -import java.util.Set; - -/** - * Uses sample data to reverse engineer a feature-hashed model. - * - * The result gives approximate weights for features and interactions - * in the original space. - * - * The idea is that the hashed encoders have the option of having a trace dictionary. This - * tells us where each feature is hashed to, or each feature/value combination in the case - * of word-like values. Using this dictionary, we can put values into a synthetic feature - * vector in just the locations specified by a single feature or interaction. Then we can - * push this through a linear part of a model to see the contribution of that input. For - * any generalized linear model like logistic regression, there is a linear part of the - * model that allows this. - * - * What the ModelDissector does is to accept a trace dictionary and a model in an update - * method. It figures out the weights for the elements in the trace dictionary and stashes - * them. Then in a summary method, the biggest weights are returned. This update/flush - * style is used so that the trace dictionary doesn't have to grow to enormous levels, - * but instead can be cleared between updates. - */ -public class ModelDissector { - private final Map weightMap; - - public ModelDissector() { - weightMap = Maps.newHashMap(); - } - - /** - * Probes a model to determine the effect of a particular variable. This is done - * with the ade of a trace dictionary which has recorded the locations in the feature - * vector that are modified by various variable values. We can set these locations to - * 1 and then look at the resulting score. This tells us the weight the model places - * on that variable. - * @param features A feature vector to use (destructively) - * @param traceDictionary A trace dictionary containing variables and what locations - * in the feature vector are affected by them - * @param learner The model that we are probing to find weights on features - */ - - public void update(Vector features, Map> traceDictionary, AbstractVectorClassifier learner) { - // zero out feature vector - features.assign(0); - for (Map.Entry> entry : traceDictionary.entrySet()) { - // get a feature and locations where it is stored in the feature vector - String key = entry.getKey(); - Set value = entry.getValue(); - - // if we haven't looked at this feature yet - if (!weightMap.containsKey(key)) { - // put probe values in the feature vector - for (Integer where : value) { - features.set(where, 1); - } - - // see what the model says - Vector v = learner.classifyNoLink(features); - weightMap.put(key, v); - - // and zero out those locations again - for (Integer where : value) { - features.set(where, 0); - } - } - } - } - - /** - * Returns the n most important features with their - * weights, most important category and the top few - * categories that they affect. - * @param n How many results to return. - * @return A list of the top variables. - */ - public List summary(int n) { - Queue pq = new PriorityQueue(); - for (Map.Entry entry : weightMap.entrySet()) { - pq.add(new Weight(entry.getKey(), entry.getValue())); - while (pq.size() > n) { - pq.poll(); - } - } - List r = Lists.newArrayList(pq); - Collections.sort(r, Ordering.natural().reverse()); - return r; - } - - private static final class Category implements Comparable { - private final int index; - private final double weight; - - private Category(int index, double weight) { - this.index = index; - this.weight = weight; - } - - @Override - public int compareTo(Category o) { - int r = Double.compare(Math.abs(weight), Math.abs(o.weight)); - if (r == 0) { - if (o.index < index) { - return -1; - } - if (o.index > index) { - return 1; - } - return 0; - } - return r; - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof Category)) { - return false; - } - Category other = (Category) o; - return index == other.index && weight == other.weight; - } - - @Override - public int hashCode() { - return RandomUtils.hashDouble(weight) ^ index; - } - - } - - public static class Weight implements Comparable { - private final String feature; - private final double value; - private final int maxIndex; - private final List categories; - - public Weight(String feature, Vector weights) { - this(feature, weights, 3); - } - - public Weight(String feature, Vector weights, int n) { - this.feature = feature; - // pick out the weight with the largest abs value, but don't forget the sign - Queue biggest = new PriorityQueue(n + 1, Ordering.natural()); - for (Vector.Element element : weights) { - biggest.add(new Category(element.index(), element.get())); - while (biggest.size() > n) { - biggest.poll(); - } - } - categories = Lists.newArrayList(biggest); - Collections.sort(categories, Ordering.natural().reverse()); - value = categories.get(0).weight; - maxIndex = categories.get(0).index; - } - - @Override - public int compareTo(Weight other) { - int r = Double.compare(Math.abs(this.value), Math.abs(other.value)); - if (r == 0) { - return feature.compareTo(other.feature); - } - return r; - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof Weight)) { - return false; - } - Weight other = (Weight) o; - return feature.equals(other.feature) - && value == other.value - && maxIndex == other.maxIndex - && categories.equals(other.categories); - } - - @Override - public int hashCode() { - return feature.hashCode() ^ RandomUtils.hashDouble(value) ^ maxIndex ^ categories.hashCode(); - } - - public String getFeature() { - return feature; - } - - public double getWeight() { - return value; - } - - public double getWeight(int n) { - return categories.get(n).weight; - } - - public double getCategory(int n) { - return categories.get(n).index; - } - - public int getMaxImpact() { - return maxIndex; - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java deleted file mode 100644 index e03e31033..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import com.google.common.io.Closeables; -import org.apache.hadoop.io.Writable; - -import java.io.DataInput; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; - -/** - * Provides the ability to store SGD model-related objects as binary files. - */ -public final class ModelSerializer { - - // static class ... don't instantiate - private ModelSerializer() { - } - - public static void writeBinary(String path, CrossFoldLearner model) throws IOException { - DataOutputStream out = new DataOutputStream(new FileOutputStream(path)); - try { - PolymorphicWritable.write(out, model); - } finally { - Closeables.closeQuietly(out); - } - } - - public static void writeBinary(String path, OnlineLogisticRegression model) throws IOException { - DataOutputStream out = new DataOutputStream(new FileOutputStream(path)); - try { - PolymorphicWritable.write(out, model); - } finally { - Closeables.closeQuietly(out); - } - } - - public static void writeBinary(String path, AdaptiveLogisticRegression model) throws IOException { - DataOutputStream out = new DataOutputStream(new FileOutputStream(path)); - try { - PolymorphicWritable.write(out, model); - } finally { - Closeables.closeQuietly(out); - } - } - - public static T readBinary(InputStream in, Class clazz) throws IOException { - DataInput dataIn = new DataInputStream(in); - try { - return PolymorphicWritable.read(dataIn, clazz); - } finally { - Closeables.closeQuietly(in); - } - } - - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/OnlineLogisticRegression.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/OnlineLogisticRegression.java deleted file mode 100644 index a477b9ac0..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/OnlineLogisticRegression.java +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.math.DenseMatrix; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.MatrixWritable; -import org.apache.mahout.math.VectorWritable; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * Extends the basic on-line logistic regression learner with a specific set of learning - * rate annealing schedules. - */ -public class OnlineLogisticRegression extends AbstractOnlineLogisticRegression implements Writable { - public static final int WRITABLE_VERSION = 1; - - // these next two control decayFactor^steps exponential type of annealing - // learning rate and decay factor - private double mu0 = 1; - private double decayFactor = 1 - 1.0e-3; - - // these next two control 1/steps^forget type annealing - private int stepOffset = 10; - // -1 equals even weighting of all examples, 0 means only use exponential annealing - private double forgettingExponent = -0.5; - - // controls how per term annealing works - private int perTermAnnealingOffset = 20; - - public OnlineLogisticRegression() { - // private constructor available for serialization, but not normal use - } - - public OnlineLogisticRegression(int numCategories, int numFeatures, PriorFunction prior) { - this.numCategories = numCategories; - this.prior = prior; - - updateSteps = new DenseVector(numFeatures); - updateCounts = new DenseVector(numFeatures).assign(perTermAnnealingOffset); - beta = new DenseMatrix(numCategories - 1, numFeatures); - } - - /** - * Chainable configuration option. - * - * @param alpha New value of decayFactor, the exponential decay rate for the learning rate. - * @return This, so other configurations can be chained. - */ - public OnlineLogisticRegression alpha(double alpha) { - this.decayFactor = alpha; - return this; - } - - @Override - public OnlineLogisticRegression lambda(double lambda) { - // we only over-ride this to provide a more restrictive return type - super.lambda(lambda); - return this; - } - - /** - * Chainable configuration option. - * - * @param learningRate New value of initial learning rate. - * @return This, so other configurations can be chained. - */ - public OnlineLogisticRegression learningRate(double learningRate) { - this.mu0 = learningRate; - return this; - } - - public OnlineLogisticRegression stepOffset(int stepOffset) { - this.stepOffset = stepOffset; - return this; - } - - public OnlineLogisticRegression decayExponent(double decayExponent) { - if (decayExponent > 0) { - decayExponent = -decayExponent; - } - this.forgettingExponent = decayExponent; - return this; - } - - - @Override - public double perTermLearningRate(int j) { - return Math.sqrt(perTermAnnealingOffset / updateCounts.get(j)); - } - - @Override - public double currentLearningRate() { - return mu0 * Math.pow(decayFactor, getStep()) * Math.pow(getStep() + stepOffset, forgettingExponent); - } - - public void copyFrom(OnlineLogisticRegression other) { - super.copyFrom(other); - mu0 = other.mu0; - decayFactor = other.decayFactor; - - stepOffset = other.stepOffset; - forgettingExponent = other.forgettingExponent; - - perTermAnnealingOffset = other.perTermAnnealingOffset; - } - - public OnlineLogisticRegression copy() { - close(); - OnlineLogisticRegression r = new OnlineLogisticRegression(numCategories(), numFeatures(), prior); - r.copyFrom(this); - return r; - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeInt(WRITABLE_VERSION); - out.writeDouble(mu0); - out.writeDouble(decayFactor); - out.writeInt(stepOffset); - out.writeInt(step); - out.writeDouble(forgettingExponent); - out.writeInt(perTermAnnealingOffset); - out.writeInt(numCategories); - MatrixWritable.writeMatrix(out, beta); - PolymorphicWritable.write(out, prior); - VectorWritable.writeVector(out, updateCounts); - VectorWritable.writeVector(out, updateSteps); - } - - @Override - public void readFields(DataInput in) throws IOException { - int version = in.readInt(); - if (version == WRITABLE_VERSION) { - mu0 = in.readDouble(); - decayFactor = in.readDouble(); - stepOffset = in.readInt(); - step = in.readInt(); - forgettingExponent = in.readDouble(); - perTermAnnealingOffset = in.readInt(); - numCategories = in.readInt(); - beta = MatrixWritable.readMatrix(in); - prior = PolymorphicWritable.read(in, PriorFunction.class); - - updateCounts = VectorWritable.readVector(in); - updateSteps = VectorWritable.readVector(in); - } else { - throw new IOException("Incorrect object version, wanted " + WRITABLE_VERSION + " got " + version); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/PassiveAggressive.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/PassiveAggressive.java deleted file mode 100644 index c51361c17..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/PassiveAggressive.java +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.classifier.AbstractVectorClassifier; -import org.apache.mahout.classifier.OnlineLearner; -import org.apache.mahout.math.DenseMatrix; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.MatrixWritable; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.function.Functions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * Online passive aggressive learner that tries to minimize the label ranking hinge loss. - * Implements a multi-class linear classifier minimizing rank loss. - * based on "Online passive aggressive algorithms" by Cramer et al, 2006. - * Note: Its better to use classifyNoLink because the loss function is based - * on ensuring that the score of the good label is larger than the next - * highest label by some margin. The conversion to probability is just done - * by exponentiating and dividing by the sum and is empirical at best. - * Your features should be pre-normalized in some sensible range, for example, - * by subtracting the mean and standard deviation, if they are very - * different in magnitude from each other. - */ -public class PassiveAggressive extends AbstractVectorClassifier implements OnlineLearner, Writable { - - private static final Logger log = LoggerFactory.getLogger(PassiveAggressive.class); - - public static final int WRITABLE_VERSION = 1; - - // the learning rate of the algorithm - private double learningRate = 0.1; - - // loss statistics. - private int lossCount = 0; - private double lossSum = 0; - - // coefficients for the classification. This is a dense matrix - // that is (numCategories ) x numFeatures - private Matrix weights; - - // number of categories we are classifying. - private int numCategories; - - public PassiveAggressive(int numCategories, int numFeatures) { - this.numCategories = numCategories; - weights = new DenseMatrix(numCategories, numFeatures); - weights.assign(0.0); - } - - /** - * Chainable configuration option. - * - * @param learningRate New value of initial learning rate. - * @return This, so other configurations can be chained. - */ - public PassiveAggressive learningRate(double learningRate) { - this.learningRate = learningRate; - return this; - } - - public void copyFrom(PassiveAggressive other) { - learningRate = other.learningRate; - numCategories = other.numCategories; - weights = other.weights; - } - - @Override - public int numCategories() { - return numCategories; - } - - @Override - public Vector classify(Vector instance) { - Vector result = classifyNoLink(instance); - // Convert to probabilities by exponentiation. - double max = result.maxValue(); - result.assign(Functions.minus(max)).assign(Functions.EXP); - result = result.divide(result.norm(1)); - - return result.viewPart(1, result.size() - 1); - } - - @Override - public Vector classifyNoLink(Vector instance) { - Vector result = new DenseVector(weights.numRows()); - result.assign(0); - for (int i = 0; i < weights.numRows(); i++) { - result.setQuick(i, weights.viewRow(i).dot(instance)); - } - return result; - } - - @Override - public double classifyScalar(Vector instance) { - double v1 = weights.viewRow(0).dot(instance); - double v2 = weights.viewRow(1).dot(instance); - v1 = Math.exp(v1); - v2 = Math.exp(v2); - return v2 / (v1 + v2); - } - - public int numFeatures() { - return weights.numCols(); - } - - public PassiveAggressive copy() { - close(); - PassiveAggressive r = new PassiveAggressive(numCategories(), numFeatures()); - r.copyFrom(this); - return r; - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeInt(WRITABLE_VERSION); - out.writeDouble(learningRate); - out.writeInt(numCategories); - MatrixWritable.writeMatrix(out, weights); - } - - @Override - public void readFields(DataInput in) throws IOException { - int version = in.readInt(); - if (version == WRITABLE_VERSION) { - learningRate = in.readDouble(); - numCategories = in.readInt(); - weights = MatrixWritable.readMatrix(in); - } else { - throw new IOException("Incorrect object version, wanted " + WRITABLE_VERSION + " got " + version); - } - } - - @Override - public void close() { - // This is an online classifier, nothing to do. - } - - @Override - public void train(long trackingKey, String groupKey, int actual, Vector instance) { - if (lossCount > 1000) { - log.info("Avg. Loss = {}", lossSum / lossCount); - lossCount = 0; - lossSum = 0; - } - Vector result = classifyNoLink(instance); - double myScore = result.get(actual); - // Find the highest score that is not actual. - int otherIndex = result.maxValueIndex(); - double otherValue = result.get(otherIndex); - if (otherIndex == actual) { - result.setQuick(otherIndex, Double.NEGATIVE_INFINITY); - otherIndex = result.maxValueIndex(); - otherValue = result.get(otherIndex); - } - double loss = 1.0 - myScore + otherValue; - lossCount += 1; - if (loss >= 0) { - lossSum += loss; - double tau = loss / (instance.dot(instance) + 0.5 / learningRate); - Vector delta = instance.clone(); - delta.assign(Functions.mult(tau)); - weights.viewRow(actual).assign(delta, Functions.PLUS); -// delta.addTo(weights.viewRow(actual)); - delta.assign(Functions.mult(-1)); - weights.viewRow(otherIndex).assign(delta, Functions.PLUS); -// delta.addTo(weights.viewRow(otherIndex)); - } - } - - @Override - public void train(long trackingKey, int actual, Vector instance) { - train(trackingKey, null, actual, instance); - } - - @Override - public void train(int actual, Vector instance) { - train(0, null, actual, instance); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/PolymorphicWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/PolymorphicWritable.java deleted file mode 100644 index 90062a674..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/PolymorphicWritable.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.common.ClassUtils; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * Utilities that write a class name and then serialize using writables. - */ -public final class PolymorphicWritable { - - private PolymorphicWritable() { - } - - public static void write(DataOutput dataOutput, T value) throws IOException { - dataOutput.writeUTF(value.getClass().getName()); - value.write(dataOutput); - } - - public static T read(DataInput dataInput, Class clazz) throws IOException { - String className = dataInput.readUTF(); - T r = ClassUtils.instantiateAs(className, clazz); - r.readFields(dataInput); - return r; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/PriorFunction.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/PriorFunction.java deleted file mode 100644 index 857f06139..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/PriorFunction.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import org.apache.hadoop.io.Writable; - -/** - * A prior is used to regularize the learning algorithm. This allows a trade-off to - * be made between complexity of the model being learned and the accuracy with which - * the model fits the training data. There are different definitions of complexity - * which can be approximated using different priors. For large sparse systems, such - * as text classification, the L1 prior is often used which favors sparse models. - */ -public interface PriorFunction extends Writable { - /** - * Applies the regularization to a coefficient. - * @param oldValue The previous value. - * @param generations The number of generations. - * @param learningRate The learning rate with lambda baked in. - * @return The new coefficient value after regularization. - */ - double age(double oldValue, double generations, double learningRate); - - /** - * Returns the log of the probability of a particular coefficient value according to the prior. - * @param betaIJ The coefficient. - * @return The log probability. - */ - double logP(double betaIJ); -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/RankingGradient.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/RankingGradient.java deleted file mode 100644 index b52cb8cbd..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/RankingGradient.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import com.google.common.collect.Lists; -import org.apache.mahout.classifier.AbstractVectorClassifier; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.function.Functions; - -import java.util.ArrayDeque; -import java.util.Deque; -import java.util.List; - -/** - * Uses the difference between this instance and recent history to get a - * gradient that optimizes ranking performance. Essentially this is the - * same as directly optimizing AUC. It isn't expected that this would - * be used alone, but rather that a MixedGradient would use it and a - * DefaultGradient together to combine both ranking and log-likelihood - * goals. - */ -public class RankingGradient implements Gradient { - - private static final Gradient BASIC = new DefaultGradient(); - - private int window = 10; - - private final List> history = Lists.newArrayList(); - - public RankingGradient(int window) { - this.window = window; - } - - @Override - public final Vector apply(String groupKey, int actual, Vector instance, AbstractVectorClassifier classifier) { - addToHistory(actual, instance); - - // now compute average gradient versus saved vectors from the other side - Deque otherSide = history.get(1 - actual); - int n = otherSide.size(); - - Vector r = null; - for (Vector other : otherSide) { - Vector g = BASIC.apply(groupKey, actual, instance.minus(other), classifier); - - if (r == null) { - r = g; - } else { - r.assign(g, Functions.plusMult(1.0 / n)); - } - } - return r; - } - - public void addToHistory(int actual, Vector instance) { - while (history.size() <= actual) { - history.add(new ArrayDeque(window)); - } - // save this instance - Deque ourSide = history.get(actual); - ourSide.add(instance); - while (ourSide.size() >= window) { - ourSide.pollFirst(); - } - } - - public Gradient getBaseGradient() { - return BASIC; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/RecordFactory.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/RecordFactory.java deleted file mode 100644 index fbc825d83..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/RecordFactory.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import org.apache.mahout.math.Vector; - -import java.util.List; -import java.util.Map; -import java.util.Set; - -/** - * A record factor understands how to convert a line of data into fields and then into a vector. - */ -public interface RecordFactory { - void defineTargetCategories(List values); - - RecordFactory maxTargetValue(int max); - - boolean usesFirstLineAsSchema(); - - int processLine(String line, Vector featureVector); - - Iterable getPredictors(); - - Map> getTraceDictionary(); - - RecordFactory includeBiasTerm(boolean useBias); - - List getTargetCategories(); - - void firstLine(String line); -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/TPrior.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/TPrior.java deleted file mode 100644 index 334292434..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/TPrior.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import org.apache.commons.math.special.Gamma; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * Provides a t-distribution as a prior. - */ -public class TPrior implements PriorFunction { - private double df; - - public TPrior(double df) { - this.df = df; - } - - @Override - public double age(double oldValue, double generations, double learningRate) { - for (int i = 0; i < generations; i++) { - oldValue -= learningRate * oldValue * (df + 1.0) / (df + oldValue * oldValue); - } - return oldValue; - } - - @Override - public double logP(double betaIJ) { - return Gamma.logGamma((df + 1.0) / 2.0) - - Math.log(df * Math.PI) - - Gamma.logGamma(df / 2.0) - - (df + 1.0) / 2.0 * Math.log1p(betaIJ * betaIJ); - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeDouble(df); - } - - @Override - public void readFields(DataInput in) throws IOException { - df = in.readDouble(); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/UniformPrior.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/UniformPrior.java deleted file mode 100644 index 23c812ff5..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/UniformPrior.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.sgd; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * A uniform prior. This is an improper prior that corresponds to no regularization at all. - */ -public class UniformPrior implements PriorFunction { - @Override - public double age(double oldValue, double generations, double learningRate) { - return oldValue; - } - - @Override - public double logP(double betaIJ) { - return 0; - } - - @Override - public void write(DataOutput dataOutput) throws IOException { - // nothing to write - } - - @Override - public void readFields(DataInput dataInput) throws IOException { - // stateless class is trivial to read - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/package-info.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/package-info.java deleted file mode 100644 index c93ef0164..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/sgd/package-info.java +++ /dev/null @@ -1,23 +0,0 @@ -/** - *

Implements a variety of on-line logistric regression classifiers using SGD-based algorithms. - * SGD stands for Stochastic Gradient Descent and refers to a class of learning algorithms - * that make it relatively easy to build high speed on-line learning algorithms for a variety - * of problems, notably including supervised learning for classification.

- * - *

The primary class of interest in the this package is - * {@link org.apache.mahout.classifier.sgd.CrossFoldLearner} which contains a - * number (typically 5) of sub-learners, each of which is given a different portion of the - * training data. Each of these sub-learners can then be evaluated on the data it was not - * trained on. This allows fully incremental learning while still getting cross-validated - * performance estimates.

- * - *

The CrossFoldLearner implements {@link org.apache.mahout.classifier.OnlineLearner} - * and thus expects to be fed input in the form - * of a target variable and a feature vector. The target variable is simply an integer in the - * half-open interval [0..numFeatures) where numFeatures is defined when the CrossFoldLearner - * is constructed. The creation of feature vectors is facilitated by the classes that inherit - * from {@link org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder}. - * These classes currently implement a form of feature hashing with - * multiple probes to limit feature ambiguity.

- */ -package org.apache.mahout.classifier.sgd; \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/AbstractCluster.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/AbstractCluster.java deleted file mode 100644 index 2ceb01b80..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/AbstractCluster.java +++ /dev/null @@ -1,376 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.Collection; -import java.util.Collections; -import java.util.Iterator; -import java.util.Locale; - -import org.apache.hadoop.conf.Configuration; -import org.apache.mahout.common.parameters.Parameter; -import org.apache.mahout.math.NamedVector; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.function.Functions; -import org.apache.mahout.math.function.SquareRootFunction; - -public abstract class AbstractCluster implements Cluster { - - // cluster persistent state - private int id; - - private long numObservations; - - private long totalObservations; - - private Vector center; - - private Vector radius; - - // the observation statistics - private double s0; - - private Vector s1; - - private Vector s2; - - protected AbstractCluster() {} - - protected AbstractCluster(Vector point, int id2) { - setNumObservations(0); - setTotalObservations(0); - setCenter(new RandomAccessSparseVector(point)); - setRadius(center.like()); - setS0(0); - setS1(center.like()); - setS2(center.like()); - this.id = id2; - } - - protected AbstractCluster(Vector center2, Vector radius2, int id2) { - setNumObservations(0); - setTotalObservations(0); - setCenter(new RandomAccessSparseVector(center2)); - setRadius(new RandomAccessSparseVector(radius2)); - setS0(0); - setS1(center.like()); - setS2(center.like()); - this.id = id2; - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeInt(id); - out.writeLong(getNumObservations()); - out.writeLong(getTotalObservations()); - VectorWritable.writeVector(out, getCenter()); - VectorWritable.writeVector(out, getRadius()); - out.writeDouble(s0); - VectorWritable.writeVector(out, s1); - VectorWritable.writeVector(out, s2); - } - - @Override - public void readFields(DataInput in) throws IOException { - this.id = in.readInt(); - this.setNumObservations(in.readLong()); - this.setTotalObservations(in.readLong()); - this.setCenter(VectorWritable.readVector(in)); - this.setRadius(VectorWritable.readVector(in)); - this.setS0(in.readDouble()); - this.setS1(VectorWritable.readVector(in)); - this.setS2(VectorWritable.readVector(in)); - } - - @Override - public void configure(Configuration job) { - // nothing to do - } - - @Override - public Collection> getParameters() { - return Collections.emptyList(); - } - - @Override - public void createParameters(String prefix, Configuration jobConf) { - // nothing to do - } - - @Override - public int getId() { - return id; - } - - /** - * @param id - * the id to set - */ - protected void setId(int id) { - this.id = id; - } - - public long getNumObservations() { - return numObservations; - } - - /** - * @param l - * the numPoints to set - */ - protected void setNumObservations(long l) { - this.numObservations = l; - } - - public long getTotalObservations() { - return totalObservations; - } - - protected void setTotalObservations(long totalPoints) { - this.totalObservations = totalPoints; - } - - @Override - public Vector getCenter() { - return center; - } - - /** - * @param center - * the center to set - */ - protected void setCenter(Vector center) { - this.center = center; - } - - @Override - public Vector getRadius() { - return radius; - } - - /** - * @param radius - * the radius to set - */ - protected void setRadius(Vector radius) { - this.radius = radius; - } - - /** - * @return the s0 - */ - protected double getS0() { - return s0; - } - - protected void setS0(double s0) { - this.s0 = s0; - } - - /** - * @return the s1 - */ - protected Vector getS1() { - return s1; - } - - protected void setS1(Vector s1) { - this.s1 = s1; - } - - /** - * @return the s2 - */ - protected Vector getS2() { - return s2; - } - - protected void setS2(Vector s2) { - this.s2 = s2; - } - - @Override - public void observe(Model x) { - AbstractCluster cl = (AbstractCluster) x; - setS0(getS0() + cl.getS0()); - setS1(getS1().plus(cl.getS1())); - setS2(getS2().plus(cl.getS2())); - } - - public void observe(ClusterObservations observations) { - setS0(getS0() + observations.getS0()); - if (getS1() == null) { - setS1(observations.getS1().clone()); - } else { - getS1().assign(observations.getS1(), Functions.PLUS); - } - if (getS2() == null) { - setS2(observations.getS2().clone()); - } else { - getS2().assign(observations.getS2(), Functions.PLUS); - } - } - - @Override - public void observe(VectorWritable x) { - observe(x.get()); - } - - @Override - public void observe(VectorWritable x, double weight) { - observe(x.get(), weight); - } - - public void observe(Vector x, double weight) { - if (weight == 1.0) { - observe(x); - } else { - setS0(getS0() + weight); - Vector weightedX = x.times(weight); - if (getS1() == null) { - setS1(weightedX); - } else { - getS1().assign(weightedX, Functions.PLUS); - } - Vector x2 = x.times(x).times(weight); - if (getS2() == null) { - setS2(x2); - } else { - getS2().assign(x2, Functions.PLUS); - } - } - } - - public void observe(Vector x) { - setS0(getS0() + 1); - if (getS1() == null) { - setS1(x.clone()); - } else { - getS1().assign(x, Functions.PLUS); - } - Vector x2 = x.times(x); - if (getS2() == null) { - setS2(x2); - } else { - getS2().assign(x2, Functions.PLUS); - } - } - - - public ClusterObservations getObservations() { - return new ClusterObservations(getS0(), getS1(), getS2()); - } - - @Override - public void computeParameters() { - if (getS0() == 0) { - return; - } - setNumObservations((long) getS0()); - setTotalObservations(getTotalObservations() + getNumObservations()); - setCenter(getS1().divide(getS0())); - // compute the component stds - if (getS0() > 1) { - setRadius(getS2().times(getS0()).minus(getS1().times(getS1())).assign(new SquareRootFunction()).divide(getS0())); - } - setS0(0); - setS1(center.like()); - setS2(center.like()); - } - - @Override - public String asFormatString(String[] bindings) { - StringBuilder buf = new StringBuilder(50); - buf.append(getIdentifier()).append("{n=").append(getNumObservations()); - if (getCenter() != null) { - buf.append(" c=").append(formatVector(getCenter(), bindings)); - } - if (getRadius() != null) { - buf.append(" r=").append(formatVector(getRadius(), bindings)); - } - buf.append('}'); - return buf.toString(); - } - - public abstract String getIdentifier(); - - /** - * Compute the centroid by averaging the pointTotals - * - * @return the new centroid - */ - public Vector computeCentroid() { - return getS0() == 0 ? getCenter() : getS1().divide(getS0()); - } - - /** - * Return a human-readable formatted string representation of the vector, not - * intended to be complete nor usable as an input/output representation - */ - public static String formatVector(Vector v, String[] bindings) { - StringBuilder buf = new StringBuilder(); - if (v instanceof NamedVector) { - buf.append(((NamedVector) v).getName()).append(" = "); - } - int nzero = 0; - Iterator iterateNonZero = v.iterateNonZero(); - while (iterateNonZero.hasNext()) { - iterateNonZero.next(); - nzero++; - } - // if vector is sparse or if we have bindings, use sparse notation - if (nzero < v.size() || bindings != null) { - buf.append('['); - for (int i = 0; i < v.size(); i++) { - double elem = v.get(i); - if (elem == 0.0) { - continue; - } - String label; - if (bindings != null && (label = bindings[i]) != null) { - buf.append(label).append(':'); - } else { - buf.append(i).append(':'); - } - buf.append(String.format(Locale.ENGLISH, "%.3f", elem)).append(", "); - } - } else { - buf.append('['); - for (int i = 0; i < v.size(); i++) { - double elem = v.get(i); - buf.append(String.format(Locale.ENGLISH, "%.3f", elem)).append(", "); - } - } - if (buf.length() > 1) { - buf.setLength(buf.length() - 2); - } - buf.append(']'); - return buf.toString(); - } - - @Override - public boolean isConverged() { - // Convergence has no meaning yet, perhaps in subclasses - return false; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/Cluster.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/Cluster.java deleted file mode 100644 index 2f8d4ddfc..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/Cluster.java +++ /dev/null @@ -1,81 +0,0 @@ -/* Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering; - -import org.apache.mahout.common.parameters.Parametered; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -/** - * Implementations of this interface have a printable representation and certain - * attributes that are common across all clustering implementations - * - */ -public interface Cluster extends Model, Parametered { - - // default directory for all clustered points - String CLUSTERED_POINTS_DIR = "clusteredPoints"; - - // default directory for initial clusters to prime iterative clustering - // algorithms - String INITIAL_CLUSTERS_DIR = "clusters-0"; - - // default directory for output of clusters per iteration - String CLUSTERS_DIR = "clusters-"; - - // default suffix for output of clusters for final iteration - String FINAL_ITERATION_SUFFIX = "-final"; - - /** - * Get the id of the Cluster - * - * @return a unique integer - */ - int getId(); - - /** - * Get the "center" of the Cluster as a Vector - * - * @return a Vector - */ - Vector getCenter(); - - /** - * Get the "radius" of the Cluster as a Vector. Usually the radius is the - * standard deviation expressed as a Vector of size equal to the center. Some - * clusters may return zero values if not appropriate. - * - * @return aVector - */ - Vector getRadius(); - - /** - * Produce a custom, human-friendly, printable representation of the Cluster. - * - * @param bindings - * an optional String[] containing labels used to format the primary - * Vector/s of this implementation. - * @return a String - */ - String asFormatString(String[] bindings); - - /** - * @return if the receiver has converged, or false if that has no meaning for - * the implementation - */ - boolean isConverged(); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/ClusterObservations.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/ClusterObservations.java deleted file mode 100644 index aa8ce8144..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/ClusterObservations.java +++ /dev/null @@ -1,121 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -public class ClusterObservations implements Writable { - - private int combinerState; - - private double s0; - - private Vector s1; - - private Vector s2; - - public ClusterObservations(double s0, Vector s1, Vector s2) { - this.s0 = s0; - this.s1 = s1; - this.s2 = s2; - } - - public ClusterObservations(int combinerState, double s0, Vector s1, Vector s2) { - this.combinerState = combinerState; - this.s0 = s0; - this.s1 = s1; - this.s2 = s2; - } - - public ClusterObservations() { - } - - @Override - public void readFields(DataInput in) throws IOException { - this.combinerState = in.readInt(); - this.s0 = in.readDouble(); - VectorWritable temp = new VectorWritable(); - temp.readFields(in); - this.s1 = temp.get(); - temp.readFields(in); - this.s2 = temp.get(); - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeInt(combinerState); - out.writeDouble(s0); - VectorWritable.writeVector(out, s1); - VectorWritable.writeVector(out, s2); - } - - /** - * @return the combinerState - */ - public int getCombinerState() { - return combinerState; - } - - /** - * @return the s0 - */ - public double getS0() { - return s0; - } - - /** - * @return the s1 - */ - public Vector getS1() { - return s1; - } - - /** - * @return the s2 - */ - public Vector getS2() { - return s2; - } - - @Override - public String toString() { - StringBuilder buf = new StringBuilder(50); - buf.append("co{s0=").append(s0).append(" s1="); - if (s1 != null) { - buf.append(AbstractCluster.formatVector(s1, null)); - } - buf.append(" s2="); - if (s2 != null) { - buf.append(AbstractCluster.formatVector(s2, null)); - } - buf.append('}'); - return buf.toString(); - } - - public ClusterObservations incrementCombinerState() { - combinerState++; - return this; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java deleted file mode 100644 index c25e03961..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java +++ /dev/null @@ -1,62 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering; - -import org.apache.mahout.math.Vector; - -public interface GaussianAccumulator { - - /** - * @return the number of observations - */ - double getN(); - - /** - * @return the mean of the observations - */ - Vector getMean(); - - /** - * @return the std of the observations - */ - Vector getStd(); - - /** - * @return the average of the vector std elements - */ - double getAverageStd(); - - /** - * @return the variance of the observations - */ - Vector getVariance(); - - /** - * Observe the vector - * - * @param x a Vector - * @param weight the double observation weight (usually 1.0) - */ - void observe(Vector x, double weight); - - /** - * Compute the mean, variance and standard deviation - */ - void compute(); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/Model.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/Model.java deleted file mode 100644 index 79dab3090..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/Model.java +++ /dev/null @@ -1,93 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.math.VectorWritable; - -/** - * A model is a probability distribution over observed data points and allows - * the probability of any data point to be computed. All Models have a - * persistent representation and extend - * WritablesampleFromPosterior(Model[]) - */ -public interface Model extends Writable { - - /** - * Return the probability that the observation is described by this model - * - * @param x - * an Observation from the posterior - * @return the probability that x is in the receiver - */ - double pdf(O x); - - /** - * Observe the given observation, retaining information about it - * - * @param x - * an Observation from the posterior - */ - void observe(O x); - - /** - * Observe the given observation, retaining information about it - * - * @param x - * an Observation from the posterior - * @param weight - * a double weighting factor - */ - void observe(O x, double weight); - - /** - * Observe the given model, retaining information about its observations - * - * @param x - * a Model<0> - */ - void observe(Model x); - - /** - * Compute a new set of posterior parameters based upon the Observations that - * have been observed since my creation - */ - void computeParameters(); - - /** - * Return the number of observations that this model has seen since its - * parameters were last computed - * - * @return a long - */ - long getNumObservations(); - - /** - * Return the number of observations that this model has seen over its - * lifetime - * - * @return a long - */ - long getTotalObservations(); - - /** - * @return a sample of my posterior model - */ - Model sampleFromPosterior(); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/ModelDistribution.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/ModelDistribution.java deleted file mode 100644 index d77bf40c1..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/ModelDistribution.java +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering; - -/** A model distribution allows us to sample a model from its prior distribution. */ -public interface ModelDistribution { - - /** - * Return a list of models sampled from the prior - * - * @param howMany - * the int number of models to return - * @return a Model[] representing what is known apriori - */ - Model[] sampleFromPrior(int howMany); - - /** - * Return a list of models sampled from the posterior - * - * @param posterior - * the Model[] after observations - * @return a Model[] representing what is known apriori - */ - Model[] sampleFromPosterior(Model[] posterior); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java deleted file mode 100644 index b76e00fe0..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java +++ /dev/null @@ -1,107 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering; - -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.function.SquareRootFunction; - -/** - * An online Gaussian statistics accumulator based upon Knuth (who cites Welford) which is declared to be - * numerically-stable. See http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - */ -public class OnlineGaussianAccumulator implements GaussianAccumulator { - - private double sumWeight; - private Vector mean; - private Vector s; - private Vector variance; - - @Override - public double getN() { - return sumWeight; - } - - @Override - public Vector getMean() { - return mean; - } - - @Override - public Vector getStd() { - return variance.clone().assign(new SquareRootFunction()); - } - - /* from Wikipedia: http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - * - * Weighted incremental algorithm - * - * def weighted_incremental_variance(dataWeightPairs): - * mean = 0 - * S = 0 - * sumweight = 0 - * for x, weight in dataWeightPairs: # Alternately "for x in zip(data, weight):" - * temp = weight + sumweight - * Q = x - mean - * R = Q * weight / temp - * S = S + sumweight * Q * R - * mean = mean + R - * sumweight = temp - * Variance = S / (sumweight-1) # if sample is the population, omit -1 - * return Variance - */ - @Override - public void observe(Vector x, double weight) { - double temp = weight + sumWeight; - Vector q; - if (mean == null) { - mean = x.like(); - q = x.clone(); - } else { - q = x.minus(mean); - } - Vector r = q.times(weight).divide(temp); - if (s == null) { - s = q.times(sumWeight).times(r); - } else { - s = s.plus(q.times(sumWeight).times(r)); - } - mean = mean.plus(r); - sumWeight = temp; - variance = s.divide(sumWeight - 1); // # if sample is the population, omit -1 - } - - @Override - public void compute() { - // nothing to do here! - } - - @Override - public double getAverageStd() { - if (sumWeight == 0.0) { - return 0.0; - } else { - Vector std = getStd(); - return std.zSum() / std.size(); - } - } - - @Override - public Vector getVariance() { - return variance; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java deleted file mode 100644 index 138e83037..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering; - -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.function.Functions; -import org.apache.mahout.math.function.SquareRootFunction; - -/** - * An online Gaussian accumulator that uses a running power sums approach as reported - * on http://en.wikipedia.org/wiki/Standard_deviation - * Suffers from overflow, underflow and roundoff error but has minimal observe-time overhead - */ -public class RunningSumsGaussianAccumulator implements GaussianAccumulator { - - private double s0; - private Vector s1; - private Vector s2; - private Vector mean; - private Vector std; - - @Override - public double getN() { - return s0; - } - - @Override - public Vector getMean() { - return mean; - } - - @Override - public Vector getStd() { - return std; - } - - @Override - public double getAverageStd() { - if (s0 == 0.0) { - return 0.0; - } else { - return std.zSum() / std.size(); - } - } - - @Override - public Vector getVariance() { - return std.times(std); - } - - @Override - public void observe(Vector x, double weight) { - s0 += weight; - Vector weightedX = x.times(weight); - if (s1 == null) { - s1 = weightedX; - } else { - s1.assign(weightedX, Functions.PLUS); - } - Vector x2 = x.times(x).times(weight); - if (s2 == null) { - s2 = x2; - } else { - s2.assign(x2, Functions.PLUS); - } - } - - @Override - public void compute() { - if (s0 != 0.0) { - mean = s1.divide(s0); - std = s2.times(s0).minus(s1.times(s1)).assign(new SquareRootFunction()).divide(s0); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java deleted file mode 100644 index 903875011..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java +++ /dev/null @@ -1,59 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.canopy; - -import org.apache.mahout.clustering.iterator.DistanceMeasureCluster; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.Vector; - -/** - * This class models a canopy as a center point, the number of points that are contained within it according - * to the application of some distance metric, and a point total which is the sum of all the points and is - * used to compute the centroid when needed. - */ -public class Canopy extends DistanceMeasureCluster { - - /** Used for deserialization as a writable */ - public Canopy() { } - - /** - * Create a new Canopy containing the given point and canopyId - * - * @param center a point in vector space - * @param canopyId an int identifying the canopy local to this process only - * @param measure a DistanceMeasure to use - */ - public Canopy(Vector center, int canopyId, DistanceMeasure measure) { - super(center, canopyId, measure); - observe(center); - } - - public String asFormatString() { - return "C" + this.getId() + ": " + this.computeCentroid().asFormatString(); - } - - @Override - public String toString() { - return getIdentifier() + ": " + getCenter().asFormatString(); - } - - @Override - public String getIdentifier() { - return "C-" + getId(); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java deleted file mode 100644 index bf26814b9..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java +++ /dev/null @@ -1,261 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.canopy; - -import java.util.Collection; -import java.util.Iterator; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.mahout.clustering.AbstractCluster; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.Vector; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.collect.Lists; - -public class CanopyClusterer { - - private static final Logger log = LoggerFactory.getLogger(CanopyClusterer.class); - - private int nextCanopyId; - - // the T1 distance threshold - private double t1; - - // the T2 distance threshold - private double t2; - - // the T3 distance threshold - private double t3; - - // the T4 distance threshold - private double t4; - - // the distance measure - private DistanceMeasure measure; - - public CanopyClusterer(DistanceMeasure measure, double t1, double t2) { - this.t1 = t1; - this.t2 = t2; - this.t3 = t1; - this.t4 = t2; - this.measure = measure; - } - - public double getT1() { - return t1; - } - - public double getT2() { - return t2; - } - - public double getT3() { - return t3; - } - - public double getT4() { - return t4; - } - - public CanopyClusterer(Configuration config) { - this.configure(config); - } - - /** - * Configure the Canopy and its distance measure - * - * @param configuration - * the Configuration - */ - public void configure(Configuration configuration) { - measure = ClassUtils.instantiateAs(configuration.get(CanopyConfigKeys.DISTANCE_MEASURE_KEY), - DistanceMeasure.class); - measure.configure(configuration); - t1 = Double.parseDouble(configuration.get(CanopyConfigKeys.T1_KEY)); - t2 = Double.parseDouble(configuration.get(CanopyConfigKeys.T2_KEY)); - t3 = t1; - String d = configuration.get(CanopyConfigKeys.T3_KEY); - if (d != null) { - t3 = Double.parseDouble(d); - } - t4 = t2; - d = configuration.get(CanopyConfigKeys.T4_KEY); - if (d != null) { - t4 = Double.parseDouble(d); - } - nextCanopyId = 0; - } - - /** - * Used by CanopyReducer to set t1=t3 and t2=t4 configuration values - */ - public void useT3T4() { - t1 = t3; - t2 = t4; - } - - /** - * Configure the Canopy for unit tests - * - * @param aMeasure - * the DistanceMeasure - * @param aT1 - * the T1 distance threshold - * @param aT2 - * the T2 distance threshold - * */ - public void config(DistanceMeasure aMeasure, double aT1, double aT2) { - measure = aMeasure; - t1 = aT1; - t2 = aT2; - t3 = t1; - t4 = t2; - } - - /** - * This is the same algorithm as the reference but inverted to iterate over - * existing canopies instead of the points. Because of this it does not need - * to actually store the points, instead storing a total points vector and - * the number of points. From this a centroid can be computed. - *

- * This method is used by the CanopyMapper, CanopyReducer and CanopyDriver. - * - * @param point - * the point to be added - * @param canopies - * the List to be appended - */ - public void addPointToCanopies(Vector point, Collection canopies) { - boolean pointStronglyBound = false; - for (Canopy canopy : canopies) { - double dist = measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point); - if (dist < t1) { - if (log.isDebugEnabled()) { - log.debug("Added point: {} to canopy: {}", AbstractCluster.formatVector(point, null), canopy.getIdentifier()); - } - canopy.observe(point); - } - pointStronglyBound = pointStronglyBound || dist < t2; - } - if (!pointStronglyBound) { - if (log.isDebugEnabled()) { - log.debug("Created new Canopy:{} at center:{}", nextCanopyId, AbstractCluster.formatVector(point, null)); - } - canopies.add(new Canopy(point, nextCanopyId++, measure)); - } - } - - /** - * Return if the point is covered by the canopy - * - * @param point - * a point - * @return if the point is covered - */ - public boolean canopyCovers(Canopy canopy, Vector point) { - return measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point) < t1; - } - - /** - * Iterate through the points, adding new canopies. Return the canopies. - * - * @param points - * a list defining the points to be clustered - * @param measure - * a DistanceMeasure to use - * @param t1 - * the T1 distance threshold - * @param t2 - * the T2 distance threshold - * @return the List created - */ - public static List createCanopies(List points, - DistanceMeasure measure, - double t1, - double t2) { - List canopies = Lists.newArrayList(); - /** - * Reference Implementation: Given a distance metric, one can create - * canopies as follows: Start with a list of the data points in any - * order, and with two distance thresholds, T1 and T2, where T1 > T2. - * (These thresholds can be set by the user, or selected by - * cross-validation.) Pick a point on the list and measure its distance - * to all other points. Put all points that are within distance - * threshold T1 into a canopy. Remove from the list all points that are - * within distance threshold T2. Repeat until the list is empty. - */ - int nextCanopyId = 0; - while (!points.isEmpty()) { - Iterator ptIter = points.iterator(); - Vector p1 = ptIter.next(); - ptIter.remove(); - Canopy canopy = new Canopy(p1, nextCanopyId++, measure); - canopies.add(canopy); - while (ptIter.hasNext()) { - Vector p2 = ptIter.next(); - double dist = measure.distance(p1, p2); - // Put all points that are within distance threshold T1 into the - // canopy - if (dist < t1) { - canopy.observe(p2); - } - // Remove from the list all points that are within distance - // threshold T2 - if (dist < t2) { - ptIter.remove(); - } - } - for (Canopy c : canopies) { - c.computeParameters(); - } - } - return canopies; - } - - /** - * Iterate through the canopies, adding their centroids to a list - * - * @param canopies - * a List - * @return the List - */ - public static List getCenters(Iterable canopies) { - List result = Lists.newArrayList(); - for (Canopy canopy : canopies) { - result.add(canopy.getCenter()); - } - return result; - } - - /** - * Iterate through the canopies, resetting their center to their centroids - * - * @param canopies - * a List - */ - public static void updateCentroids(Iterable canopies) { - for (Canopy canopy : canopies) { - canopy.computeParameters(); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java deleted file mode 100644 index a5ec19359..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java +++ /dev/null @@ -1,37 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.canopy; - -public interface CanopyConfigKeys { - - String T1_KEY = "org.apache.mahout.clustering.canopy.t1"; - - String CANOPY_PATH_KEY = "org.apache.mahout.clustering.canopy.path"; - - String T2_KEY = "org.apache.mahout.clustering.canopy.t2"; - - String T3_KEY = "org.apache.mahout.clustering.canopy.t3"; - - String T4_KEY = "org.apache.mahout.clustering.canopy.t4"; - - // keys used by Driver, Mapper, Combiner & Reducer - String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.canopy.measure"; - - String CF_KEY = "org.apache.mahout.clustering.canopy.canopyFilter"; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java deleted file mode 100644 index ccc204ec2..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java +++ /dev/null @@ -1,376 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.canopy; - -import static org.apache.mahout.clustering.topdown.PathDirectory.CLUSTERED_POINTS_DIRECTORY; - -import java.io.IOException; -import java.util.Collection; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.clustering.AbstractCluster; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.classify.ClusterClassificationDriver; -import org.apache.mahout.clustering.classify.ClusterClassifier; -import org.apache.mahout.clustering.iterator.CanopyClusteringPolicy; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable; -import org.apache.mahout.math.VectorWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.collect.Lists; -import com.google.common.io.Closeables; - -public class CanopyDriver extends AbstractJob { - - public static final String DEFAULT_CLUSTERED_POINTS_DIRECTORY = "clusteredPoints"; - - private static final Logger log = LoggerFactory.getLogger(CanopyDriver.class); - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new CanopyDriver(), args); - } - - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOutputOption(); - addOption(DefaultOptionCreator.distanceMeasureOption().create()); - addOption(DefaultOptionCreator.t1Option().create()); - addOption(DefaultOptionCreator.t2Option().create()); - addOption(DefaultOptionCreator.t3Option().create()); - addOption(DefaultOptionCreator.t4Option().create()); - addOption(DefaultOptionCreator.clusterFilterOption().create()); - addOption(DefaultOptionCreator.overwriteOption().create()); - addOption(DefaultOptionCreator.clusteringOption().create()); - addOption(DefaultOptionCreator.methodOption().create()); - addOption(DefaultOptionCreator.outlierThresholdOption().create()); - - if (parseArguments(args) == null) { - return -1; - } - - Path input = getInputPath(); - Path output = getOutputPath(); - Configuration conf = getConf(); - if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(conf, output); - } - String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); - double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION)); - double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION)); - double t3 = t1; - if (hasOption(DefaultOptionCreator.T3_OPTION)) { - t3 = Double.parseDouble(getOption(DefaultOptionCreator.T3_OPTION)); - } - double t4 = t2; - if (hasOption(DefaultOptionCreator.T4_OPTION)) { - t4 = Double.parseDouble(getOption(DefaultOptionCreator.T4_OPTION)); - } - int clusterFilter = 0; - if (hasOption(DefaultOptionCreator.CLUSTER_FILTER_OPTION)) { - clusterFilter = Integer - .parseInt(getOption(DefaultOptionCreator.CLUSTER_FILTER_OPTION)); - } - boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION); - boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION) - .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD); - DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); - double clusterClassificationThreshold = 0.0; - if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) { - clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD)); - } - run(conf, input, output, measure, t1, t2, t3, t4, clusterFilter, - runClustering, clusterClassificationThreshold, runSequential ); - return 0; - } - - /** - * Build a directory of Canopy clusters from the input arguments and, if - * requested, cluster the input vectors using these clusters - * - * @param conf - * the Configuration - * @param input - * the Path to the directory containing input vectors - * @param output - * the Path for all output directories - * @param measure - * the DistanceMeasure - * @param t1 - * the double T1 distance metric - * @param t2 - * the double T2 distance metric - * @param t3 - * the reducer's double T1 distance metric - * @param t4 - * the reducer's double T2 distance metric - * @param clusterFilter - * the minimum canopy size output by the mappers - * @param runClustering - * cluster the input vectors if true - * @param clusterClassificationThreshold - * vectors having pdf below this value will not be clustered. Its value should be between 0 and 1. - * @param runSequential - * execute sequentially if true - */ - public static void run(Configuration conf, Path input, Path output, - DistanceMeasure measure, double t1, double t2, double t3, double t4, - int clusterFilter, boolean runClustering, double clusterClassificationThreshold, boolean runSequential) - throws IOException, InterruptedException, ClassNotFoundException { - Path clustersOut = buildClusters(conf, input, output, measure, t1, t2, t3, - t4, clusterFilter, runSequential); - if (runClustering) { - clusterData(conf, input, clustersOut, output, clusterClassificationThreshold, runSequential); - } - } - - /** - * Convenience method to provide backward compatibility - * @param clusterClassificationThreshold TODO - */ - public static void run(Configuration conf, Path input, Path output, - DistanceMeasure measure, double t1, double t2, boolean runClustering, - double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, - ClassNotFoundException { - run(conf, input, output, measure, t1, t2, t1, t2, 0, runClustering, - clusterClassificationThreshold, runSequential); - } - - /** - * Convenience method creates new Configuration() Build a directory of Canopy - * clusters from the input arguments and, if requested, cluster the input - * vectors using these clusters - * - * @param input - * the Path to the directory containing input vectors - * @param output - * the Path for all output directories - * @param t1 - * the double T1 distance metric - * @param t2 - * the double T2 distance metric - * @param runClustering - * cluster the input vectors if true - * @param clusterClassificationThreshold - * vectors having pdf below this value will not be clustered. Its value should be between 0 and 1. - * @param runSequential - * execute sequentially if true - */ - public static void run(Path input, Path output, DistanceMeasure measure, - double t1, double t2, boolean runClustering, double clusterClassificationThreshold, boolean runSequential) - throws IOException, InterruptedException, ClassNotFoundException { - run(new Configuration(), input, output, measure, t1, t2, runClustering, - clusterClassificationThreshold, runSequential); - } - - /** - * Convenience method for backwards compatibility - * - */ - public static Path buildClusters(Configuration conf, Path input, Path output, - DistanceMeasure measure, double t1, double t2, int clusterFilter, - boolean runSequential) throws IOException, InterruptedException, - ClassNotFoundException { - return buildClusters(conf, input, output, measure, t1, t2, t1, t2, - clusterFilter, runSequential); - } - - /** - * Build a directory of Canopy clusters from the input vectors and other - * arguments. Run sequential or mapreduce execution as requested - * - * @param conf - * the Configuration to use - * @param input - * the Path to the directory containing input vectors - * @param output - * the Path for all output directories - * @param measure - * the DistanceMeasure - * @param t1 - * the double T1 distance metric - * @param t2 - * the double T2 distance metric - * @param t3 - * the reducer's double T1 distance metric - * @param t4 - * the reducer's double T2 distance metric - * @param clusterFilter - * the int minimum size of canopies produced - * @param runSequential - * a boolean indicates to run the sequential (reference) algorithm - * @return the canopy output directory Path - */ - public static Path buildClusters(Configuration conf, Path input, Path output, - DistanceMeasure measure, double t1, double t2, double t3, double t4, - int clusterFilter, boolean runSequential) throws IOException, - InterruptedException, ClassNotFoundException { - log.info("Build Clusters Input: {} Out: {} Measure: {} t1: {} t2: {}", - new Object[] { input, output, measure, t1, t2 }); - if (runSequential) { - return buildClustersSeq(input, output, measure, t1, t2, clusterFilter); - } else { - return buildClustersMR(conf, input, output, measure, t1, t2, t3, t4, - clusterFilter); - } - } - - /** - * Build a directory of Canopy clusters from the input vectors and other - * arguments. Run sequential execution - * - * @param input - * the Path to the directory containing input vectors - * @param output - * the Path for all output directories - * @param measure - * the DistanceMeasure - * @param t1 - * the double T1 distance metric - * @param t2 - * the double T2 distance metric - * @param clusterFilter - * the int minimum size of canopies produced - * @return the canopy output directory Path - */ - private static Path buildClustersSeq(Path input, Path output, - DistanceMeasure measure, double t1, double t2, int clusterFilter) - throws IOException { - CanopyClusterer clusterer = new CanopyClusterer(measure, t1, t2); - Collection canopies = Lists.newArrayList(); - Configuration conf = new Configuration(); - FileSystem fs = FileSystem.get(input.toUri(), conf); - - for (VectorWritable vw : new SequenceFileDirValueIterable( - input, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { - clusterer.addPointToCanopies(vw.get(), canopies); - } - - Path canopyOutputDir = new Path(output, Cluster.CLUSTERS_DIR + '0'+ Cluster.FINAL_ITERATION_SUFFIX); - Path path = new Path(canopyOutputDir, "part-r-00000"); - SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, - Text.class, ClusterWritable.class); - ClusterWritable clusterWritable = new ClusterWritable(); - try { - for (Canopy canopy : canopies) { - canopy.computeParameters(); - if (log.isDebugEnabled()) { - log.debug("Writing Canopy:{} center:{} numPoints:{} radius:{}", - new Object[] { canopy.getIdentifier(), - AbstractCluster.formatVector(canopy.getCenter(), null), - canopy.getNumObservations(), - AbstractCluster.formatVector(canopy.getRadius(), null) }); - } - if (canopy.getNumObservations() > clusterFilter) { - clusterWritable.setValue(canopy); - writer.append(new Text(canopy.getIdentifier()), clusterWritable); - } - } - } finally { - Closeables.closeQuietly(writer); - } - return canopyOutputDir; - } - - /** - * Build a directory of Canopy clusters from the input vectors and other - * arguments. Run mapreduce execution - * - * @param conf - * the Configuration - * @param input - * the Path to the directory containing input vectors - * @param output - * the Path for all output directories - * @param measure - * the DistanceMeasure - * @param t1 - * the double T1 distance metric - * @param t2 - * the double T2 distance metric - * @param t3 - * the reducer's double T1 distance metric - * @param t4 - * the reducer's double T2 distance metric - * @param clusterFilter - * the int minimum size of canopies produced - * @return the canopy output directory Path - */ - private static Path buildClustersMR(Configuration conf, Path input, - Path output, DistanceMeasure measure, double t1, double t2, double t3, - double t4, int clusterFilter) throws IOException, InterruptedException, - ClassNotFoundException { - conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass() - .getName()); - conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(t1)); - conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(t2)); - conf.set(CanopyConfigKeys.T3_KEY, String.valueOf(t3)); - conf.set(CanopyConfigKeys.T4_KEY, String.valueOf(t4)); - conf.set(CanopyConfigKeys.CF_KEY, String.valueOf(clusterFilter)); - - Job job = new Job(conf, "Canopy Driver running buildClusters over input: " - + input); - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - job.setMapperClass(CanopyMapper.class); - job.setMapOutputKeyClass(Text.class); - job.setMapOutputValueClass(VectorWritable.class); - job.setReducerClass(CanopyReducer.class); - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(ClusterWritable.class); - job.setNumReduceTasks(1); - job.setJarByClass(CanopyDriver.class); - - FileInputFormat.addInputPath(job, input); - Path canopyOutputDir = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX); - FileOutputFormat.setOutputPath(job, canopyOutputDir); - if (!job.waitForCompletion(true)) { - throw new InterruptedException("Canopy Job failed processing " + input); - } - return canopyOutputDir; - } - - private static void clusterData(Configuration conf, Path points, Path canopies, Path output, - double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, - ClassNotFoundException { - ClusterClassifier.writePolicy(new CanopyClusteringPolicy(), canopies); - ClusterClassificationDriver.run(points, output, new Path(output, CLUSTERED_POINTS_DIRECTORY), - clusterClassificationThreshold, true, runSequential); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java deleted file mode 100644 index 52fe8651c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java +++ /dev/null @@ -1,65 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.canopy; - -import java.io.IOException; -import java.util.Collection; - -import com.google.common.collect.Lists; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.math.VectorWritable; - -class CanopyMapper extends - Mapper, VectorWritable, Text, VectorWritable> { - - private final Collection canopies = Lists.newArrayList(); - - private CanopyClusterer canopyClusterer; - - private int clusterFilter; - - @Override - protected void map(WritableComparable key, VectorWritable point, - Context context) throws IOException, InterruptedException { - canopyClusterer.addPointToCanopies(point.get(), canopies); - } - - @Override - protected void setup(Context context) throws IOException, - InterruptedException { - super.setup(context); - canopyClusterer = new CanopyClusterer(context.getConfiguration()); - clusterFilter = Integer.parseInt(context.getConfiguration().get( - CanopyConfigKeys.CF_KEY)); - } - - @Override - protected void cleanup(Context context) throws IOException, - InterruptedException { - for (Canopy canopy : canopies) { - canopy.computeParameters(); - if (canopy.getNumObservations() > clusterFilter) { - context.write(new Text("centroid"), new VectorWritable(canopy - .getCenter())); - } - } - super.cleanup(context); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java deleted file mode 100644 index 6487601f9..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.canopy; - -import java.io.IOException; -import java.util.Collection; - -import com.google.common.collect.Lists; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -public class CanopyReducer extends Reducer { - - private final Collection canopies = Lists.newArrayList(); - - private CanopyClusterer canopyClusterer; - - private int clusterFilter; - - CanopyClusterer getCanopyClusterer() { - return canopyClusterer; - } - - @Override - protected void reduce(Text arg0, Iterable values, - Context context) throws IOException, InterruptedException { - for (VectorWritable value : values) { - Vector point = value.get(); - canopyClusterer.addPointToCanopies(point, canopies); - } - for (Canopy canopy : canopies) { - ClusterWritable clusterWritable = new ClusterWritable(); - canopy.computeParameters(); - if (canopy.getNumObservations() > clusterFilter) { - clusterWritable.setValue(canopy); - context.write(new Text(canopy.getIdentifier()), clusterWritable); - } - } - } - - @Override - protected void setup(Context context) throws IOException, - InterruptedException { - super.setup(context); - canopyClusterer = new CanopyClusterer(context.getConfiguration()); - canopyClusterer.useT3T4(); - clusterFilter = Integer.parseInt(context.getConfiguration().get( - CanopyConfigKeys.CF_KEY)); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java deleted file mode 100644 index af94f3aa7..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.classify; - -/** - * Constants used in Cluster Classification. - */ -public class ClusterClassificationConfigKeys { - - public static final String CLUSTERS_IN = "clusters_in"; - - public static final String OUTLIER_REMOVAL_THRESHOLD = "pdf_threshold"; - - public static final String EMIT_MOST_LIKELY = "emit_most_likely"; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java deleted file mode 100644 index 4f33bfb8a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java +++ /dev/null @@ -1,291 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.classify; - -import static org.apache.mahout.clustering.classify.ClusterClassificationConfigKeys.CLUSTERS_IN; -import static org.apache.mahout.clustering.classify.ClusterClassificationConfigKeys.EMIT_MOST_LIKELY; -import static org.apache.mahout.clustering.classify.ClusterClassificationConfigKeys.OUTLIER_REMOVAL_THRESHOLD; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.clustering.iterator.ClusteringPolicy; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.Vector.Element; -import org.apache.mahout.math.VectorWritable; - -/** - * Classifies the vectors into different clusters found by the clustering - * algorithm. - */ -public class ClusterClassificationDriver extends AbstractJob { - - /** - * CLI to run Cluster Classification Driver. - */ - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOutputOption(); - addOption(DefaultOptionCreator.methodOption().create()); - addOption(DefaultOptionCreator.clustersInOption() - .withDescription("The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy.") - .create()); - - if (parseArguments(args) == null) { - return -1; - } - - Path input = getInputPath(); - Path output = getOutputPath(); - - if (getConf() == null) { - setConf(new Configuration()); - } - Path clustersIn = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION)); - boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase( - DefaultOptionCreator.SEQUENTIAL_METHOD); - - double clusterClassificationThreshold = 0.0; - if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) { - clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD)); - } - - run(input, clustersIn, output, clusterClassificationThreshold, true, runSequential); - - return 0; - } - - /** - * Constructor to be used by the ToolRunner. - */ - private ClusterClassificationDriver() {} - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new ClusterClassificationDriver(), args); - } - - /** - * Uses {@link ClusterClassifier} to classify input vectors into their - * respective clusters. - * - * @param input - * the input vectors - * @param clusteringOutputPath - * the output path of clustering ( it reads clusters-*-final file - * from here ) - * @param output - * the location to store the classified vectors - * @param clusterClassificationThreshold - * the threshold value of probability distribution function from 0.0 - * to 1.0. Any vector with pdf less that this threshold will not be - * classified for the cluster. - * @param runSequential - * Run the process sequentially or in a mapreduce way. - * @param runSequential - * @throws IOException - * @throws InterruptedException - * @throws ClassNotFoundException - */ - public static void run(Path input, Path clusteringOutputPath, Path output, Double clusterClassificationThreshold, - boolean emitMostLikely, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { - Configuration conf = new Configuration(); - if (runSequential) { - classifyClusterSeq(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely); - } else { - classifyClusterMR(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely); - } - - } - - private static void classifyClusterSeq(Configuration conf, Path input, Path clusters, Path output, - Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException { - List clusterModels = populateClusterModels(clusters, conf); - ClusteringPolicy policy = ClusterClassifier.readPolicy(finalClustersPath(conf, clusters)); - ClusterClassifier clusterClassifier = new ClusterClassifier(clusterModels, policy); - selectCluster(input, clusterModels, clusterClassifier, output, clusterClassificationThreshold, emitMostLikely); - - } - - /** - * Populates a list with clusters present in clusters-*-final directory. - * - * @param clusterOutputPath - * The output path of the clustering. - * @param conf - * The Hadoop Configuration - * @return The list of clusters found by the clustering. - * @throws IOException - */ - private static List populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException { - List clusterModels = new ArrayList(); - Cluster cluster = null; - Path finalClustersPath = finalClustersPath(conf, clusterOutputPath); - Iterator it = new SequenceFileDirValueIterator(finalClustersPath, PathType.LIST, - PathFilters.partFilter(), null, false, conf); - while (it.hasNext()) { - ClusterWritable next = (ClusterWritable) it.next(); - cluster = (Cluster) next.getValue(); - cluster.configure(conf); - clusterModels.add(cluster); - } - return clusterModels; - } - - private static Path finalClustersPath(Configuration conf, Path clusterOutputPath) throws IOException { - FileSystem fileSystem = clusterOutputPath.getFileSystem(conf); - FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter()); - Path finalClustersPath = clusterFiles[0].getPath(); - return finalClustersPath; - } - - /** - * Classifies the vector into its respective cluster. - * - * @param input - * the path containing the input vector. - * @param clusterModels - * the clusters - * @param clusterClassifier - * used to classify the vectors into different clusters - * @param output - * the path to store classified data - * @param clusterClassificationThreshold - * @param emitMostLikely - * TODO - * @throws IOException - */ - private static void selectCluster(Path input, List clusterModels, ClusterClassifier clusterClassifier, - Path output, Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException { - Configuration conf = new Configuration(); - SequenceFile.Writer writer = new SequenceFile.Writer(input.getFileSystem(conf), conf, new Path(output, - "part-m-" + 0), IntWritable.class, WeightedVectorWritable.class); - for (VectorWritable vw : new SequenceFileDirValueIterable(input, PathType.LIST, - PathFilters.logsCRCFilter(), conf)) { - Vector pdfPerCluster = clusterClassifier.classify(vw.get()); - if (shouldClassify(pdfPerCluster, clusterClassificationThreshold)) { - classifyAndWrite(clusterModels, clusterClassificationThreshold, emitMostLikely, writer, vw, pdfPerCluster); - } - } - writer.close(); - } - - private static void classifyAndWrite(List clusterModels, Double clusterClassificationThreshold, - boolean emitMostLikely, SequenceFile.Writer writer, VectorWritable vw, Vector pdfPerCluster) throws IOException { - if (emitMostLikely) { - int maxValueIndex = pdfPerCluster.maxValueIndex(); - WeightedVectorWritable wvw = new WeightedVectorWritable(pdfPerCluster.maxValue(), vw.get()); - write(clusterModels, writer, wvw, maxValueIndex); - } else { - writeAllAboveThreshold(clusterModels, clusterClassificationThreshold, writer, vw, pdfPerCluster); - } - } - - private static void writeAllAboveThreshold(List clusterModels, Double clusterClassificationThreshold, - SequenceFile.Writer writer, VectorWritable vw, Vector pdfPerCluster) throws IOException { - Iterator iterateNonZero = pdfPerCluster.iterateNonZero(); - while (iterateNonZero.hasNext()) { - Element pdf = iterateNonZero.next(); - if (pdf.get() >= clusterClassificationThreshold) { - WeightedVectorWritable wvw = new WeightedVectorWritable(pdf.get(), vw.get()); - int clusterIndex = pdf.index(); - write(clusterModels, writer, wvw, clusterIndex); - } - } - } - - private static void write(List clusterModels, SequenceFile.Writer writer, WeightedVectorWritable wvw, - int maxValueIndex) throws IOException { - Cluster cluster = clusterModels.get(maxValueIndex); - writer.append(new IntWritable(cluster.getId()), wvw); - } - - /** - * Decides whether the vector should be classified or not based on the max pdf - * value of the clusters and threshold value. - * - * @return whether the vector should be classified or not. - */ - private static boolean shouldClassify(Vector pdfPerCluster, Double clusterClassificationThreshold) { - boolean isMaxPDFGreatherThanThreshold = pdfPerCluster.maxValue() >= clusterClassificationThreshold; - return isMaxPDFGreatherThanThreshold; - } - - private static void classifyClusterMR(Configuration conf, Path input, Path clustersIn, Path output, - Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException, InterruptedException, - ClassNotFoundException { - - conf.setFloat(OUTLIER_REMOVAL_THRESHOLD, clusterClassificationThreshold.floatValue()); - conf.setBoolean(EMIT_MOST_LIKELY, emitMostLikely); - conf.set(CLUSTERS_IN, clustersIn.toUri().toString()); - - Job job = new Job(conf, "Cluster Classification Driver running over input: " + input); - job.setJarByClass(ClusterClassificationDriver.class); - - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - - job.setMapperClass(ClusterClassificationMapper.class); - job.setNumReduceTasks(0); - - job.setOutputKeyClass(IntWritable.class); - job.setOutputValueClass(WeightedVectorWritable.class); - - FileInputFormat.addInputPath(job, input); - FileOutputFormat.setOutputPath(job, output); - if (!job.waitForCompletion(true)) { - throw new InterruptedException("Cluster Classification Driver Job failed processing " + input); - } - } - - public static void run(Configuration conf, Path input, Path clusteringOutputPath, Path output, - double clusterClassificationThreshold, boolean emitMostLikely, boolean runSequential) throws IOException, - InterruptedException, ClassNotFoundException { - if (runSequential) { - classifyClusterSeq(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely); - } else { - classifyClusterMR(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely); - } - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java deleted file mode 100644 index 171b5ff00..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java +++ /dev/null @@ -1,155 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.classify; - -import static org.apache.mahout.clustering.classify.ClusterClassificationConfigKeys.CLUSTERS_IN; -import static org.apache.mahout.clustering.classify.ClusterClassificationConfigKeys.EMIT_MOST_LIKELY; -import static org.apache.mahout.clustering.classify.ClusterClassificationConfigKeys.OUTLIER_REMOVAL_THRESHOLD; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.clustering.iterator.ClusteringPolicy; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.Vector.Element; -import org.apache.mahout.math.VectorWritable; - -/** - * Mapper for classifying vectors into clusters. - */ -public class ClusterClassificationMapper extends - Mapper,VectorWritable,IntWritable,WeightedVectorWritable> { - - private static double threshold; - private List clusterModels; - private ClusterClassifier clusterClassifier; - private IntWritable clusterId; - private WeightedVectorWritable weightedVW; - private boolean emitMostLikely; - - @Override - protected void setup(Context context) throws IOException, - InterruptedException { - super.setup(context); - - Configuration conf = context.getConfiguration(); - String clustersIn = conf.get(CLUSTERS_IN); - threshold = conf.getFloat(OUTLIER_REMOVAL_THRESHOLD, 0.0f); - emitMostLikely = conf.getBoolean(EMIT_MOST_LIKELY, false); - - clusterModels = new ArrayList(); - - if (clustersIn != null && !clustersIn.isEmpty()) { - Path clustersInPath = new Path(clustersIn); - clusterModels = populateClusterModels(clustersInPath, conf); - ClusteringPolicy policy = ClusterClassifier - .readPolicy(finalClustersPath(clustersInPath)); - clusterClassifier = new ClusterClassifier(clusterModels, policy); - } - clusterId = new IntWritable(); - weightedVW = new WeightedVectorWritable(1, null); - } - - /** - * Mapper which classifies the vectors to respective clusters. - */ - @Override - protected void map(WritableComparable key, VectorWritable vw, Context context) - throws IOException, InterruptedException { - if (!clusterModels.isEmpty()) { - Vector pdfPerCluster = clusterClassifier.classify(vw.get()); - if (shouldClassify(pdfPerCluster)) { - if (emitMostLikely) { - int maxValueIndex = pdfPerCluster.maxValueIndex(); - write(vw, context, maxValueIndex); - } else { - writeAllAboveThreshold(vw, context, pdfPerCluster); - } - } - } - } - - private void writeAllAboveThreshold(VectorWritable vw, Context context, - Vector pdfPerCluster) throws IOException, InterruptedException { - Iterator iterateNonZero = pdfPerCluster.iterateNonZero(); - while (iterateNonZero.hasNext()) { - Element pdf = iterateNonZero.next(); - if (pdf.get() >= threshold) { - int clusterIndex = pdf.index(); - write(vw, context, clusterIndex); - } - } - } - - private void write(VectorWritable vw, Context context, int clusterIndex) - throws IOException, InterruptedException { - Cluster cluster = clusterModels.get(clusterIndex); - clusterId.set(cluster.getId()); - weightedVW.setVector(vw.get()); - context.write(clusterId, weightedVW); - } - - public static List populateClusterModels(Path clusterOutputPath, Configuration conf) - throws IOException { - List clusters = new ArrayList(); - Cluster cluster = null; - FileSystem fileSystem = clusterOutputPath.getFileSystem(conf); - FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, - PathFilters.finalPartFilter()); - Iterator it = new SequenceFileDirValueIterator( - clusterFiles[0].getPath(), PathType.LIST, PathFilters.partFilter(), - null, false, conf); - while (it.hasNext()) { - ClusterWritable next = (ClusterWritable) it.next(); - cluster = next.getValue(); - cluster.configure(conf); - clusters.add(cluster); - } - return clusters; - } - - private static boolean shouldClassify(Vector pdfPerCluster) { - boolean isMaxPDFGreatherThanThreshold = pdfPerCluster.maxValue() >= threshold; - return isMaxPDFGreatherThanThreshold; - } - - private static Path finalClustersPath(Path clusterOutputPath) - throws IOException { - FileSystem fileSystem = clusterOutputPath - .getFileSystem(new Configuration()); - FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, - PathFilters.finalPartFilter()); - Path finalClustersPath = clusterFiles[0].getPath(); - return finalClustersPath; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassifier.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassifier.java deleted file mode 100644 index 70434827f..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassifier.java +++ /dev/null @@ -1,239 +0,0 @@ -/* Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.classify; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.List; -import java.util.Locale; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.classifier.AbstractVectorClassifier; -import org.apache.mahout.classifier.OnlineLearner; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.clustering.iterator.ClusteringPolicy; -import org.apache.mahout.clustering.iterator.ClusteringPolicyWritable; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -import com.google.common.collect.Lists; -import com.google.common.io.Closeables; - -/** - * This classifier works with any ClusteringPolicy and its associated Clusters. - * It is initialized with a policy and a list of compatible clusters and - * thereafter it can classify any new Vector into one or more of the clusters - * based upon the pdf() function which each cluster supports. - * - * In addition, it is an OnlineLearner and can be trained. Training amounts to - * asking the actual model to observe the vector and closing the classifier - * causes all the models to computeParameters. - * - * Because a ClusterClassifier implements Writable, it can be written-to and - * read-from a sequence file as a single entity. For sequential and mapreduce - * clustering in conjunction with a ClusterIterator; however, it utilizes an - * exploded file format. In this format, the iterator writes the policy to a - * single POLICY_FILE_NAME file in the clustersOut directory and the models are - * written to one or more part-n files so that multiple reducers may employed to - * produce them. - */ -public class ClusterClassifier extends AbstractVectorClassifier implements OnlineLearner, Writable { - - private static final String POLICY_FILE_NAME = "_policy"; - - private List models; - - private String modelClass; - - private ClusteringPolicy policy; - - /** - * The public constructor accepts a list of clusters to become the models - * - * @param models - * a List - * @param policy - * a ClusteringPolicy - */ - public ClusterClassifier(List models, ClusteringPolicy policy) { - this.models = models; - modelClass = models.get(0).getClass().getName(); - this.policy = policy; - } - - // needed for serialization/deserialization - public ClusterClassifier() {} - - // only used by MR ClusterIterator - protected ClusterClassifier(ClusteringPolicy policy) { - this.policy = policy; - } - - @Override - public Vector classify(Vector instance) { - return policy.classify(instance, this); - } - - @Override - public double classifyScalar(Vector instance) { - if (models.size() == 2) { - double pdf0 = models.get(0).pdf(new VectorWritable(instance)); - double pdf1 = models.get(1).pdf(new VectorWritable(instance)); - return pdf0 / (pdf0 + pdf1); - } - throw new IllegalStateException(); - } - - @Override - public int numCategories() { - return models.size(); - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeInt(models.size()); - out.writeUTF(modelClass); - new ClusteringPolicyWritable(policy).write(out); - for (Cluster cluster : models) { - cluster.write(out); - } - } - - @Override - public void readFields(DataInput in) throws IOException { - int size = in.readInt(); - modelClass = in.readUTF(); - models = Lists.newArrayList(); - ClusteringPolicyWritable clusteringPolicyWritable = new ClusteringPolicyWritable(); - clusteringPolicyWritable.readFields(in); - policy = clusteringPolicyWritable.getValue(); - for (int i = 0; i < size; i++) { - Cluster element = ClassUtils.instantiateAs(modelClass, Cluster.class); - element.readFields(in); - models.add(element); - } - } - - @Override - public void train(int actual, Vector instance) { - models.get(actual).observe(new VectorWritable(instance)); - } - - /** - * Train the models given an additional weight. Unique to ClusterClassifier - * - * @param actual - * the int index of a model - * @param data - * a data Vector - * @param weight - * a double weighting factor - */ - public void train(int actual, Vector data, double weight) { - models.get(actual).observe(new VectorWritable(data), weight); - } - - @Override - public void train(long trackingKey, String groupKey, int actual, Vector instance) { - models.get(actual).observe(new VectorWritable(instance)); - } - - @Override - public void train(long trackingKey, int actual, Vector instance) { - models.get(actual).observe(new VectorWritable(instance)); - } - - @Override - public void close() { - policy.close(this); - } - - public List getModels() { - return models; - } - - public ClusteringPolicy getPolicy() { - return policy; - } - - public void writeToSeqFiles(Path path) throws IOException { - writePolicy(policy, path); - Configuration config = new Configuration(); - FileSystem fs = FileSystem.get(path.toUri(), config); - SequenceFile.Writer writer = null; - ClusterWritable cw = new ClusterWritable(); - for (int i = 0; i < models.size(); i++) { - try { - Cluster cluster = models.get(i); - cw.setValue(cluster); - writer = new SequenceFile.Writer(fs, config, - new Path(path, "part-" + String.format(Locale.ENGLISH, "%05d", i)), IntWritable.class, - ClusterWritable.class); - Writable key = new IntWritable(i); - writer.append(key, cw); - } finally { - Closeables.closeQuietly(writer); - } - } - } - - public void readFromSeqFiles(Configuration conf, Path path) throws IOException { - Configuration config = new Configuration(); - List clusters = Lists.newArrayList(); - for (ClusterWritable cw : new SequenceFileDirValueIterable(path, PathType.LIST, - PathFilters.logsCRCFilter(), config)) { - Cluster cluster = cw.getValue(); - cluster.configure(conf); - clusters.add(cluster); - } - this.models = clusters; - modelClass = models.get(0).getClass().getName(); - this.policy = readPolicy(path); - } - - public static ClusteringPolicy readPolicy(Path path) throws IOException { - Path policyPath = new Path(path, POLICY_FILE_NAME); - Configuration config = new Configuration(); - FileSystem fs = FileSystem.get(policyPath.toUri(), config); - SequenceFile.Reader reader = new SequenceFile.Reader(fs, policyPath, config); - Text key = new Text(); - ClusteringPolicyWritable cpw = new ClusteringPolicyWritable(); - reader.next(key, cpw); - return cpw.getValue(); - } - - public static void writePolicy(ClusteringPolicy policy, Path path) throws IOException { - Path policyPath = new Path(path, POLICY_FILE_NAME); - Configuration config = new Configuration(); - FileSystem fs = FileSystem.get(policyPath.toUri(), config); - SequenceFile.Writer writer = new SequenceFile.Writer(fs, config, policyPath, Text.class, - ClusteringPolicyWritable.class); - writer.append(new Text(), new ClusteringPolicyWritable(policy)); - writer.close(); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java deleted file mode 100644 index 6bc8b4ead..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java +++ /dev/null @@ -1,95 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.classify; - -import org.apache.hadoop.io.Text; -import org.apache.mahout.clustering.AbstractCluster; -import org.apache.mahout.math.Vector; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -public class WeightedPropertyVectorWritable extends WeightedVectorWritable { - - private Map properties; - - public WeightedPropertyVectorWritable() { - } - - public WeightedPropertyVectorWritable(Map properties) { - this.properties = properties; - } - - public WeightedPropertyVectorWritable(double weight, Vector vector, Map properties) { - super(weight, vector); - this.properties = properties; - } - - public Map getProperties() { - return properties; - } - - public void setProperties(Map properties) { - this.properties = properties; - } - - @Override - public void readFields(DataInput in) throws IOException { - super.readFields(in); - int size = in.readInt(); - if (size > 0) { - properties = new HashMap(); - for (int i = 0; i < size; i++) { - Text key = new Text(in.readUTF()); - Text val = new Text(in.readUTF()); - properties.put(key, val); - } - } - } - - @Override - public void write(DataOutput out) throws IOException { - super.write(out); - out.writeInt(properties != null ? properties.size() : 0); - if (properties != null) { - for (Map.Entry entry : properties.entrySet()) { - out.writeUTF(entry.getKey().toString()); - out.writeUTF(entry.getValue().toString()); - } - } - } - - @Override - public String toString() { - Vector vector = getVector(); - StringBuilder bldr = new StringBuilder("wt: ").append(getWeight()).append(" "); - if (properties != null && !properties.isEmpty()) { - for (Map.Entry entry : properties.entrySet()) { - bldr.append(entry.getKey().toString()).append(": ").append(entry.getValue().toString()).append(' '); - } - } - bldr.append(" vec: ").append(vector == null ? "null" : AbstractCluster.formatVector(vector, null)); - return bldr.toString(); - } - - -} - diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/WeightedVectorWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/WeightedVectorWritable.java deleted file mode 100644 index 510dd3925..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/classify/WeightedVectorWritable.java +++ /dev/null @@ -1,72 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.classify; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.clustering.AbstractCluster; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -public class WeightedVectorWritable implements Writable { - - private final VectorWritable vectorWritable = new VectorWritable(); - private double weight; - - public WeightedVectorWritable() { - } - - public WeightedVectorWritable(double weight, Vector vector) { - this.vectorWritable.set(vector); - this.weight = weight; - } - - public Vector getVector() { - return vectorWritable.get(); - } - - public void setVector(Vector vector) { - vectorWritable.set(vector); - } - - public double getWeight() { - return weight; - } - - @Override - public void readFields(DataInput in) throws IOException { - vectorWritable.readFields(in); - weight = in.readDouble(); - } - - @Override - public void write(DataOutput out) throws IOException { - vectorWritable.write(out); - out.writeDouble(weight); - } - - @Override - public String toString() { - Vector vector = vectorWritable.get(); - return weight + ": " + (vector == null ? "null" : AbstractCluster.formatVector(vector, null)); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java deleted file mode 100644 index 60159037a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java +++ /dev/null @@ -1,240 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.dirichlet; - -import static org.apache.mahout.clustering.topdown.PathDirectory.CLUSTERED_POINTS_DIRECTORY; - -import java.io.IOException; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.Model; -import org.apache.mahout.clustering.ModelDistribution; -import org.apache.mahout.clustering.classify.ClusterClassificationDriver; -import org.apache.mahout.clustering.classify.ClusterClassifier; -import org.apache.mahout.clustering.dirichlet.models.DistributionDescription; -import org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution; -import org.apache.mahout.clustering.iterator.ClusterIterator; -import org.apache.mahout.clustering.iterator.DirichletClusteringPolicy; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.VectorWritable; - -import com.google.common.collect.Lists; - -public class DirichletDriver extends AbstractJob { - - public static final String STATE_IN_KEY = "org.apache.mahout.clustering.dirichlet.stateIn"; - public static final String MODEL_DISTRIBUTION_KEY = "org.apache.mahout.clustering.dirichlet.modelFactory"; - public static final String NUM_CLUSTERS_KEY = "org.apache.mahout.clustering.dirichlet.numClusters"; - public static final String ALPHA_0_KEY = "org.apache.mahout.clustering.dirichlet.alpha_0"; - public static final String EMIT_MOST_LIKELY_KEY = "org.apache.mahout.clustering.dirichlet.emitMostLikely"; - public static final String THRESHOLD_KEY = "org.apache.mahout.clustering.dirichlet.threshold"; - public static final String MODEL_PROTOTYPE_CLASS_OPTION = "modelPrototype"; - public static final String MODEL_DISTRIBUTION_CLASS_OPTION = "modelDist"; - public static final String ALPHA_OPTION = "alpha"; - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new DirichletDriver(), args); - } - - @Override - public int run(String[] args) throws Exception { - addInputOption(); - addOutputOption(); - addOption(DefaultOptionCreator.maxIterationsOption().create()); - addOption(DefaultOptionCreator.numClustersOption().withRequired(true).create()); - addOption(DefaultOptionCreator.overwriteOption().create()); - addOption(DefaultOptionCreator.clusteringOption().create()); - addOption(ALPHA_OPTION, "a0", "The alpha0 value for the DirichletDistribution. Defaults to 1.0", "1.0"); - addOption(MODEL_DISTRIBUTION_CLASS_OPTION, "md", - "The ModelDistribution class name. Defaults to GaussianClusterDistribution", - GaussianClusterDistribution.class.getName()); - addOption(MODEL_PROTOTYPE_CLASS_OPTION, "mp", - "The ModelDistribution prototype Vector class name. Defaults to RandomAccessSparseVector", - RandomAccessSparseVector.class.getName()); - addOption(DefaultOptionCreator.distanceMeasureOption().withRequired(false).create()); - addOption(DefaultOptionCreator.emitMostLikelyOption().create()); - addOption(DefaultOptionCreator.thresholdOption().create()); - addOption(DefaultOptionCreator.methodOption().create()); - - if (parseArguments(args) == null) { - return -1; - } - - Path input = getInputPath(); - Path output = getOutputPath(); - if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(getConf(), output); - } - String modelFactory = getOption(MODEL_DISTRIBUTION_CLASS_OPTION); - String modelPrototype = getOption(MODEL_PROTOTYPE_CLASS_OPTION); - String distanceMeasure = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); - int numModels = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)); - int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); - boolean emitMostLikely = Boolean.parseBoolean(getOption(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION)); - double threshold = Double.parseDouble(getOption(DefaultOptionCreator.THRESHOLD_OPTION)); - double alpha0 = Double.parseDouble(getOption(ALPHA_OPTION)); - boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION); - boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase( - DefaultOptionCreator.SEQUENTIAL_METHOD); - int prototypeSize = readPrototypeSize(input); - - DistributionDescription description = new DistributionDescription(modelFactory, modelPrototype, distanceMeasure, - prototypeSize); - - run(getConf(), input, output, description, numModels, maxIterations, alpha0, runClustering, emitMostLikely, - threshold, runSequential); - return 0; - } - - /** - * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to - * cluster the input vectors. - * - * @param conf - * the Configuration to use - * @param input - * the directory Path for input points - * @param output - * the directory Path for output points - * @param description - * model distribution parameters - * @param maxIterations - * the maximum number of iterations - * @param alpha0 - * the alpha_0 value for the DirichletDistribution - * @param runClustering - * true if clustering of points to be done after iterations - * @param emitMostLikely - * a boolean if true emit only most likely cluster for each point - * @param threshold - * a double threshold value emits all clusters having greater pdf (emitMostLikely = false) - * @param runSequential - * execute sequentially if true - */ - public static void run(Configuration conf, Path input, Path output, DistributionDescription description, - int numModels, int maxIterations, double alpha0, boolean runClustering, boolean emitMostLikely, double threshold, - boolean runSequential) throws IOException, ClassNotFoundException, InterruptedException { - Path clustersOut = buildClusters(conf, input, output, description, numModels, maxIterations, alpha0, runSequential); - if (runClustering) { - clusterData(conf, input, clustersOut, output, alpha0, numModels, emitMostLikely, threshold, runSequential); - } - } - - /** - * Read the first input vector to determine the prototype size for the modelPrototype - */ - public static int readPrototypeSize(Path input) throws IOException { - Configuration conf = new Configuration(); - FileSystem fs = FileSystem.get(input.toUri(), conf); - FileStatus[] status = fs.listStatus(input, PathFilters.logsCRCFilter()); - int protoSize = 0; - if (status.length > 0) { - FileStatus s = status[0]; - for (VectorWritable value : new SequenceFileValueIterable(s.getPath(), true, conf)) { - protoSize = value.get().size(); - } - } - return protoSize; - } - - /** - * Iterate over the input vectors to produce cluster directories for each iteration - * - * @param conf - * the hadoop configuration - * @param input - * the directory Path for input points - * @param output - * the directory Path for output points - * @param description - * model distribution parameters - * @param numClusters - * the number of models to iterate over - * @param maxIterations - * the maximum number of iterations - * @param alpha0 - * the alpha_0 value for the DirichletDistribution - * @param runSequential - * execute sequentially if true - * - * @return the Path of the final clusters directory - */ - public static Path buildClusters(Configuration conf, Path input, Path output, DistributionDescription description, - int numClusters, int maxIterations, double alpha0, boolean runSequential) throws IOException, - ClassNotFoundException, InterruptedException { - Path clustersIn = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); - ModelDistribution modelDist = description.createModelDistribution(conf); - - List models = Lists.newArrayList(); - for (Model cluster : modelDist.sampleFromPrior(numClusters)) { - models.add((Cluster) cluster); - } - - ClusterClassifier prior = new ClusterClassifier(models, new DirichletClusteringPolicy(numClusters, alpha0)); - prior.writeToSeqFiles(clustersIn); - - if (runSequential) { - new ClusterIterator().iterateSeq(conf, input, clustersIn, output, maxIterations); - } else { - new ClusterIterator().iterateMR(conf, input, clustersIn, output, maxIterations); - } - return output; - - } - - /** - * Run the job using supplied arguments - * - * @param conf - * @param input - * the directory pathname for input points - * @param stateIn - * the directory pathname for input state - * @param output - * the directory pathname for output points - * @param alpha0 - * TODO - * @param numModels - * TODO - * @param emitMostLikely - * a boolean if true emit only most likely cluster for each point - * @param threshold - * a double threshold value emits all clusters having greater pdf (emitMostLikely = false) - * @param runSequential - * execute sequentially if true - */ - public static void clusterData(Configuration conf, Path input, Path stateIn, Path output, double alpha0, - int numModels, boolean emitMostLikely, double threshold, boolean runSequential) throws IOException, - InterruptedException, ClassNotFoundException { - ClusterClassifier.writePolicy(new DirichletClusteringPolicy(numModels, alpha0), stateIn); - ClusterClassificationDriver.run(conf, input, output, new Path(output, CLUSTERED_POINTS_DIRECTORY), threshold, - emitMostLikely, runSequential); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/UncommonDistributions.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/UncommonDistributions.java deleted file mode 100644 index bdd21cabe..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/UncommonDistributions.java +++ /dev/null @@ -1,267 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.dirichlet; - -import java.util.Random; - -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Vector; -import org.uncommons.maths.random.GaussianGenerator; - -public final class UncommonDistributions { - - public static final double SQRT2PI = Math.sqrt(2.0 * Math.PI); - - private static final Random RANDOM = RandomUtils.getRandom(); - - private UncommonDistributions() { - } - - // =============== start of BSD licensed code. See LICENSE.txt - /** - * Returns a double sampled according to this distribution. Uniformly fast for all k > 0. (Reference: - * Non-Uniform Random Variate Generation, Devroye http://cgm.cs.mcgill.ca/~luc/rnbookindex.html) Uses - * Cheng's rejection algorithm (GB) for k>=1, rejection from Weibull distribution for 0 < k < 1. - */ - public static double rGamma(double k, double lambda) { - boolean accept = false; - if (k >= 1.0) { - // Cheng's algorithm - double b = k - Math.log(4.0); - double c = k + Math.sqrt(2.0 * k - 1.0); - double lam = Math.sqrt(2.0 * k - 1.0); - double cheng = 1.0 + Math.log(4.5); - double x; - do { - double u = RANDOM.nextDouble(); - double v = RANDOM.nextDouble(); - double y = 1.0 / lam * Math.log(v / (1.0 - v)); - x = k * Math.exp(y); - double z = u * v * v; - double r = b + c * y - x; - if (r >= 4.5 * z - cheng || r >= Math.log(z)) { - accept = true; - } - } while (!accept); - return x / lambda; - } else { - // Weibull algorithm - double c = 1.0 / k; - double d = (1.0 - k) * Math.pow(k, k / (1.0 - k)); - double x; - do { - double u = RANDOM.nextDouble(); - double v = RANDOM.nextDouble(); - double z = -Math.log(u); - double e = -Math.log(v); - x = Math.pow(z, c); - if (z + e >= d + x) { - accept = true; - } - } while (!accept); - return x / lambda; - } - } - - // ============= end of BSD licensed code - - /** - * Returns a random sample from a beta distribution with the given shapes - * - * @param shape1 - * a double representing shape1 - * @param shape2 - * a double representing shape2 - * @return a Vector of samples - */ - public static double rBeta(double shape1, double shape2) { - double gam1 = rGamma(shape1, 1.0); - double gam2 = rGamma(shape2, 1.0); - return gam1 / (gam1 + gam2); - - } - - /** - * Returns a vector of random samples from a beta distribution with the given shapes - * - * @param k - * the number of samples to return - * @param shape1 - * a double representing shape1 - * @param shape2 - * a double representing shape2 - * @return a Vector of samples - */ - public static Vector rBeta(int k, double shape1, double shape2) { - // List params = new ArrayList(2); - // params.add(shape1); - // params.add(Math.max(0, shape2)); - Vector result = new DenseVector(k); - for (int i = 0; i < k; i++) { - result.set(i, rBeta(shape1, shape2)); - } - return result; - } - - /** - * Return a random sample from the chi-squared (chi^2) distribution with df degrees of freedom. - * - * @return a double sample - */ - public static double rChisq(double df) { - double result = 0.0; - for (int i = 0; i < df; i++) { - double sample = rNorm(0.0, 1.0); - result += sample * sample; - } - return result; - } - - /** - * Return a random value from a normal distribution with the given mean and standard deviation - * - * @param mean - * a double mean value - * @param sd - * a double standard deviation - * @return a double sample - */ - public static double rNorm(double mean, double sd) { - GaussianGenerator dist = new GaussianGenerator(mean, sd, RANDOM); - return dist.nextValue(); - } - - /** - * Return the normal density function value for the sample x - * - * pdf = 1/[sqrt(2*p)*s] * e^{-1/2*[(x-m)/s]^2} - * - * @param x - * a double sample value - * @param m - * a double mean value - * @param s - * a double standard deviation - * @return a double probability value - */ - public static double dNorm(double x, double m, double s) { - double xms = (x - m) / s; - double ex = xms * xms / 2.0; - double exp = Math.exp(-ex); - return exp / (SQRT2PI * s); - } - - /** Returns one sample from a multinomial. */ - public static int rMultinom(Vector probabilities) { - // our probability argument are not normalized. - double total = probabilities.zSum(); - double nextDouble = RANDOM.nextDouble(); - double p = nextDouble * total; - for (int i = 0; i < probabilities.size(); i++) { - double pi = probabilities.get(i); - if (p < pi) { - return i; - } else { - p -= pi; - } - } - // can't happen except for round-off error so we don't care what we return here - return 0; - } - - /** - * Returns a multinomial vector sampled from the given probabilities - * - * rmultinom should be implemented as successive binomial sampling. - * - * Keep a normalizing amount that starts with 1 (I call it total). - * - * For each i k[i] = rbinom(p[i] / total, size); total -= p[i]; size -= k[i]; - * - * @param size - * the size parameter of the binomial distribution - * @param probabilities - * a Vector of probabilities - * @return a multinomial distribution Vector - */ - public static Vector rMultinom(int size, Vector probabilities) { - // our probability argument may not be normalized. - double total = probabilities.zSum(); - int cardinality = probabilities.size(); - Vector result = new DenseVector(cardinality); - for (int i = 0; total > 0 && i < cardinality; i++) { - double p = probabilities.get(i); - int ki = rBinomial(size, p / total); - total -= p; - size -= ki; - result.set(i, ki); - } - return result; - } - - /** - * Returns an integer sampled according to this distribution. Takes time proportional to np + 1. (Reference: - * Non-Uniform Random Variate Generation, Devroye http://cgm.cs.mcgill.ca/~luc/rnbookindex.html) Second - * time-waiting algorithm. - */ - public static int rBinomial(int n, double p) { - if (p >= 1.0) { - return n; // needed to avoid infinite loops and negative results - } - double q = -Math.log1p(-p); - double sum = 0.0; - int x = 0; - while (sum <= q) { - double u = RANDOM.nextDouble(); - double e = -Math.log(u); - sum += e / (n - x); - x++; - } - if (x == 0) { - return 0; - } - return x - 1; - } - - /** - * Sample from a Dirichlet distribution, returning a vector of probabilities using a stick-breaking - * algorithm - * - * @param totalCounts - * an unnormalized count Vector - * @param alpha0 - * a double - * @return a Vector of probabilities - */ - public static Vector rDirichlet(Vector totalCounts, double alpha0) { - Vector pi = totalCounts.like(); - double total = totalCounts.zSum(); - double remainder = 1.0; - for (int k = 0; k < pi.size(); k++) { - double countK = totalCounts.get(k); - total -= countK; - double betaK = rBeta(1.0 + countK, Math.max(0.0, alpha0 + total)); - double piK = betaK * remainder; - pi.set(k, piK); - remainder -= piK; - } - return pi; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/AbstractVectorModelDistribution.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/AbstractVectorModelDistribution.java deleted file mode 100644 index ece74d32c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/AbstractVectorModelDistribution.java +++ /dev/null @@ -1,50 +0,0 @@ -/* Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.dirichlet.models; - -import org.apache.mahout.clustering.ModelDistribution; -import org.apache.mahout.math.VectorWritable; - -public abstract class AbstractVectorModelDistribution implements ModelDistribution { - - // a prototype instance used for creating prior model distributions using like(). It - // should be of the class and cardinality desired for the particular application. - private VectorWritable modelPrototype; - - protected AbstractVectorModelDistribution() { - } - - protected AbstractVectorModelDistribution(VectorWritable modelPrototype) { - this.modelPrototype = modelPrototype; - } - - /** - * @return the modelPrototype - */ - public VectorWritable getModelPrototype() { - return modelPrototype; - } - - /** - * @param modelPrototype - * the modelPrototype to set - */ - public void setModelPrototype(VectorWritable modelPrototype) { - this.modelPrototype = modelPrototype; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistanceMeasureClusterDistribution.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistanceMeasureClusterDistribution.java deleted file mode 100644 index 712ae29a7..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistanceMeasureClusterDistribution.java +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.dirichlet.models; - -import org.apache.mahout.clustering.Model; -import org.apache.mahout.clustering.dirichlet.UncommonDistributions; -import org.apache.mahout.clustering.iterator.DistanceMeasureCluster; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.distance.ManhattanDistanceMeasure; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -/** - * An implementation of the ModelDistribution interface suitable for testing the - * DirichletCluster algorithm. Models use a DistanceMeasure to calculate pdf - * values. - */ -public class DistanceMeasureClusterDistribution extends AbstractVectorModelDistribution { - - private DistanceMeasure measure; - - public DistanceMeasureClusterDistribution() { - } - - public DistanceMeasureClusterDistribution(VectorWritable modelPrototype) { - super(modelPrototype); - this.measure = new ManhattanDistanceMeasure(); - } - - public DistanceMeasureClusterDistribution(VectorWritable modelPrototype, DistanceMeasure measure) { - super(modelPrototype); - this.measure = measure; - } - - @Override - public Model[] sampleFromPrior(int howMany) { - Model[] result = new DistanceMeasureCluster[howMany]; - Vector prototype = getModelPrototype().get().like(); - for (int i = 0; i < prototype.size(); i++) { - prototype.setQuick(i, UncommonDistributions.rNorm(0, 1)); - } - for (int i = 0; i < howMany; i++) { - result[i] = new DistanceMeasureCluster(prototype, i, measure); - } - return result; - } - - @Override - public Model[] sampleFromPosterior(Model[] posterior) { - Model[] result = new DistanceMeasureCluster[posterior.length]; - for (int i = 0; i < posterior.length; i++) { - result[i] = posterior[i].sampleFromPosterior(); - } - return result; - } - - public void setMeasure(DistanceMeasure measure) { - this.measure = measure; - } - - public DistanceMeasure getMeasure() { - return measure; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistributionDescription.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistributionDescription.java deleted file mode 100644 index 29b357f90..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistributionDescription.java +++ /dev/null @@ -1,115 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.dirichlet.models; - -import java.lang.reflect.Constructor; -import java.lang.reflect.InvocationTargetException; -import java.util.Iterator; - -import org.apache.hadoop.conf.Configuration; -import org.apache.mahout.clustering.ModelDistribution; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -import com.google.common.base.Splitter; - -/** - * Simply describes parameters needs to create a {@link org.apache.mahout.clustering.ModelDistribution}. - */ -public final class DistributionDescription { - - private final String modelFactory; - private final String modelPrototype; - private final String distanceMeasure; - private final int prototypeSize; - - public DistributionDescription(String modelFactory, String modelPrototype, String distanceMeasure, int prototypeSize) { - this.modelFactory = modelFactory; - this.modelPrototype = modelPrototype; - this.distanceMeasure = distanceMeasure; - this.prototypeSize = prototypeSize; - } - - public String getModelFactory() { - return modelFactory; - } - - public String getModelPrototype() { - return modelPrototype; - } - - public String getDistanceMeasure() { - return distanceMeasure; - } - - public int getPrototypeSize() { - return prototypeSize; - } - - /** - * Create an instance of AbstractVectorModelDistribution from the given command line arguments - * - * @param conf - * the Configuration - */ - public ModelDistribution createModelDistribution(Configuration conf) { - ClassLoader ccl = Thread.currentThread().getContextClassLoader(); - AbstractVectorModelDistribution modelDistribution; - try { - modelDistribution = ClassUtils.instantiateAs(modelFactory, AbstractVectorModelDistribution.class); - - Class vcl = ccl.loadClass(modelPrototype).asSubclass(Vector.class); - Constructor v = vcl.getConstructor(int.class); - modelDistribution.setModelPrototype(new VectorWritable(v.newInstance(prototypeSize))); - - if (modelDistribution instanceof DistanceMeasureClusterDistribution) { - DistanceMeasure measure = ClassUtils.instantiateAs(distanceMeasure, DistanceMeasure.class); - measure.configure(conf); - ((DistanceMeasureClusterDistribution) modelDistribution).setMeasure(measure); - } - } catch (ClassNotFoundException cnfe) { - throw new IllegalStateException(cnfe); - } catch (NoSuchMethodException nsme) { - throw new IllegalStateException(nsme); - } catch (InstantiationException ie) { - throw new IllegalStateException(ie); - } catch (IllegalAccessException iae) { - throw new IllegalStateException(iae); - } catch (InvocationTargetException ite) { - throw new IllegalStateException(ite); - } - return modelDistribution; - } - - @Override - public String toString() { - return modelFactory + ',' + modelPrototype + ',' + distanceMeasure + ',' + prototypeSize; - } - - public static DistributionDescription fromString(CharSequence s) { - Iterator tokens = Splitter.on(',').split(s).iterator(); - String modelFactory = tokens.next(); - String modelPrototype = tokens.next(); - String distanceMeasure = tokens.next(); - int prototypeSize = Integer.parseInt(tokens.next()); - return new DistributionDescription(modelFactory, modelPrototype, distanceMeasure, prototypeSize); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/GaussianCluster.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/GaussianCluster.java deleted file mode 100644 index 45766dbba..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/GaussianCluster.java +++ /dev/null @@ -1,97 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.dirichlet.models; - -import java.util.Iterator; - -import org.apache.mahout.clustering.AbstractCluster; -import org.apache.mahout.clustering.Model; -import org.apache.mahout.clustering.dirichlet.UncommonDistributions; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.Vector.Element; -import org.apache.mahout.math.VectorWritable; - -public class GaussianCluster extends AbstractCluster { - - public GaussianCluster() {} - - public GaussianCluster(Vector point, int id2) { - super(point, id2); - } - - public GaussianCluster(Vector center, Vector radius, int id) { - super(center, radius, id); - } - - @Override - public String getIdentifier() { - return "GC:" + getId(); - } - - @Override - public Model sampleFromPosterior() { - return new GaussianCluster(getCenter(), getRadius(), getId()); - } - - /* (non-Javadoc) - * @see org.apache.mahout.clustering.AbstractCluster#setRadius(org.apache.mahout.math.Vector) - */ - @Override - protected void setRadius(Vector s2) { - super.setRadius(s2); - computeProd2piR(); - } - - // the value of the zProduct(S*2pi) term. Calculated below. - private double zProd2piR; - - /** - * Compute the product(r[i]*SQRT2PI) over all i. Note that the cluster Radius - * corresponds to the Stdev of a Gaussian and the Center to its Mean. - */ - private void computeProd2piR() { - zProd2piR = 1.0; - for (Iterator it = getRadius().iterateNonZero(); it.hasNext();) { - Element radius = it.next(); - zProd2piR *= radius.get() * UncommonDistributions.SQRT2PI; - } - } - - @Override - public double pdf(VectorWritable vw) { - return Math.exp(-(sumXminusCdivRsquared(vw.get()) / 2)) / zProd2piR; - } - - /** - * @param x - * a Vector - * @return the zSum(((x[i]-c[i])/r[i])^2) over all i - */ - private double sumXminusCdivRsquared(Vector x) { - double result = 0; - for (Iterator it = getRadius().iterateNonZero(); it.hasNext();) { - Element radiusElem = it.next(); - int index = radiusElem.index(); - double quotient = (x.get(index) - getCenter().get(index)) - / radiusElem.get(); - result += quotient * quotient; - } - return result; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/GaussianClusterDistribution.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/GaussianClusterDistribution.java deleted file mode 100644 index a78ec7800..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/GaussianClusterDistribution.java +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.dirichlet.models; - -import org.apache.mahout.clustering.Model; -import org.apache.mahout.clustering.dirichlet.UncommonDistributions; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -/** - * An implementation of the ModelDistribution interface suitable for testing the DirichletCluster algorithm. - * Uses a Normal Distribution to sample the prior model values. Model values have a vector standard deviation, - * allowing assymetrical regions to be covered by a model. - */ -public class GaussianClusterDistribution extends AbstractVectorModelDistribution { - - public GaussianClusterDistribution() { - } - - public GaussianClusterDistribution(VectorWritable modelPrototype) { - super(modelPrototype); - } - - @Override - public Model[] sampleFromPrior(int howMany) { - Model[] result = new GaussianCluster[howMany]; - for (int i = 0; i < howMany; i++) { - Vector prototype = getModelPrototype().get(); - Vector mean = prototype.like(); - for (int j = 0; j < prototype.size(); j++) { - mean.set(j, UncommonDistributions.rNorm(0, 1)); - } - Vector sd = prototype.like(); - for (int j = 0; j < prototype.size(); j++) { - sd.set(j, UncommonDistributions.rNorm(1, 1)); - } - result[i] = new GaussianCluster(mean, sd, i); - } - return result; - } - - @Override - public Model[] sampleFromPosterior(Model[] posterior) { - Model[] result = new GaussianCluster[posterior.length]; - for (int i = 0; i < posterior.length; i++) { - result[i] = posterior[i].sampleFromPosterior(); - } - return result; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java deleted file mode 100644 index ff02a4c06..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java +++ /dev/null @@ -1,59 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.fuzzykmeans; - -import java.util.Collection; -import java.util.List; - -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Vector; - -public class FuzzyKMeansClusterer { - - private static final double MINIMAL_VALUE = 0.0000000001; - - private double m = 2.0; // default value - - public Vector computePi(Collection clusters, List clusterDistanceList) { - Vector pi = new DenseVector(clusters.size()); - for (int i = 0; i < clusters.size(); i++) { - double probWeight = computeProbWeight(clusterDistanceList.get(i), clusterDistanceList); - pi.set(i, probWeight); - } - return pi; - } - - /** Computes the probability of a point belonging to a cluster */ - public double computeProbWeight(double clusterDistance, Iterable clusterDistanceList) { - if (clusterDistance == 0) { - clusterDistance = MINIMAL_VALUE; - } - double denom = 0.0; - for (double eachCDist : clusterDistanceList) { - if (eachCDist == 0.0) { - eachCDist = MINIMAL_VALUE; - } - denom += Math.pow(clusterDistance / eachCDist, 2.0 / (m - 1)); - } - return 1.0 / denom; - } - - public void setM(double m) { - this.m = m; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansConfigKeys.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansConfigKeys.java deleted file mode 100644 index 7e9e00c73..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansConfigKeys.java +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.fuzzykmeans; - -public interface FuzzyKMeansConfigKeys { - - String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.kmeans.measure"; - - String CLUSTER_PATH_KEY = "org.apache.mahout.clustering.kmeans.path"; - - String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.kmeans.convergence"; - - String M_KEY = "org.apache.mahout.clustering.fuzzykmeans.m"; - - String EMIT_MOST_LIKELY_KEY = "org.apache.mahout.clustering.fuzzykmeans.emitMostLikely"; - - String THRESHOLD_KEY = "org.apache.mahout.clustering.fuzzykmeans.threshold"; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java deleted file mode 100644 index 39252416e..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java +++ /dev/null @@ -1,327 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.fuzzykmeans; - -import static org.apache.mahout.clustering.topdown.PathDirectory.CLUSTERED_POINTS_DIRECTORY; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.classify.ClusterClassificationDriver; -import org.apache.mahout.clustering.classify.ClusterClassifier; -import org.apache.mahout.clustering.iterator.ClusterIterator; -import org.apache.mahout.clustering.iterator.ClusteringPolicy; -import org.apache.mahout.clustering.iterator.FuzzyKMeansClusteringPolicy; -import org.apache.mahout.clustering.kmeans.RandomSeedGenerator; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class FuzzyKMeansDriver extends AbstractJob { - - public static final String M_OPTION = "m"; - - private static final Logger log = LoggerFactory.getLogger(FuzzyKMeansDriver.class); - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new FuzzyKMeansDriver(), args); - } - - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOutputOption(); - addOption(DefaultOptionCreator.distanceMeasureOption().create()); - addOption(DefaultOptionCreator.clustersInOption() - .withDescription("The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy. " - + "If k is also specified, then a random set of vectors will be selected" - + " and written out to this path first") - .create()); - addOption(DefaultOptionCreator.numClustersOption() - .withDescription("The k in k-Means. If specified, then a random selection of k Vectors will be chosen" - + " as the Centroid and written to the clusters input path.").create()); - addOption(DefaultOptionCreator.convergenceOption().create()); - addOption(DefaultOptionCreator.maxIterationsOption().create()); - addOption(DefaultOptionCreator.overwriteOption().create()); - addOption(M_OPTION, M_OPTION, "coefficient normalization factor, must be greater than 1", true); - addOption(DefaultOptionCreator.clusteringOption().create()); - addOption(DefaultOptionCreator.emitMostLikelyOption().create()); - addOption(DefaultOptionCreator.thresholdOption().create()); - addOption(DefaultOptionCreator.methodOption().create()); - - if (parseArguments(args) == null) { - return -1; - } - - Path input = getInputPath(); - Path clusters = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION)); - Path output = getOutputPath(); - String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); - if (measureClass == null) { - measureClass = SquaredEuclideanDistanceMeasure.class.getName(); - } - double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); - float fuzziness = Float.parseFloat(getOption(M_OPTION)); - - int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); - if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(getConf(), output); - } - boolean emitMostLikely = Boolean.parseBoolean(getOption(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION)); - double threshold = Double.parseDouble(getOption(DefaultOptionCreator.THRESHOLD_OPTION)); - DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); - - if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) { - clusters = RandomSeedGenerator.buildRandom(getConf(), - input, - clusters, - Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), - measure); - } - boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION); - boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase( - DefaultOptionCreator.SEQUENTIAL_METHOD); - run(getConf(), - input, - clusters, - output, - measure, - convergenceDelta, - maxIterations, - fuzziness, - runClustering, - emitMostLikely, - threshold, - runSequential); - return 0; - } - - /** - * Iterate over the input vectors to produce clusters and, if requested, use the - * results of the final iteration to cluster the input vectors. - * - * @param input - * the directory pathname for input points - * @param clustersIn - * the directory pathname for initial & computed clusters - * @param output - * the directory pathname for output points - * @param convergenceDelta - * the convergence delta value - * @param maxIterations - * the maximum number of iterations - * @param m - * the fuzzification factor, see - * http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering - * @param runClustering - * true if points are to be clustered after iterations complete - * @param emitMostLikely - * a boolean if true emit only most likely cluster for each point - * @param threshold - * a double threshold value emits all clusters having greater pdf (emitMostLikely = false) - * @param runSequential if true run in sequential execution mode - */ - public static void run(Path input, - Path clustersIn, - Path output, - DistanceMeasure measure, - double convergenceDelta, - int maxIterations, - float m, - boolean runClustering, - boolean emitMostLikely, - double threshold, - boolean runSequential) throws IOException, ClassNotFoundException, InterruptedException { - Path clustersOut = buildClusters(new Configuration(), - input, - clustersIn, - output, - measure, - convergenceDelta, - maxIterations, - m, - runSequential); - if (runClustering) { - log.info("Clustering "); - clusterData(input, - clustersOut, - output, - measure, - convergenceDelta, - m, - emitMostLikely, - threshold, - runSequential); - } - } - - /** - * Iterate over the input vectors to produce clusters and, if requested, use the - * results of the final iteration to cluster the input vectors. - * @param input - * the directory pathname for input points - * @param clustersIn - * the directory pathname for initial & computed clusters - * @param output - * the directory pathname for output points - * @param convergenceDelta - * the convergence delta value - * @param maxIterations - * the maximum number of iterations - * @param m - * the fuzzification factor, see - * http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering - * @param runClustering - * true if points are to be clustered after iterations complete - * @param emitMostLikely - * a boolean if true emit only most likely cluster for each point - * @param threshold - * a double threshold value emits all clusters having greater pdf (emitMostLikely = false) - * @param runSequential if true run in sequential execution mode - */ - public static void run(Configuration conf, - Path input, - Path clustersIn, - Path output, - DistanceMeasure measure, - double convergenceDelta, - int maxIterations, - float m, - boolean runClustering, - boolean emitMostLikely, - double threshold, - boolean runSequential) - throws IOException, ClassNotFoundException, InterruptedException { - Path clustersOut = - buildClusters(conf, input, clustersIn, output, measure, convergenceDelta, maxIterations, m, runSequential); - if (runClustering) { - log.info("Clustering"); - clusterData(input, - clustersOut, - output, - measure, - convergenceDelta, - m, - emitMostLikely, - threshold, - runSequential); - } - } - - /** - * Iterate over the input vectors to produce cluster directories for each iteration - * @param input - * the directory pathname for input points - * @param clustersIn - * the file pathname for initial cluster centers - * @param output - * the directory pathname for output points - * @param measure - * the classname of the DistanceMeasure - * @param convergenceDelta - * the convergence delta value - * @param maxIterations - * the maximum number of iterations - * @param m - * the fuzzification factor, see - * http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering - * @param runSequential if true run in sequential execution mode - * - * @return the Path of the final clusters directory - */ - public static Path buildClusters(Configuration conf, - Path input, - Path clustersIn, - Path output, - DistanceMeasure measure, - double convergenceDelta, - int maxIterations, - float m, - boolean runSequential) - throws IOException, InterruptedException, ClassNotFoundException { - - List clusters = new ArrayList(); - FuzzyKMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters); - - if (conf==null) { - conf = new Configuration(); - } - - if (clusters.isEmpty()) { - throw new IllegalStateException("No input clusters found in " + clustersIn + ". Check your -c argument."); - } - - Path priorClustersPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); - ClusteringPolicy policy = new FuzzyKMeansClusteringPolicy(m, convergenceDelta); - ClusterClassifier prior = new ClusterClassifier(clusters, policy); - prior.writeToSeqFiles(priorClustersPath); - - if (runSequential) { - new ClusterIterator().iterateSeq(conf, input, priorClustersPath, output, maxIterations); - } else { - new ClusterIterator().iterateMR(conf, input, priorClustersPath, output, maxIterations); - } - return output; - } - - /** - * Run the job using supplied arguments - * - * @param input - * the directory pathname for input points - * @param clustersIn - * the directory pathname for input clusters - * @param output - * the directory pathname for output points - * @param measure - * the classname of the DistanceMeasure - * @param convergenceDelta - * the convergence delta value - * @param emitMostLikely - * a boolean if true emit only most likely cluster for each point - * @param threshold - * a double threshold value emits all clusters having greater pdf (emitMostLikely = false) - * @param runSequential if true run in sequential execution mode - */ - public static void clusterData(Path input, - Path clustersIn, - Path output, - DistanceMeasure measure, - double convergenceDelta, - float m, - boolean emitMostLikely, - double threshold, - boolean runSequential) - throws IOException, ClassNotFoundException, InterruptedException { - - ClusterClassifier.writePolicy(new FuzzyKMeansClusteringPolicy(m, convergenceDelta), clustersIn); - ClusterClassificationDriver.run(input, output, new Path(output, CLUSTERED_POINTS_DIRECTORY), threshold, true, - runSequential); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java deleted file mode 100644 index 052e45e88..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.fuzzykmeans; - -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.canopy.Canopy; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.clustering.kmeans.Kluster; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable; - -final class FuzzyKMeansUtil { - - private FuzzyKMeansUtil() {} - - /** - * Create a list of SoftClusters from whatever type is passed in as the prior - * - * @param conf - * the Configuration - * @param clusterPath - * the path to the prior Clusters - * @param clusters - * a List to put values into - */ - public static void configureWithClusterInfo(Configuration conf, Path clusterPath, List clusters) { - for (Writable value : new SequenceFileDirValueIterable(clusterPath, PathType.LIST, - PathFilters.partFilter(), conf)) { - Class valueClass = value.getClass(); - - if (valueClass.equals(ClusterWritable.class)) { - ClusterWritable clusterWritable = (ClusterWritable) value; - value = clusterWritable.getValue(); - valueClass = value.getClass(); - } - - if (valueClass.equals(Kluster.class)) { - // get the cluster info - Kluster cluster = (Kluster) value; - clusters.add(new SoftCluster(cluster.getCenter(), cluster.getId(), cluster.getMeasure())); - } else if (valueClass.equals(SoftCluster.class)) { - // get the cluster info - clusters.add((SoftCluster) value); - } else if (valueClass.equals(Canopy.class)) { - // get the cluster info - Canopy canopy = (Canopy) value; - clusters.add(new SoftCluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure())); - } else { - throw new IllegalStateException("Bad value class: " + valueClass); - } - } - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java deleted file mode 100644 index 52fd764ad..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java +++ /dev/null @@ -1,60 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.fuzzykmeans; - -import org.apache.mahout.clustering.kmeans.Kluster; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -public class SoftCluster extends Kluster { - - // For Writable - public SoftCluster() {} - - /** - * Construct a new SoftCluster with the given point as its center - * - * @param center - * the center point - * @param measure - * the DistanceMeasure - */ - public SoftCluster(Vector center, int clusterId, DistanceMeasure measure) { - super(center, clusterId, measure); - } - - @Override - public String asFormatString() { - return this.getIdentifier() + ": " - + this.computeCentroid().asFormatString(); - } - - @Override - public String getIdentifier() { - return (isConverged() ? "SV-" : "SC-") + getId(); - } - - @Override - public double pdf(VectorWritable vw) { - // SoftCluster pdf cannot be calculated out of context. See - // FuzzyKMeansClusterer - throw new UnsupportedOperationException( - "SoftCluster pdf cannot be calculated out of context. See FuzzyKMeansClusterer"); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/AbstractClusteringPolicy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/AbstractClusteringPolicy.java deleted file mode 100644 index 07cc7e3c6..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/AbstractClusteringPolicy.java +++ /dev/null @@ -1,72 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.iterator; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.List; - -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.classify.ClusterClassifier; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.SequentialAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.function.TimesFunction; - -public abstract class AbstractClusteringPolicy implements ClusteringPolicy { - - @Override - public abstract void write(DataOutput out) throws IOException; - - @Override - public abstract void readFields(DataInput in) throws IOException; - - @Override - public Vector select(Vector probabilities) { - int maxValueIndex = probabilities.maxValueIndex(); - Vector weights = new SequentialAccessSparseVector(probabilities.size()); - weights.set(maxValueIndex, 1.0); - return weights; - } - - @Override - public void update(ClusterClassifier posterior) { - // nothing to do in general here - } - - @Override - public Vector classify(Vector data, ClusterClassifier prior) { - List models = prior.getModels(); - int i = 0; - Vector pdfs = new DenseVector(models.size()); - for (Cluster model : models) { - pdfs.set(i++, model.pdf(new VectorWritable(data))); - } - return pdfs.assign(new TimesFunction(), 1.0 / pdfs.zSum()); - } - - @Override - public void close(ClusterClassifier posterior) { - for (Cluster cluster : posterior.getModels()) { - cluster.computeParameters(); - } - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java deleted file mode 100644 index f4703a338..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java +++ /dev/null @@ -1,77 +0,0 @@ -package org.apache.mahout.clustering.iterator; - -import java.io.IOException; -import java.util.Iterator; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.classify.ClusterClassifier; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.Vector.Element; -import org.apache.mahout.math.VectorWritable; - -public class CIMapper extends Mapper,VectorWritable,IntWritable,ClusterWritable> { - - private ClusterClassifier classifier; - - private ClusteringPolicy policy; - - /* - * (non-Javadoc) - * - * @see - * org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper - * .Context) - */ - @Override - protected void setup(Context context) throws IOException, InterruptedException { - Configuration conf = context.getConfiguration(); - String priorClustersPath = conf.get(ClusterIterator.PRIOR_PATH_KEY); - classifier = new ClusterClassifier(); - classifier.readFromSeqFiles(conf, new Path(priorClustersPath)); - policy = classifier.getPolicy(); - policy.update(classifier); - super.setup(context); - } - - /* - * (non-Javadoc) - * - * @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object, - * java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context) - */ - @Override - protected void map(WritableComparable key, VectorWritable value, Context context) throws IOException, - InterruptedException { - Vector probabilities = classifier.classify(value.get()); - Vector selections = policy.select(probabilities); - for (Iterator it = selections.iterateNonZero(); it.hasNext();) { - Element el = it.next(); - classifier.train(el.index(), value.get(), el.get()); - } - } - - /* - * (non-Javadoc) - * - * @see - * org.apache.hadoop.mapreduce.Mapper#cleanup(org.apache.hadoop.mapreduce. - * Mapper.Context) - */ - @Override - protected void cleanup(Context context) throws IOException, InterruptedException { - List clusters = classifier.getModels(); - ClusterWritable cw = new ClusterWritable(); - for (int index = 0; index < clusters.size(); index++) { - cw.setValue(clusters.get(index)); - context.write(new IntWritable(index), cw); - } - super.cleanup(context); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java deleted file mode 100644 index 5da7d1bf0..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.iterator; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.classify.ClusterClassifier; - -public class CIReducer extends Reducer { - - private ClusterClassifier classifier; - private ClusteringPolicy policy; - - @Override - protected void reduce(IntWritable key, Iterable values, Context context) throws IOException, - InterruptedException { - Iterator iter = values.iterator(); - ClusterWritable first = null; - while (iter.hasNext()) { - ClusterWritable cw = iter.next(); - if (first == null) { - first = cw; - } else { - first.getValue().observe(cw.getValue()); - } - } - List models = new ArrayList(); - models.add(first.getValue()); - classifier = new ClusterClassifier(models, policy); - classifier.close(); - context.write(key, first); - } - - /* - * (non-Javadoc) - * - * @see - * org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper - * .Context) - */ - @Override - protected void setup(Context context) throws IOException, InterruptedException { - Configuration conf = context.getConfiguration(); - String priorClustersPath = conf.get(ClusterIterator.PRIOR_PATH_KEY); - classifier = new ClusterClassifier(); - classifier.readFromSeqFiles(conf, new Path(priorClustersPath)); - policy = classifier.getPolicy(); - policy.update(classifier); - super.setup(context); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java deleted file mode 100644 index 63d08e3d0..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.iterator; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.mahout.math.SequentialAccessSparseVector; -import org.apache.mahout.math.Vector; - -/** - * This is a simple maximum likelihood clustering policy, suitable for k-means - * clustering - * - */ -public class CanopyClusteringPolicy extends AbstractClusteringPolicy { - - public CanopyClusteringPolicy() { - super(); - } - - private double t1, t2; - - /* - * (non-Javadoc) - * - * @see - * org.apache.mahout.clustering.ClusteringPolicy#select(org.apache.mahout. - * math.Vector) - */ - @Override - public Vector select(Vector probabilities) { - int maxValueIndex = probabilities.maxValueIndex(); - Vector weights = new SequentialAccessSparseVector(probabilities.size()); - weights.set(maxValueIndex, 1.0); - return weights; - } - - /* - * (non-Javadoc) - * - * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) - */ - @Override - public void write(DataOutput out) throws IOException { - out.writeDouble(t1); - out.writeDouble(t2); - } - - /* - * (non-Javadoc) - * - * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) - */ - @Override - public void readFields(DataInput in) throws IOException { - this.t1 = in.readDouble(); - this.t2 = in.readDouble(); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java deleted file mode 100644 index 9c5f2add5..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java +++ /dev/null @@ -1,222 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.iterator; - -import java.io.IOException; -import java.util.Iterator; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.classify.ClusterClassifier; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -import com.google.common.io.Closeables; - -/** - * This is a clustering iterator which works with a set of Vector data and a prior ClusterClassifier which has been - * initialized with a set of models. Its implementation is algorithm-neutral and works for any iterative clustering - * algorithm (currently k-means, fuzzy-k-means and Dirichlet) that processes all the input vectors in each iteration. - * The cluster classifier is configured with a ClusteringPolicy to select the desired clustering algorithm. - */ -public class ClusterIterator { - - public static final String PRIOR_PATH_KEY = "org.apache.mahout.clustering.prior.path"; - - /** - * Iterate over data using a prior-trained ClusterClassifier, for a number of iterations - * - * @param policy - * the ClusteringPolicy to use - * @param data - * a {@code List} of input vectors - * @param classifier - * a prior ClusterClassifier - * @param numIterations - * the int number of iterations to perform - * - * @return the posterior ClusterClassifier - */ - public ClusterClassifier iterate(Iterable data, ClusterClassifier classifier, int numIterations) { - ClusteringPolicy policy = classifier.getPolicy(); - for (int iteration = 1; iteration <= numIterations; iteration++) { - for (Vector vector : data) { - // update the policy based upon the prior - policy.update(classifier); - // classification yields probabilities - Vector probabilities = classifier.classify(vector); - // policy selects weights for models given those probabilities - Vector weights = policy.select(probabilities); - // training causes all models to observe data - for (Iterator it = weights.iterateNonZero(); it.hasNext();) { - int index = it.next().index(); - classifier.train(index, vector, weights.get(index)); - } - } - // compute the posterior models - classifier.close(); - } - return classifier; - } - - /** - * Iterate over data using a prior-trained ClusterClassifier, for a number of iterations using a sequential - * implementation - * - * @param conf - * the Configuration - * @param inPath - * a Path to input VectorWritables - * @param priorPath - * a Path to the prior classifier - * @param outPath - * a Path of output directory - * @param numIterations - * the int number of iterations to perform - * - * @throws IOException - */ - public void iterateSeq(Configuration conf, Path inPath, Path priorPath, Path outPath, int numIterations) - throws IOException { - ClusterClassifier classifier = new ClusterClassifier(); - classifier.readFromSeqFiles(conf, priorPath); - Path clustersOut = null; - int iteration = 1; - while (iteration <= numIterations) { - for (VectorWritable vw : new SequenceFileDirValueIterable(inPath, PathType.LIST, - PathFilters.logsCRCFilter(), conf)) { - Vector vector = vw.get(); - // classification yields probabilities - Vector probabilities = classifier.classify(vector); - // policy selects weights for models given those probabilities - Vector weights = classifier.getPolicy().select(probabilities); - // training causes all models to observe data - for (Iterator it = weights.iterateNonZero(); it.hasNext();) { - int index = it.next().index(); - classifier.train(index, vector, weights.get(index)); - } - } - // compute the posterior models - classifier.close(); - // update the policy - classifier.getPolicy().update(classifier); - // output the classifier - clustersOut = new Path(outPath, Cluster.CLUSTERS_DIR + iteration); - classifier.writeToSeqFiles(clustersOut); - FileSystem fs = FileSystem.get(outPath.toUri(), conf); - iteration++; - if (isConverged(clustersOut, conf, fs)) { - break; - } - } - Path finalClustersIn = new Path(outPath, Cluster.CLUSTERS_DIR + (iteration - 1) + Cluster.FINAL_ITERATION_SUFFIX); - FileSystem.get(clustersOut.toUri(), conf).rename(clustersOut, finalClustersIn); - } - - /** - * Iterate over data using a prior-trained ClusterClassifier, for a number of iterations using a mapreduce - * implementation - * - * @param conf - * the Configuration - * @param inPath - * a Path to input VectorWritables - * @param priorPath - * a Path to the prior classifier - * @param outPath - * a Path of output directory - * @param numIterations - * the int number of iterations to perform - */ - public void iterateMR(Configuration conf, Path inPath, Path priorPath, Path outPath, int numIterations) - throws IOException, InterruptedException, ClassNotFoundException { - ClusteringPolicy policy = ClusterClassifier.readPolicy(priorPath); - Path clustersOut = null; - int iteration = 1; - while (iteration <= numIterations) { - conf.set(PRIOR_PATH_KEY, priorPath.toString()); - - String jobName = "Cluster Iterator running iteration " + iteration + " over priorPath: " + priorPath; - System.out.println(jobName); - Job job = new Job(conf, jobName); - job.setMapOutputKeyClass(IntWritable.class); - job.setMapOutputValueClass(ClusterWritable.class); - job.setOutputKeyClass(IntWritable.class); - job.setOutputValueClass(ClusterWritable.class); - - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - job.setMapperClass(CIMapper.class); - job.setReducerClass(CIReducer.class); - - FileInputFormat.addInputPath(job, inPath); - clustersOut = new Path(outPath, Cluster.CLUSTERS_DIR + iteration); - priorPath = clustersOut; - FileOutputFormat.setOutputPath(job, clustersOut); - - job.setJarByClass(ClusterIterator.class); - if (!job.waitForCompletion(true)) { - throw new InterruptedException("Cluster Iteration " + iteration + " failed processing " + priorPath); - } - ClusterClassifier.writePolicy(policy, clustersOut); - FileSystem fs = FileSystem.get(outPath.toUri(), conf); - iteration++; - if (isConverged(clustersOut, conf, fs)) { - break; - } - } - Path finalClustersIn = new Path(outPath, Cluster.CLUSTERS_DIR + (iteration - 1) + Cluster.FINAL_ITERATION_SUFFIX); - FileSystem.get(clustersOut.toUri(), conf).rename(clustersOut, finalClustersIn); - } - - /** - * Return if all of the Clusters in the parts in the filePath have converged or not - * - * @param filePath - * the file path to the single file containing the clusters - * @return true if all Clusters are converged - * @throws IOException - * if there was an IO error - */ - private boolean isConverged(Path filePath, Configuration conf, FileSystem fs) throws IOException { - for (FileStatus part : fs.listStatus(filePath, PathFilters.partFilter())) { - SequenceFileValueIterator iterator = new SequenceFileValueIterator( - part.getPath(), true, conf); - while (iterator.hasNext()) { - ClusterWritable value = iterator.next(); - if (!value.getValue().isConverged()) { - Closeables.closeQuietly(iterator); - return false; - } - } - } - return true; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/ClusterWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/ClusterWritable.java deleted file mode 100644 index fabfcf7d9..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/ClusterWritable.java +++ /dev/null @@ -1,49 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.iterator; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.classifier.sgd.PolymorphicWritable; -import org.apache.mahout.clustering.Cluster; - -public class ClusterWritable implements Writable { - - private Cluster value; - - public Cluster getValue() { - return value; - } - - public void setValue(Cluster value) { - this.value = value; - } - - @Override - public void write(DataOutput out) throws IOException { - PolymorphicWritable.write(out, value); - } - - @Override - public void readFields(DataInput in) throws IOException { - value = PolymorphicWritable.read(in, Cluster.class); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java deleted file mode 100644 index ce9fce86c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.iterator; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.clustering.classify.ClusterClassifier; -import org.apache.mahout.math.Vector; - -/** - * A ClusteringPolicy captures the semantics of assignment of points to clusters - * - */ -public interface ClusteringPolicy extends Writable { - - /** - * Classify the data vector given the classifier's models - * - * @param data - * a data Vector - * @param prior - * a prior ClusterClassifier - * @return a Vector of probabilities that the data is described by each of the - * models - */ - public Vector classify(Vector data, ClusterClassifier prior); - - /** - * Return a vector of weights for each of the models given those probabilities - * - * @param probabilities - * a Vector of pdfs - * @return a Vector of weights - */ - public Vector select(Vector probabilities); - - /** - * Update the policy with the given classifier - * - * @param posterior - * a ClusterClassifier - */ - public void update(ClusterClassifier posterior); - - /** - * Close the policy using the classifier's models - * - * @param posterior - * a posterior ClusterClassifier - */ - public void close(ClusterClassifier posterior); - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicyWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicyWritable.java deleted file mode 100644 index f69442d88..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicyWritable.java +++ /dev/null @@ -1,55 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.iterator; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.classifier.sgd.PolymorphicWritable; - -public class ClusteringPolicyWritable implements Writable { - - private ClusteringPolicy value; - - public ClusteringPolicyWritable(ClusteringPolicy policy) { - this.value = policy; - } - - public ClusteringPolicyWritable() { - } - - public ClusteringPolicy getValue() { - return value; - } - - public void setValue(ClusteringPolicy value) { - this.value = value; - } - - @Override - public void write(DataOutput out) throws IOException { - PolymorphicWritable.write(out, value); - } - - @Override - public void readFields(DataInput in) throws IOException { - value = PolymorphicWritable.read(in, ClusteringPolicy.class); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/DirichletClusteringPolicy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/DirichletClusteringPolicy.java deleted file mode 100644 index 78123da76..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/DirichletClusteringPolicy.java +++ /dev/null @@ -1,108 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.iterator; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.mahout.clustering.classify.ClusterClassifier; -import org.apache.mahout.clustering.dirichlet.UncommonDistributions; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.SequentialAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -public class DirichletClusteringPolicy extends AbstractClusteringPolicy { - - public DirichletClusteringPolicy() { - super(); - } - - /** - * - * @param k - * The number of models to create from prior - * @param alpha0 - * The alpha_0 parameter to the Dirichlet Distribution. - */ - public DirichletClusteringPolicy(int k, double alpha0) { - this.alpha0 = alpha0; - this.mixture = UncommonDistributions.rDirichlet(new DenseVector(k), alpha0); - } - - // The mixture is the Dirichlet distribution of the total Cluster counts over - // all iterations - private Vector mixture; - - // Alpha_0 primes the Dirichlet distribution - private double alpha0; - - /* - * (non-Javadoc) - * - * @see - * org.apache.mahout.clustering.ClusteringPolicy#select(org.apache.mahout. - * math.Vector) - */ - @Override - public Vector select(Vector probabilities) { - int rMultinom = UncommonDistributions.rMultinom(probabilities.times(mixture)); - Vector weights = new SequentialAccessSparseVector(probabilities.size()); - weights.set(rMultinom, 1.0); - return weights; - } - - // update the total counts and then the mixture - /* - * (non-Javadoc) - * - * @see - * org.apache.mahout.clustering.ClusteringPolicy#update(org.apache.mahout. - * clustering.ClusterClassifier) - */ - @Override - public void update(ClusterClassifier prior) { - Vector totalCounts = new DenseVector(prior.getModels().size()); - for (int i = 0; i < prior.getModels().size(); i++) { - totalCounts.set(i, prior.getModels().get(i).getTotalObservations()); - } - mixture = UncommonDistributions.rDirichlet(totalCounts, alpha0); - } - - /* - * (non-Javadoc) - * - * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) - */ - @Override - public void write(DataOutput out) throws IOException { - out.writeDouble(alpha0); - VectorWritable.writeVector(out, mixture); - } - - /* - * (non-Javadoc) - * - * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) - */ - @Override - public void readFields(DataInput in) throws IOException { - this.alpha0 = in.readDouble(); - this.mixture = VectorWritable.readVector(in); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/DistanceMeasureCluster.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/DistanceMeasureCluster.java deleted file mode 100644 index f61aa2737..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/DistanceMeasureCluster.java +++ /dev/null @@ -1,91 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.iterator; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.mahout.clustering.AbstractCluster; -import org.apache.mahout.clustering.Model; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -public class DistanceMeasureCluster extends AbstractCluster { - - private DistanceMeasure measure; - - public DistanceMeasureCluster(Vector point, int id, DistanceMeasure measure) { - super(point, id); - this.measure = measure; - } - - public DistanceMeasureCluster() { - } - - @Override - public void configure(Configuration job) { - if (measure != null) { - measure.configure(job); - } - } - - @Override - public void readFields(DataInput in) throws IOException { - String dm = in.readUTF(); - this.measure = ClassUtils.instantiateAs(dm, DistanceMeasure.class); - super.readFields(in); - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeUTF(measure.getClass().getName()); - super.write(out); - } - - @Override - public double pdf(VectorWritable vw) { - return 1 / (1 + measure.distance(vw.get(), getCenter())); - } - - @Override - public Model sampleFromPosterior() { - return new DistanceMeasureCluster(getCenter(), getId(), measure); - } - - public DistanceMeasure getMeasure() { - return measure; - } - - /** - * @param measure - * the measure to set - */ - public void setMeasure(DistanceMeasure measure) { - this.measure = measure; - } - - @Override - public String getIdentifier() { - return "DMC:" + getId(); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java deleted file mode 100644 index 84f449f6b..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java +++ /dev/null @@ -1,109 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.iterator; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.Collection; -import java.util.List; - -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.classify.ClusterClassifier; -import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansClusterer; -import org.apache.mahout.clustering.fuzzykmeans.SoftCluster; -import org.apache.mahout.math.Vector; - -import com.google.common.collect.Lists; - -/** - * This is a probability-weighted clustering policy, suitable for fuzzy k-means - * clustering - * - */ -public class FuzzyKMeansClusteringPolicy extends AbstractClusteringPolicy { - - public FuzzyKMeansClusteringPolicy() { - super(); - } - - private double m = 2; - - private double convergenceDelta = 0.05; - - public FuzzyKMeansClusteringPolicy(double m, double convergenceDelta) { - this.m = m; - } - - /* - * (non-Javadoc) - * - * @see - * org.apache.mahout.clustering.ClusteringPolicy#select(org.apache.mahout. - * math.Vector) - */ - @Override - public Vector select(Vector probabilities) { - return probabilities; - } - - @Override - public Vector classify(Vector data, ClusterClassifier prior) { - Collection clusters = Lists.newArrayList(); - List distances = Lists.newArrayList(); - for (Cluster model : prior.getModels()) { - SoftCluster sc = (SoftCluster) model; - clusters.add(sc); - distances.add(sc.getMeasure().distance(data, sc.getCenter())); - } - FuzzyKMeansClusterer fuzzyKMeansClusterer = new FuzzyKMeansClusterer(); - fuzzyKMeansClusterer.setM(m); - return fuzzyKMeansClusterer.computePi(clusters, distances); - } - - /* - * (non-Javadoc) - * - * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) - */ - @Override - public void write(DataOutput out) throws IOException { - out.writeDouble(m); - out.writeDouble(convergenceDelta); - } - - /* - * (non-Javadoc) - * - * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) - */ - @Override - public void readFields(DataInput in) throws IOException { - this.m = in.readDouble(); - this.convergenceDelta = in.readDouble(); - } - - @Override - public void close(ClusterClassifier posterior) { - for (Cluster cluster : posterior.getModels()) { - ((org.apache.mahout.clustering.kmeans.Kluster) cluster).calculateConvergence(convergenceDelta); - cluster.computeParameters(); - } - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java deleted file mode 100644 index b809210bb..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.iterator; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.classify.ClusterClassifier; - -/** - * This is a simple maximum likelihood clustering policy, suitable for k-means - * clustering - * - */ -public class KMeansClusteringPolicy extends AbstractClusteringPolicy { - - public KMeansClusteringPolicy() { - super(); - } - - public KMeansClusteringPolicy(double convergenceDelta) { - super(); - this.convergenceDelta = convergenceDelta; - } - - private double convergenceDelta = 0.001; - - /* - * (non-Javadoc) - * - * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) - */ - @Override - public void write(DataOutput out) throws IOException { - out.writeDouble(convergenceDelta); - } - - /* - * (non-Javadoc) - * - * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) - */ - @Override - public void readFields(DataInput in) throws IOException { - this.convergenceDelta = in.readDouble(); - } - - @Override - public void close(ClusterClassifier posterior) { - boolean allConverged = true; - for (Cluster cluster : posterior.getModels()) { - org.apache.mahout.clustering.kmeans.Kluster kluster = (org.apache.mahout.clustering.kmeans.Kluster) cluster; - boolean converged = kluster.calculateConvergence(convergenceDelta); - allConverged = allConverged && converged; - cluster.computeParameters(); - } - - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/MeanShiftClusteringPolicy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/MeanShiftClusteringPolicy.java deleted file mode 100644 index 051bfa131..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/iterator/MeanShiftClusteringPolicy.java +++ /dev/null @@ -1,62 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.iterator; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * This is a simple maximum likelihood clustering policy, suitable for k-means - * clustering - * - */ -public class MeanShiftClusteringPolicy extends AbstractClusteringPolicy { - - public MeanShiftClusteringPolicy() { - super(); - } - - private double t1, t2, t3, t4; - - /* - * (non-Javadoc) - * - * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) - */ - @Override - public void write(DataOutput out) throws IOException { - out.writeDouble(t1); - out.writeDouble(t2); - out.writeDouble(t3); - out.writeDouble(t4); - } - - /* - * (non-Javadoc) - * - * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) - */ - @Override - public void readFields(DataInput in) throws IOException { - this.t1 = in.readDouble(); - this.t2 = in.readDouble(); - this.t3 = in.readDouble(); - this.t4 = in.readDouble(); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansConfigKeys.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansConfigKeys.java deleted file mode 100644 index 6c02b1b6e..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansConfigKeys.java +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.kmeans; - -/** - * This class holds all config keys that are relevant to be used in the KMeans MapReduce configuration. - * */ -public interface KMeansConfigKeys { - /** Configuration key for distance measure to use. */ - String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.kmeans.measure"; - /** Configuration key for convergence threshold. */ - String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.kmeans.convergence"; - /** Configuration key for iteration cluster path */ - String CLUSTER_PATH_KEY = "org.apache.mahout.clustering.kmeans.path"; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java deleted file mode 100644 index 9ac38b942..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java +++ /dev/null @@ -1,264 +0,0 @@ -/* Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.kmeans; - -import static org.apache.mahout.clustering.topdown.PathDirectory.CLUSTERED_POINTS_DIRECTORY; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.classify.ClusterClassificationDriver; -import org.apache.mahout.clustering.classify.ClusterClassifier; -import org.apache.mahout.clustering.iterator.ClusterIterator; -import org.apache.mahout.clustering.iterator.ClusteringPolicy; -import org.apache.mahout.clustering.iterator.KMeansClusteringPolicy; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure; -import org.apache.mahout.math.VectorWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class KMeansDriver extends AbstractJob { - - private static final Logger log = LoggerFactory.getLogger(KMeansDriver.class); - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new KMeansDriver(), args); - } - - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOutputOption(); - addOption(DefaultOptionCreator.distanceMeasureOption().create()); - addOption(DefaultOptionCreator - .clustersInOption() - .withDescription( - "The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy. " - + "If k is also specified, then a random set of vectors will be selected" - + " and written out to this path first").create()); - addOption(DefaultOptionCreator - .numClustersOption() - .withDescription( - "The k in k-Means. If specified, then a random selection of k Vectors will be chosen" - + " as the Centroid and written to the clusters input path.").create()); - addOption(DefaultOptionCreator.convergenceOption().create()); - addOption(DefaultOptionCreator.maxIterationsOption().create()); - addOption(DefaultOptionCreator.overwriteOption().create()); - addOption(DefaultOptionCreator.clusteringOption().create()); - addOption(DefaultOptionCreator.methodOption().create()); - addOption(DefaultOptionCreator.outlierThresholdOption().create()); - - if (parseArguments(args) == null) { - return -1; - } - - Path input = getInputPath(); - Path clusters = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION)); - Path output = getOutputPath(); - String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); - if (measureClass == null) { - measureClass = SquaredEuclideanDistanceMeasure.class.getName(); - } - double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); - int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); - if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(getConf(), output); - } - DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); - - if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) { - clusters = RandomSeedGenerator.buildRandom(getConf(), input, clusters, - Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure); - } - boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION); - boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase( - DefaultOptionCreator.SEQUENTIAL_METHOD); - if (getConf() == null) { - setConf(new Configuration()); - } - double clusterClassificationThreshold = 0.0; - if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) { - clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD)); - } - run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering, - clusterClassificationThreshold, runSequential); - return 0; - } - - /** - * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to - * cluster the input vectors. - * - * @param input - * the directory pathname for input points - * @param clustersIn - * the directory pathname for initial & computed clusters - * @param output - * the directory pathname for output points - * @param measure - * the DistanceMeasure to use - * @param convergenceDelta - * the convergence delta value - * @param maxIterations - * the maximum number of iterations - * @param runClustering - * true if points are to be clustered after iterations are completed - * @param clusterClassificationThreshold - * Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors - * having pdf below this value will not be clustered. - * @param runSequential - * if true execute sequential algorithm - */ - public static void run(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure, - double convergenceDelta, int maxIterations, boolean runClustering, double clusterClassificationThreshold, - boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { - - // iterate until the clusters converge - String delta = Double.toString(convergenceDelta); - if (log.isInfoEnabled()) { - log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] {input, clustersIn, output, - measure.getClass().getName()}); - log.info("convergence: {} max Iterations: {} num Reduce Tasks: {} Input Vectors: {}", new Object[] { - convergenceDelta, maxIterations, VectorWritable.class.getName()}); - } - Path clustersOut = buildClusters(conf, input, clustersIn, output, measure, maxIterations, delta, runSequential); - if (runClustering) { - log.info("Clustering data"); - clusterData(conf, input, clustersOut, output, measure, clusterClassificationThreshold, runSequential); - } - } - - /** - * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to - * cluster the input vectors. - * - * @param input - * the directory pathname for input points - * @param clustersIn - * the directory pathname for initial & computed clusters - * @param output - * the directory pathname for output points - * @param measure - * the DistanceMeasure to use - * @param convergenceDelta - * the convergence delta value - * @param maxIterations - * the maximum number of iterations - * @param runClustering - * true if points are to be clustered after iterations are completed - * @param clusterClassificationThreshold - * Is a clustering strictness / outlier removal parrameter. Its value should be between 0 and 1. Vectors - * having pdf below this value will not be clustered. - * @param runSequential - * if true execute sequential algorithm - */ - public static void run(Path input, Path clustersIn, Path output, DistanceMeasure measure, double convergenceDelta, - int maxIterations, boolean runClustering, double clusterClassificationThreshold, boolean runSequential) - throws IOException, InterruptedException, ClassNotFoundException { - run(new Configuration(), input, clustersIn, output, measure, convergenceDelta, maxIterations, runClustering, - clusterClassificationThreshold, runSequential); - } - - /** - * Iterate over the input vectors to produce cluster directories for each iteration - * - * @param conf - * the Configuration to use - * @param input - * the directory pathname for input points - * @param clustersIn - * the directory pathname for initial & computed clusters - * @param output - * the directory pathname for output points - * @param measure - * the classname of the DistanceMeasure - * @param maxIterations - * the maximum number of iterations - * @param delta - * the convergence delta value - * @param runSequential - * if true execute sequential algorithm - * - * @return the Path of the final clusters directory - */ - public static Path buildClusters(Configuration conf, Path input, Path clustersIn, Path output, - DistanceMeasure measure, int maxIterations, String delta, boolean runSequential) throws IOException, - InterruptedException, ClassNotFoundException { - - double convergenceDelta = Double.parseDouble(delta); - List clusters = new ArrayList(); - KMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters); - - if (clusters.isEmpty()) { - throw new IllegalStateException("No input clusters found in " + clustersIn + ". Check your -c argument."); - } - - Path priorClustersPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); - ClusteringPolicy policy = new KMeansClusteringPolicy(convergenceDelta); - ClusterClassifier prior = new ClusterClassifier(clusters, policy); - prior.writeToSeqFiles(priorClustersPath); - - if (runSequential) { - new ClusterIterator().iterateSeq(conf, input, priorClustersPath, output, maxIterations); - } else { - new ClusterIterator().iterateMR(conf, input, priorClustersPath, output, maxIterations); - } - return output; - } - - /** - * Run the job using supplied arguments - * - * @param input - * the directory pathname for input points - * @param clustersIn - * the directory pathname for input clusters - * @param output - * the directory pathname for output points - * @param measure - * the classname of the DistanceMeasure - * @param clusterClassificationThreshold - * Is a clustering strictness / outlier removal parrameter. Its value should be between 0 and 1. Vectors - * having pdf below this value will not be clustered. - * @param runSequential - * if true execute sequential algorithm - */ - public static void clusterData(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure, - double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, - ClassNotFoundException { - - if (log.isInfoEnabled()) { - log.info("Running Clustering"); - log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] {input, clustersIn, output, measure}); - } - ClusterClassifier.writePolicy(new KMeansClusteringPolicy(), clustersIn); - ClusterClassificationDriver.run(input, output, new Path(output, CLUSTERED_POINTS_DIRECTORY), - clusterClassificationThreshold, true, runSequential); - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java deleted file mode 100644 index b3ca50730..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java +++ /dev/null @@ -1,74 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.kmeans; - -import java.util.Collection; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.canopy.Canopy; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -final class KMeansUtil { - - private static final Logger log = LoggerFactory.getLogger(KMeansUtil.class); - - private KMeansUtil() {} - - /** - * Create a list of Klusters from whatever Cluster type is passed in as the prior - * - * @param conf - * the Configuration - * @param clusterPath - * the path to the prior Clusters - * @param clusters - * a List to put values into - */ - public static void configureWithClusterInfo(Configuration conf, Path clusterPath, Collection clusters) { - for (Writable value : new SequenceFileDirValueIterable(clusterPath, PathType.LIST, - PathFilters.partFilter(), conf)) { - Class valueClass = value.getClass(); - if (valueClass.equals(ClusterWritable.class)) { - ClusterWritable clusterWritable = (ClusterWritable) value; - value = clusterWritable.getValue(); - valueClass = value.getClass(); - } - log.debug("Read 1 Cluster from {}", clusterPath); - - if (valueClass.equals(Kluster.class)) { - // get the cluster info - clusters.add((Kluster) value); - } else if (valueClass.equals(Canopy.class)) { - // get the cluster info - Canopy canopy = (Canopy) value; - clusters.add(new Kluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure())); - } else { - throw new IllegalStateException("Bad value class: " + valueClass); - } - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java deleted file mode 100644 index a29d35541..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java +++ /dev/null @@ -1,116 +0,0 @@ -/* Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.kmeans; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.mahout.clustering.iterator.DistanceMeasureCluster; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.Vector; - -public class Kluster extends DistanceMeasureCluster { - - /** Has the centroid converged with the center? */ - private boolean converged; - - /** For (de)serialization as a Writable */ - public Kluster() {} - - /** - * Construct a new cluster with the given point as its center - * - * @param center - * the Vector center - * @param clusterId - * the int cluster id - * @param measure - * a DistanceMeasure - */ - public Kluster(Vector center, int clusterId, DistanceMeasure measure) { - super(center, clusterId, measure); - } - - /** - * Format the cluster for output - * - * @param cluster - * the Cluster - * @return the String representation of the Cluster - */ - public static String formatCluster(Kluster cluster) { - return cluster.getIdentifier() + ": " + cluster.computeCentroid().asFormatString(); - } - - public String asFormatString() { - return formatCluster(this); - } - - @Override - public void write(DataOutput out) throws IOException { - super.write(out); - out.writeBoolean(converged); - } - - @Override - public void readFields(DataInput in) throws IOException { - super.readFields(in); - this.converged = in.readBoolean(); - } - - @Override - public String toString() { - return asFormatString(null); - } - - @Override - public String getIdentifier() { - return (converged ? "VL-" : "CL-") + getId(); - } - - /** - * Return if the cluster is converged by comparing its center and centroid. - * - * @param measure - * The distance measure to use for cluster-point comparisons. - * @param convergenceDelta - * the convergence delta to use for stopping. - * @return if the cluster is converged - */ - public boolean computeConvergence(DistanceMeasure measure, double convergenceDelta) { - Vector centroid = computeCentroid(); - converged = measure.distance(centroid.getLengthSquared(), centroid, getCenter()) <= convergenceDelta; - return converged; - } - - @Override - public boolean isConverged() { - return converged; - } - - protected void setConverged(boolean converged) { - this.converged = converged; - } - - public boolean calculateConvergence(double convergenceDelta) { - Vector centroid = computeCentroid(); - converged = getMeasure().distance(centroid.getLengthSquared(), centroid, getCenter()) <= convergenceDelta; - return converged; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java deleted file mode 100644 index 1e7f2ee54..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java +++ /dev/null @@ -1,126 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.kmeans; - -import java.io.IOException; -import java.util.List; -import java.util.Random; - -import com.google.common.collect.Lists; -import com.google.common.io.Closeables; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; -import org.apache.mahout.math.VectorWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Given an Input Path containing a {@link org.apache.hadoop.io.SequenceFile}, randomly select k vectors and - * write them to the output file as a {@link org.apache.mahout.clustering.kmeans.Kluster} representing the - * initial centroid to use. - */ -public final class RandomSeedGenerator { - - private static final Logger log = LoggerFactory.getLogger(RandomSeedGenerator.class); - - public static final String K = "k"; - - private RandomSeedGenerator() { - } - - public static Path buildRandom(Configuration conf, - Path input, - Path output, - int k, - DistanceMeasure measure) throws IOException { - // delete the output directory - FileSystem fs = FileSystem.get(output.toUri(), conf); - HadoopUtil.delete(conf, output); - Path outFile = new Path(output, "part-randomSeed"); - boolean newFile = fs.createNewFile(outFile); - if (newFile) { - Path inputPathPattern; - - if (fs.getFileStatus(input).isDir()) { - inputPathPattern = new Path(input, "*"); - } else { - inputPathPattern = input; - } - - FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter()); - SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, ClusterWritable.class); - Random random = RandomUtils.getRandom(); - List chosenTexts = Lists.newArrayListWithCapacity(k); - List chosenClusters = Lists.newArrayListWithCapacity(k); - int nextClusterId = 0; - - for (FileStatus fileStatus : inputFiles) { - if (fileStatus.isDir()) { - continue; - } - for (Pair record - : new SequenceFileIterable(fileStatus.getPath(), true, conf)) { - Writable key = record.getFirst(); - VectorWritable value = record.getSecond(); - Kluster newCluster = new Kluster(value.get(), nextClusterId++, measure); - newCluster.observe(value.get(), 1); - Text newText = new Text(key.toString()); - int currentSize = chosenTexts.size(); - if (currentSize < k) { - chosenTexts.add(newText); - ClusterWritable clusterWritable = new ClusterWritable(); - clusterWritable.setValue(newCluster); - chosenClusters.add(clusterWritable); - } else if (random.nextInt(currentSize + 1) != 0) { // with chance 1/(currentSize+1) pick new element - int indexToRemove = random.nextInt(currentSize); // evict one chosen randomly - chosenTexts.remove(indexToRemove); - chosenClusters.remove(indexToRemove); - chosenTexts.add(newText); - ClusterWritable clusterWritable = new ClusterWritable(); - clusterWritable.setValue(newCluster); - chosenClusters.add(clusterWritable); - } - } - } - - try { - for (int i = 0; i < chosenTexts.size(); i++) { - writer.append(chosenTexts.get(i), chosenClusters.get(i)); - } - log.info("Wrote {} Klusters to {}", k, outFile); - } finally { - Closeables.closeQuietly(writer); - } - } - - return outFile; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/package-info.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/package-info.java deleted file mode 100644 index 43f655ad0..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/kmeans/package-info.java +++ /dev/null @@ -1,5 +0,0 @@ -/** - * This package provides an implementation of the k-means clustering - * algorithm. - */ -package org.apache.mahout.clustering.kmeans; \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0DocInferenceMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0DocInferenceMapper.java deleted file mode 100644 index d60d99896..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0DocInferenceMapper.java +++ /dev/null @@ -1,48 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.lda.cvb; - -import org.apache.hadoop.io.IntWritable; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.SparseRowMatrix; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -import java.io.IOException; - -public class CVB0DocInferenceMapper extends CachingCVB0Mapper { - - @Override - public void map(IntWritable docId, VectorWritable doc, Context context) - throws IOException, InterruptedException { - int numTopics = getNumTopics(); - Vector docTopics = new DenseVector(new double[numTopics]).assign(1.0 /numTopics); - Matrix docModel = new SparseRowMatrix(numTopics, doc.get().size()); - int maxIters = getMaxIters(); - ModelTrainer modelTrainer = getModelTrainer(); - for (int i = 0; i < maxIters; i++) { - modelTrainer.getReadModel().trainDocTopicModel(doc.get(), docTopics, docModel); - } - context.write(docId, new VectorWritable(docTopics)); - } - - @Override - protected void cleanup(Context context) { - getModelTrainer().stop(); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0Driver.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0Driver.java deleted file mode 100644 index a1e0aff9c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0Driver.java +++ /dev/null @@ -1,552 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.lda.cvb; - -import com.google.common.base.Joiner; -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.filecache.DistributedCache; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.DoubleWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; -import org.apache.mahout.common.mapreduce.VectorSumReducer; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.net.URI; -import java.util.List; - -/** - * See {@link CachingCVB0Mapper} for more details on scalability and room for improvement. - * To try out this LDA implementation without using Hadoop, check out - * {@link InMemoryCollapsedVariationalBayes0}. If you want to do training directly in java code - * with your own main(), then look to {@link ModelTrainer} and {@link TopicModel}. - * - * Usage: {@code ./bin/mahout cvb options} - *

- * Valid options include: - *

- *
{@code --input path} - *
Input path for {@code SequenceFile} document vectors. See - * {@link SparseVectorsFromSequenceFiles} for details on how to generate this input format.
- *
{@code --dictionary path}
- *
Path to dictionary file(s) generated during construction of input document vectors (glob - * expression supported). If set, this data is scanned to determine an appropriate value for option - * {@code --num_terms}.
- *
{@code --output path}
- *
Output path for topic-term distributions.
- *
{@code --doc_topic_output path}
- *
Output path for doc-topic distributions.
- *
{@code --num_topics k}
- *
Number of latent topics.
- *
{@code --num_terms nt}
- *
Number of unique features defined by input document vectors. If option {@code --dictionary} - * is defined and this option is unspecified, term count is calculated from dictionary.
- *
{@code --topic_model_temp_dir path}
- *
Path in which to store model state after each iteration.
- *
{@code --maxIter i}
- *
Maximum number of iterations to perform. If this value is less than or equal to the number of - * iteration states found beneath the path specified by option {@code --topic_model_temp_dir}, no - * further iterations are performed. Instead, output topic-term and doc-topic distributions are - * generated using data from the specified iteration.
- *
{@code --max_doc_topic_iters i}
- *
Maximum number of iterations per doc for p(topic|doc) learning. Defaults to {@code 10}.
- *
{@code --doc_topic_smoothing a}
- *
Smoothing for doc-topic distribution. Defaults to {@code 0.0001}.
- *
{@code --term_topic_smoothing e}
- *
Smoothing for topic-term distribution. Defaults to {@code 0.0001}.
- *
{@code --random_seed seed}
- *
Integer seed for random number generation.
- *
{@code --test_set_percentage p}
- *
Fraction of data to hold out for testing. Defaults to {@code 0.0}.
- *
{@code --iteration_block_size block}
- *
Number of iterations between perplexity checks. Defaults to {@code 10}. This option is - * ignored unless option {@code --test_set_percentage} is greater than zero.
- *
- */ -public class CVB0Driver extends AbstractJob { - private static final Logger log = LoggerFactory.getLogger(CVB0Driver.class); - - public static final String NUM_TOPICS = "num_topics"; - public static final String NUM_TERMS = "num_terms"; - public static final String DOC_TOPIC_SMOOTHING = "doc_topic_smoothing"; - public static final String TERM_TOPIC_SMOOTHING = "term_topic_smoothing"; - public static final String DICTIONARY = "dictionary"; - public static final String DOC_TOPIC_OUTPUT = "doc_topic_output"; - public static final String MODEL_TEMP_DIR = "topic_model_temp_dir"; - public static final String ITERATION_BLOCK_SIZE = "iteration_block_size"; - public static final String RANDOM_SEED = "random_seed"; - public static final String TEST_SET_FRACTION = "test_set_fraction"; - public static final String NUM_TRAIN_THREADS = "num_train_threads"; - public static final String NUM_UPDATE_THREADS = "num_update_threads"; - public static final String MAX_ITERATIONS_PER_DOC = "max_doc_topic_iters"; - public static final String MODEL_WEIGHT = "prev_iter_mult"; - public static final String NUM_REDUCE_TASKS = "num_reduce_tasks"; - public static final String BACKFILL_PERPLEXITY = "backfill_perplexity"; - private static final String MODEL_PATHS = "mahout.lda.cvb.modelPath"; - - @Override - public int run(String[] args) throws Exception { - addInputOption(); - addOutputOption(); - addOption(DefaultOptionCreator.maxIterationsOption().create()); - addOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION, "cd", "The convergence delta value", "0"); - addOption(DefaultOptionCreator.overwriteOption().create()); - - addOption(NUM_TOPICS, "k", "Number of topics to learn", true); - addOption(NUM_TERMS, "nt", "Vocabulary size", false); - addOption(DOC_TOPIC_SMOOTHING, "a", "Smoothing for document/topic distribution", "0.0001"); - addOption(TERM_TOPIC_SMOOTHING, "e", "Smoothing for topic/term distribution", "0.0001"); - addOption(DICTIONARY, "dict", "Path to term-dictionary file(s) (glob expression supported)", - false); - addOption(DOC_TOPIC_OUTPUT, "dt", "Output path for the training doc/topic distribution", - false); - addOption(MODEL_TEMP_DIR, "mt", "Path to intermediate model path (useful for restarting)", - false); - addOption(ITERATION_BLOCK_SIZE, "block", "Number of iterations per perplexity check", "10"); - addOption(RANDOM_SEED, "seed", "Random seed", false); - addOption(TEST_SET_FRACTION, "tf", "Fraction of data to hold out for testing", "0"); - addOption(NUM_TRAIN_THREADS, "ntt", "number of threads per mapper to train with", "4"); - addOption(NUM_UPDATE_THREADS, "nut", "number of threads per mapper to update the model with", - "1"); - addOption(MAX_ITERATIONS_PER_DOC, "mipd", - "max number of iterations per doc for p(topic|doc) learning", "10"); - addOption(NUM_REDUCE_TASKS, null, - "number of reducers to use during model estimation", "10"); - addOption(buildOption(BACKFILL_PERPLEXITY, null, - "enable backfilling of missing perplexity values", false, false, null)); - - if (parseArguments(args) == null) { - return -1; - } - - int numTopics = Integer.parseInt(getOption(NUM_TOPICS)); - Path inputPath = getInputPath(); - Path topicModelOutputPath = getOutputPath(); - int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); - int iterationBlockSize = Integer.parseInt(getOption(ITERATION_BLOCK_SIZE)); - double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); - double alpha = Double.parseDouble(getOption(DOC_TOPIC_SMOOTHING)); - double eta = Double.parseDouble(getOption(TERM_TOPIC_SMOOTHING)); - int numTrainThreads = Integer.parseInt(getOption(NUM_TRAIN_THREADS)); - int numUpdateThreads = Integer.parseInt(getOption(NUM_UPDATE_THREADS)); - int maxItersPerDoc = Integer.parseInt(getOption(MAX_ITERATIONS_PER_DOC)); - Path dictionaryPath = hasOption(DICTIONARY) ? new Path(getOption(DICTIONARY)) : null; - int numTerms = hasOption(NUM_TERMS) - ? Integer.parseInt(getOption(NUM_TERMS)) - : getNumTerms(getConf(), dictionaryPath); - Path docTopicOutputPath = hasOption(DOC_TOPIC_OUTPUT) ? new Path(getOption(DOC_TOPIC_OUTPUT)) : null; - Path modelTempPath = hasOption(MODEL_TEMP_DIR) - ? new Path(getOption(MODEL_TEMP_DIR)) - : getTempPath("topicModelState"); - long seed = hasOption(RANDOM_SEED) - ? Long.parseLong(getOption(RANDOM_SEED)) - : System.nanoTime() % 10000; - float testFraction = hasOption(TEST_SET_FRACTION) - ? Float.parseFloat(getOption(TEST_SET_FRACTION)) - : 0.0f; - int numReduceTasks = Integer.parseInt(getOption(NUM_REDUCE_TASKS)); - boolean backfillPerplexity = hasOption(BACKFILL_PERPLEXITY); - - return run(getConf(), inputPath, topicModelOutputPath, numTopics, numTerms, alpha, eta, - maxIterations, iterationBlockSize, convergenceDelta, dictionaryPath, docTopicOutputPath, - modelTempPath, seed, testFraction, numTrainThreads, numUpdateThreads, maxItersPerDoc, - numReduceTasks, backfillPerplexity); - } - - private static int getNumTerms(Configuration conf, Path dictionaryPath) throws IOException { - FileSystem fs = dictionaryPath.getFileSystem(conf); - Text key = new Text(); - IntWritable value = new IntWritable(); - int maxTermId = -1; - for (FileStatus stat : fs.globStatus(dictionaryPath)) { - SequenceFile.Reader reader = new SequenceFile.Reader(fs, stat.getPath(), conf); - while (reader.next(key, value)) { - maxTermId = Math.max(maxTermId, value.get()); - } - } - return maxTermId + 1; - } - - public static int run(Configuration conf, - Path inputPath, - Path topicModelOutputPath, - int numTopics, - int numTerms, - double alpha, - double eta, - int maxIterations, - int iterationBlockSize, - double convergenceDelta, - Path dictionaryPath, - Path docTopicOutputPath, - Path topicModelStateTempPath, - long randomSeed, - float testFraction, - int numTrainThreads, - int numUpdateThreads, - int maxItersPerDoc, - int numReduceTasks, - boolean backfillPerplexity) - throws ClassNotFoundException, IOException, InterruptedException { - // verify arguments - Preconditions.checkArgument(testFraction >= 0.0 && testFraction <= 1.0, - "Expected 'testFraction' value in range [0, 1] but found value '%s'", testFraction); - Preconditions.checkArgument(!backfillPerplexity || testFraction > 0.0, - "Expected 'testFraction' value in range (0, 1] but found value '%s'", testFraction); - - String infoString = "Will run Collapsed Variational Bayes (0th-derivative approximation) " - + "learning for LDA on {} (numTerms: {}), finding {}-topics, with document/topic prior {}, " - + "topic/term prior {}. Maximum iterations to run will be {}, unless the change in " - + "perplexity is less than {}. Topic model output (p(term|topic) for each topic) will be " - + "stored {}. Random initialization seed is {}, holding out {} of the data for perplexity " - + "check\n"; - log.info(infoString, new Object[] {inputPath, numTerms, numTopics, alpha, eta, maxIterations, - convergenceDelta, topicModelOutputPath, randomSeed, testFraction}); - infoString = dictionaryPath == null - ? "" : "Dictionary to be used located " + dictionaryPath.toString() + '\n'; - infoString += docTopicOutputPath == null - ? "" : "p(topic|docId) will be stored " + docTopicOutputPath.toString() + '\n'; - log.info(infoString); - - FileSystem fs = FileSystem.get(topicModelStateTempPath.toUri(), conf); - int iterationNumber = getCurrentIterationNumber(conf, topicModelStateTempPath, maxIterations); - log.info("Current iteration number: {}", iterationNumber); - - conf.set(NUM_TOPICS, String.valueOf(numTopics)); - conf.set(NUM_TERMS, String.valueOf(numTerms)); - conf.set(DOC_TOPIC_SMOOTHING, String.valueOf(alpha)); - conf.set(TERM_TOPIC_SMOOTHING, String.valueOf(eta)); - conf.set(RANDOM_SEED, String.valueOf(randomSeed)); - conf.set(NUM_TRAIN_THREADS, String.valueOf(numTrainThreads)); - conf.set(NUM_UPDATE_THREADS, String.valueOf(numUpdateThreads)); - conf.set(MAX_ITERATIONS_PER_DOC, String.valueOf(maxItersPerDoc)); - conf.set(MODEL_WEIGHT, "1"); // TODO - conf.set(TEST_SET_FRACTION, String.valueOf(testFraction)); - - List perplexities = Lists.newArrayList(); - for (int i = 1; i <= iterationNumber; i++) { - // form path to model - Path modelPath = modelPath(topicModelStateTempPath, i); - - // read perplexity - double perplexity = readPerplexity(conf, topicModelStateTempPath, i); - if (Double.isNaN(perplexity)) { - if (!(backfillPerplexity && i % iterationBlockSize == 0)) { - continue; - } - log.info("Backfilling perplexity at iteration {}", i); - if (!fs.exists(modelPath)) { - log.error("Model path '{}' does not exist; Skipping iteration {} perplexity calculation", modelPath.toString(), i); - continue; - } - perplexity = calculatePerplexity(conf, inputPath, modelPath, i); - } - - // register and log perplexity - perplexities.add(perplexity); - log.info("Perplexity at iteration {} = {}", i, perplexity); - } - - long startTime = System.currentTimeMillis(); - while (iterationNumber < maxIterations) { - // test convergence - if (convergenceDelta > 0.0) { - double delta = rateOfChange(perplexities); - if (delta < convergenceDelta) { - log.info("Convergence achieved at iteration {} with perplexity {} and delta {}", - new Object[]{iterationNumber, perplexities.get(perplexities.size() - 1), delta}); - break; - } - } - - // update model - iterationNumber++; - log.info("About to run iteration {} of {}", iterationNumber, maxIterations); - Path modelInputPath = modelPath(topicModelStateTempPath, iterationNumber - 1); - Path modelOutputPath = modelPath(topicModelStateTempPath, iterationNumber); - runIteration(conf, inputPath, modelInputPath, modelOutputPath, iterationNumber, - maxIterations, numReduceTasks); - - // calculate perplexity - if (testFraction > 0 && iterationNumber % iterationBlockSize == 0) { - perplexities.add(calculatePerplexity(conf, inputPath, modelOutputPath, iterationNumber)); - log.info("Current perplexity = {}", perplexities.get(perplexities.size() - 1)); - log.info("(p_{} - p_{}) / p_0 = {}; target = {}", new Object[]{ - iterationNumber , iterationNumber - iterationBlockSize, rateOfChange(perplexities), convergenceDelta - }); - } - } - log.info("Completed {} iterations in {} seconds", iterationNumber, - (System.currentTimeMillis() - startTime)/1000); - log.info("Perplexities: ({})", Joiner.on(", ").join(perplexities)); - - // write final topic-term and doc-topic distributions - Path finalIterationData = modelPath(topicModelStateTempPath, iterationNumber); - Job topicModelOutputJob = topicModelOutputPath != null - ? writeTopicModel(conf, finalIterationData, topicModelOutputPath) - : null; - Job docInferenceJob = docTopicOutputPath != null - ? writeDocTopicInference(conf, inputPath, finalIterationData, docTopicOutputPath) - : null; - if (topicModelOutputJob != null && !topicModelOutputJob.waitForCompletion(true)) { - return -1; - } - if (docInferenceJob != null && !docInferenceJob.waitForCompletion(true)) { - return -1; - } - return 0; - } - - private static double rateOfChange(List perplexities) { - int sz = perplexities.size(); - if (sz < 2) { - return Double.MAX_VALUE; - } - return Math.abs(perplexities.get(sz - 1) - perplexities.get(sz - 2)) / perplexities.get(0); - } - - private static double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration) - throws IOException, - ClassNotFoundException, InterruptedException { - String jobName = "Calculating perplexity for " + modelPath; - log.info("About to run: " + jobName); - Job job = new Job(conf, jobName); - job.setJarByClass(CachingCVB0PerplexityMapper.class); - job.setMapperClass(CachingCVB0PerplexityMapper.class); - job.setCombinerClass(DualDoubleSumReducer.class); - job.setReducerClass(DualDoubleSumReducer.class); - job.setNumReduceTasks(1); - job.setOutputKeyClass(DoubleWritable.class); - job.setOutputValueClass(DoubleWritable.class); - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - FileInputFormat.addInputPath(job, corpusPath); - Path outputPath = perplexityPath(modelPath.getParent(), iteration); - FileOutputFormat.setOutputPath(job, outputPath); - setModelPaths(job, modelPath); - HadoopUtil.delete(conf, outputPath); - if (!job.waitForCompletion(true)) { - throw new InterruptedException("Failed to calculate perplexity for: " + modelPath); - } - return readPerplexity(conf, modelPath.getParent(), iteration); - } - - /** - * Sums keys and values independently. - */ - public static class DualDoubleSumReducer extends - Reducer { - private final DoubleWritable outKey = new DoubleWritable(); - private final DoubleWritable outValue = new DoubleWritable(); - - @Override - public void run(Context context) throws IOException, - InterruptedException { - double keySum = 0.0; - double valueSum = 0.0; - while (context.nextKey()) { - keySum += context.getCurrentKey().get(); - for (DoubleWritable value : context.getValues()) { - valueSum += value.get(); - } - } - outKey.set(keySum); - outValue.set(valueSum); - context.write(outKey, outValue); - } - } - - /** - * @param topicModelStateTemp - * @param iteration - * @return {@code double[2]} where first value is perplexity and second is model weight of those - * documents sampled during perplexity computation, or {@code null} if no perplexity data - * exists for the given iteration. - * @throws IOException - */ - public static double readPerplexity(Configuration conf, Path topicModelStateTemp, int iteration) - throws IOException { - Path perplexityPath = perplexityPath(topicModelStateTemp, iteration); - FileSystem fs = FileSystem.get(perplexityPath.toUri(), conf); - if (!fs.exists(perplexityPath)) { - log.warn("Perplexity path {} does not exist, returning NaN", perplexityPath); - return Double.NaN; - } - double perplexity = 0; - double modelWeight = 0; - long n = 0; - for (Pair pair : new SequenceFileDirIterable( - perplexityPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) { - modelWeight += pair.getFirst().get(); - perplexity += pair.getSecond().get(); - n++; - } - log.info("Read {} entries with total perplexity {} and model weight {}", new Object[] { n, - perplexity, modelWeight }); - return perplexity / modelWeight; - } - - private static Job writeTopicModel(Configuration conf, Path modelInput, Path output) throws IOException, - InterruptedException, ClassNotFoundException { - String jobName = String.format("Writing final topic/term distributions from %s to %s", modelInput, - output); - log.info("About to run: " + jobName); - Job job = new Job(conf, jobName); - job.setJarByClass(CVB0Driver.class); - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setMapperClass(CVB0TopicTermVectorNormalizerMapper.class); - job.setNumReduceTasks(0); - job.setOutputKeyClass(IntWritable.class); - job.setOutputValueClass(VectorWritable.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - FileInputFormat.addInputPath(job, modelInput); - FileOutputFormat.setOutputPath(job, output); - job.submit(); - return job; - } - - private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output) - throws IOException, ClassNotFoundException, InterruptedException { - String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, - output); - log.info("About to run: " + jobName); - Job job = new Job(conf, jobName); - job.setMapperClass(CVB0DocInferenceMapper.class); - job.setNumReduceTasks(0); - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - job.setOutputKeyClass(IntWritable.class); - job.setOutputValueClass(VectorWritable.class); - FileSystem fs = FileSystem.get(corpus.toUri(), conf); - if (modelInput != null && fs.exists(modelInput)) { - FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter()); - URI[] modelUris = new URI[statuses.length]; - for (int i = 0; i < statuses.length; i++) { - modelUris[i] = statuses[i].getPath().toUri(); - } - DistributedCache.setCacheFiles(modelUris, conf); - } - FileInputFormat.addInputPath(job, corpus); - FileOutputFormat.setOutputPath(job, output); - job.setJarByClass(CVB0Driver.class); - job.submit(); - return job; - } - - public static Path modelPath(Path topicModelStateTempPath, int iterationNumber) { - return new Path(topicModelStateTempPath, "model-" + iterationNumber); - } - - public static Path stage1OutputPath(Path topicModelStateTempPath, int iterationNumber) { - return new Path(topicModelStateTempPath, "tmp-" + iterationNumber); - } - - public static Path perplexityPath(Path topicModelStateTempPath, int iterationNumber) { - return new Path(topicModelStateTempPath, "perplexity-" + iterationNumber); - } - - private static int getCurrentIterationNumber(Configuration config, Path modelTempDir, int maxIterations) - throws IOException { - FileSystem fs = FileSystem.get(modelTempDir.toUri(), config); - int iterationNumber = 1; - Path iterationPath = modelPath(modelTempDir, iterationNumber); - while (fs.exists(iterationPath) && iterationNumber <= maxIterations) { - log.info("Found previous state: " + iterationPath); - iterationNumber++; - iterationPath = modelPath(modelTempDir, iterationNumber); - } - return iterationNumber - 1; - } - - public static void runIteration(Configuration conf, Path corpusInput, Path modelInput, Path modelOutput, - int iterationNumber, int maxIterations, int numReduceTasks) throws IOException, ClassNotFoundException, InterruptedException { - String jobName = String.format("Iteration %d of %d, input path: %s", - iterationNumber, maxIterations, modelInput); - log.info("About to run: " + jobName); - Job job = new Job(conf, jobName); - job.setJarByClass(CVB0Driver.class); - job.setMapperClass(CachingCVB0Mapper.class); - job.setCombinerClass(VectorSumReducer.class); - job.setReducerClass(VectorSumReducer.class); - job.setNumReduceTasks(numReduceTasks); - job.setOutputKeyClass(IntWritable.class); - job.setOutputValueClass(VectorWritable.class); - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - FileInputFormat.addInputPath(job, corpusInput); - FileOutputFormat.setOutputPath(job, modelOutput); - setModelPaths(job, modelInput); - HadoopUtil.delete(conf, modelOutput); - if (!job.waitForCompletion(true)) { - throw new InterruptedException(String.format("Failed to complete iteration %d stage 1", - iterationNumber)); - } - } - - private static void setModelPaths(Job job, Path modelPath) throws IOException { - Configuration conf = job.getConfiguration(); - if (modelPath == null || !FileSystem.get(modelPath.toUri(), conf).exists(modelPath)) { - return; - } - FileStatus[] statuses = FileSystem.get(modelPath.toUri(), conf).listStatus(modelPath, PathFilters.partFilter()); - Preconditions.checkState(statuses.length > 0, "No part files found in model path '%s'", modelPath.toString()); - String[] modelPaths = new String[statuses.length]; - for (int i = 0; i < statuses.length; i++) { - modelPaths[i] = statuses[i].getPath().toUri().toString(); - } - conf.setStrings(MODEL_PATHS, modelPaths); - } - - public static Path[] getModelPaths(Configuration conf) { - String[] modelPathNames = conf.getStrings(MODEL_PATHS); - if (modelPathNames == null || modelPathNames.length == 0) { - return null; - } - Path[] modelPaths = new Path[modelPathNames.length]; - for (int i = 0; i < modelPathNames.length; i++) { - modelPaths[i] = new Path(modelPathNames[i]); - } - return modelPaths; - } - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new CVB0Driver(), args); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0TopicTermVectorNormalizerMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0TopicTermVectorNormalizerMapper.java deleted file mode 100644 index 1253942b1..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0TopicTermVectorNormalizerMapper.java +++ /dev/null @@ -1,38 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.lda.cvb; - -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.function.Functions; - -import java.io.IOException; - -/** - * Performs L1 normalization of input vectors. - */ -public class CVB0TopicTermVectorNormalizerMapper extends - Mapper { - - @Override - protected void map(IntWritable key, VectorWritable value, Context context) throws IOException, - InterruptedException { - value.get().assign(Functions.div(value.get().norm(1.0))); - context.write(key, value); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0Mapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0Mapper.java deleted file mode 100644 index 0849eca32..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0Mapper.java +++ /dev/null @@ -1,130 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.lda.cvb; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.MatrixSlice; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; - -/** - * Run ensemble learning via loading the {@link ModelTrainer} with two {@link TopicModel} instances: - * one from the previous iteration, the other empty. Inference is done on the first, and the - * learning updates are stored in the second, and only emitted at cleanup(). - * - * In terms of obvious performance improvements still available, the memory footprint in this - * Mapper could be dropped by half if we accumulated model updates onto the model we're using - * for inference, which might also speed up convergence, as we'd be able to take advantage of - * learning during iteration, not just after each one is done. Most likely we don't - * really need to accumulate double values in the model either, floats would most likely be - * sufficient. Between these two, we could squeeze another factor of 4 in memory efficiency. - * - * In terms of CPU, we're re-learning the p(topic|doc) distribution on every iteration, starting - * from scratch. This is usually only 10 fixed-point iterations per doc, but that's 10x more than - * only 1. To avoid having to do this, we would need to do a map-side join of the unchanging - * corpus with the continually-improving p(topic|doc) matrix, and then emit multiple outputs - * from the mappers to make sure we can do the reduce model averaging as well. Tricky, but - * possibly worth it. - * - * {@link ModelTrainer} already takes advantage (in maybe the not-nice way) of multi-core - * availability by doing multithreaded learning, see that class for details. - */ -public class CachingCVB0Mapper - extends Mapper { - - private static final Logger log = LoggerFactory.getLogger(CachingCVB0Mapper.class); - - private ModelTrainer modelTrainer; - private int maxIters; - private int numTopics; - - protected ModelTrainer getModelTrainer() { - return modelTrainer; - } - - protected int getMaxIters() { - return maxIters; - } - - protected int getNumTopics() { - return numTopics; - } - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - log.info("Retrieving configuration"); - Configuration conf = context.getConfiguration(); - float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN); - float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN); - long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L); - numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1); - int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1); - int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1); - int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4); - maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10); - float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f); - - log.info("Initializing read model"); - TopicModel readModel; - Path[] modelPaths = CVB0Driver.getModelPaths(conf); - if (modelPaths != null && modelPaths.length > 0) { - readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths); - } else { - log.info("No model files found"); - readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null, - numTrainThreads, modelWeight); - } - - log.info("Initializing write model"); - TopicModel writeModel = modelWeight == 1 - ? new TopicModel(numTopics, numTerms, eta, alpha, null, numUpdateThreads) - : readModel; - - log.info("Initializing model trainer"); - modelTrainer = new ModelTrainer(readModel, writeModel, numTrainThreads, numTopics, numTerms); - modelTrainer.start(); - } - - @Override - public void map(IntWritable docId, VectorWritable document, Context context) - throws IOException, InterruptedException{ - /* where to get docTopics? */ - Vector topicVector = new DenseVector(new double[numTopics]).assign(1.0/numTopics); - modelTrainer.train(document.get(), topicVector, true, maxIters); - } - - @Override - protected void cleanup(Context context) throws IOException, InterruptedException { - log.info("Stopping model trainer"); - modelTrainer.stop(); - - log.info("Writing model"); - TopicModel model = modelTrainer.getReadModel(); - for (MatrixSlice topic : model) { - context.write(new IntWritable(topic.index()), new VectorWritable(topic.vector())); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0PerplexityMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0PerplexityMapper.java deleted file mode 100644 index d0688292c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0PerplexityMapper.java +++ /dev/null @@ -1,107 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.lda.cvb; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.DoubleWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.common.MemoryUtil; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Random; - -public class CachingCVB0PerplexityMapper extends - Mapper { - /** - * Hadoop counters for {@link CachingCVB0PerplexityMapper}, to aid in debugging. - */ - public enum Counters { - SAMPLED_DOCUMENTS - } - - private static final Logger log = LoggerFactory.getLogger(CachingCVB0PerplexityMapper.class); - - private ModelTrainer modelTrainer; - private int maxIters; - private int numTopics; - private float testFraction; - private Random random; - private Vector topicVector; - private final DoubleWritable outKey = new DoubleWritable(); - private final DoubleWritable outValue = new DoubleWritable(); - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - MemoryUtil.startMemoryLogger(5000); - - log.info("Retrieving configuration"); - Configuration conf = context.getConfiguration(); - float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN); - float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN); - long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L); - random = RandomUtils.getRandom(seed); - numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1); - int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1); - int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1); - int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4); - maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10); - float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f); - testFraction = conf.getFloat(CVB0Driver.TEST_SET_FRACTION, 0.1f); - - log.info("Initializing read model"); - TopicModel readModel; - Path[] modelPaths = CVB0Driver.getModelPaths(conf); - if (modelPaths != null && modelPaths.length > 0) { - readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths); - } else { - log.info("No model files found"); - readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null, - numTrainThreads, modelWeight); - } - - log.info("Initializing model trainer"); - modelTrainer = new ModelTrainer(readModel, null, numTrainThreads, numTopics, numTerms); - - log.info("Initializing topic vector"); - topicVector = new DenseVector(new double[numTopics]); - } - - @Override - protected void cleanup(Context context) throws IOException, InterruptedException { - MemoryUtil.stopMemoryLogger(); - } - - @Override - public void map(IntWritable docId, VectorWritable document, Context context) - throws IOException, InterruptedException{ - if (1 > testFraction && random.nextFloat() >= testFraction) { - return; - } - context.getCounter(Counters.SAMPLED_DOCUMENTS).increment(1); - outKey.set(document.get().norm(1)); - outValue.set(modelTrainer.calculatePerplexity(document.get(), topicVector.assign(1.0 / numTopics), maxIters)); - context.write(outKey, outValue); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/InMemoryCollapsedVariationalBayes0.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/InMemoryCollapsedVariationalBayes0.java deleted file mode 100644 index dfc547647..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/InMemoryCollapsedVariationalBayes0.java +++ /dev/null @@ -1,504 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.lda.cvb; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import org.apache.commons.cli2.CommandLine; -import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; -import org.apache.commons.cli2.OptionException; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.CommandLineUtil; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; -import org.apache.mahout.math.DenseMatrix; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.DistributedRowMatrixWriter; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.SparseRowMatrix; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.List; -import java.util.Map; - -/** - * Runs the same algorithm as {@link CVB0Driver}, but sequentially, in memory. Memory requirements - * are currently: the entire corpus is read into RAM, two copies of the model (each of size - * numTerms * numTopics), and another matrix of size numDocs * numTopics is held in memory - * (to store p(topic|doc) for all docs). - * - * But if all this fits in memory, this can be significantly faster than an iterative MR job. - */ -public class InMemoryCollapsedVariationalBayes0 extends AbstractJob { - - private static final Logger log = LoggerFactory.getLogger(InMemoryCollapsedVariationalBayes0.class); - - private int numTopics; - private int numTerms; - private int numDocuments; - private double alpha; - private double eta; - private int minDfCt; - private double maxDfPct; - private boolean verbose = false; - - private Map termIdMap; - private String[] terms; // of length numTerms; - private Matrix corpusWeights; // length numDocs; - private double totalCorpusWeight; - private double initialModelCorpusFraction; - private Matrix docTopicCounts; - private long seed; - private TopicModel topicModel; - private TopicModel updatedModel; - private int numTrainingThreads; - private int numUpdatingThreads; - private ModelTrainer modelTrainer; - - private InMemoryCollapsedVariationalBayes0() { - // only for main usage - } - - public void setVerbose(boolean verbose) { - this.verbose = verbose; - } - - public InMemoryCollapsedVariationalBayes0(Matrix corpus, String[] terms, int numTopics, - double alpha, double eta) { - this(corpus, terms, numTopics, alpha, eta, 1, 1, 0, 1234); - } - - public InMemoryCollapsedVariationalBayes0(Matrix corpus, String[] terms, int numTopics, - double alpha, double eta, int numTrainingThreads, int numUpdatingThreads, - double modelCorpusFraction, long seed) { - this.seed = seed; - this.numTopics = numTopics; - this.alpha = alpha; - this.eta = eta; - this.minDfCt = 0; - this.maxDfPct = 1.0f; - corpusWeights = corpus; - numDocuments = corpus.numRows(); - this.terms = terms; - this.initialModelCorpusFraction = modelCorpusFraction; - numTerms = terms != null ? terms.length : corpus.numCols(); - termIdMap = Maps.newHashMap(); - if (terms != null) { - for (int t=0; t minFractionalErrorChange) { - trainDocuments(); - if (verbose) { - log.info("model after: " + iter + ": " + modelTrainer.getReadModel().toString()); - } - newPerplexity = modelTrainer.calculatePerplexity(corpusWeights, docTopicCounts, - testFraction); - log.info(newPerplexity + " = perplexity"); - iter++; - fractionalChange = Math.abs(newPerplexity - oldPerplexity) / oldPerplexity; - log.info(fractionalChange + " = fractionalChange"); - oldPerplexity = newPerplexity; - } - if (iter < maxIterations) { - log.info(String.format("Converged! fractional error change: %f, error %f", - fractionalChange, newPerplexity)); - } else { - log.info(String.format("Reached max iteration count (%d), fractional error change: %f, error: %f", - maxIterations, fractionalChange, newPerplexity)); - } - return newPerplexity; - } - - public void writeModel(Path outputPath) throws IOException { - modelTrainer.persist(outputPath); - } - - private static void logTime(String label, long nanos) { - log.info("{} time: {}ms", label, (double) nanos / 1.0e6); - } - - public static int main2(String[] args, Configuration conf) throws Exception { - DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); - ArgumentBuilder abuilder = new ArgumentBuilder(); - GroupBuilder gbuilder = new GroupBuilder(); - - Option helpOpt = DefaultOptionCreator.helpOption(); - - Option inputDirOpt = obuilder.withLongName("input").withRequired(true).withArgument( - abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription( - "The Directory on HDFS containing the collapsed, properly formatted files having " - + "one doc per line").withShortName("i").create(); - - Option dictOpt = obuilder.withLongName("dictionary").withRequired(false).withArgument( - abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription( - "The path to the term-dictionary format is ... ").withShortName("d").create(); - - Option dfsOpt = obuilder.withLongName("dfs").withRequired(false).withArgument( - abuilder.withName("dfs").withMinimum(1).withMaximum(1).create()).withDescription( - "HDFS namenode URI").withShortName("dfs").create(); - - Option numTopicsOpt = obuilder.withLongName("numTopics").withRequired(true).withArgument(abuilder - .withName("numTopics").withMinimum(1).withMaximum(1) - .create()).withDescription("Number of topics to learn").withShortName("top").create(); - - Option outputTopicFileOpt = obuilder.withLongName("topicOutputFile").withRequired(true).withArgument( - abuilder.withName("topicOutputFile").withMinimum(1).withMaximum(1).create()) - .withDescription("File to write out p(term | topic)").withShortName("to").create(); - - Option outputDocFileOpt = obuilder.withLongName("docOutputFile").withRequired(true).withArgument( - abuilder.withName("docOutputFile").withMinimum(1).withMaximum(1).create()) - .withDescription("File to write out p(topic | docid)").withShortName("do").create(); - - Option alphaOpt = obuilder.withLongName("alpha").withRequired(false).withArgument(abuilder - .withName("alpha").withMinimum(1).withMaximum(1).withDefault("0.1").create()) - .withDescription("Smoothing parameter for p(topic | document) prior").withShortName("a").create(); - - Option etaOpt = obuilder.withLongName("eta").withRequired(false).withArgument(abuilder - .withName("eta").withMinimum(1).withMaximum(1).withDefault("0.1").create()) - .withDescription("Smoothing parameter for p(term | topic)").withShortName("e").create(); - - Option maxIterOpt = obuilder.withLongName("maxIterations").withRequired(false).withArgument(abuilder - .withName("maxIterations").withMinimum(1).withMaximum(1).withDefault(10).create()) - .withDescription("Maximum number of training passes").withShortName("m").create(); - - Option modelCorpusFractionOption = obuilder.withLongName("modelCorpusFraction") - .withRequired(false).withArgument(abuilder.withName("modelCorpusFraction").withMinimum(1) - .withMaximum(1).withDefault(0.0).create()).withShortName("mcf") - .withDescription("For online updates, initial value of |model|/|corpus|").create(); - - Option burnInOpt = obuilder.withLongName("burnInIterations").withRequired(false).withArgument(abuilder - .withName("burnInIterations").withMinimum(1).withMaximum(1).withDefault(5).create()) - .withDescription("Minimum number of iterations").withShortName("b").create(); - - Option convergenceOpt = obuilder.withLongName("convergence").withRequired(false).withArgument(abuilder - .withName("convergence").withMinimum(1).withMaximum(1).withDefault("0.0").create()) - .withDescription("Fractional rate of perplexity to consider convergence").withShortName("c").create(); - - Option reInferDocTopicsOpt = obuilder.withLongName("reInferDocTopics").withRequired(false) - .withArgument(abuilder.withName("reInferDocTopics").withMinimum(1).withMaximum(1) - .withDefault("no").create()) - .withDescription("re-infer p(topic | doc) : [no | randstart | continue]") - .withShortName("rdt").create(); - - Option numTrainThreadsOpt = obuilder.withLongName("numTrainThreads").withRequired(false) - .withArgument(abuilder.withName("numTrainThreads").withMinimum(1).withMaximum(1) - .withDefault("1").create()) - .withDescription("number of threads to train with") - .withShortName("ntt").create(); - - Option numUpdateThreadsOpt = obuilder.withLongName("numUpdateThreads").withRequired(false) - .withArgument(abuilder.withName("numUpdateThreads").withMinimum(1).withMaximum(1) - .withDefault("1").create()) - .withDescription("number of threads to update the model with") - .withShortName("nut").create(); - - Option verboseOpt = obuilder.withLongName("verbose").withRequired(false) - .withArgument(abuilder.withName("verbose").withMinimum(1).withMaximum(1) - .withDefault("false").create()) - .withDescription("print verbose information, like top-terms in each topic, during iteration") - .withShortName("v").create(); - - Group group = gbuilder.withName("Options").withOption(inputDirOpt).withOption(numTopicsOpt) - .withOption(alphaOpt).withOption(etaOpt) - .withOption(maxIterOpt).withOption(burnInOpt).withOption(convergenceOpt) - .withOption(dictOpt).withOption(reInferDocTopicsOpt) - .withOption(outputDocFileOpt).withOption(outputTopicFileOpt).withOption(dfsOpt) - .withOption(numTrainThreadsOpt).withOption(numUpdateThreadsOpt) - .withOption(modelCorpusFractionOption).withOption(verboseOpt).create(); - - try { - Parser parser = new Parser(); - - parser.setGroup(group); - parser.setHelpOption(helpOpt); - CommandLine cmdLine = parser.parse(args); - if (cmdLine.hasOption(helpOpt)) { - CommandLineUtil.printHelp(group); - return -1; - } - - String inputDirString = (String) cmdLine.getValue(inputDirOpt); - String dictDirString = cmdLine.hasOption(dictOpt) ? (String)cmdLine.getValue(dictOpt) : null; - int numTopics = Integer.parseInt((String) cmdLine.getValue(numTopicsOpt)); - double alpha = Double.parseDouble((String)cmdLine.getValue(alphaOpt)); - double eta = Double.parseDouble((String)cmdLine.getValue(etaOpt)); - int maxIterations = Integer.parseInt((String)cmdLine.getValue(maxIterOpt)); - int burnInIterations = (Integer)cmdLine.getValue(burnInOpt); - double minFractionalErrorChange = Double.parseDouble((String) cmdLine.getValue(convergenceOpt)); - int numTrainThreads = Integer.parseInt((String)cmdLine.getValue(numTrainThreadsOpt)); - int numUpdateThreads = Integer.parseInt((String)cmdLine.getValue(numUpdateThreadsOpt)); - String topicOutFile = (String)cmdLine.getValue(outputTopicFileOpt); - String docOutFile = (String)cmdLine.getValue(outputDocFileOpt); - String reInferDocTopics = (String)cmdLine.getValue(reInferDocTopicsOpt); - boolean verbose = Boolean.parseBoolean((String) cmdLine.getValue(verboseOpt)); - double modelCorpusFraction = (Double) cmdLine.getValue(modelCorpusFractionOption); - - long start = System.nanoTime(); - - if (conf.get("fs.default.name") == null) { - String dfsNameNode = (String)cmdLine.getValue(dfsOpt); - conf.set("fs.default.name", dfsNameNode); - } - String[] terms = loadDictionary(dictDirString, conf); - logTime("dictionary loading", System.nanoTime() - start); - start = System.nanoTime(); - Matrix corpus = loadVectors(inputDirString, conf); - logTime("vector seqfile corpus loading", System.nanoTime() - start); - start = System.nanoTime(); - InMemoryCollapsedVariationalBayes0 cvb0 = - new InMemoryCollapsedVariationalBayes0(corpus, terms, numTopics, alpha, eta, - numTrainThreads, numUpdateThreads, modelCorpusFraction, 1234); - logTime("cvb0 init", System.nanoTime() - start); - - start = System.nanoTime(); - cvb0.setVerbose(verbose); - cvb0.iterateUntilConvergence(minFractionalErrorChange, maxIterations, burnInIterations); - logTime("total training time", System.nanoTime() - start); - - if ("randstart".equalsIgnoreCase(reInferDocTopics)) { - cvb0.inferDocuments(0.0, 100, true); - } else if ("continue".equalsIgnoreCase(reInferDocTopics)) { - cvb0.inferDocuments(0.0, 100, false); - } - - start = System.nanoTime(); - cvb0.writeModel(new Path(topicOutFile)); - DistributedRowMatrixWriter.write(new Path(docOutFile), conf, cvb0.docTopicCounts); - logTime("printTopics", System.nanoTime() - start); - } catch (OptionException e) { - log.error("Error while parsing options", e); - CommandLineUtil.printHelp(group); - } - return 0; - } - - /* - private static Map> loadCorpus(String path) throws IOException { - List lines = Resources.readLines(Resources.getResource(path), Charsets.UTF_8); - Map> corpus = Maps.newHashMap(); - for (int i=0; i doc = Maps.newHashMap(); - for (String s : line.split(" ")) { - s = s.replaceAll("\\W", "").toLowerCase().trim(); - if (s.length() == 0) { - continue; - } - if (!doc.containsKey(s)) { - doc.put(s, 0); - } - doc.put(s, doc.get(s) + 1); - } - corpus.put(i, doc); - } - return corpus; - } - */ - - private static String[] loadDictionary(String dictionaryPath, Configuration conf) { - if (dictionaryPath == null) { - return null; - } - Path dictionaryFile = new Path(dictionaryPath); - List> termList = Lists.newArrayList(); - int maxTermId = 0; - // key is word value is id - for (Pair record - : new SequenceFileIterable(dictionaryFile, true, conf)) { - termList.add(new Pair(record.getSecond().get(), - record.getFirst().toString())); - maxTermId = Math.max(maxTermId, record.getSecond().get()); - } - String[] terms = new String[maxTermId + 1]; - for (Pair pair : termList) { - terms[pair.getFirst()] = pair.getSecond(); - } - return terms; - } - - @Override - public Configuration getConf() { - if (super.getConf() == null) { - setConf(new Configuration()); - } - return super.getConf(); - } - - private static Matrix loadVectors(String vectorPathString, Configuration conf) - throws IOException { - Path vectorPath = new Path(vectorPathString); - FileSystem fs = vectorPath.getFileSystem(conf); - List subPaths = Lists.newArrayList(); - if (fs.isFile(vectorPath)) { - subPaths.add(vectorPath); - } else { - for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) { - subPaths.add(fileStatus.getPath()); - } - } - List vectorList = Lists.newArrayList(); - for (Path subPath : subPaths) { - for (Pair record - : new SequenceFileIterable(subPath, true, conf)) { - vectorList.add(record.getSecond().get()); - } - } - int numRows = vectorList.size(); - int numCols = vectorList.get(0).size(); - return new SparseRowMatrix(numRows, numCols, - vectorList.toArray(new Vector[vectorList.size()]), true, - vectorList.get(0).isSequentialAccess()); - } - - @Override - public int run(String[] strings) throws Exception { - return main2(strings, getConf()); - } - - public static void main(String[] args) throws Exception { - ToolRunner.run(new InMemoryCollapsedVariationalBayes0(), args); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/ModelTrainer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/ModelTrainer.java deleted file mode 100644 index 445c04ea3..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/ModelTrainer.java +++ /dev/null @@ -1,298 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.lda.cvb; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import org.apache.hadoop.fs.Path; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.MatrixSlice; -import org.apache.mahout.math.SparseRowMatrix; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorIterable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.Callable; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; - -/** - * Multithreaded LDA model trainer class, which primarily operates by running a "map/reduce" - * operation, all in memory locally (ie not a hadoop job!) : the "map" operation is to take - * the "read-only" {@link TopicModel} and use it to iteratively learn the p(topic|term, doc) - * distribution for documents (this can be done in parallel across many documents, as the - * "read-only" model is, well, read-only. Then the outputs of this are "reduced" onto the - * "write" model, and these updates are not parallelizable in the same way: individual - * documents can't be added to the same entries in different threads at the same time, but - * updates across many topics to the same term from the same document can be done in parallel, - * so they are. - * - * Because computation is done asynchronously, when iteration is done, it's important to call - * the stop() method, which blocks until work is complete. - * - * Setting the read model and the write model to be the same object may not quite work yet, - * on account of parallelism badness. - */ -public class ModelTrainer { - - private static final Logger log = LoggerFactory.getLogger(ModelTrainer.class); - - private final int numTopics; - private final int numTerms; - private TopicModel readModel; - private TopicModel writeModel; - private ThreadPoolExecutor threadPool; - private BlockingQueue workQueue; - private final int numTrainThreads; - private final boolean isReadWrite; - - public ModelTrainer(TopicModel initialReadModel, TopicModel initialWriteModel, - int numTrainThreads, int numTopics, int numTerms) { - this.readModel = initialReadModel; - this.writeModel = initialWriteModel; - this.numTrainThreads = numTrainThreads; - this.numTopics = numTopics; - this.numTerms = numTerms; - isReadWrite = initialReadModel == initialWriteModel; - } - - /** - * WARNING: this constructor may not lead to good behavior. What should be verified is that - * the model updating process does not conflict with model reading. It might work, but then - * again, it might not! - * @param model to be used for both reading (inference) and accumulating (learning) - * @param numTrainThreads - * @param numTopics - * @param numTerms - */ - public ModelTrainer(TopicModel model, int numTrainThreads, int numTopics, int numTerms) { - this(model, model, numTrainThreads, numTopics, numTerms); - } - - public TopicModel getReadModel() { - return readModel; - } - - public void start() { - log.info("Starting training threadpool with " + numTrainThreads + " threads"); - workQueue = new ArrayBlockingQueue(numTrainThreads * 10); - threadPool = new ThreadPoolExecutor(numTrainThreads, numTrainThreads, 0, TimeUnit.SECONDS, - workQueue); - threadPool.allowCoreThreadTimeOut(false); - threadPool.prestartAllCoreThreads(); - } - - public void train(VectorIterable matrix, VectorIterable docTopicCounts) { - train(matrix, docTopicCounts, 1); - } - - public double calculatePerplexity(VectorIterable matrix, VectorIterable docTopicCounts) { - return calculatePerplexity(matrix, docTopicCounts, 0); - } - - public double calculatePerplexity(VectorIterable matrix, VectorIterable docTopicCounts, - double testFraction) { - Iterator docIterator = matrix.iterator(); - Iterator docTopicIterator = docTopicCounts.iterator(); - double perplexity = 0; - double matrixNorm = 0; - while (docIterator.hasNext() && docTopicIterator.hasNext()) { - MatrixSlice docSlice = docIterator.next(); - MatrixSlice topicSlice = docTopicIterator.next(); - int docId = docSlice.index(); - Vector document = docSlice.vector(); - Vector topicDist = topicSlice.vector(); - if (testFraction == 0 || docId % (1/testFraction) == 0) { - trainSync(document, topicDist, false, 10); - perplexity += readModel.perplexity(document, topicDist); - matrixNorm += document.norm(1); - } - } - return perplexity / matrixNorm; - } - - public void train(VectorIterable matrix, VectorIterable docTopicCounts, int numDocTopicIters) { - start(); - Iterator docIterator = matrix.iterator(); - Iterator docTopicIterator = docTopicCounts.iterator(); - long startTime = System.nanoTime(); - int i = 0; - double[] times = new double[100]; - Map batch = Maps.newHashMap(); - int numTokensInBatch = 0; - long batchStart = System.nanoTime(); - while (docIterator.hasNext() && docTopicIterator.hasNext()) { - i++; - Vector document = docIterator.next().vector(); - Vector topicDist = docTopicIterator.next().vector(); - if (isReadWrite) { - if (batch.size() < numTrainThreads) { - batch.put(document, topicDist); - if (log.isDebugEnabled()) { - numTokensInBatch += document.getNumNondefaultElements(); - } - } else { - batchTrain(batch, true, numDocTopicIters); - long time = System.nanoTime(); - log.debug("trained {} docs with {} tokens, start time {}, end time {}", - new Object[] {numTrainThreads, numTokensInBatch, batchStart, time}); - batchStart = time; - numTokensInBatch = 0; - } - } else { - long start = System.nanoTime(); - train(document, topicDist, true, numDocTopicIters); - if (log.isDebugEnabled()) { - times[i % times.length] = - (System.nanoTime() - start) /(1.0e6 * document.getNumNondefaultElements()); - if (i % 100 == 0) { - long time = System.nanoTime() - startTime; - log.debug("trained " + i + " documents in " + (time / 1.0e6) + "ms"); - if (i % 500 == 0) { - Arrays.sort(times); - log.debug("training took median " + times[times.length / 2] + "ms per token-instance"); - } - } - } - } - } - stop(); - } - - public void batchTrain(Map batch, boolean update, int numDocTopicsIters) { - while (true) { - try { - List runnables = Lists.newArrayList(); - for (Map.Entry entry : batch.entrySet()) { - runnables.add(new TrainerRunnable(readModel, null, entry.getKey(), - entry.getValue(), new SparseRowMatrix(numTopics, numTerms, true), - numDocTopicsIters)); - } - threadPool.invokeAll(runnables); - if (update) { - for (TrainerRunnable runnable : runnables) { - writeModel.update(runnable.docTopicModel); - } - } - break; - } catch (InterruptedException e) { - log.warn("Interrupted during batch training, retrying!", e); - } - } - } - - public void train(Vector document, Vector docTopicCounts, boolean update, int numDocTopicIters) { - while (true) { - try { - workQueue.put(new TrainerRunnable(readModel, - update ? writeModel : null, document, docTopicCounts, new SparseRowMatrix( - numTopics, numTerms, true), numDocTopicIters)); - return; - } catch (InterruptedException e) { - log.warn("Interrupted waiting to submit document to work queue: " + document, e); - } - } - } - - public void trainSync(Vector document, Vector docTopicCounts, boolean update, - int numDocTopicIters) { - new TrainerRunnable(readModel, - update ? writeModel : null, document, docTopicCounts, new SparseRowMatrix( - numTopics, numTerms, true), numDocTopicIters).run(); - } - - public double calculatePerplexity(Vector document, Vector docTopicCounts, int numDocTopicIters) { - TrainerRunnable runner = new TrainerRunnable(readModel, - null, document, docTopicCounts, new SparseRowMatrix( - numTopics, numTerms, true), numDocTopicIters); - return runner.call(); - } - - public void stop() { - long startTime = System.nanoTime(); - log.info("Initiating stopping of training threadpool"); - try { - threadPool.shutdown(); - if (!threadPool.awaitTermination(60, TimeUnit.SECONDS)) { - log.warn("Threadpool timed out on await termination - jobs still running!"); - } - long newTime = System.nanoTime(); - log.info("threadpool took: " + (newTime - startTime) / 1.0e6 + "ms"); - startTime = newTime; - writeModel.awaitTermination(); - newTime = System.nanoTime(); - log.info("writeModel.awaitTermination() took " + (newTime - startTime) / 1.0e6 + "ms"); - TopicModel tmpModel = writeModel; - writeModel = readModel; - readModel = tmpModel; - writeModel.reset(); - } catch (InterruptedException e) { - log.error("Interrupted shutting down!", e); - } - } - - public void persist(Path outputPath) throws IOException { - readModel.persist(outputPath, true); - } - - private static class TrainerRunnable implements Runnable, Callable { - private final TopicModel readModel; - private final TopicModel writeModel; - private final Vector document; - private final Vector docTopics; - private final Matrix docTopicModel; - private final int numDocTopicIters; - - private TrainerRunnable(TopicModel readModel, TopicModel writeModel, Vector document, - Vector docTopics, Matrix docTopicModel, int numDocTopicIters) { - this.readModel = readModel; - this.writeModel = writeModel; - this.document = document; - this.docTopics = docTopics; - this.docTopicModel = docTopicModel; - this.numDocTopicIters = numDocTopicIters; - } - - @Override - public void run() { - for (int i = 0; i < numDocTopicIters; i++) { - // synchronous read-only call: - readModel.trainDocTopicModel(document, docTopics, docTopicModel); - } - if (writeModel != null) { - // parallel call which is read-only on the docTopicModel, and write-only on the writeModel - // this method does not return until all rows of the docTopicModel have been submitted - // to write work queues - writeModel.update(docTopicModel); - } - } - - @Override - public Double call() { - run(); - return readModel.perplexity(document, docTopics); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/TopicModel.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/TopicModel.java deleted file mode 100644 index e1523b1f2..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/lda/cvb/TopicModel.java +++ /dev/null @@ -1,505 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.lda.cvb; - -import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configurable; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; -import org.apache.mahout.math.DenseMatrix; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.DistributedRowMatrixWriter; -import org.apache.mahout.math.Matrix; -import org.apache.mahout.math.MatrixSlice; -import org.apache.mahout.math.SequentialAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.function.Functions; -import org.apache.mahout.math.stats.Sampler; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; -import java.util.Random; -import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; - -/** - * Thin wrapper around a {@link Matrix} of counts of occurrences of (topic, term) pairs. Dividing - * {code topicTermCount.viewRow(topic).get(term)} by the sum over the values for all terms in that - * row yields p(term | topic). Instead dividing it by all topic columns for that term yields - * p(topic | term). - * - * Multithreading is enabled for the {@code update(Matrix)} method: this method is async, and - * merely submits the matrix to a work queue. When all work has been submitted, - * {@code awaitTermination()} should be called, which will block until updates have been - * accumulated. - */ -public class TopicModel implements Configurable, Iterable { - - private static final Logger log = LoggerFactory.getLogger(TopicModel.class); - - private final String[] dictionary; - private final Matrix topicTermCounts; - private final Vector topicSums; - private final int numTopics; - private final int numTerms; - private final double eta; - private final double alpha; - - private Configuration conf; - - private final Sampler sampler; - private final int numThreads; - private Updater[] updaters; - - public int getNumTerms() { - return numTerms; - } - - public int getNumTopics() { - return numTopics; - } - - public TopicModel(int numTopics, int numTerms, double eta, double alpha, String[] dictionary, - double modelWeight) { - this(numTopics, numTerms, eta, alpha, null, dictionary, 1, modelWeight); - } - - public TopicModel(Configuration conf, double eta, double alpha, - String[] dictionary, int numThreads, double modelWeight, Path... modelpath) throws IOException { - this(loadModel(conf, modelpath), eta, alpha, dictionary, numThreads, modelWeight); - } - - public TopicModel(int numTopics, int numTerms, double eta, double alpha, String[] dictionary, - int numThreads, double modelWeight) { - this(new DenseMatrix(numTopics, numTerms), new DenseVector(numTopics), eta, alpha, dictionary, - numThreads, modelWeight); - } - - public TopicModel(int numTopics, int numTerms, double eta, double alpha, Random random, - String[] dictionary, int numThreads, double modelWeight) { - this(randomMatrix(numTopics, numTerms, random), eta, alpha, dictionary, numThreads, modelWeight); - } - - private TopicModel(Pair model, double eta, double alpha, String[] dict, - int numThreads, double modelWeight) { - this(model.getFirst(), model.getSecond(), eta, alpha, dict, numThreads, modelWeight); - } - - public TopicModel(Matrix topicTermCounts, Vector topicSums, double eta, double alpha, - String[] dictionary, double modelWeight) { - this(topicTermCounts, topicSums, eta, alpha, dictionary, 1, modelWeight); - } - - public TopicModel(Matrix topicTermCounts, double eta, double alpha, String[] dictionary, - int numThreads, double modelWeight) { - this(topicTermCounts, viewRowSums(topicTermCounts), - eta, alpha, dictionary, numThreads, modelWeight); - } - - public TopicModel(Matrix topicTermCounts, Vector topicSums, double eta, double alpha, - String[] dictionary, int numThreads, double modelWeight) { - this.dictionary = dictionary; - this.topicTermCounts = topicTermCounts; - this.topicSums = topicSums; - this.numTopics = topicSums.size(); - this.numTerms = topicTermCounts.numCols(); - this.eta = eta; - this.alpha = alpha; - this.sampler = new Sampler(RandomUtils.getRandom()); - this.numThreads = numThreads; - if (modelWeight != 1) { - topicSums.assign(Functions.mult(modelWeight)); - for (int x = 0; x < numTopics; x++) { - topicTermCounts.viewRow(x).assign(Functions.mult(modelWeight)); - } - } - initializeThreadPool(); - } - - private static Vector viewRowSums(Matrix m) { - Vector v = new DenseVector(m.numRows()); - for (MatrixSlice slice : m) { - v.set(slice.index(), slice.vector().norm(1)); - } - return v; - } - - private void initializeThreadPool() { - ThreadPoolExecutor threadPool = new ThreadPoolExecutor(numThreads, numThreads, 0, TimeUnit.SECONDS, - new ArrayBlockingQueue(numThreads * 10)); - threadPool.allowCoreThreadTimeOut(false); - updaters = new Updater[numThreads]; - for (int i = 0; i < numThreads; i++) { - updaters[i] = new Updater(); - threadPool.submit(updaters[i]); - } - } - - Matrix topicTermCounts() { - return topicTermCounts; - } - - @Override - public Iterator iterator() { - return topicTermCounts.iterateAll(); - } - - public Vector topicSums() { - return topicSums; - } - - private static Pair randomMatrix(int numTopics, int numTerms, Random random) { - Matrix topicTermCounts = new DenseMatrix(numTopics, numTerms); - Vector topicSums = new DenseVector(numTopics); - if (random != null) { - for (int x = 0; x < numTopics; x++) { - for (int term = 0; term < numTerms; term++) { - topicTermCounts.viewRow(x).set(term, random.nextDouble()); - } - } - } - for (int x = 0; x < numTopics; x++) { - topicSums.set(x, random == null ? 1.0 : topicTermCounts.viewRow(x).norm(1)); - } - return Pair.of(topicTermCounts, topicSums); - } - - public static Pair loadModel(Configuration conf, Path... modelPaths) - throws IOException { - int numTopics = -1; - int numTerms = -1; - List> rows = Lists.newArrayList(); - for (Path modelPath : modelPaths) { - for (Pair row : - new SequenceFileIterable(modelPath, true, conf)) { - rows.add(Pair.of(row.getFirst().get(), row.getSecond().get())); - numTopics = Math.max(numTopics, row.getFirst().get()); - if (numTerms < 0) { - numTerms = row.getSecond().get().size(); - } - } - } - if (rows.isEmpty()) { - throw new IOException(Arrays.toString(modelPaths) + " have no vectors in it"); - } - numTopics++; - Matrix model = new DenseMatrix(numTopics, numTerms); - Vector topicSums = new DenseVector(numTopics); - for (Pair pair : rows) { - model.viewRow(pair.getFirst()).assign(pair.getSecond()); - topicSums.set(pair.getFirst(), pair.getSecond().norm(1)); - } - return Pair.of(model, topicSums); - } - - // NOTE: this is purely for debug purposes. It is not performant to "toString()" a real model - @Override - public String toString() { - StringBuilder buf = new StringBuilder(); - for (int x = 0; x < numTopics; x++) { - String v = dictionary != null - ? vectorToSortedString(topicTermCounts.viewRow(x).normalize(1), dictionary) - : topicTermCounts.viewRow(x).asFormatString(); - buf.append(v).append('\n'); - } - return buf.toString(); - } - - public int sampleTerm(Vector topicDistribution) { - return sampler.sample(topicTermCounts.viewRow(sampler.sample(topicDistribution))); - } - - public int sampleTerm(int topic) { - return sampler.sample(topicTermCounts.viewRow(topic)); - } - - public void reset() { - for (int x = 0; x < numTopics; x++) { - topicTermCounts.assignRow(x, new SequentialAccessSparseVector(numTerms)); - } - topicSums.assign(1.0); - initializeThreadPool(); - } - - public void awaitTermination() { - for (Updater updater : updaters) { - updater.shutdown(); - } - } - - public void renormalize() { - for (int x = 0; x < numTopics; x++) { - topicTermCounts.assignRow(x, topicTermCounts.viewRow(x).normalize(1)); - topicSums.assign(1.0); - } - } - - public void trainDocTopicModel(Vector original, Vector topics, Matrix docTopicModel) { - // first calculate p(topic|term,document) for all terms in original, and all topics, - // using p(term|topic) and p(topic|doc) - pTopicGivenTerm(original, topics, docTopicModel); - normalizeByTopic(docTopicModel); - // now multiply, term-by-term, by the document, to get the weighted distribution of - // term-topic pairs from this document. - Iterator it = original.iterateNonZero(); - while (it.hasNext()) { - Vector.Element e = it.next(); - for (int x = 0; x < numTopics; x++) { - Vector docTopicModelRow = docTopicModel.viewRow(x); - docTopicModelRow.setQuick(e.index(), docTopicModelRow.getQuick(e.index()) * e.get()); - } - } - // now recalculate p(topic|doc) by summing contributions from all of pTopicGivenTerm - topics.assign(0.0); - for (int x = 0; x < numTopics; x++) { - topics.set(x, docTopicModel.viewRow(x).norm(1)); - } - // now renormalize so that sum_x(p(x|doc)) = 1 - topics.assign(Functions.mult(1/topics.norm(1))); - } - - public Vector infer(Vector original, Vector docTopics) { - Vector pTerm = original.like(); - Iterator it = original.iterateNonZero(); - while (it.hasNext()) { - Vector.Element e = it.next(); - int term = e.index(); - // p(a) = sum_x (p(a|x) * p(x|i)) - double pA = 0; - for (int x = 0; x < numTopics; x++) { - pA += (topicTermCounts.viewRow(x).get(term) / topicSums.get(x)) * docTopics.get(x); - } - pTerm.set(term, pA); - } - return pTerm; - } - - public void update(Matrix docTopicCounts) { - for (int x = 0; x < numTopics; x++) { - updaters[x % updaters.length].update(x, docTopicCounts.viewRow(x)); - } - } - - public void updateTopic(int topic, Vector docTopicCounts) { - topicTermCounts.viewRow(topic).assign(docTopicCounts, Functions.PLUS); - topicSums.set(topic, topicSums.get(topic) + docTopicCounts.norm(1)); - } - - public void update(int termId, Vector topicCounts) { - for (int x = 0; x < numTopics; x++) { - Vector v = topicTermCounts.viewRow(x); - v.set(termId, v.get(termId) + topicCounts.get(x)); - } - topicSums.assign(topicCounts, Functions.PLUS); - } - - public void persist(Path outputDir, boolean overwrite) throws IOException { - FileSystem fs = outputDir.getFileSystem(conf); - if (overwrite) { - fs.delete(outputDir, true); // CHECK second arg - } - DistributedRowMatrixWriter.write(outputDir, conf, topicTermCounts); - } - - /** - * Computes {@code p(topic x|term a, document i)} distributions given input document {@code i}. - * {@code pTGT[x][a]} is the (un-normalized) {@code p(x|a,i)}, or if docTopics is {@code null}, - * {@code p(a|x)} (also un-normalized). - * - * @param document doc-term vector encoding {@code w(term a|document i)}. - * @param docTopics {@code docTopics[x]} is the overall weight of topic {@code x} in given - * document. If {@code null}, a topic weight of {@code 1.0} is used for all topics. - * @param termTopicDist storage for output {@code p(x|a,i)} distributions. - */ - private void pTopicGivenTerm(Vector document, Vector docTopics, Matrix termTopicDist) { - // for each topic x - for (int x = 0; x < numTopics; x++) { - // get p(topic x | document i), or 1.0 if docTopics is null - double topicWeight = docTopics == null ? 1.0 : docTopics.get(x); - // get w(term a | topic x) - Vector topicTermRow = topicTermCounts.viewRow(x); - // get \sum_a w(term a | topic x) - double topicSum = topicSums.get(x); - // get p(topic x | term a) distribution to update - Vector termTopicRow = termTopicDist.viewRow(x); - - // for each term a in document i with non-zero weight - Iterator it = document.iterateNonZero(); - while (it.hasNext()) { - Vector.Element e = it.next(); - int termIndex = e.index(); - - // calc un-normalized p(topic x | term a, document i) - double termTopicLikelihood = (topicTermRow.get(termIndex) + eta) * (topicWeight + alpha) / (topicSum + eta * numTerms); - termTopicRow.set(termIndex, termTopicLikelihood); - } - } - } - - /** - * sum_x sum_a (c_ai * log(p(x|i) * p(a|x))) - */ - public double perplexity(Vector document, Vector docTopics) { - double perplexity = 0; - double norm = docTopics.norm(1) + (docTopics.size() * alpha); - Iterator it = document.iterateNonZero(); - while (it.hasNext()) { - Vector.Element e = it.next(); - int term = e.index(); - double prob = 0; - for (int x = 0; x < numTopics; x++) { - double d = (docTopics.get(x) + alpha) / norm; - double p = d * (topicTermCounts.viewRow(x).get(term) + eta) - / (topicSums.get(x) + eta * numTerms); - prob += p; - } - perplexity += e.get() * Math.log(prob); - } - return -perplexity; - } - - private void normalizeByTopic(Matrix perTopicSparseDistributions) { - Iterator it = perTopicSparseDistributions.viewRow(0).iterateNonZero(); - // then make sure that each of these is properly normalized by topic: sum_x(p(x|t,d)) = 1 - while (it.hasNext()) { - Vector.Element e = it.next(); - int a = e.index(); - double sum = 0; - for (int x = 0; x < numTopics; x++) { - sum += perTopicSparseDistributions.viewRow(x).get(a); - } - for (int x = 0; x < numTopics; x++) { - perTopicSparseDistributions.viewRow(x).set(a, - perTopicSparseDistributions.viewRow(x).get(a) / sum); - } - } - } - - public static String vectorToSortedString(Vector vector, String[] dictionary) { - List> vectorValues = - new ArrayList>(vector.getNumNondefaultElements()); - Iterator it = vector.iterateNonZero(); - while (it.hasNext()) { - Vector.Element e = it.next(); - vectorValues.add(Pair.of(dictionary != null ? dictionary[e.index()] : String.valueOf(e.index()), - e.get())); - } - Collections.sort(vectorValues, new Comparator>() { - @Override public int compare(Pair x, Pair y) { - return y.getSecond().compareTo(x.getSecond()); - } - }); - Iterator> listIt = vectorValues.iterator(); - StringBuilder bldr = new StringBuilder(2048); - bldr.append('{'); - int i = 0; - while (listIt.hasNext() && i < 25) { - i++; - Pair p = listIt.next(); - bldr.append(p.getFirst()); - bldr.append(':'); - bldr.append(p.getSecond()); - bldr.append(','); - } - if (bldr.length() > 1) { - bldr.setCharAt(bldr.length() - 1, '}'); - } - return bldr.toString(); - } - - @Override - public void setConf(Configuration configuration) { - this.conf = configuration; - } - - @Override - public Configuration getConf() { - return conf; - } - - private final class Updater implements Runnable { - private final ArrayBlockingQueue> queue = - new ArrayBlockingQueue>(100); - private boolean shutdown = false; - private boolean shutdownComplete = false; - - public void shutdown() { - try { - synchronized (this) { - while (!shutdownComplete) { - shutdown = true; - wait(10000L); // Arbitrarily, wait 10 seconds rather than forever for this - } - } - } catch (InterruptedException e) { - log.warn("Interrupted waiting to shutdown() : ", e); - } - } - - public boolean update(int topic, Vector v) { - if (shutdown) { // maybe don't do this? - throw new IllegalStateException("In SHUTDOWN state: cannot submit tasks"); - } - while (true) { // keep trying if interrupted - try { - // start async operation by submitting to the queue - queue.put(Pair.of(topic, v)); - // return once you got access to the queue - return true; - } catch (InterruptedException e) { - log.warn("Interrupted trying to queue update:", e); - } - } - } - - @Override public void run() { - while (!shutdown) { - try { - Pair pair = queue.poll(1, TimeUnit.SECONDS); - if (pair != null) { - updateTopic(pair.getFirst(), pair.getSecond()); - } - } catch (InterruptedException e) { - log.warn("Interrupted waiting to poll for update", e); - } - } - // in shutdown mode, finish remaining tasks! - for (Pair pair : queue) { - updateTopic(pair.getFirst(), pair.getSecond()); - } - synchronized (this) { - shutdownComplete = true; - notifyAll(); - } - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java deleted file mode 100644 index 28fc43b9b..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java +++ /dev/null @@ -1,182 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.meanshift; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.mahout.clustering.kmeans.Kluster; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.list.IntArrayList; - -/** - * This class models a canopy as a center point, the number of points that are - * contained within it according to the application of some distance metric, and - * a point total which is the sum of all the points and is used to compute the - * centroid when needed. - */ -public class MeanShiftCanopy extends Kluster { - - // TODO: this is still problematic from a scalability perspective, but how - // else to encode membership? - private IntArrayList boundPoints = new IntArrayList(); - - private int mass = 0; - - public int getMass() { - return mass; - } - - void setMass(int num) { - mass = num; - } - - /** - * Used for Writable - */ - public MeanShiftCanopy() { - } - - /** - * Create a new Canopy containing the given point - * - * @param point - * a Vector - * @param id - * an int canopy id - * @param measure - * a DistanceMeasure - */ - public MeanShiftCanopy(Vector point, int id, DistanceMeasure measure) { - super(point, id, measure); - boundPoints.add(id); - mass = 1; - } - - /** - * Create an initial Canopy, retaining the original type of the given point - * (e.g. NamedVector) - * - * @param point - * a Vector - * @param id - * an int - * @param measure - * a DistanceMeasure - * @return a MeanShiftCanopy - */ - public static MeanShiftCanopy initialCanopy(Vector point, int id, - DistanceMeasure measure) { - MeanShiftCanopy result = new MeanShiftCanopy(point, id, measure); - // overwrite center so original point type is retained - result.setCenter(point); - return result; - } - - public IntArrayList getBoundPoints() { - return boundPoints; - } - - /** - * The receiver overlaps the given canopy. Add my bound points to it. - * - * @param canopy - * an existing MeanShiftCanopy - * @param accumulateBoundPoints - * true to accumulate bound points from the canopy - */ - void merge(MeanShiftCanopy canopy, boolean accumulateBoundPoints) { - if (accumulateBoundPoints) { - boundPoints.addAllOf(canopy.boundPoints); - } - mass += canopy.mass; - } - - /** - * The receiver touches the given canopy. Add respective centers with the - * given weights. - * - * @param canopy - * an existing MeanShiftCanopy - * @param weight - * double weight of the touching - */ - void touch(MeanShiftCanopy canopy, double weight) { - canopy.observe(getCenter(), weight * mass); - observe(canopy.getCenter(), weight * canopy.mass); - } - - @Override - public void readFields(DataInput in) throws IOException { - super.readFields(in); - this.mass = in.readInt(); - int numpoints = in.readInt(); - this.boundPoints = new IntArrayList(); - for (int i = 0; i < numpoints; i++) { - this.boundPoints.add(in.readInt()); - } - this.mass = boundPoints.size(); - } - - @Override - public void write(DataOutput out) throws IOException { - super.write(out); - out.writeInt(mass); - out.writeInt(boundPoints.size()); - for (int v : boundPoints.elements()) { - out.writeInt(v); - } - } - - public MeanShiftCanopy shallowCopy() { - MeanShiftCanopy result = new MeanShiftCanopy(); - result.setMeasure(this.getMeasure()); - result.setId(this.getId()); - result.setCenter(this.getCenter()); - result.setRadius(this.getRadius()); - result.setNumObservations(this.getNumObservations()); - result.setBoundPoints(boundPoints); - result.setMass(mass); - return result; - } - - @Override - public String asFormatString() { - return toString(); - } - - public void setBoundPoints(IntArrayList boundPoints) { - this.boundPoints = boundPoints; - } - - @Override - public String getIdentifier() { - return (isConverged() ? "MSV-" : "MSC-") + getId(); - } - - @Override - public double pdf(VectorWritable vw) { - // MSCanopy membership is explicit via membership in boundPoints. Can't - // compute pdf for Arbitrary point - throw new UnsupportedOperationException(); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterMapper.java deleted file mode 100644 index 6a99aa3b6..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterMapper.java +++ /dev/null @@ -1,75 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.meanshift; - -import java.io.IOException; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.clustering.classify.WeightedVectorWritable; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable; - -import com.google.common.collect.Lists; - -public class MeanShiftCanopyClusterMapper - extends Mapper, ClusterWritable, IntWritable, WeightedVectorWritable> { - - private List canopies; - - @Override - protected void map(WritableComparable key, ClusterWritable clusterWritable, Context context) - throws IOException, InterruptedException { - // canopies use canopyIds assigned when input vectors are processed as vectorIds too - MeanShiftCanopy canopy = (MeanShiftCanopy)clusterWritable.getValue(); - int vectorId = canopy.getId(); - for (MeanShiftCanopy msc : canopies) { - for (int containedId : msc.getBoundPoints().toList()) { - if (vectorId == containedId) { - context.write(new IntWritable(msc.getId()), - new WeightedVectorWritable(1, canopy.getCenter())); - } - } - } - } - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - super.setup(context); - canopies = getCanopies(context.getConfiguration()); - } - - public static List getCanopies(Configuration conf) { - String statePath = conf.get(MeanShiftCanopyDriver.STATE_IN_KEY); - List canopies = Lists.newArrayList(); - Path path = new Path(statePath); - for (ClusterWritable clusterWritable - : new SequenceFileDirValueIterable(path, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { - MeanShiftCanopy canopy = (MeanShiftCanopy)clusterWritable.getValue(); - canopies.add(canopy); - } - return canopies; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java deleted file mode 100644 index 9ad7155af..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java +++ /dev/null @@ -1,216 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.meanshift; - -import java.util.Collection; -import java.util.List; - -import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configuration; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.kernel.IKernelProfile; -import org.apache.mahout.math.Vector; - -public class MeanShiftCanopyClusterer { - - private final double convergenceDelta; - - // the T1 distance threshold - private final double t1; - - // the T2 distance threshold - private final double t2; - - // the distance measure - private final DistanceMeasure measure; - - private final IKernelProfile kernelProfile; - - // if true accumulate clusters during merge so clusters can be produced later - private final boolean runClustering; - - public MeanShiftCanopyClusterer(Configuration configuration) { - measure = ClassUtils.instantiateAs(configuration.get(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY), - DistanceMeasure.class); - measure.configure(configuration); - runClustering = configuration.getBoolean(MeanShiftCanopyConfigKeys.CLUSTER_POINTS_KEY, true); - kernelProfile = ClassUtils.instantiateAs(configuration.get(MeanShiftCanopyConfigKeys.KERNEL_PROFILE_KEY), - IKernelProfile.class); - // nextCanopyId = 0; // never read? - t1 = Double - .parseDouble(configuration.get(MeanShiftCanopyConfigKeys.T1_KEY)); - t2 = Double - .parseDouble(configuration.get(MeanShiftCanopyConfigKeys.T2_KEY)); - convergenceDelta = Double.parseDouble(configuration - .get(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY)); - } - - public MeanShiftCanopyClusterer(DistanceMeasure aMeasure, - IKernelProfile aKernelProfileDerivative, double aT1, double aT2, - double aDelta, boolean runClustering) { - // nextCanopyId = 100; // so canopyIds will sort properly // never read? - measure = aMeasure; - t1 = aT1; - t2 = aT2; - convergenceDelta = aDelta; - kernelProfile = aKernelProfileDerivative; - this.runClustering = runClustering; - } - - public double getT1() { - return t1; - } - - public double getT2() { - return t2; - } - - /** - * Merge the given canopy into the canopies list. If it touches any existing - * canopy (norm to be appended - */ - public void mergeCanopy(MeanShiftCanopy aCanopy, - Collection canopies) { - MeanShiftCanopy closestCoveringCanopy = null; - double closestNorm = Double.MAX_VALUE; - for (MeanShiftCanopy canopy : canopies) { - double norm = measure.distance(canopy.getCenter(), aCanopy.getCenter()); - double weight = kernelProfile.calculateDerivativeValue(norm, t1); - if (weight > 0.0) { - aCanopy.touch(canopy, weight); - } - if (norm < t2 && (closestCoveringCanopy == null || norm < closestNorm)) { - closestNorm = norm; - closestCoveringCanopy = canopy; - } - } - if (closestCoveringCanopy == null) { - canopies.add(aCanopy); - } else { - closestCoveringCanopy.merge(aCanopy, runClustering); - } - } - - /** - * Shift the center to the new centroid of the cluster - * - * @param canopy - * the canopy to shift. - * @return if the cluster is converged - */ - public boolean shiftToMean(MeanShiftCanopy canopy) { - canopy.observe(canopy.getCenter(), canopy.getMass()); - canopy.computeConvergence(measure, convergenceDelta); - canopy.computeParameters(); - return canopy.isConverged(); - } - - /** - * Return if the point is covered by this canopy - * - * @param canopy - * a canopy. - * @param point - * a Vector point - * @return if the point is covered - */ - boolean covers(MeanShiftCanopy canopy, Vector point) { - return measure.distance(canopy.getCenter(), point) < t1; - } - - /** - * Return if the point is closely covered by the canopy - * - * @param canopy - * a canopy. - * @param point - * a Vector point - * @return if the point is covered - */ - public boolean closelyBound(MeanShiftCanopy canopy, Vector point) { - return measure.distance(canopy.getCenter(), point) < t2; - } - - /** - * This is the reference mean-shift implementation. Given its inputs it - * iterates over the points and clusters until their centers converge or until - * the maximum number of iterations is exceeded. - * - * @param points - * the input List of points - * @param measure - * the DistanceMeasure to use - * @param numIter - * the maximum number of iterations - */ - public static List clusterPoints(Iterable points, - DistanceMeasure measure, IKernelProfile aKernelProfileDerivative, - double convergenceThreshold, double t1, double t2, int numIter) { - MeanShiftCanopyClusterer clusterer = new MeanShiftCanopyClusterer(measure, - aKernelProfileDerivative, t1, t2, convergenceThreshold, true); - int nextCanopyId = 0; - - List canopies = Lists.newArrayList(); - for (Vector point : points) { - clusterer.mergeCanopy( - new MeanShiftCanopy(point, nextCanopyId++, measure), canopies); - } - List newCanopies = canopies; - boolean[] converged = { false }; - for (int iter = 0; !converged[0] && iter < numIter; iter++) { - newCanopies = clusterer.iterate(newCanopies, converged); - } - return newCanopies; - } - - protected List iterate(Iterable canopies, - boolean[] converged) { - converged[0] = true; - List migratedCanopies = Lists.newArrayList(); - for (MeanShiftCanopy canopy : canopies) { - converged[0] = shiftToMean(canopy) && converged[0]; - mergeCanopy(canopy, migratedCanopies); - } - return migratedCanopies; - } - - protected static MeanShiftCanopy findCoveringCanopy(MeanShiftCanopy canopy, - Iterable clusters) { - // canopies use canopyIds assigned when input vectors are processed as - // vectorIds too - int vectorId = canopy.getId(); - for (MeanShiftCanopy msc : clusters) { - for (int containedId : msc.getBoundPoints().toList()) { - if (vectorId == containedId) { - return msc; - } - } - } - return null; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyConfigKeys.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyConfigKeys.java deleted file mode 100644 index 8c72266df..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyConfigKeys.java +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.meanshift; - -public interface MeanShiftCanopyConfigKeys { - - // keys used by Driver, Mapper, Combiner & Reducer - String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.canopy.measure"; - String KERNEL_PROFILE_KEY = "org.apache.mahout.clustering.canopy.kernelprofile"; - String T1_KEY = "org.apache.mahout.clustering.canopy.t1"; - String T2_KEY = "org.apache.mahout.clustering.canopy.t2"; - String CONTROL_PATH_KEY = "org.apache.mahout.clustering.control.path"; - String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.canopy.convergence"; - String CLUSTER_POINTS_KEY = "org.apache.mahout.clustering.meanshift.clusterPointsKey"; - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyCreatorMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyCreatorMapper.java deleted file mode 100644 index 5d003ee93..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyCreatorMapper.java +++ /dev/null @@ -1,68 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.meanshift; - -import java.io.IOException; -import java.util.regex.Pattern; - -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.clustering.kmeans.KMeansConfigKeys; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.VectorWritable; - -import com.google.common.base.Preconditions; - -public class MeanShiftCanopyCreatorMapper extends Mapper, VectorWritable, Text, ClusterWritable> { - - private static final Pattern UNDERSCORE_PATTERN = Pattern.compile("_"); - - private static int nextCanopyId = -1; - - private DistanceMeasure measure; - - @Override - protected void map(WritableComparable key, VectorWritable point, Context context) - throws IOException, InterruptedException { - MeanShiftCanopy canopy = MeanShiftCanopy.initialCanopy(point.get(), nextCanopyId++, measure); - ClusterWritable clusterWritable = new ClusterWritable(); - clusterWritable.setValue(canopy); - context.write(new Text(key.toString()), clusterWritable); - } - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - super.setup(context); - String measureClass = context.getConfiguration().get(KMeansConfigKeys.DISTANCE_MEASURE_KEY); - measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); - - if (nextCanopyId == -1) { - String taskId = context.getConfiguration().get("mapred.task.id"); - String[] parts = UNDERSCORE_PATTERN.split(taskId); - Preconditions.checkArgument(parts.length == 6 - && "attempt".equals(parts[0]) - && ("m".equals(parts[3]) || "r".equals(parts[3])), - "TaskAttemptId string: %d is not properly formed", taskId); - nextCanopyId = ((1 << 31) / 50000) * Integer.parseInt(parts[4]); - //each mapper has 42,949 ids to give. - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java deleted file mode 100644 index eaa747bd7..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java +++ /dev/null @@ -1,532 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.meanshift; - -import java.io.IOException; -import java.util.Collection; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.clustering.AbstractCluster; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.classify.WeightedVectorWritable; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.clustering.kmeans.KMeansConfigKeys; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable; -import org.apache.mahout.common.kernel.IKernelProfile; -import org.apache.mahout.math.VectorWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.collect.Lists; -import com.google.common.io.Closeables; - -/** - * This class implements the driver for Mean Shift Canopy clustering - * - */ -public class MeanShiftCanopyDriver extends AbstractJob { - - public static final String MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"; - - private static final Logger log = LoggerFactory - .getLogger(MeanShiftCanopyDriver.class); - - public static final String INPUT_IS_CANOPIES_OPTION = "inputIsCanopies"; - - public static final String STATE_IN_KEY = "org.apache.mahout.clustering.meanshift.stateInKey"; - - private static final String CONTROL_CONVERGED = "control/converged"; - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new MeanShiftCanopyDriver(), args); - } - - @Override - public int run(String[] args) throws Exception { - addInputOption(); - addOutputOption(); - addOption(DefaultOptionCreator.convergenceOption().create()); - addOption(DefaultOptionCreator.maxIterationsOption().create()); - addOption(DefaultOptionCreator.overwriteOption().create()); - addOption(DefaultOptionCreator.inputIsCanopiesOption().create()); - addOption(DefaultOptionCreator.distanceMeasureOption().create()); - addOption(DefaultOptionCreator.kernelProfileOption().create()); - addOption(DefaultOptionCreator.t1Option().create()); - addOption(DefaultOptionCreator.t2Option().create()); - addOption(DefaultOptionCreator.clusteringOption().create()); - addOption(DefaultOptionCreator.methodOption().create()); - - if (parseArguments(args) == null) { - return -1; - } - - Path input = getInputPath(); - Path output = getOutputPath(); - if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(getConf(), output); - } - String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); - String kernelProfileClass = getOption(DefaultOptionCreator.KERNEL_PROFILE_OPTION); - double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION)); - double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION)); - boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION); - double convergenceDelta = Double - .parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); - int maxIterations = Integer - .parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); - boolean inputIsCanopies = hasOption(INPUT_IS_CANOPIES_OPTION); - boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION) - .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD); - DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); - IKernelProfile kernelProfile = ClassUtils.instantiateAs(kernelProfileClass, IKernelProfile.class); - run(getConf(), input, output, measure, kernelProfile, t1, t2, - convergenceDelta, maxIterations, inputIsCanopies, runClustering, - runSequential); - - return 0; - } - - /** - * Run the job where the input format can be either Vectors or Canopies. If - * requested, cluster the input data using the computed Canopies - * - * @param conf - * the Configuration to use - * @param input - * the input pathname String - * @param output - * the output pathname String - * @param measure - * the DistanceMeasure - * @param kernelProfile - * the IKernelProfile - * @param t1 - * the T1 distance threshold - * @param t2 - * the T2 distance threshold - * @param convergenceDelta - * the double convergence criteria - * @param maxIterations - * an int number of iterations - * @param inputIsCanopies - * true if the input path already contains MeanShiftCanopies and does - * not need to be converted from Vectors - * @param runClustering - * true if the input points are to be clustered once the iterations - * complete - * @param runSequential - * if true run in sequential execution mode - */ - public static void run(Configuration conf, Path input, Path output, - DistanceMeasure measure, IKernelProfile kernelProfile, double t1, - double t2, double convergenceDelta, int maxIterations, - boolean inputIsCanopies, boolean runClustering, boolean runSequential) - throws IOException, InterruptedException, ClassNotFoundException { - Path clustersIn = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); - if (inputIsCanopies) { - clustersIn = input; - } else { - createCanopyFromVectors(conf, input, clustersIn, measure, runSequential); - } - - Path clustersOut = buildClusters(conf, clustersIn, output, measure, - kernelProfile, t1, t2, convergenceDelta, maxIterations, runSequential, - runClustering); - if (runClustering) { - clusterData(inputIsCanopies ? input : new Path(output, - Cluster.INITIAL_CLUSTERS_DIR), clustersOut, new Path(output, - Cluster.CLUSTERED_POINTS_DIR), runSequential); - } - } - - /** - * Convert input vectors to MeanShiftCanopies for further processing - */ - public static void createCanopyFromVectors(Configuration conf, Path input, - Path output, DistanceMeasure measure, boolean runSequential) - throws IOException, InterruptedException, ClassNotFoundException { - if (runSequential) { - createCanopyFromVectorsSeq(input, output, measure); - } else { - createCanopyFromVectorsMR(conf, input, output, measure); - } - } - - /** - * Convert vectors to MeanShiftCanopies sequentially - * - * @param input - * the Path to the input VectorWritable data - * @param output - * the Path to the initial clusters directory - * @param measure - * the DistanceMeasure - */ - private static void createCanopyFromVectorsSeq(Path input, Path output, - DistanceMeasure measure) throws IOException { - Configuration conf = new Configuration(); - FileSystem fs = FileSystem.get(input.toUri(), conf); - FileStatus[] status = fs.listStatus(input, PathFilters.logsCRCFilter()); - int part = 0; - int id = 0; - for (FileStatus s : status) { - SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path( - output, "part-m-" + part++), Text.class, ClusterWritable.class); - try { - for (VectorWritable value : new SequenceFileValueIterable( - s.getPath(), conf)) { - MeanShiftCanopy initialCanopy = MeanShiftCanopy.initialCanopy(value.get(), - id++, measure); - ClusterWritable clusterWritable = new ClusterWritable(); - clusterWritable.setValue(initialCanopy); - writer.append(new Text(), clusterWritable); - } - } finally { - Closeables.closeQuietly(writer); - } - } - } - - /** - * Convert vectors to MeanShiftCanopies using Hadoop - */ - private static void createCanopyFromVectorsMR(Configuration conf, Path input, - Path output, DistanceMeasure measure) throws IOException, - InterruptedException, ClassNotFoundException { - conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass() - .getName()); - Job job = new Job(conf); - job.setJarByClass(MeanShiftCanopyDriver.class); - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(ClusterWritable.class); - job.setMapperClass(MeanShiftCanopyCreatorMapper.class); - job.setNumReduceTasks(0); - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - - FileInputFormat.setInputPaths(job, input); - FileOutputFormat.setOutputPath(job, output); - - if (!job.waitForCompletion(true)) { - throw new InterruptedException( - "Mean Shift createCanopyFromVectorsMR failed on input " + input); - } - } - - /** - * Iterate over the input clusters to produce the next cluster directories for - * each iteration - * - * @param conf - * the Configuration to use - * @param clustersIn - * the input directory Path - * @param output - * the output Path - * @param measure - * the DistanceMeasure - * @param kernelProfile - * the IKernelProfile - * @param t1 - * the T1 distance threshold - * @param t2 - * the T2 distance threshold - * @param convergenceDelta - * the double convergence criteria - * @param maxIterations - * an int number of iterations - * @param runSequential - * if true run in sequential execution mode - * @param runClustering - * if true accumulate merged clusters for subsequent clustering step - */ - public static Path buildClusters(Configuration conf, Path clustersIn, - Path output, DistanceMeasure measure, IKernelProfile kernelProfile, - double t1, double t2, double convergenceDelta, int maxIterations, - boolean runSequential, boolean runClustering) throws IOException, - InterruptedException, ClassNotFoundException { - if (runSequential) { - return buildClustersSeq(clustersIn, output, measure, kernelProfile, t1, - t2, convergenceDelta, maxIterations, runClustering); - } else { - return buildClustersMR(conf, clustersIn, output, measure, kernelProfile, - t1, t2, convergenceDelta, maxIterations, runClustering); - } - } - - /** - * Build new clusters sequentially - * - */ - private static Path buildClustersSeq(Path clustersIn, Path output, - DistanceMeasure measure, IKernelProfile aKernelProfile, double t1, - double t2, double convergenceDelta, int maxIterations, - boolean runClustering) throws IOException { - MeanShiftCanopyClusterer clusterer = new MeanShiftCanopyClusterer(measure, - aKernelProfile, t1, t2, convergenceDelta, runClustering); - List clusters = Lists.newArrayList(); - Configuration conf = new Configuration(); - FileSystem fs = FileSystem.get(clustersIn.toUri(), conf); - for (ClusterWritable clusterWritable : new SequenceFileDirValueIterable( - clustersIn, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { - MeanShiftCanopy canopy = (MeanShiftCanopy)clusterWritable.getValue(); - clusterer.mergeCanopy(canopy, clusters); - } - boolean[] converged = { false }; - int iteration = 1; - while (!converged[0] && iteration <= maxIterations) { - log.info("Mean Shift Iteration: {}", iteration); - clusters = clusterer.iterate(clusters, converged); - Path clustersOut = new Path(output, Cluster.CLUSTERS_DIR + iteration); - SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path( - clustersOut, "part-r-00000"), Text.class, ClusterWritable.class); - try { - for (MeanShiftCanopy cluster : clusters) { - if (log.isDebugEnabled()) { - log.debug( - "Writing Cluster:{} center:{} numPoints:{} radius:{} to: {}", - new Object[] { cluster.getId(), - AbstractCluster.formatVector(cluster.getCenter(), null), - cluster.getNumObservations(), - AbstractCluster.formatVector(cluster.getRadius(), null), - clustersOut.getName() }); - } - ClusterWritable clusterWritable = new ClusterWritable(); - clusterWritable.setValue(cluster); - writer.append(new Text(cluster.getIdentifier()), clusterWritable); - } - } finally { - Closeables.closeQuietly(writer); - } - clustersIn = clustersOut; - iteration++; - } - Path fromPath = new Path(output, Cluster.CLUSTERS_DIR + (iteration-1)); - Path finalClustersIn = new Path(output, Cluster.CLUSTERS_DIR + (iteration-1) + "-final"); - FileSystem.get(fromPath.toUri(), conf).rename(fromPath, finalClustersIn); - return finalClustersIn; - } - - /** - * Build new clusters using Hadoop - * - */ - private static Path buildClustersMR(Configuration conf, Path clustersIn, - Path output, DistanceMeasure measure, IKernelProfile aKernelProfile, - double t1, double t2, double convergenceDelta, int maxIterations, - boolean runClustering) throws IOException, InterruptedException, - ClassNotFoundException { - // iterate until the clusters converge - boolean converged = false; - int iteration = 1; - while (!converged && iteration <= maxIterations) { - int numReducers = Integer.valueOf(conf.get(MAPRED_REDUCE_TASKS, "1")); - log.info("Mean Shift Iteration: {}, numReducers {}", new Object[] { - iteration, numReducers }); - // point the output to a new directory per iteration - Path clustersOut = new Path(output, Cluster.CLUSTERS_DIR + iteration); - Path controlOut = new Path(output, CONTROL_CONVERGED); - runIterationMR(conf, clustersIn, clustersOut, controlOut, measure - .getClass().getName(), aKernelProfile.getClass().getName(), t1, t2, - convergenceDelta, runClustering); - converged = FileSystem.get(controlOut.toUri(), conf).exists(controlOut); - // now point the input to the old output directory - clustersIn = clustersOut; - iteration++; - // decrease the number of reducers if it is > 1 to cross-pollenate - // map sets - if (numReducers > 1) { - numReducers--; - conf.set(MAPRED_REDUCE_TASKS, String.valueOf(numReducers)); - } - } - Path fromPath = new Path(output, Cluster.CLUSTERS_DIR + (iteration-1)); - Path finalClustersIn = new Path(output, Cluster.CLUSTERS_DIR + (iteration-1) + Cluster.FINAL_ITERATION_SUFFIX); - FileSystem.get(fromPath.toUri(), conf).rename(fromPath, finalClustersIn); - return finalClustersIn; - } - - /** - * Run an iteration using Hadoop - * - * @param conf - * the Configuration to use - * @param input - * the input pathname String - * @param output - * the output pathname String - * @param control - * the control path - * @param measureClassName - * the DistanceMeasure class name - * @param kernelProfileClassName - * an IKernel class name - * @param t1 - * the T1 distance threshold - * @param t2 - * the T2 distance threshold - * @param convergenceDelta - * the double convergence criteria - * @param runClustering - * if true accumulate merged clusters for subsequent clustering step - */ - private static void runIterationMR(Configuration conf, Path input, - Path output, Path control, String measureClassName, - String kernelProfileClassName, double t1, double t2, - double convergenceDelta, boolean runClustering) throws IOException, - InterruptedException, ClassNotFoundException { - - conf.set(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName); - conf.set(MeanShiftCanopyConfigKeys.KERNEL_PROFILE_KEY, - kernelProfileClassName); - conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, String - .valueOf(convergenceDelta)); - conf.set(MeanShiftCanopyConfigKeys.T1_KEY, String.valueOf(t1)); - conf.set(MeanShiftCanopyConfigKeys.T2_KEY, String.valueOf(t2)); - conf.set(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, control.toString()); - conf.set(MeanShiftCanopyConfigKeys.CLUSTER_POINTS_KEY, String - .valueOf(runClustering)); - Job job = new Job(conf, - "Mean Shift Driver running runIteration over input: " + input); - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(ClusterWritable.class); - - FileInputFormat.setInputPaths(job, input); - FileOutputFormat.setOutputPath(job, output); - - job.setMapperClass(MeanShiftCanopyMapper.class); - job.setReducerClass(MeanShiftCanopyReducer.class); - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - job.setJarByClass(MeanShiftCanopyDriver.class); - if (!job.waitForCompletion(true)) { - throw new InterruptedException("Mean Shift Iteration failed on input " - + input); - } - } - - /** - * Run the job using supplied arguments - * - * @param input - * the directory pathname for input points - * @param clustersIn - * the directory pathname for input clusters - * @param output - * the directory pathname for output clustered points - * @param runSequential - * if true run in sequential execution mode - */ - public static void clusterData(Path input, Path clustersIn, Path output, - boolean runSequential) throws IOException, InterruptedException, - ClassNotFoundException { - if (runSequential) { - clusterDataSeq(input, clustersIn, output); - } else { - clusterDataMR(input, clustersIn, output); - } - } - - /** - * Cluster the data sequentially - */ - private static void clusterDataSeq(Path input, Path clustersIn, Path output) - throws IOException { - Collection clusters = Lists.newArrayList(); - Configuration conf = new Configuration(); - for (ClusterWritable clusterWritable : new SequenceFileDirValueIterable( - clustersIn, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { - MeanShiftCanopy cluster = (MeanShiftCanopy) clusterWritable.getValue(); - clusters.add(cluster); - } - // iterate over all points, assigning each to the closest canopy and - // outputting that clustering - FileSystem fs = FileSystem.get(input.toUri(), conf); - FileStatus[] status = fs.listStatus(input, PathFilters.logsCRCFilter()); - int part = 0; - for (FileStatus s : status) { - SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path( - output, "part-m-" + part++), IntWritable.class, - WeightedVectorWritable.class); - try { - for (Pair record : new SequenceFileIterable( - s.getPath(), conf)) { - ClusterWritable clusterWritable = record.getSecond(); - MeanShiftCanopy canopy = (MeanShiftCanopy) clusterWritable.getValue(); - MeanShiftCanopy closest = MeanShiftCanopyClusterer - .findCoveringCanopy(canopy, clusters); - writer.append(new IntWritable(closest.getId()), - new WeightedVectorWritable(1, canopy.getCenter())); - } - } finally { - Closeables.closeQuietly(writer); - } - } - } - - /** - * Cluster the data using Hadoop - */ - private static void clusterDataMR(Path input, Path clustersIn, Path output) - throws IOException, InterruptedException, ClassNotFoundException { - Configuration conf = new Configuration(); - conf.set(STATE_IN_KEY, clustersIn.toString()); - Job job = new Job(conf, - "Mean Shift Driver running clusterData over input: " + input); - job.setOutputKeyClass(IntWritable.class); - job.setOutputValueClass(WeightedVectorWritable.class); - job.setMapperClass(MeanShiftCanopyClusterMapper.class); - - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - job.setNumReduceTasks(0); - job.setJarByClass(MeanShiftCanopyDriver.class); - - FileInputFormat.setInputPaths(job, input); - FileOutputFormat.setOutputPath(job, output); - - if (!job.waitForCompletion(true)) { - throw new InterruptedException( - "Mean Shift Clustering failed on clustersIn " + clustersIn); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyMapper.java deleted file mode 100644 index 9f1c5c02e..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyMapper.java +++ /dev/null @@ -1,70 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.meanshift; - -import java.io.IOException; -import java.util.Collection; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.clustering.iterator.ClusterWritable; - -import com.google.common.collect.Lists; - -public class MeanShiftCanopyMapper extends Mapper,ClusterWritable,Text,ClusterWritable> { - - private final Collection canopies = Lists.newArrayList(); - - private MeanShiftCanopyClusterer clusterer; - -private Integer numReducers; - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - super.setup(context); - Configuration conf = context.getConfiguration(); - clusterer = new MeanShiftCanopyClusterer(conf); - numReducers = Integer.valueOf(conf.get(MeanShiftCanopyDriver.MAPRED_REDUCE_TASKS, "1")); - } - - @Override - protected void map(WritableComparable key, ClusterWritable clusterWritable, Context context) - throws IOException, InterruptedException { - MeanShiftCanopy canopy = (MeanShiftCanopy)clusterWritable.getValue(); - clusterer.mergeCanopy(canopy.shallowCopy(), canopies); - } - - @Override - protected void cleanup(Context context) throws IOException, InterruptedException { - int reducer = 0; - for (MeanShiftCanopy canopy : canopies) { - clusterer.shiftToMean(canopy); - ClusterWritable clusterWritable = new ClusterWritable(); - clusterWritable.setValue(canopy); - context.write(new Text(String.valueOf(reducer)), clusterWritable); - reducer++; - if (reducer >= numReducers) { - reducer=0; - } - } - super.cleanup(context); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyReducer.java deleted file mode 100644 index 47b34a300..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyReducer.java +++ /dev/null @@ -1,74 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.meanshift; - -import java.io.IOException; -import java.util.Collection; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.clustering.iterator.ClusterWritable; - -import com.google.common.collect.Lists; - -public class MeanShiftCanopyReducer extends Reducer { - - private final Collection canopies = Lists.newArrayList(); - private MeanShiftCanopyClusterer clusterer; - private boolean allConverged = true; - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - super.setup(context); - clusterer = new MeanShiftCanopyClusterer(context.getConfiguration()); - } - - @Override - protected void reduce(Text key, Iterable values, Context context) - throws IOException, InterruptedException { - for (ClusterWritable clusterWritable : values) { - MeanShiftCanopy canopy = (MeanShiftCanopy)clusterWritable.getValue(); - clusterer.mergeCanopy(canopy.shallowCopy(), canopies); - } - - for (MeanShiftCanopy canopy : canopies) { - boolean converged = clusterer.shiftToMean(canopy); - if (converged) { - context.getCounter("Clustering", "Converged Clusters").increment(1); - } - allConverged = converged && allConverged; - ClusterWritable clusterWritable = new ClusterWritable(); - clusterWritable.setValue(canopy); - context.write(new Text(canopy.getIdentifier()), clusterWritable); - } - - } - - @Override - protected void cleanup(Context context) throws IOException, InterruptedException { - Configuration conf = context.getConfiguration(); - if (allConverged) { - Path path = new Path(conf.get(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY)); - FileSystem.get(path.toUri(), conf).createNewFile(path); - } - super.cleanup(context); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java deleted file mode 100644 index 5cb572e4f..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java +++ /dev/null @@ -1,134 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.minhash; - -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.MurmurHash; -import org.apache.mahout.math.MurmurHash3; - -import java.util.Random; - -public final class HashFactory { - - private HashFactory() { - } - - public enum HashType { - LINEAR, POLYNOMIAL, MURMUR, MURMUR3 - } - - public static HashFunction[] createHashFunctions(HashType type, int numFunctions) { - HashFunction[] hashFunction = new HashFunction[numFunctions]; - Random seed = RandomUtils.getRandom(11); - switch (type) { - case LINEAR: - for (int i = 0; i < numFunctions; i++) { - hashFunction[i] = new LinearHash(seed.nextInt(), seed.nextInt()); - } - break; - case POLYNOMIAL: - for (int i = 0; i < numFunctions; i++) { - hashFunction[i] = new PolynomialHash(seed.nextInt(), seed.nextInt(), seed.nextInt()); - } - break; - case MURMUR: - for (int i = 0; i < numFunctions; i++) { - hashFunction[i] = new MurmurHashWrapper(seed.nextInt()); - } - break; - case MURMUR3: - for (int i = 0; i < numFunctions; i++) { - hashFunction[i] = new MurmurHash3Wrapper(seed.nextInt()); - } - break; - default: - throw new IllegalStateException("Unknown type: " + type); - } - return hashFunction; - } - - static class LinearHash implements HashFunction { - private final int seedA; - private final int seedB; - - LinearHash(int seedA, int seedB) { - this.seedA = seedA; - this.seedB = seedB; - } - - @Override - public int hash(byte[] bytes) { - long hashValue = 31; - for (long byteVal : bytes) { - hashValue *= seedA * byteVal; - hashValue += seedB; - } - return Math.abs((int) (hashValue % RandomUtils.MAX_INT_SMALLER_TWIN_PRIME)); - } - } - - static class PolynomialHash implements HashFunction { - private final int seedA; - private final int seedB; - private final int seedC; - - PolynomialHash(int seedA, int seedB, int seedC) { - this.seedA = seedA; - this.seedB = seedB; - this.seedC = seedC; - } - - @Override - public int hash(byte[] bytes) { - long hashValue = 31; - for (long byteVal : bytes) { - hashValue *= seedA * (byteVal >> 4); - hashValue += seedB * byteVal + seedC; - } - return Math - .abs((int) (hashValue % RandomUtils.MAX_INT_SMALLER_TWIN_PRIME)); - } - } - - static class MurmurHashWrapper implements HashFunction { - private final int seed; - - MurmurHashWrapper(int seed) { - this.seed = seed; - } - - @Override - public int hash(byte[] bytes) { - long hashValue = MurmurHash.hash64A(bytes, seed); - return Math.abs((int) (hashValue % RandomUtils.MAX_INT_SMALLER_TWIN_PRIME)); - } - } - - static class MurmurHash3Wrapper implements HashFunction { - private final int seed; - - MurmurHash3Wrapper(int seed) { - this.seed = seed; - } - - @Override - public int hash(byte[] bytes) { - long hashValue = MurmurHash3.murmurhash3_x86_32(bytes, 0, bytes.length, seed); - return Math.abs((int) (hashValue % RandomUtils.MAX_INT_SMALLER_TWIN_PRIME)); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/HashFunction.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/HashFunction.java deleted file mode 100644 index 88f4449e5..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/HashFunction.java +++ /dev/null @@ -1,23 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.clustering.minhash; - -public interface HashFunction { - - int hash(byte[] bytes); - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashDriver.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashDriver.java deleted file mode 100644 index 5da3d8d02..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashDriver.java +++ /dev/null @@ -1,132 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.minhash; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.OutputFormat; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.commandline.MinhashOptionCreator; -import org.apache.mahout.math.VectorWritable; - -import java.io.IOException; - -public final class MinHashDriver extends AbstractJob { - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new MinHashDriver(), args); - } - - private void runJob(Path input, - Path output, - int minClusterSize, - int minVectorSize, - String hashType, - int numHashFunctions, - int keyGroups, - int numReduceTasks, - boolean debugOutput) throws IOException, ClassNotFoundException, InterruptedException { - Configuration conf = getConf(); - - conf.setInt(MinhashOptionCreator.MIN_CLUSTER_SIZE, minClusterSize); - conf.setInt(MinhashOptionCreator.MIN_VECTOR_SIZE, minVectorSize); - conf.set(MinhashOptionCreator.HASH_TYPE, hashType); - conf.setInt(MinhashOptionCreator.NUM_HASH_FUNCTIONS, numHashFunctions); - conf.setInt(MinhashOptionCreator.KEY_GROUPS, keyGroups); - conf.setBoolean(MinhashOptionCreator.DEBUG_OUTPUT, debugOutput); - - Class outputClass = debugOutput ? VectorWritable.class : Text.class; - Class outputFormatClass = - debugOutput ? SequenceFileOutputFormat.class : TextOutputFormat.class; - - Job job = new Job(conf, "MinHash Clustering"); - job.setJarByClass(MinHashDriver.class); - - FileInputFormat.setInputPaths(job, input); - FileOutputFormat.setOutputPath(job, output); - - job.setMapperClass(MinHashMapper.class); - job.setReducerClass(MinHashReducer.class); - - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(outputFormatClass); - - job.setMapOutputKeyClass(Text.class); - job.setMapOutputValueClass(outputClass); - - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(outputClass); - - job.setNumReduceTasks(numReduceTasks); - - job.waitForCompletion(true); - } - - @Override - public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { - addInputOption(); - addOutputOption(); - addOption(MinhashOptionCreator.minClusterSizeOption().create()); - addOption(MinhashOptionCreator.minVectorSizeOption().create()); - addOption(MinhashOptionCreator.hashTypeOption().create()); - addOption(MinhashOptionCreator.numHashFunctionsOption().create()); - addOption(MinhashOptionCreator.keyGroupsOption().create()); - addOption(MinhashOptionCreator.numReducersOption().create()); - addOption(MinhashOptionCreator.debugOutputOption().create()); - addOption(DefaultOptionCreator.overwriteOption().create()); - - if (parseArguments(args) == null) { - return -1; - } - - Path input = getInputPath(); - Path output = getOutputPath(); - if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(getConf(), output); - } - int minClusterSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_CLUSTER_SIZE)); - int minVectorSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_VECTOR_SIZE)); - String hashType = getOption(MinhashOptionCreator.HASH_TYPE); - int numHashFunctions = Integer.valueOf(getOption(MinhashOptionCreator.NUM_HASH_FUNCTIONS)); - int keyGroups = Integer.valueOf(getOption(MinhashOptionCreator.KEY_GROUPS)); - int numReduceTasks = Integer.parseInt(getOption(MinhashOptionCreator.NUM_REDUCERS)); - boolean debugOutput = hasOption(MinhashOptionCreator.DEBUG_OUTPUT); - - runJob(input, - output, - minClusterSize, - minVectorSize, - hashType, - numHashFunctions, - keyGroups, - numReduceTasks, - debugOutput); - return 0; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java deleted file mode 100644 index e7b3dae28..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java +++ /dev/null @@ -1,117 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.minhash; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.clustering.minhash.HashFactory.HashType; -import org.apache.mahout.common.commandline.MinhashOptionCreator; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; - -public class MinHashMapper extends Mapper { - - private static final Logger log = LoggerFactory.getLogger(MinHashMapper.class); - - private HashFunction[] hashFunction; - private int numHashFunctions; - private int keyGroups; - private int minVectorSize; - private boolean debugOutput; - private int[] minHashValues; - private byte[] bytesToHash; - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - super.setup(context); - Configuration conf = context.getConfiguration(); - this.numHashFunctions = conf.getInt(MinhashOptionCreator.NUM_HASH_FUNCTIONS, 10); - this.minHashValues = new int[numHashFunctions]; - this.bytesToHash = new byte[4]; - this.keyGroups = conf.getInt(MinhashOptionCreator.KEY_GROUPS, 1); - this.minVectorSize = conf.getInt(MinhashOptionCreator.MIN_VECTOR_SIZE, 5); - String htype = conf.get(MinhashOptionCreator.HASH_TYPE, "linear"); - this.debugOutput = conf.getBoolean(MinhashOptionCreator.DEBUG_OUTPUT, false); - - HashType hashType; - try { - hashType = HashType.valueOf(htype); - } catch (IllegalArgumentException iae) { - log.warn("No valid hash type found in configuration for {}, assuming type: {}", htype, HashType.LINEAR); - hashType = HashType.LINEAR; - } - hashFunction = HashFactory.createHashFunctions(hashType, numHashFunctions); - } - - /** - * Hash all items with each function and retain min. value for each iteration. We up with X number of - * minhash signatures. - *

- * Now depending upon the number of key-groups (1 - 4) concatenate that many minhash values to form - * cluster-id as 'key' and item-id as 'value' - */ - @Override - public void map(Text item, VectorWritable features, Context context) throws IOException, InterruptedException { - Vector featureVector = features.get(); - if (featureVector.size() < minVectorSize) { - return; - } - // Initialize the minhash values to highest - for (int i = 0; i < numHashFunctions; i++) { - minHashValues[i] = Integer.MAX_VALUE; - } - - for (int i = 0; i < numHashFunctions; i++) { - for (Vector.Element ele : featureVector) { - int value = (int) ele.get(); - bytesToHash[0] = (byte) (value >> 24); - bytesToHash[1] = (byte) (value >> 16); - bytesToHash[2] = (byte) (value >> 8); - bytesToHash[3] = (byte) value; - int hashIndex = hashFunction[i].hash(bytesToHash); - //if our new hash value is less than the old one, replace the old one - if (minHashValues[i] > hashIndex) { - minHashValues[i] = hashIndex; - } - } - } - // output the cluster information - for (int i = 0; i < numHashFunctions; i++) { - StringBuilder clusterIdBuilder = new StringBuilder(); - for (int j = 0; j < keyGroups; j++) { - clusterIdBuilder.append(minHashValues[(i + j) % numHashFunctions]).append('-'); - } - //remove the last dash - clusterIdBuilder.deleteCharAt(clusterIdBuilder.length() - 1); - Text cluster = new Text(clusterIdBuilder.toString()); - Writable point; - if (debugOutput) { - point = new VectorWritable(featureVector.clone()); - } else { - point = new Text(item.toString()); - } - context.write(cluster, point); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashReducer.java deleted file mode 100644 index f6bf554fb..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashReducer.java +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.minhash; - -import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.common.commandline.MinhashOptionCreator; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -import java.io.IOException; -import java.util.Collection; - -public class MinHashReducer extends Reducer { - - private int minClusterSize; - private boolean debugOutput; - - enum Clusters { - ACCEPTED, - DISCARDED - } - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - super.setup(context); - Configuration conf = context.getConfiguration(); - this.minClusterSize = conf.getInt(MinhashOptionCreator.MIN_CLUSTER_SIZE, 5); - this.debugOutput = conf.getBoolean(MinhashOptionCreator.DEBUG_OUTPUT, false); - } - - /** - * output the items clustered - */ - @Override - protected void reduce(Text cluster, Iterable points, Context context) - throws IOException, InterruptedException { - Collection pointList = Lists.newArrayList(); - for (Writable point : points) { - if (debugOutput) { - Vector pointVector = ((VectorWritable) point).get().clone(); - Writable writablePointVector = new VectorWritable(pointVector); - pointList.add(writablePointVector); - } else { - Writable pointText = new Text(point.toString()); - pointList.add(pointText); - } - } - if (pointList.size() >= minClusterSize) { - context.getCounter(Clusters.ACCEPTED).increment(1); - for (Writable point : pointList) { - context.write(cluster, point); - } - } else { - context.getCounter(Clusters.DISCARDED).increment(1); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/package-info.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/package-info.java deleted file mode 100644 index 1a1b14c02..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/package-info.java +++ /dev/null @@ -1,13 +0,0 @@ -/** - *

This package provides several clustering algorithm implementations. Clustering usually groups a set of - * objects into groups of similar items. The definition of similarity usually is up to you - for text documents, - * cosine-distance/-similarity is recommended. Mahout also features other types of distance measure like - * Euclidean distance.

- * - *

Input of each clustering algorithm is a set of vectors representing your items. For texts in general these are - * TFIDF or - * Bag of words representations of the documents.

- * - *

Output of each clustering algorithm is either a hard or soft assignment of items to clusters.

- */ -package org.apache.mahout.clustering; \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/AffinityMatrixInputJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/AffinityMatrixInputJob.java deleted file mode 100644 index d1e4be88c..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/AffinityMatrixInputJob.java +++ /dev/null @@ -1,85 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.common; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.mahout.clustering.spectral.eigencuts.EigencutsKeys; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.hadoop.DistributedRowMatrix; - -public final class AffinityMatrixInputJob { - - private AffinityMatrixInputJob() { - } - - /** - * Initializes and executes the job of reading the documents containing - * the data of the affinity matrix in (x_i, x_j, value) format. - */ - public static void runJob(Path input, Path output, int rows, int cols) - throws IOException, InterruptedException, ClassNotFoundException { - Configuration conf = new Configuration(); - HadoopUtil.delete(conf, output); - - conf.setInt(EigencutsKeys.AFFINITY_DIMENSIONS, rows); - Job job = new Job(conf, "AffinityMatrixInputJob: " + input + " -> M/R -> " + output); - - job.setMapOutputKeyClass(IntWritable.class); - job.setMapOutputValueClass(DistributedRowMatrix.MatrixEntryWritable.class); - job.setOutputKeyClass(IntWritable.class); - job.setOutputValueClass(VectorWritable.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - job.setMapperClass(AffinityMatrixInputMapper.class); - job.setReducerClass(AffinityMatrixInputReducer.class); - - FileInputFormat.addInputPath(job, input); - FileOutputFormat.setOutputPath(job, output); - - job.setJarByClass(AffinityMatrixInputJob.class); - - boolean succeeded = job.waitForCompletion(true); - if (!succeeded) { - throw new IllegalStateException("Job failed!"); - } - } - - /** - * A transparent wrapper for the above method which handles the tedious tasks - * of setting and retrieving system Paths. Hands back a fully-populated - * and initialized DistributedRowMatrix. - */ - public static DistributedRowMatrix runJob(Path input, Path output, int dimensions) - throws IOException, InterruptedException, ClassNotFoundException { - Path seqFiles = new Path(output, "seqfiles-" + (System.nanoTime() & 0xFF)); - runJob(input, seqFiles, dimensions, dimensions); - DistributedRowMatrix a = new DistributedRowMatrix(seqFiles, - new Path(seqFiles, "seqtmp-" + (System.nanoTime() & 0xFF)), - dimensions, dimensions); - a.setConf(new Configuration()); - return a; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/AffinityMatrixInputMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/AffinityMatrixInputMapper.java deleted file mode 100644 index 30906b6af..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/AffinityMatrixInputMapper.java +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.common; - -import java.io.IOException; -import java.util.regex.Pattern; - -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.math.hadoop.DistributedRowMatrix; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - *

Handles reading the files representing the affinity matrix. Since the affinity - * matrix is representative of a graph, each line in all the files should - * take the form:

- * - * {@code i,j,value} - * - *

where {@code i} and {@code j} are the {@code i}th and - * {@code j} data points in the entire set, and {@code value} - * represents some measurement of their relative absolute magnitudes. This - * is, simply, a method for representing a graph textually. - */ -public class AffinityMatrixInputMapper - extends Mapper { - - private static final Logger log = LoggerFactory.getLogger(AffinityMatrixInputMapper.class); - - private static final Pattern COMMA_PATTERN = Pattern.compile(","); - - @Override - protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { - - String[] elements = COMMA_PATTERN.split(value.toString()); - log.debug("(DEBUG - MAP) Key[{}], Value[{}]", key.get(), value); - - // enforce well-formed textual representation of the graph - if (elements.length != 3) { - throw new IOException("Expected input of length 3, received " - + elements.length + ". Please make sure you adhere to " - + "the structure of (i,j,value) for representing a graph in text. " - + "Input line was: '" + value + "'."); - } - if (elements[0].isEmpty() || elements[1].isEmpty() || elements[2].isEmpty()) { - throw new IOException("Found an element of 0 length. Please be sure you adhere to the structure of " - + "(i,j,value) for representing a graph in text."); - } - - // parse the line of text into a DistributedRowMatrix entry, - // making the row (elements[0]) the key to the Reducer, and - // setting the column (elements[1]) in the entry itself - DistributedRowMatrix.MatrixEntryWritable toAdd = new DistributedRowMatrix.MatrixEntryWritable(); - IntWritable row = new IntWritable(Integer.valueOf(elements[0])); - toAdd.setRow(-1); // already set as the Reducer's key - toAdd.setCol(Integer.valueOf(elements[1])); - toAdd.setVal(Double.valueOf(elements[2])); - context.write(row, toAdd); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/AffinityMatrixInputReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/AffinityMatrixInputReducer.java deleted file mode 100644 index 1bd807cdb..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/AffinityMatrixInputReducer.java +++ /dev/null @@ -1,60 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.common; - -import java.io.IOException; - -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.clustering.spectral.eigencuts.EigencutsKeys; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.SequentialAccessSparseVector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.hadoop.DistributedRowMatrix; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Tasked with taking each DistributedRowMatrix entry and collecting them - * into vectors corresponding to rows. The input and output keys are the same, - * corresponding to the row in the ensuing matrix. The matrix entries are - * entered into a vector according to the column to which they belong, and - * the vector is then given the key corresponding to its row. - */ -public class AffinityMatrixInputReducer - extends Reducer { - - private static final Logger log = LoggerFactory.getLogger(AffinityMatrixInputReducer.class); - - @Override - protected void reduce(IntWritable row, Iterable values, Context context) - throws IOException, InterruptedException { - int size = context.getConfiguration().getInt(EigencutsKeys.AFFINITY_DIMENSIONS, Integer.MAX_VALUE); - RandomAccessSparseVector out = new RandomAccessSparseVector(size, 100); - - for (DistributedRowMatrix.MatrixEntryWritable element : values) { - out.setQuick(element.getCol(), element.getVal()); - if (log.isDebugEnabled()) { - log.debug("(DEBUG - REDUCE) Row[{}], Column[{}], Value[{}]", - new Object[] {row.get(), element.getCol(), element.getVal()}); - } - } - SequentialAccessSparseVector output = new SequentialAccessSparseVector(out); - context.write(row, new VectorWritable(output)); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/IntDoublePairWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/IntDoublePairWritable.java deleted file mode 100644 index 59d2ed80f..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/IntDoublePairWritable.java +++ /dev/null @@ -1,75 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.common; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.Writable; - -/** - * This class is a Writable implementation of the mahout.common.Pair - * generic class. Since the generic types would also themselves have to - * implement Writable, it made more sense to create a more specialized - * version of the class altogether. - * - * In essence, this can be treated as a single Vector Element. - */ -public class IntDoublePairWritable implements Writable { - - private int key; - private double value; - - public IntDoublePairWritable() { - } - - public IntDoublePairWritable(int k, double v) { - this.key = k; - this.value = v; - } - - public void setKey(int k) { - this.key = k; - } - - public void setValue(double v) { - this.value = v; - } - - @Override - public void readFields(DataInput in) throws IOException { - this.key = in.readInt(); - this.value = in.readDouble(); - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeInt(key); - out.writeDouble(value); - } - - public int getKey() { - return key; - } - - public double getValue() { - return value; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/MatrixDiagonalizeJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/MatrixDiagonalizeJob.java deleted file mode 100644 index ae94b4f54..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/MatrixDiagonalizeJob.java +++ /dev/null @@ -1,110 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.common; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.mahout.clustering.spectral.eigencuts.EigencutsKeys; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -/** - * Given a matrix, this job returns a vector whose i_th element is the - * sum of all the elements in the i_th row of the original matrix. - */ -public final class MatrixDiagonalizeJob { - - private MatrixDiagonalizeJob() { - } - - public static Vector runJob(Path affInput, int dimensions) - throws IOException, ClassNotFoundException, InterruptedException { - - // set up all the job tasks - Configuration conf = new Configuration(); - Path diagOutput = new Path(affInput.getParent(), "diagonal"); - HadoopUtil.delete(conf, diagOutput); - conf.setInt(EigencutsKeys.AFFINITY_DIMENSIONS, dimensions); - Job job = new Job(conf, "MatrixDiagonalizeJob"); - - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setMapOutputKeyClass(NullWritable.class); - job.setMapOutputValueClass(IntDoublePairWritable.class); - job.setOutputKeyClass(NullWritable.class); - job.setOutputValueClass(VectorWritable.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - job.setMapperClass(MatrixDiagonalizeMapper.class); - job.setReducerClass(MatrixDiagonalizeReducer.class); - - FileInputFormat.addInputPath(job, affInput); - FileOutputFormat.setOutputPath(job, diagOutput); - - job.setJarByClass(MatrixDiagonalizeJob.class); - - boolean succeeded = job.waitForCompletion(true); - if (!succeeded) { - throw new IllegalStateException("Job failed!"); - } - - // read the results back from the path - return VectorCache.load(conf, new Path(diagOutput, "part-r-00000")); - } - - public static class MatrixDiagonalizeMapper - extends Mapper { - - @Override - protected void map(IntWritable key, VectorWritable row, Context context) - throws IOException, InterruptedException { - // store the sum - IntDoublePairWritable store = new IntDoublePairWritable(key.get(), row.get().zSum()); - context.write(NullWritable.get(), store); - } - } - - public static class MatrixDiagonalizeReducer - extends Reducer { - - @Override - protected void reduce(NullWritable key, Iterable values, - Context context) throws IOException, InterruptedException { - // create the return vector - Vector retval = new DenseVector(context.getConfiguration().getInt( - EigencutsKeys.AFFINITY_DIMENSIONS, Integer.MAX_VALUE)); - // put everything in its correct spot - for (IntDoublePairWritable e : values) { - retval.setQuick(e.getKey(), e.getValue()); - } - // write it out - context.write(key, new VectorWritable(retval)); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/UnitVectorizerJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/UnitVectorizerJob.java deleted file mode 100644 index f61102b36..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/UnitVectorizerJob.java +++ /dev/null @@ -1,102 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.common; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.function.Functions; - -/** - *

Given a DistributedRowMatrix, this job normalizes each row to unit - * vector length. If the input is a matrix U, and the output is a matrix - * W, the job follows:

- * - *

{@code v_ij = u_ij / sqrt(sum_j(u_ij * u_ij))}

- */ -public final class UnitVectorizerJob { - - private UnitVectorizerJob() { - } - - public static void runJob(Path input, Path output) - throws IOException, InterruptedException, ClassNotFoundException { - - Configuration conf = new Configuration(); - Job job = new Job(conf, "UnitVectorizerJob"); - - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputKeyClass(IntWritable.class); - job.setOutputValueClass(VectorWritable.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - job.setMapperClass(UnitVectorizerMapper.class); - job.setNumReduceTasks(0); - - FileInputFormat.addInputPath(job, input); - FileOutputFormat.setOutputPath(job, output); - - job.setJarByClass(UnitVectorizerJob.class); - - boolean succeeded = job.waitForCompletion(true); - if (!succeeded) { - throw new IllegalStateException("Job failed!"); - } - } - - public static class UnitVectorizerMapper - extends Mapper { - - @Override - protected void map(IntWritable row, VectorWritable vector, Context context) - throws IOException, InterruptedException { - - // set up the return value and perform the computations - double norm = vectorNorm(vector.get()); - Vector w = vector.get().assign(Functions.div(norm)); - RandomAccessSparseVector out = new RandomAccessSparseVector(w); - - // finally write the output - context.write(row, new VectorWritable(out)); - } - - /** - * Sums the squares of all elements together, then takes the square root - * of that sum. - * @param u - * @return - */ - private static double vectorNorm(Iterable u) { - double retval = 0.0; - for (Vector.Element e : u) { - retval += Functions.POW.apply(e.get(), 2); - } - return Functions.SQRT.apply(retval); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/VectorCache.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/VectorCache.java deleted file mode 100644 index c39ad1c43..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/VectorCache.java +++ /dev/null @@ -1,120 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.common; - -import java.io.IOException; -import java.net.URI; -import java.util.Arrays; - -import com.google.common.io.Closeables; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.filecache.DistributedCache; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - - -/** - * This class handles reading and writing vectors to the Hadoop - * distributed cache. Created as a result of Eigencuts' liberal use - * of such functionality, but available to any algorithm requiring it. - */ -public final class VectorCache { - - private static final Logger log = LoggerFactory.getLogger(VectorCache.class); - - private VectorCache() { - } - - /** - * - * @param key SequenceFile key - * @param vector Vector to save, to be wrapped as VectorWritable - */ - public static void save(Writable key, - Vector vector, - Path output, - Configuration conf, - boolean overwritePath, - boolean deleteOnExit) throws IOException { - - FileSystem fs = FileSystem.get(output.toUri(), conf); - output = fs.makeQualified(output); - if (overwritePath) { - HadoopUtil.delete(conf, output); - } - - // set the cache - DistributedCache.setCacheFiles(new URI[] {output.toUri()}, conf); - - // set up the writer - SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, - IntWritable.class, VectorWritable.class); - try { - writer.append(key, new VectorWritable(vector)); - } finally { - Closeables.closeQuietly(writer); - } - - if (deleteOnExit) { - fs.deleteOnExit(output); - } - } - - /** - * Calls the save() method, setting the cache to overwrite any previous - * Path and to delete the path after exiting - */ - public static void save(Writable key, Vector vector, Path output, Configuration conf) throws IOException { - save(key, vector, output, conf, true, true); - } - - /** - * Loads the vector from {@link DistributedCache}. Returns null if no vector exists. - */ - public static Vector load(Configuration conf) throws IOException { - URI[] files = DistributedCache.getCacheFiles(conf); - if (files == null || files.length < 1) { - return null; - } - log.info("Files are: {}", Arrays.toString(files)); - return load(conf, new Path(files[0].getPath())); - } - - /** - * Loads a Vector from the specified path. Returns null if no vector exists. - */ - public static Vector load(Configuration conf, Path input) throws IOException { - log.info("Loading vector from: {}", input); - SequenceFileValueIterator iterator = - new SequenceFileValueIterator(input, true, conf); - try { - return iterator.next().get(); - } finally { - Closeables.closeQuietly(iterator); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/VectorMatrixMultiplicationJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/VectorMatrixMultiplicationJob.java deleted file mode 100644 index ff6540163..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/VectorMatrixMultiplicationJob.java +++ /dev/null @@ -1,140 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.common; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.mahout.clustering.spectral.eigencuts.EigencutsKeys; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.function.Functions; -import org.apache.mahout.math.hadoop.DistributedRowMatrix; - -/** - *

This class handles the three-way multiplication of the digonal matrix - * and the Markov transition matrix inherent in the Eigencuts algorithm. - * The equation takes the form:

- * - * {@code W = D^(1/2) * M * D^(1/2)} - * - *

Since the diagonal matrix D has only n non-zero elements, it is represented - * as a dense vector in this job, rather than a full n-by-n matrix. This job - * performs the multiplications and returns the new DRM. - */ -public final class VectorMatrixMultiplicationJob { - - private VectorMatrixMultiplicationJob() { - } - - /** - * Invokes the job. - * @param markovPath Path to the markov DRM's sequence files - */ - public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath) - throws IOException, ClassNotFoundException, InterruptedException { - - return runJob(markovPath, diag, outputPath, new Path(outputPath, "tmp")); - } - - public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath, Path tmpPath) - throws IOException, ClassNotFoundException, InterruptedException { - - // set up the serialization of the diagonal vector - Configuration conf = new Configuration(); - FileSystem fs = FileSystem.get(markovPath.toUri(), conf); - markovPath = fs.makeQualified(markovPath); - outputPath = fs.makeQualified(outputPath); - Path vectorOutputPath = new Path(outputPath.getParent(), "vector"); - VectorCache.save(new IntWritable(EigencutsKeys.DIAGONAL_CACHE_INDEX), diag, vectorOutputPath, conf); - - // set up the job itself - Job job = new Job(conf, "VectorMatrixMultiplication"); - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputKeyClass(IntWritable.class); - job.setOutputValueClass(VectorWritable.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - job.setMapperClass(VectorMatrixMultiplicationMapper.class); - job.setNumReduceTasks(0); - - FileInputFormat.addInputPath(job, markovPath); - FileOutputFormat.setOutputPath(job, outputPath); - - job.setJarByClass(VectorMatrixMultiplicationJob.class); - - boolean succeeded = job.waitForCompletion(true); - if (!succeeded) { - throw new IllegalStateException("Job failed!"); - } - - // build the resulting DRM from the results - return new DistributedRowMatrix(outputPath, tmpPath, - diag.size(), diag.size()); - } - - public static class VectorMatrixMultiplicationMapper - extends Mapper { - - private Vector diagonal; - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - // read in the diagonal vector from the distributed cache - super.setup(context); - Configuration config = context.getConfiguration(); - diagonal = VectorCache.load(config); - if (diagonal == null) { - throw new IOException("No vector loaded from cache!"); - } - if (!(diagonal instanceof DenseVector)) { - diagonal = new DenseVector(diagonal); - } - } - - @Override - protected void map(IntWritable key, VectorWritable row, Context ctx) - throws IOException, InterruptedException { - - for (Vector.Element e : row.get()) { - double dii = Functions.SQRT.apply(diagonal.get(key.get())); - double djj = Functions.SQRT.apply(diagonal.get(e.index())); - double mij = e.get(); - e.set(dii * mij * djj); - } - ctx.write(key, row); - } - - /** - * Performs the setup of the Mapper. Used by unit tests. - * @param diag - */ - void setup(Vector diag) { - this.diagonal = diag; - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/VertexWritable.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/VertexWritable.java deleted file mode 100644 index 76aefb71f..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/common/VertexWritable.java +++ /dev/null @@ -1,101 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.common; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.Writable; - -/** - * Represents a vertex within the affinity graph for Eigencuts. - */ -public class VertexWritable implements Writable { - - /** the row */ - private int i; - - /** the column */ - private int j; - - /** the value at this vertex */ - private double value; - - /** an extra type delimeter, can probably be null */ - private String type; - - public VertexWritable() { - } - - public VertexWritable(int i, int j, double v, String t) { - this.i = i; - this.j = j; - this.value = v; - this.type = t; - } - - public int getRow() { - return i; - } - - public void setRow(int i) { - this.i = i; - } - - public int getCol() { - return j; - } - - public void setCol(int j) { - this.j = j; - } - - public double getValue() { - return value; - } - - public void setValue(double v) { - this.value = v; - } - - public String getType() { - return type; - } - - public void setType(String t) { - this.type = t; - } - - @Override - public void readFields(DataInput arg0) throws IOException { - this.i = arg0.readInt(); - this.j = arg0.readInt(); - this.value = arg0.readDouble(); - this.type = arg0.readUTF(); - } - - @Override - public void write(DataOutput arg0) throws IOException { - arg0.writeInt(i); - arg0.writeInt(j); - arg0.writeDouble(value); - arg0.writeUTF(type); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsAffinityCutsJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsAffinityCutsJob.java deleted file mode 100644 index cd534bff4..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsAffinityCutsJob.java +++ /dev/null @@ -1,214 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.eigencuts; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.mahout.clustering.spectral.common.VertexWritable; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public final class EigencutsAffinityCutsJob { - - private static final Logger log = LoggerFactory.getLogger(EigencutsAffinityCutsJob.class); - - private EigencutsAffinityCutsJob() { - } - - enum CUTSCOUNTER { - NUM_CUTS - } - - /** - * Runs a single iteration of defining cluster boundaries, based on - * previous calculations and the formation of the "cut matrix". - * - * @param currentAffinity Path to the current affinity matrix. - * @param cutMatrix Path to the sensitivity matrix. - * @param nextAffinity Output path for the new affinity matrix. - */ - public static long runjob(Path currentAffinity, Path cutMatrix, Path nextAffinity, Configuration conf) - throws IOException, ClassNotFoundException, InterruptedException { - - // these options allow us to differentiate between the two vectors - // in the mapper and reducer - we'll know from the working path - // which SequenceFile we're accessing - conf.set(EigencutsKeys.AFFINITY_PATH, currentAffinity.getName()); - conf.set(EigencutsKeys.CUTMATRIX_PATH, cutMatrix.getName()); - - Job job = new Job(conf, "EigencutsAffinityCutsJob"); - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - job.setMapOutputKeyClass(Text.class); - job.setMapOutputValueClass(VertexWritable.class); - job.setOutputKeyClass(IntWritable.class); - job.setOutputValueClass(VectorWritable.class); - job.setMapperClass(EigencutsAffinityCutsMapper.class); - job.setCombinerClass(EigencutsAffinityCutsCombiner.class); - job.setReducerClass(EigencutsAffinityCutsReducer.class); - - //FileInputFormat.addInputPath(job, currentAffinity); - FileInputFormat.addInputPath(job, cutMatrix); - FileOutputFormat.setOutputPath(job, nextAffinity); - - boolean succeeded = job.waitForCompletion(true); - if (!succeeded) { - throw new IllegalStateException("Job failed!"); - } - - return job.getCounters().findCounter(CUTSCOUNTER.NUM_CUTS).getValue(); - } - - public static class EigencutsAffinityCutsMapper - extends Mapper { - - @Override - protected void map(IntWritable key, VectorWritable row, Context context) - throws IOException, InterruptedException { - - // all this method does is construct a bunch of vertices, mapping those - // together which have the same *combination* of indices; for example, - // (1, 3) will have the same key as (3, 1) but a different key from (1, 1) - // and (3, 3) (which, incidentally, will also not be grouped together) - String type = context.getWorkingDirectory().getName(); - Vector vector = row.get(); - for (Vector.Element e : vector) { - String newkey = Math.max(key.get(), e.index()) + "_" + Math.min(key.get(), e.index()); - context.write(new Text(newkey), new VertexWritable(key.get(), e.index(), e.get(), type)); - } - } - } - - public static class EigencutsAffinityCutsCombiner - extends Reducer { - - @Override - protected void reduce(Text t, Iterable vertices, - Context context) throws IOException, InterruptedException { - // there should be exactly 4 items in the iterable; two from the - // first Path source, and two from the second with matching (i, j) indices - - // the idea here is that we want the two vertices of the "cut" matrix, - // and if either of them has a non-zero value, we want to: - // - // 1) zero out the two affinity vertices, and - // 2) add their former values to the (i, i) and (j, j) coordinates - // - // though obviously we want to perform these steps in reverse order - Configuration conf = context.getConfiguration(); - log.debug("{}", t); - boolean zero = false; - int i = -1; - int j = -1; - double k = 0; - int count = 0; - for (VertexWritable v : vertices) { - count++; - if (v.getType().equals(conf.get(EigencutsKeys.AFFINITY_PATH))) { - i = v.getRow(); - j = v.getCol(); - k = v.getValue(); - } else if (v.getValue() != 0.0) { - zero = true; - } - } - // if there are only two vertices, we have a diagonal - // we want to preserve whatever is currently in the diagonal, - // since this is acting as a running sum of all other values - // that have been "cut" so far - simply return this element as is - if (count == 2) { - VertexWritable vw = new VertexWritable(i, j, k, "unimportant"); - context.write(new Text(String.valueOf(i)), vw); - return; - } - - // do we zero out the values? - VertexWritable outI = new VertexWritable(); - VertexWritable outJ = new VertexWritable(); - if (zero) { - // increment the cut counter - context.getCounter(CUTSCOUNTER.NUM_CUTS).increment(1); - - // we want the values to exist on the diagonal - outI.setCol(i); - outJ.setCol(j); - - // also, set the old values to zero - VertexWritable zeroI = new VertexWritable(); - VertexWritable zeroJ = new VertexWritable(); - zeroI.setCol(j); - zeroI.setValue(0); - zeroJ.setCol(i); - zeroJ.setValue(0); - zeroI.setType("unimportant"); - zeroJ.setType("unimportant"); - context.write(new Text(String.valueOf(i)), zeroI); - context.write(new Text(String.valueOf(j)), zeroJ); - } else { - outI.setCol(j); - outJ.setCol(i); - } - - // set the values and write them - outI.setValue(k); - outJ.setValue(k); - outI.setType("unimportant"); - outJ.setType("unimportant"); - context.write(new Text(String.valueOf(i)), outI); - context.write(new Text(String.valueOf(j)), outJ); - } - } - - public static class EigencutsAffinityCutsReducer - extends Reducer { - - @Override - protected void reduce(Text row, Iterable entries, - Context context) throws IOException, InterruptedException { - // now to assemble the vectors - RandomAccessSparseVector output = new RandomAccessSparseVector( - context.getConfiguration().getInt(EigencutsKeys.AFFINITY_DIMENSIONS, Integer.MAX_VALUE), 100); - int rownum = Integer.parseInt(row.toString()); - for (VertexWritable e : entries) { - // first, are we setting a diagonal? - if (e.getCol() == rownum) { - // add to what's already present - output.setQuick(e.getCol(), output.getQuick(e.getCol()) + e.getValue()); - } else { - // simply set the value - output.setQuick(e.getCol(), e.getValue()); - } - } - context.write(new IntWritable(rownum), new VectorWritable(output)); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java deleted file mode 100644 index 9a837aab1..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java +++ /dev/null @@ -1,225 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.eigencuts; - -import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.clustering.spectral.common.AffinityMatrixInputJob; -import org.apache.mahout.clustering.spectral.common.MatrixDiagonalizeJob; -import org.apache.mahout.clustering.spectral.common.VectorMatrixMultiplicationJob; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.decomposer.lanczos.LanczosState; -import org.apache.mahout.math.hadoop.DistributedRowMatrix; -import org.apache.mahout.math.hadoop.decomposer.DistributedLanczosSolver; -import org.apache.mahout.math.hadoop.decomposer.EigenVerificationJob; -import org.apache.mahout.math.stats.OnlineSummarizer; - -import java.io.IOException; -import java.util.Collection; -import java.util.List; -import java.util.Map; - -public class EigencutsDriver extends AbstractJob { - - public static final double EPSILON_DEFAULT = 0.25; - - public static final double TAU_DEFAULT = -0.1; - - public static final double OVERSHOOT_MULTIPLIER = 1.5; - - public static void main(String[] args) throws Exception { - ToolRunner.run(new EigencutsDriver(), args); - } - - @Override - public int run(String[] arg0) throws Exception { - - // set up command line arguments - addOption("half-life", "b", "Minimal half-life threshold", true); - addOption("dimensions", "d", "Square dimensions of affinity matrix", true); - addOption("epsilon", "e", "Half-life threshold coefficient", Double.toString(EPSILON_DEFAULT)); - addOption("tau", "t", "Threshold for cutting affinities", Double.toString(TAU_DEFAULT)); - addOption("eigenrank", "k", "Number of top eigenvectors to use", true); - addOption(DefaultOptionCreator.inputOption().create()); - addOption(DefaultOptionCreator.outputOption().create()); - addOption(DefaultOptionCreator.overwriteOption().create()); - Map> parsedArgs = parseArguments(arg0); - if (parsedArgs == null) { - return 0; - } - - // read in the command line values - Path input = getInputPath(); - Path output = getOutputPath(); - if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(getConf(), output); - } - int dimensions = Integer.parseInt(getOption("dimensions")); - double halflife = Double.parseDouble(getOption("half-life")); - double epsilon = Double.parseDouble(getOption("epsilon")); - double tau = Double.parseDouble(getOption("tau")); - int eigenrank = Integer.parseInt(getOption("eigenrank")); - - run(getConf(), input, output, eigenrank, dimensions, halflife, epsilon, tau); - - return 0; - } - - /** - * Run the Eigencuts clustering algorithm using the supplied arguments - * - * @param conf the Configuration to use - * @param input the Path to the directory containing input affinity tuples - * @param output the Path to the output directory - * @param eigenrank The number of top eigenvectors/eigenvalues to use - * @param dimensions the int number of dimensions of the square affinity matrix - * @param halflife the double minimum half-life threshold - * @param epsilon the double coefficient for setting minimum half-life threshold - * @param tau the double tau threshold for cutting links in the affinity graph - */ - public static void run(Configuration conf, - Path input, - Path output, - int dimensions, - int eigenrank, - double halflife, - double epsilon, - double tau) - throws IOException, InterruptedException, ClassNotFoundException { - // set the instance variables - // create a few new Paths for temp files and transformations - Path outputCalc = new Path(output, "calculations"); - Path outputTmp = new Path(output, "temporary"); - - DistributedRowMatrix A = AffinityMatrixInputJob.runJob(input, outputCalc, dimensions); - Vector D = MatrixDiagonalizeJob.runJob(A.getRowPath(), dimensions); - - long numCuts; - do { - // first three steps are the same as spectral k-means: - // 1) calculate D from A - // 2) calculate L = D^-0.5 * A * D^-0.5 - // 3) calculate eigenvectors of L - - DistributedRowMatrix L = - VectorMatrixMultiplicationJob.runJob(A.getRowPath(), D, - new Path(outputCalc, "laplacian-" + (System.nanoTime() & 0xFF))); - L.setConf(new Configuration(conf)); - - // eigendecomposition (step 3) - int overshoot = (int) ((double) eigenrank * OVERSHOOT_MULTIPLIER); - LanczosState state = new LanczosState(L, eigenrank, - new DistributedLanczosSolver().getInitialVector(L)); - - DistributedRowMatrix U = performEigenDecomposition(conf, L, state, eigenrank, overshoot, outputCalc); - U.setConf(new Configuration(conf)); - List eigenValues = Lists.newArrayList(); - for (int i=0; i 0) { - // recalculate A - A = new DistributedRowMatrix(input, - new Path(outputTmp, Long.toString(System.nanoTime())), dimensions, dimensions); - A.setConf(new Configuration()); - } - } while (numCuts > 0); - - // TODO: MAHOUT-517: Eigencuts needs an output format - } - - /** - * Does most of the heavy lifting in setting up Paths, configuring return - * values, and generally performing the tedious administrative tasks involved - * in an eigen-decomposition and running the verifier - */ - public static DistributedRowMatrix performEigenDecomposition(Configuration conf, - DistributedRowMatrix input, - LanczosState state, - int numEigenVectors, - int overshoot, - Path tmp) throws IOException { - DistributedLanczosSolver solver = new DistributedLanczosSolver(); - Path seqFiles = new Path(tmp, "eigendecomp-" + (System.nanoTime() & 0xFF)); - solver.runJob(conf, - state, - overshoot, - true, - seqFiles.toString()); - - // now run the verifier to trim down the number of eigenvectors - EigenVerificationJob verifier = new EigenVerificationJob(); - Path verifiedEigens = new Path(tmp, "verifiedeigens"); - verifier.runJob(conf, seqFiles, input.getRowPath(), verifiedEigens, false, 1.0, numEigenVectors); - Path cleanedEigens = verifier.getCleanedEigensPath(); - return new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), numEigenVectors, input.numRows()); - } - - /** - * A quick and dirty hack to compute the median of a vector... - * @param v - * @return - */ - private static double median(Vector v) { - OnlineSummarizer med = new OnlineSummarizer(); - if (v.size() < 100) { - return v.zSum() / v.size(); - } - for (Vector.Element e : v) { - med.add(e.get()); - } - return med.getMedian(); - } - - /** - * Iteratively loops through the list, converting it to a Vector of double - * primitives worthy of other Mahout operations - */ - private static Vector listToVector(Collection list) { - Vector retval = new DenseVector(list.size()); - int index = 0; - for (Double d : list) { - retval.setQuick(index++, d); - } - return retval; - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsKeys.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsKeys.java deleted file mode 100644 index a161fdbce..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsKeys.java +++ /dev/null @@ -1,84 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.eigencuts; - -/** - * Configuration keys for the Eigencuts algorithm (analogous to KMeansConfigKeys) - */ -public interface EigencutsKeys { - - /** - * B_0, or the user-specified minimum eigenflow half-life threshold - * for an eigenvector/eigenvalue pair to be considered. Increasing - * B_0 equates to fewer clusters - */ - String BETA = "org.apache.mahout.clustering.spectral.beta"; - - /** - * Tau, or the user-specified threshold for making cuts (setting edge - * affinities to 0) after performing non-maximal suppression on edge weight - * sensitivies. Increasing tau equates to more edge cuts - */ - String TAU = "org.apache.mahout.clustering.spectral.tau"; - - /** - * The normalization factor for computing the cut threshold - */ - String DELTA = "org.apache.mahout.clustering.spectral.delta"; - - /** - * Epsilon, or the user-specified coefficient that works in tandem with - * MINIMUM_HALF_LIFE to determine which eigenvector/eigenvalue pairs to use. - * Increasing epsilon equates to fewer eigenvector/eigenvalue pairs - */ - String EPSILON = "org.apache.mahout.clustering.spectral.epsilon"; - - /** - * Base path to the location on HDFS where the diagonal matrix (a vector) - * and the list of eigenvalues will be stored for one of the map/reduce - * jobs in Eigencuts. - */ - String VECTOR_CACHE_BASE = "org.apache.mahout.clustering.spectral.eigencuts.vectorcache"; - - /** - * Refers to the dimensions of the raw affinity matrix input. Since this - * matrix is symmetrical, it is a square matrix, hence all its dimensions - * are equal. - */ - String AFFINITY_DIMENSIONS = "org.apache.mahout.clustering.spectral.eigencuts.affinitydimensions"; - - /** - * Refers to the Path to the SequenceFile representing the affinity matrix - */ - String AFFINITY_PATH = "org.apache.mahout.clustering.spectral.eigencuts.affinitypath"; - - /** - * Refers to the Path to the SequenceFile representing the cut matrix - */ - String CUTMATRIX_PATH = "org.apache.mahout.clustering.spectral.eigencuts.cutmatrixpath"; - - /** - * Sets the SequenceFile index for the list of eigenvalues. - */ - int EIGENVALUES_CACHE_INDEX = 0; - - /** - * Sets the SequenceFile index for the diagonal matrix. - */ - int DIAGONAL_CACHE_INDEX = 1; -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsSensitivityJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsSensitivityJob.java deleted file mode 100644 index c6037e859..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsSensitivityJob.java +++ /dev/null @@ -1,127 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.eigencuts; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.mahout.clustering.spectral.common.VectorCache; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -/** - *

There are a quite a few operations bundled within this mapper. Gather 'round - * and listen, all of ye.

- * - *

The input to this job is eight items:

- *
  1. B0, which is a command-line parameter fed through the Configuration object
  2. - *
  3. diagonal matrix, a constant vector fed through the Hadoop cache
  4. - *
  5. list of eigenvalues, a constant vector fed through the Hadoop cache
  6. - *
  7. eigenvector, the input value to the mapper
  8. - *
  9. epsilon
  10. - *
  11. delta
  12. - *
  13. tau
  14. - *
  15. output, the Path to the output matrix of sensitivities
- * - *

The first three items are constant and are used in all of the map - * tasks. The row index indicates which eigenvalue from the list to use, and - * also serves as the output identifier. The diagonal matrix and the - * eigenvector are both of equal length and are iterated through twice - * within each map task, unfortunately lending each task to a runtime of - * n2. This is unavoidable.

- * - *

For each (i, j) combination of elements within the eigenvector, a complex - * equation is run that explicitly computes the sensitivity to perturbation of - * the flow of probability within the specific edge of the graph. Each - * sensitivity, as it is computed, is simultaneously applied to a non-maximal - * suppression step: for a given sensitivity S_ij, it must be suppressed if - * any other S_in or S_mj has a more negative value. Thus, only the most - * negative S_ij within its row i or its column j is stored in the return - * array, leading to an output (per eigenvector!) with maximum length n, - * minimum length 1.

- * - *

Overall, this creates an n-by-n (possibly sparse) matrix with a maximum - * of n^2 non-zero elements, minimum of n non-zero elements.

- */ -public final class EigencutsSensitivityJob { - - private EigencutsSensitivityJob() { - } - - /** - * Initializes the configuration tasks, loads the needed data into - * the HDFS cache, and executes the job. - * - * @param eigenvalues Vector of eigenvalues - * @param diagonal Vector representing the diagonal matrix - * @param eigenvectors Path to the DRM of eigenvectors - * @param output Path to the output matrix (will have between n and full-rank - * non-zero elements) - */ - public static void runJob(Vector eigenvalues, - Vector diagonal, - Path eigenvectors, - double beta, - double tau, - double delta, - double epsilon, - Path output) - throws IOException, ClassNotFoundException, InterruptedException { - - // save the two vectors to the distributed cache - Configuration jobConfig = new Configuration(); - Path eigenOutputPath = new Path(output.getParent(), "eigenvalues"); - Path diagOutputPath = new Path(output.getParent(), "diagonal"); - jobConfig.set(EigencutsKeys.VECTOR_CACHE_BASE, output.getParent().getName()); - VectorCache.save(new IntWritable(EigencutsKeys.EIGENVALUES_CACHE_INDEX), - eigenvalues, eigenOutputPath, jobConfig); - VectorCache.save(new IntWritable(EigencutsKeys.DIAGONAL_CACHE_INDEX), - diagonal, diagOutputPath, jobConfig); - - // set up the rest of the job - jobConfig.set(EigencutsKeys.BETA, Double.toString(beta)); - jobConfig.set(EigencutsKeys.EPSILON, Double.toString(epsilon)); - jobConfig.set(EigencutsKeys.DELTA, Double.toString(delta)); - jobConfig.set(EigencutsKeys.TAU, Double.toString(tau)); - - Job job = new Job(jobConfig, "EigencutsSensitivityJob"); - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setMapOutputKeyClass(IntWritable.class); - job.setMapOutputValueClass(EigencutsSensitivityNode.class); - job.setOutputKeyClass(IntWritable.class); - job.setOutputValueClass(VectorWritable.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - job.setMapperClass(EigencutsSensitivityMapper.class); - job.setReducerClass(EigencutsSensitivityReducer.class); - - FileInputFormat.addInputPath(job, eigenvectors); - FileOutputFormat.setOutputPath(job, output); - - boolean succeeded = job.waitForCompletion(true); - if (!succeeded) { - throw new IllegalStateException("Job failed!"); - } - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsSensitivityMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsSensitivityMapper.java deleted file mode 100644 index c9cf445ee..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsSensitivityMapper.java +++ /dev/null @@ -1,142 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.eigencuts; - -import java.io.IOException; -import java.util.Map; - -import com.google.common.collect.Maps; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.clustering.spectral.common.VectorCache; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.SequentialAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.function.Functions; - -public class EigencutsSensitivityMapper extends - Mapper { - - private Vector eigenvalues; - private Vector diagonal; - private double beta0; - private double epsilon; - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - super.setup(context); - Configuration config = context.getConfiguration(); - beta0 = Double.parseDouble(config.get(EigencutsKeys.BETA)); - epsilon = Double.parseDouble(config.get(EigencutsKeys.EPSILON)); - - // read in the two vectors from the cache - eigenvalues = VectorCache.load(config); - diagonal = VectorCache.load(config); - if (!(eigenvalues instanceof SequentialAccessSparseVector || eigenvalues instanceof DenseVector)) { - eigenvalues = new SequentialAccessSparseVector(eigenvalues); - } - if (!(diagonal instanceof SequentialAccessSparseVector || diagonal instanceof DenseVector)) { - diagonal = new SequentialAccessSparseVector(diagonal); - } - } - - @Override - protected void map(IntWritable row, VectorWritable vw, Context context) - throws IOException, InterruptedException { - - // first, does this particular eigenvector even pass the required threshold? - double eigenvalue = Math.abs(eigenvalues.get(row.get())); - double betak = -Functions.LOGARITHM.apply(2) / Functions.LOGARITHM.apply(eigenvalue); - if (eigenvalue >= 1.0 || betak <= epsilon * beta0) { - // doesn't pass the threshold! quit - return; - } - - // go through the vector, performing the calculations - // sadly, no way to get around n^2 computations - Map columns = Maps.newHashMap(); - Vector ev = vw.get(); - for (int i = 0; i < ev.size(); i++) { - double minsij = Double.MAX_VALUE; - int minInd = -1; - for (int j = 0; j < ev.size(); j++) { - double sij = performSensitivityCalculation(eigenvalue, ev.get(i), - ev.get(j), diagonal.get(i), diagonal.get(j)); - - // perform non-maximal suppression - // is this the smallest value in the row? - if (sij < minsij) { - minsij = sij; - minInd = j; - } - } - - // is this the smallest value in the column? - Integer column = minInd; - EigencutsSensitivityNode value = new EigencutsSensitivityNode(i, minInd, minsij); - if (!columns.containsKey(column)) { - columns.put(column, value); - } else if (columns.get(column).getSensitivity() > minsij) { - columns.remove(column); - columns.put(column, value); - } - } - - // write whatever values made it through - - for (EigencutsSensitivityNode e : columns.values()) { - context.write(new IntWritable(e.getRow()), e); - } - } - - /** - * Helper method, performs the actual calculation. Looks something like this: - * - * (log(2) / lambda_k * log(lambda_k) * log(lambda_k^beta0 / 2)) * [ - * - (((u_i / sqrt(d_i)) - (u_j / sqrt(d_j)))^2 + (1 - lambda) * - * ((u_i^2 / d_i) + (u_j^2 / d_j))) ] - */ - private double performSensitivityCalculation(double eigenvalue, - double evi, - double evj, - double diagi, - double diagj) { - - double firsthalf = Functions.LOGARITHM.apply(2) - / (eigenvalue * Functions.LOGARITHM.apply(eigenvalue) - * Functions.LOGARITHM.apply(Functions.POW.apply(eigenvalue, beta0) / 2)); - - double secondhalf = - -Functions.POW.apply(evi / Functions.SQRT.apply(diagi) - evj / Functions.SQRT.apply(diagj), 2) - + (1.0 - eigenvalue) * (Functions.POW.apply(evi, 2) / diagi + Functions.POW.apply(evj, 2) / diagj); - - return firsthalf * secondhalf; - } - - /** - * Utility helper method, used for unit testing. - */ - void setup(double beta0, double epsilon, Vector eigenvalues, Vector diagonal) { - this.beta0 = beta0; - this.epsilon = epsilon; - this.eigenvalues = eigenvalues; - this.diagonal = diagonal; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsSensitivityNode.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsSensitivityNode.java deleted file mode 100644 index 5c26cd099..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsSensitivityNode.java +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.eigencuts; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.Writable; - -/** - * This class allows the storage of computed sensitivities in an - * unordered fashion, instead having each sensitivity track its - * own (i, j) coordinate. Thus these objects can be stored as elements - * in any list or, in particular, Writable array. - */ -public class EigencutsSensitivityNode implements Writable { - - private int row; - private int column; - private double sensitivity; - - public EigencutsSensitivityNode(int i, int j, double s) { - row = i; - column = j; - sensitivity = s; - } - - @Override - public void readFields(DataInput in) throws IOException { - this.row = in.readInt(); - this.column = in.readInt(); - this.sensitivity = in.readDouble(); - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeInt(row); - out.writeInt(column); - out.writeDouble(sensitivity); - } - - public int getRow() { - return row; - } - - public int getColumn() { - return column; - } - - public double getSensitivity() { - return sensitivity; - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsSensitivityReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsSensitivityReducer.java deleted file mode 100644 index a8f9fd437..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsSensitivityReducer.java +++ /dev/null @@ -1,53 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.eigencuts; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -/** - *

The point of this class is to take all the arrays of sensitivities - * and convert them to a single matrix. Since there may be many values - * that, according to their (i, j) coordinates, overlap in the matrix, - * the "winner" will be determined by whichever value is smaller.

- */ -public class EigencutsSensitivityReducer extends - Reducer { - - @Override - protected void reduce(IntWritable key, Iterable arr, Context context) - throws IOException, InterruptedException { - Configuration conf = context.getConfiguration(); - Vector v = new RandomAccessSparseVector(conf.getInt(EigencutsKeys.AFFINITY_DIMENSIONS, Integer.MAX_VALUE), 100); - double threshold = Double.parseDouble(conf.get(EigencutsKeys.TAU)) - / Double.parseDouble(conf.get(EigencutsKeys.DELTA)); - - for (EigencutsSensitivityNode n : arr) { - if (n.getSensitivity() < threshold && n.getSensitivity() < v.getQuick(n.getColumn())) { - v.setQuick(n.getColumn(), n.getSensitivity()); - } - } - context.write(key, new VectorWritable(v)); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java deleted file mode 100644 index 7b00cc73e..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java +++ /dev/null @@ -1,193 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.spectral.kmeans; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.kmeans.KMeansDriver; -import org.apache.mahout.clustering.kmeans.RandomSeedGenerator; -import org.apache.mahout.clustering.spectral.common.AffinityMatrixInputJob; -import org.apache.mahout.clustering.spectral.common.MatrixDiagonalizeJob; -import org.apache.mahout.clustering.spectral.common.UnitVectorizerJob; -import org.apache.mahout.clustering.spectral.common.VectorMatrixMultiplicationJob; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.decomposer.lanczos.LanczosState; -import org.apache.mahout.math.hadoop.DistributedRowMatrix; -import org.apache.mahout.math.hadoop.decomposer.DistributedLanczosSolver; -import org.apache.mahout.math.hadoop.decomposer.EigenVerificationJob; - -import java.io.IOException; -import java.util.List; -import java.util.Map; - -/** - * Implementation of the EigenCuts spectral clustering algorithm. - */ -public class SpectralKMeansDriver extends AbstractJob { - - public static final double OVERSHOOT_MULTIPLIER = 2.0; - - public static void main(String[] args) throws Exception { - ToolRunner.run(new SpectralKMeansDriver(), args); - } - - @Override - public int run(String[] arg0) - throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException, InterruptedException { - // set up command line options - Configuration conf = getConf(); - addInputOption(); - addOutputOption(); - addOption("dimensions", "d", "Square dimensions of affinity matrix", true); - addOption("clusters", "k", "Number of clusters and top eigenvectors", true); - addOption(DefaultOptionCreator.distanceMeasureOption().create()); - addOption(DefaultOptionCreator.convergenceOption().create()); - addOption(DefaultOptionCreator.maxIterationsOption().create()); - addOption(DefaultOptionCreator.overwriteOption().create()); - Map> parsedArgs = parseArguments(arg0); - if (parsedArgs == null) { - return 0; - } - - Path input = getInputPath(); - Path output = getOutputPath(); - if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(conf, output); - } - int numDims = Integer.parseInt(getOption("dimensions")); - int clusters = Integer.parseInt(getOption("clusters")); - String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); - DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); - double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); - int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); - - run(conf, input, output, numDims, clusters, measure, convergenceDelta, maxIterations); - - return 0; - } - - /** - * Run the Spectral KMeans clustering on the supplied arguments - * - * @param conf the Configuration to be used - * @param input the Path to the input tuples directory - * @param output the Path to the output directory - * @param numDims the int number of dimensions of the affinity matrix - * @param clusters the int number of eigenvectors and thus clusters to produce - * @param measure the DistanceMeasure for the k-Means calculations - * @param convergenceDelta the double convergence delta for the k-Means calculations - * @param maxIterations the int maximum number of iterations for the k-Means calculations - */ - public static void run(Configuration conf, - Path input, - Path output, - int numDims, - int clusters, - DistanceMeasure measure, - double convergenceDelta, - int maxIterations) - throws IOException, InterruptedException, ClassNotFoundException { - // create a few new Paths for temp files and transformations - Path outputCalc = new Path(output, "calculations"); - Path outputTmp = new Path(output, "temporary"); - - // Take in the raw CSV text file and split it ourselves, - // creating our own SequenceFiles for the matrices to read later - // (similar to the style of syntheticcontrol.canopy.InputMapper) - Path affSeqFiles = new Path(outputCalc, "seqfile-" + (System.nanoTime() & 0xFF)); - AffinityMatrixInputJob.runJob(input, affSeqFiles, numDims, numDims); - - // Next step: construct the affinity matrix using the newly-created - // sequence files - DistributedRowMatrix A = new DistributedRowMatrix(affSeqFiles, - new Path(outputTmp, "afftmp-" + (System.nanoTime() & 0xFF)), - numDims, - numDims); - Configuration depConf = new Configuration(conf); - A.setConf(depConf); - - // Next step: construct the diagonal matrix D (represented as a vector) - // and calculate the normalized Laplacian of the form: - // L = D^(-0.5)AD^(-0.5) - Vector D = MatrixDiagonalizeJob.runJob(affSeqFiles, numDims); - DistributedRowMatrix L = - VectorMatrixMultiplicationJob.runJob(affSeqFiles, D, - new Path(outputCalc, "laplacian-" + (System.nanoTime() & 0xFF)), new Path(outputCalc, "laplacian-tmp-" + (System.nanoTime() & 0xFF))); - L.setConf(depConf); - - // Next step: perform eigen-decomposition using LanczosSolver - // since some of the eigen-output is spurious and will be eliminated - // upon verification, we have to aim to overshoot and then discard - // unnecessary vectors later - int overshoot = (int) ((double) clusters * OVERSHOOT_MULTIPLIER); - DistributedLanczosSolver solver = new DistributedLanczosSolver(); - LanczosState state = new LanczosState(L, clusters, solver.getInitialVector(L)); - Path lanczosSeqFiles = new Path(outputCalc, "eigenvectors-" + (System.nanoTime() & 0xFF)); - solver.runJob(conf, - state, - overshoot, - true, - lanczosSeqFiles.toString()); - - // perform a verification - EigenVerificationJob verifier = new EigenVerificationJob(); - Path verifiedEigensPath = new Path(outputCalc, "eigenverifier"); - verifier.runJob(conf, lanczosSeqFiles, L.getRowPath(), verifiedEigensPath, true, 1.0, clusters); - Path cleanedEigens = verifier.getCleanedEigensPath(); - DistributedRowMatrix W = new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), clusters, numDims); - W.setConf(depConf); - DistributedRowMatrix Wtrans = W.transpose(); - // DistributedRowMatrix Wt = W.transpose(); - - // next step: normalize the rows of Wt to unit length - Path unitVectors = new Path(outputCalc, "unitvectors-" + (System.nanoTime() & 0xFF)); - UnitVectorizerJob.runJob(Wtrans.getRowPath(), unitVectors); - DistributedRowMatrix Wt = new DistributedRowMatrix(unitVectors, new Path(unitVectors, "tmp"), clusters, numDims); - Wt.setConf(depConf); - - // Finally, perform k-means clustering on the rows of L (or W) - // generate random initial clusters - Path initialclusters = RandomSeedGenerator.buildRandom(conf, - Wt.getRowPath(), - new Path(output, Cluster.INITIAL_CLUSTERS_DIR), - clusters, - measure); - - // The output format is the same as the K-means output format. - // TODO: Perhaps a conversion of the output format from points and clusters - // in eigenspace to the original dataset. Currently, the user has to perform - // the association step after this job finishes on their own. - KMeansDriver.run(conf, - Wt.getRowPath(), - initialclusters, - output, - measure, - convergenceDelta, - maxIterations, - true, - 0.0, - false); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/PathDirectory.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/PathDirectory.java deleted file mode 100644 index ae23b0cdc..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/PathDirectory.java +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.topdown; - -import java.io.File; - -import org.apache.hadoop.fs.Path; - -/** - * Contains list of all internal paths used in top down clustering. - */ -public final class PathDirectory { - - public static final String TOP_LEVEL_CLUSTER_DIRECTORY = "topLevelCluster"; - public static final String POST_PROCESS_DIRECTORY = "clusterPostProcessed"; - public static final String CLUSTERED_POINTS_DIRECTORY = "clusteredPoints"; - public static final String BOTTOM_LEVEL_CLUSTER_DIRECTORY = "bottomLevelCluster"; - - private PathDirectory() { - } - - /** - * All output of top level clustering is stored in output directory/topLevelCluster. - * - * @param output - * the output path of clustering. - * @return The top level Cluster Directory. - */ - public static Path getTopLevelClusterPath(Path output) { - return new Path(output + File.separator + TOP_LEVEL_CLUSTER_DIRECTORY); - } - - /** - * The output of top level clusters is post processed and kept in this path. - * - * @param outputPathProvidedByUser - * the output path of clustering. - * @return the path where the output of top level cluster post processor is kept. - */ - public static Path getClusterPostProcessorOutputDirectory(Path outputPathProvidedByUser) { - return new Path(outputPathProvidedByUser + File.separator + POST_PROCESS_DIRECTORY); - } - - /** - * The top level clustered points before post processing is generated here. - * - * @param output - * the output path of clustering. - * @return the clustered points directory - */ - public static Path getClusterOutputClusteredPoints(Path output) { - return new Path(output + File.separator + CLUSTERED_POINTS_DIRECTORY + File.separator, "*"); - } - - /** - * Each cluster produced by top level clustering is processed in output/"bottomLevelCluster"/clusterId. - * - * @param output - * @param clusterId - * @return the bottom level clustering path. - */ - public static Path getBottomLevelClusterPath(Path output, String clusterId) { - return new Path(output + File.separator + BOTTOM_LEVEL_CLUSTER_DIRECTORY + File.separator + clusterId); - } - - /** - * Each clusters path name is its clusterId. The vectors reside in separate files inside it. - * - * @param clusterPostProcessorOutput - * the path of cluster post processor output. - * @param clusterId - * the id of the cluster. - * @return the cluster path for cluster id. - */ - public static Path getClusterPathForClusterId(Path clusterPostProcessorOutput, String clusterId) { - return new Path(clusterPostProcessorOutput + File.separator + clusterId); - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReader.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReader.java deleted file mode 100644 index 11c4d881e..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReader.java +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.topdown.postprocessor; - -import java.io.IOException; -import java.util.Iterator; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator; - -/** - * Reads the number of clusters produced by the clustering algorithm. - */ -public final class ClusterCountReader { - - private ClusterCountReader() { - } - - /** - * Reads the number of clusters present by reading the clusters-*-final file. - * - * @param clusterOutputPath - * The output path provided to the clustering algorithm. - * @param conf - * The hadoop configuration. - * @return the number of final clusters. - */ - public static int getNumberOfClusters(Path clusterOutputPath, Configuration conf) throws IOException { - FileSystem fileSystem = clusterOutputPath.getFileSystem(conf); - FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter()); - int numberOfClusters = 0; - Iterator it = new SequenceFileDirValueIterator(clusterFiles[0].getPath(), - PathType.LIST, - PathFilters.partFilter(), - null, - true, - conf); - while (it.hasNext()) { - it.next(); - numberOfClusters++; - } - return numberOfClusters; - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessor.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessor.java deleted file mode 100644 index 56e990fdd..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessor.java +++ /dev/null @@ -1,145 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.topdown.postprocessor; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.SequenceFile.Writer; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.clustering.classify.WeightedVectorWritable; -import org.apache.mahout.clustering.topdown.PathDirectory; -import org.apache.mahout.common.IOUtils; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; -import org.apache.mahout.math.VectorWritable; - -/** - * This class reads the output of any clustering algorithm, and, creates separate directories for different - * clusters. Each cluster directory's name is its clusterId. Each and every point is written in the cluster - * directory associated with that point. - * - * This class incorporates a sequential algorithm and is appropriate for use for data which has been clustered - * sequentially. - * - * The sequential and non sequential version, both are being used from {@link ClusterOutputPostProcessorDriver}. - */ -public final class ClusterOutputPostProcessor { - - private Path clusteredPoints; - private final FileSystem fileSystem; - private final Configuration conf; - private final Path clusterPostProcessorOutput; - private final Map postProcessedClusterDirectories = new HashMap(); - private long uniqueVectorId = 0L; - private final Map writersForClusters; - - public ClusterOutputPostProcessor(Path clusterOutputToBeProcessed, - Path output, - Configuration hadoopConfiguration) throws IOException { - this.clusterPostProcessorOutput = output; - this.clusteredPoints = PathDirectory.getClusterOutputClusteredPoints(clusterOutputToBeProcessed); - this.conf = hadoopConfiguration; - this.writersForClusters = new HashMap(); - fileSystem = clusteredPoints.getFileSystem(conf); - } - - /** - * This method takes the clustered points output by the clustering algorithms as input and writes them into - * their respective clusters. - */ - public void process() throws IOException { - createPostProcessDirectory(); - for (Pair record : - new SequenceFileDirIterable(clusteredPoints, - PathType.GLOB, - PathFilters.partFilter(), - null, - false, - conf)) { - String clusterId = record.getFirst().toString().trim(); - putVectorInRespectiveCluster(clusterId, record.getSecond()); - } - IOUtils.close(writersForClusters.values()); - writersForClusters.clear(); - } - - /** - * Creates the directory to put post processed clusters. - */ - private void createPostProcessDirectory() throws IOException { - if (!fileSystem.exists(clusterPostProcessorOutput)) { - if (!fileSystem.mkdirs(clusterPostProcessorOutput)) { - throw new IOException("Error creating cluster post processor directory"); - } - } - } - - /** - * - * Finds out the cluster directory of the vector and writes it into the specified cluster. - */ - private void putVectorInRespectiveCluster(String clusterId, WeightedVectorWritable point) throws IOException { - Writer writer = findWriterForVector(clusterId); - postProcessedClusterDirectories.put(clusterId, - PathDirectory.getClusterPathForClusterId(clusterPostProcessorOutput, clusterId)); - writeVectorToCluster(writer, point); - } - - /** - * Finds out the path in cluster where the point is supposed to be written. - */ - private Writer findWriterForVector(String clusterId) throws IOException { - Path clusterDirectory = PathDirectory.getClusterPathForClusterId(clusterPostProcessorOutput, clusterId); - Writer writer = writersForClusters.get(clusterId); - if (writer == null) { - Path pathToWrite = new Path(clusterDirectory, new Path("part-m-0")); - writer = new Writer(fileSystem, conf, pathToWrite, LongWritable.class, VectorWritable.class); - writersForClusters.put(clusterId, writer); - } - return writer; - } - - /** - * Writes vector to the cluster directory. - */ - private void writeVectorToCluster(Writer writer, WeightedVectorWritable point) throws IOException { - writer.append(new LongWritable(uniqueVectorId++), new VectorWritable(point.getVector())); - writer.sync(); - } - - /** - * @return the set of all post processed cluster paths. - */ - public Map getPostProcessedClusterDirectories() { - return postProcessedClusterDirectories; - } - - public void setClusteredPoints(Path clusteredPoints) { - this.clusteredPoints = clusteredPoints; - } - -} \ No newline at end of file diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorDriver.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorDriver.java deleted file mode 100644 index c4d62c55f..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorDriver.java +++ /dev/null @@ -1,194 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.topdown.postprocessor; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator; -import org.apache.mahout.math.VectorWritable; - -/** - * Post processes the output of clustering algorithms and groups them into respective clusters. Ideal to be - * used for top down clustering. It can also be used if the clustering output needs to be grouped into their - * respective clusters. - */ -public class ClusterOutputPostProcessorDriver extends AbstractJob { - - /** - * CLI to run clustering post processor. The input to post processor is the ouput path specified to the - * clustering. - */ - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOutputOption(); - addOption(DefaultOptionCreator.methodOption().create()); - - if (parseArguments(args) == null) { - return -1; - } - - Path input = getInputPath(); - Path output = getOutputPath(); - - if (getConf() == null) { - setConf(new Configuration()); - } - boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase( - DefaultOptionCreator.SEQUENTIAL_METHOD); - run(input, output, runSequential); - return 0; - - } - - /** - * Constructor to be used by the ToolRunner. - */ - private ClusterOutputPostProcessorDriver() {} - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new ClusterOutputPostProcessorDriver(), args); - } - - /** - * Post processes the output of clustering algorithms and groups them into respective clusters. Each - * cluster's vectors are written into a directory named after its clusterId. - * - * @param input - * The output path provided to the clustering algorithm, whose would be post processed. Hint : The - * path of the directory containing clusters-*-final and clusteredPoints. - * @param output - * The post processed data would be stored at this path. - * @param runSequential - * If set to true, post processes it sequentially, else, uses. MapReduce. Hint : If the clustering - * was done sequentially, make it sequential, else vice versa. - */ - public static void run(Path input, Path output, boolean runSequential) throws IOException, - InterruptedException, - ClassNotFoundException { - if (runSequential) { - postProcessSeq(input, output); - } else { - Configuration conf = new Configuration(); - postProcessMR(conf, input, output); - movePartFilesToRespectiveDirectories(conf, output); - } - - } - - /** - * Process Sequentially. Reads the vectors one by one, and puts them into respective directory, named after - * their clusterId. - * - * @param input - * The output path provided to the clustering algorithm, whose would be post processed. Hint : The - * path of the directory containing clusters-*-final and clusteredPoints. - * @param output - * The post processed data would be stored at this path. - */ - private static void postProcessSeq(Path input, Path output) throws IOException { - ClusterOutputPostProcessor clusterOutputPostProcessor = new ClusterOutputPostProcessor(input, output, - new Configuration()); - clusterOutputPostProcessor.process(); - } - - /** - * Process as a map reduce job. The numberOfReduceTasks is set to the number of clusters present in the - * output. So that each cluster's vector is written in its own part file. - * - * @param conf - * The hadoop configuration. - * @param input - * The output path provided to the clustering algorithm, whose would be post processed. Hint : The - * path of the directory containing clusters-*-final and clusteredPoints. - * @param output - * The post processed data would be stored at this path. - */ - private static void postProcessMR(Configuration conf, Path input, Path output) throws IOException, - InterruptedException, - ClassNotFoundException { - Job job = new Job(conf, "ClusterOutputPostProcessor Driver running over input: " + input); - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - job.setMapperClass(ClusterOutputPostProcessorMapper.class); - job.setMapOutputKeyClass(Text.class); - job.setMapOutputValueClass(VectorWritable.class); - job.setReducerClass(ClusterOutputPostProcessorReducer.class); - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(VectorWritable.class); - int numberOfClusters = ClusterCountReader.getNumberOfClusters(input, conf); - job.setNumReduceTasks(numberOfClusters); - job.setJarByClass(ClusterOutputPostProcessorDriver.class); - - FileInputFormat.addInputPath(job, new Path(input, new Path("clusteredPoints"))); - FileOutputFormat.setOutputPath(job, output); - if (!job.waitForCompletion(true)) { - throw new InterruptedException("ClusterOutputPostProcessor Job failed processing " + input); - } - } - - /** - * The mapreduce version of the post processor writes different clusters into different part files. This - * method reads the part files and moves them into directories named after their clusterIds. - * - * @param conf - * The hadoop configuration. - * @param output - * The post processed data would be stored at this path. - */ - private static void movePartFilesToRespectiveDirectories(Configuration conf, Path output) throws IOException { - FileSystem fileSystem = output.getFileSystem(conf); - for (FileStatus fileStatus : fileSystem.listStatus(output, PathFilters.partFilter())) { - SequenceFileIterator it = - new SequenceFileIterator(fileStatus.getPath(), true, conf); - if (it.hasNext()) { - renameFile(it.next().getFirst(), fileStatus, conf); - } - it.close(); - } - } - - /** - * Using @FileSystem rename method to move the file. - */ - private static void renameFile(Writable key, FileStatus fileStatus, Configuration conf) throws IOException { - Path path = fileStatus.getPath(); - FileSystem fileSystem = path.getFileSystem(conf); - Path subDir = new Path(key.toString()); - Path renameTo = new Path(path.getParent(), subDir); - fileSystem.mkdirs(renameTo); - fileSystem.rename(path, renameTo); - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorMapper.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorMapper.java deleted file mode 100644 index 96e74c864..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorMapper.java +++ /dev/null @@ -1,42 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.topdown.postprocessor; - -import java.io.IOException; - -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.clustering.classify.WeightedVectorWritable; -import org.apache.mahout.math.VectorWritable; - -/** - * Mapper for post processing cluster output. - */ -public class ClusterOutputPostProcessorMapper extends - Mapper { - - /** - * The key is the cluster id and the value is the vector. - */ - @Override - protected void map(IntWritable key, WeightedVectorWritable vector, Context context) throws IOException, - InterruptedException { - context.write(new Text(key.toString().trim()), new VectorWritable(vector.getVector())); - } -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorReducer.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorReducer.java deleted file mode 100644 index 54936186a..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorReducer.java +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.topdown.postprocessor; - -import java.io.IOException; - -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.math.VectorWritable; - -/** - * Reducer for post processing cluster output. - */ -public class ClusterOutputPostProcessorReducer extends Reducer { - /** - * The key is the cluster id and the values contains the points in that cluster. - */ - @Override - protected void reduce(Text key, Iterable values, Context context) throws IOException, - InterruptedException { - for (VectorWritable value : values) { - context.write(key, value); - } - } - -} diff --git a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/common/AbstractJob.java b/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/common/AbstractJob.java deleted file mode 100644 index 5baf03328..000000000 --- a/common/mahout-distribution-0.7-hadoop1/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/core/src/main/java/org/apache/mahout/common/AbstractJob.java +++ /dev/null @@ -1,622 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.common; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; -import java.util.concurrent.atomic.AtomicInteger; - -import com.google.common.io.Closeables; -import org.apache.commons.cli2.CommandLine; -import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; -import org.apache.commons.cli2.OptionException; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configured; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapreduce.InputFormat; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.OutputFormat; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.util.Tool; -import org.apache.lucene.analysis.Analyzer; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.vectorizer.DefaultAnalyzer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - *

Superclass of many Mahout Hadoop "jobs". A job drives configuration and launch of one or - * more maps and reduces in order to accomplish some task.

- * - *

Command line arguments available to all subclasses are:

- * - *
    - *
  • --tempDir (path): Specifies a directory where the job may place temp files - * (default "temp")
  • - *
  • --help: Show help message
  • - *
- * - *

In addition, note some key command line parameters that are parsed by Hadoop, which jobs - * may need to set:

- * - *
    - *
  • -Dmapred.job.name=(name): Sets the Hadoop task names. It will be suffixed by - * the mapper and reducer class names
  • - *
  • -Dmapred.output.compress={true,false}: Compress final output (default true)
  • - *
  • -Dmapred.input.dir=(path): input file, or directory containing input files (required)
  • - *
  • -Dmapred.output.dir=(path): path to write output files (required)
  • - *
- * - *

Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other - * arguments.

- */ -public abstract class AbstractJob extends Configured implements Tool { - - private static final Logger log = LoggerFactory.getLogger(AbstractJob.class); - - /** option used to specify the input path */ - private Option inputOption; - - /** option used to specify the output path */ - private Option outputOption; - - /** input path, populated by {@link #parseArguments(String[])} */ - protected Path inputPath; - protected File inputFile; //the input represented as a file - - /** output path, populated by {@link #parseArguments(String[]) */ - protected Path outputPath; - protected File outputFile; //the output represented as a file - - /** temp path, populated by {@link #parseArguments(String[]) */ - protected Path tempPath; - - protected Map> argMap; - - /** internal list of options that have been added */ - private final List