Merge branch 'mesos'

MLnick · Jan 24, 2013 · 97e2420 · 97e2420
2 parents df9ae8a + 548856a
commit 97e2420
Show file tree

Hide file tree

Showing 246 changed files with 17,206 additions and 2,297 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,7 @@ third_party/libmesos.so
 third_party/libmesos.dylib
 conf/java-opts
 conf/spark-env.sh
+conf/streaming-env.sh
 conf/log4j.properties
 docs/_site
 docs/api
@@ -31,6 +32,7 @@ project/plugins/src_managed/
 logs/
 log/
 spark-tests.log
+streaming-tests.log
 dependency-reduced-pom.xml
 .ensime
 .ensime_lucene
diff --git a/bagel/pom.xml b/bagel/pom.xml
@@ -45,6 +45,11 @@
   <profiles>
     <profile>
       <id>hadoop1</id>
+      <activation>
+        <property>
+          <name>!hadoopVersion</name>
+        </property>
+      </activation>
       <dependencies>
         <dependency>
           <groupId>org.spark-project</groupId>
@@ -72,6 +77,12 @@
     </profile>
     <profile>
       <id>hadoop2</id>
+      <activation>
+        <property>
+          <name>hadoopVersion</name>
+          <value>2</value>
+        </property>
+      </activation>
       <dependencies>
         <dependency>
           <groupId>org.spark-project</groupId>

diff --git a/bagel/src/test/resources/log4j.properties b/bagel/src/test/resources/log4j.properties
@@ -1,8 +1,8 @@
-# Set everything to be logged to the console
+# Set everything to be logged to the file bagel/target/unit-tests.log 
 log4j.rootCategory=INFO, file
 log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file.append=false
-log4j.appender.file.file=spark-tests.log
+log4j.appender.file.file=bagel/target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
 

diff --git a/core/pom.xml b/core/pom.xml
@@ -71,6 +71,10 @@
       <groupId>cc.spray</groupId>
       <artifactId>spray-server</artifactId>
     </dependency>
+    <dependency>
+      <groupId>cc.spray</groupId>
+      <artifactId>spray-json_${scala.version}</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.tomdz.twirl</groupId>
       <artifactId>twirl-api</artifactId>
@@ -159,6 +163,11 @@
   <profiles>
     <profile>
       <id>hadoop1</id>
+      <activation>
+        <property>
+          <name>!hadoopVersion</name>
+        </property>
+      </activation>
       <dependencies>
         <dependency>
           <groupId>org.apache.hadoop</groupId>
@@ -211,6 +220,12 @@
     </profile>
     <profile>
       <id>hadoop2</id>
+      <activation>
+        <property>
+          <name>hadoopVersion</name>
+          <value>2</value>
+        </property>
+      </activation>
       <dependencies>
         <dependency>
           <groupId>org.apache.hadoop</groupId>
@@ -267,4 +282,4 @@
       </build>
     </profile>
   </profiles>
-</project>
+</project>
diff --git a/core/src/main/scala/spark/Accumulators.scala b/core/src/main/scala/spark/Accumulators.scala
@@ -25,8 +25,7 @@ class Accumulable[R, T] (
   extends Serializable {
 
   val id = Accumulators.newId
-  @transient
-  private var value_ = initialValue // Current value on master
+  @transient private var value_ = initialValue // Current value on master
   val zero = param.zero(initialValue)  // Zero value to be passed to workers
   var deserialized = false
 
@@ -38,20 +37,37 @@ class Accumulable[R, T] (
    */
   def += (term: T) { value_ = param.addAccumulator(value_, term) }
 
+  /**
+   * Add more data to this accumulator / accumulable
+   * @param term the data to add
+   */
+  def add(term: T) { value_ = param.addAccumulator(value_, term) }
+
   /**
    * Merge two accumulable objects together
-   * 
+   *
    * Normally, a user will not want to use this version, but will instead call `+=`.
-   * @param term the other Accumulable that will get merged with this
+   * @param term the other `R` that will get merged with this
    */
   def ++= (term: R) { value_ = param.addInPlace(value_, term)}
 
+  /**
+   * Merge two accumulable objects together
+   *
+   * Normally, a user will not want to use this version, but will instead call `add`.
+   * @param term the other `R` that will get merged with this
+   */
+  def merge(term: R) { value_ = param.addInPlace(value_, term)}
+
   /**
    * Access the accumulator's current value; only allowed on master.
    */
-  def value = {
-    if (!deserialized) value_
-    else throw new UnsupportedOperationException("Can't read accumulator value in task")
+  def value: R = {
+    if (!deserialized) {
+      value_
+    } else {
+      throw new UnsupportedOperationException("Can't read accumulator value in task")
+    }
   }
 
   /**
@@ -68,10 +84,17 @@ class Accumulable[R, T] (
   /**
    * Set the accumulator's value; only allowed on master.
    */
-  def value_= (r: R) {
-    if (!deserialized) value_ = r
+  def value_= (newValue: R) {
+    if (!deserialized) value_ = newValue
     else throw new UnsupportedOperationException("Can't assign accumulator value in task")
   }
+
+  /**
+   * Set the accumulator's value; only allowed on master
+   */
+  def setValue(newValue: R) {
+    this.value = newValue
+  }
 
   // Called by Java when deserializing an object
   private def readObject(in: ObjectInputStream) {

diff --git a/core/src/main/scala/spark/BoundedMemoryCache.scala b/core/src/main/scala/spark/BoundedMemoryCache.scala
diff --git a/core/src/main/scala/spark/CacheManager.scala b/core/src/main/scala/spark/CacheManager.scala
@@ -0,0 +1,65 @@
+package spark
+
+import scala.collection.mutable.{ArrayBuffer, HashSet}
+import spark.storage.{BlockManager, StorageLevel}
+
+
+/** Spark class responsible for passing RDDs split contents to the BlockManager and making
+    sure a node doesn't load two copies of an RDD at once.
+  */
+private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
+  private val loading = new HashSet[String]
+
+  /** Gets or computes an RDD split. Used by RDD.iterator() when a RDD is cached. */
+  def getOrCompute[T](rdd: RDD[T], split: Split, context: TaskContext, storageLevel: StorageLevel)
+  : Iterator[T] = {
+    val key = "rdd_%d_%d".format(rdd.id, split.index)
+    logInfo("Cache key is " + key)
+    blockManager.get(key) match {
+      case Some(cachedValues) =>
+        // Split is in cache, so just return its values
+        logInfo("Found partition in cache!")
+        return cachedValues.asInstanceOf[Iterator[T]]
+
+      case None =>
+        // Mark the split as loading (unless someone else marks it first)
+        loading.synchronized {
+          if (loading.contains(key)) {
+            logInfo("Loading contains " + key + ", waiting...")
+            while (loading.contains(key)) {
+              try {loading.wait()} catch {case _ =>}
+            }
+            logInfo("Loading no longer contains " + key + ", so returning cached result")
+            // See whether someone else has successfully loaded it. The main way this would fail
+            // is for the RDD-level cache eviction policy if someone else has loaded the same RDD
+            // partition but we didn't want to make space for it. However, that case is unlikely
+            // because it's unlikely that two threads would work on the same RDD partition. One
+            // downside of the current code is that threads wait serially if this does happen.
+            blockManager.get(key) match {
+              case Some(values) =>
+                return values.asInstanceOf[Iterator[T]]
+              case None =>
+                logInfo("Whoever was loading " + key + " failed; we'll try it ourselves")
+                loading.add(key)
+            }
+          } else {
+            loading.add(key)
+          }
+        }
+        try {
+          // If we got here, we have to load the split
+          val elements = new ArrayBuffer[Any]
+          logInfo("Computing partition " + split)
+          elements ++= rdd.compute(split, context)
+          // Try to put this block in the blockManager
+          blockManager.put(key, elements, storageLevel, true)
+          return elements.iterator.asInstanceOf[Iterator[T]]
+        } finally {
+          loading.synchronized {
+            loading.remove(key)
+            loading.notifyAll()
+          }
+        }
+    }
+  }
+}