Permalink
Browse files

Merge branch 'mesos'

  • Loading branch information...
2 parents 8587844 + 7151e1e commit 7c129388fbdc90cb6abb99470545dba8a2e90adf @haitaoyao haitaoyao committed Feb 19, 2013
Showing with 836 additions and 749 deletions.
  1. +10 −10 bagel/src/main/scala/spark/bagel/Bagel.scala
  2. +3 −3 bagel/src/main/scala/spark/bagel/examples/WikipediaPageRank.scala
  3. +1 −1 bagel/src/main/scala/spark/bagel/examples/WikipediaPageRankStandalone.scala
  4. +2 −2 core/src/main/scala/spark/CacheManager.scala
  5. +2 −2 core/src/main/scala/spark/DoubleRDDFunctions.scala
  6. +32 −30 core/src/main/scala/spark/PairRDDFunctions.scala
  7. +1 −1 core/src/main/scala/spark/{Split.scala → Partition.scala}
  8. +43 −33 core/src/main/scala/spark/RDD.scala
  9. +6 −6 core/src/main/scala/spark/RDDCheckpointData.scala
  10. +14 −14 core/src/main/scala/spark/SparkContext.scala
  11. +3 −3 core/src/main/scala/spark/api/java/JavaDoubleRDD.scala
  12. +22 −22 core/src/main/scala/spark/api/java/JavaPairRDD.scala
  13. +3 −3 core/src/main/scala/spark/api/java/JavaRDD.scala
  14. +6 −6 core/src/main/scala/spark/api/java/JavaRDDLike.scala
  15. +11 −11 core/src/main/scala/spark/api/java/JavaSparkContext.scala
  16. +1 −1 core/src/main/scala/spark/api/python/PythonPartitioner.scala
  17. +5 −5 core/src/main/scala/spark/api/python/PythonRDD.scala
  18. +2 −2 core/src/main/scala/spark/deploy/{JobDescription.scala → ApplicationDescription.scala}
  19. +10 −9 core/src/main/scala/spark/deploy/DeployMessage.scala
  20. +9 −9 core/src/main/scala/spark/deploy/JsonProtocol.scala
  21. +11 −11 core/src/main/scala/spark/deploy/client/Client.scala
  22. +1 −1 core/src/main/scala/spark/deploy/client/ClientListener.scala
  23. +3 −3 core/src/main/scala/spark/deploy/client/TestClient.scala
  24. +5 −5 core/src/main/scala/spark/deploy/master/{JobInfo.scala → ApplicationInfo.scala}
  25. +11 −0 core/src/main/scala/spark/deploy/master/ApplicationState.scala
  26. +2 −2 core/src/main/scala/spark/deploy/master/ExecutorInfo.scala
  27. +0 −9 core/src/main/scala/spark/deploy/master/JobState.scala
  28. +87 −87 core/src/main/scala/spark/deploy/master/Master.scala
  29. +11 −11 core/src/main/scala/spark/deploy/master/MasterWebUI.scala
  30. +2 −2 core/src/main/scala/spark/deploy/master/WorkerInfo.scala
  31. +13 −13 core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala
  32. +10 −10 core/src/main/scala/spark/deploy/worker/Worker.scala
  33. +1 −1 core/src/main/scala/spark/deploy/worker/WorkerArguments.scala
  34. +2 −2 core/src/main/scala/spark/deploy/worker/WorkerWebUI.scala
  35. +3 −2 core/src/main/scala/spark/executor/StandaloneExecutorBackend.scala
  36. +1 −1 core/src/main/scala/spark/partial/ApproximateActionListener.scala
  37. +8 −8 core/src/main/scala/spark/rdd/BlockRDD.scala
  38. +18 −18 core/src/main/scala/spark/rdd/CartesianRDD.scala
  39. +10 −10 core/src/main/scala/spark/rdd/CheckpointRDD.scala
  40. +13 −13 core/src/main/scala/spark/rdd/CoGroupedRDD.scala
  41. +12 −12 core/src/main/scala/spark/rdd/CoalescedRDD.scala
  42. +3 −3 core/src/main/scala/spark/rdd/FilteredRDD.scala
  43. +3 −3 core/src/main/scala/spark/rdd/FlatMappedRDD.scala
  44. +3 −3 core/src/main/scala/spark/rdd/GlommedRDD.scala
  45. +10 −10 core/src/main/scala/spark/rdd/HadoopRDD.scala
  46. +4 −4 core/src/main/scala/spark/rdd/MapPartitionsRDD.scala
  47. +6 −6 core/src/main/scala/spark/rdd/{MapPartitionsWithSplitRDD.scala → MapPartitionsWithIndexRDD.scala}
  48. +3 −3 core/src/main/scala/spark/rdd/MappedRDD.scala
  49. +10 −10 core/src/main/scala/spark/rdd/NewHadoopRDD.scala
  50. +13 −18 core/src/main/scala/spark/{ParallelCollection.scala → rdd/ParallelCollectionRDD.scala}
  51. +8 −8 core/src/main/scala/spark/rdd/PartitionPruningRDD.scala
  52. +3 −3 core/src/main/scala/spark/rdd/PipedRDD.scala
  53. +8 −8 core/src/main/scala/spark/rdd/SampledRDD.scala
  54. +5 −5 core/src/main/scala/spark/rdd/ShuffledRDD.scala
  55. +15 −15 core/src/main/scala/spark/rdd/UnionRDD.scala
  56. +16 −16 core/src/main/scala/spark/rdd/ZippedRDD.scala
  57. +7 −7 core/src/main/scala/spark/scheduler/DAGScheduler.scala
  58. +3 −3 core/src/main/scala/spark/scheduler/ResultTask.scala
  59. +3 −3 core/src/main/scala/spark/scheduler/ShuffleMapTask.scala
  60. +1 −1 core/src/main/scala/spark/scheduler/Stage.scala
  61. +8 −7 core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
  62. +2 −2 core/src/main/scala/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala
  63. +2 −2 core/src/main/scala/spark/scheduler/mesos/MesosSchedulerBackend.scala
  64. +1 −1 core/src/main/scala/spark/storage/BlockManager.scala
  65. +1 −1 core/src/main/scala/spark/storage/StorageUtils.scala
  66. +40 −0 core/src/main/twirl/spark/deploy/master/app_details.scala.html
  67. +20 −0 core/src/main/twirl/spark/deploy/master/app_row.scala.html
  68. +4 −4 core/src/main/twirl/spark/deploy/master/{job_table.scala.html → app_table.scala.html}
  69. +3 −3 core/src/main/twirl/spark/deploy/master/executor_row.scala.html
  70. +8 −8 core/src/main/twirl/spark/deploy/master/index.scala.html
  71. +0 −40 core/src/main/twirl/spark/deploy/master/job_details.scala.html
  72. +0 −20 core/src/main/twirl/spark/deploy/master/job_row.scala.html
  73. +5 −5 core/src/main/twirl/spark/deploy/worker/executor_row.scala.html
  74. +56 −56 core/src/test/scala/spark/CheckpointSuite.scala
  75. +9 −4 core/src/test/scala/spark/RDDSuite.scala
  76. +24 −1 core/src/test/scala/spark/ShuffleSuite.scala
  77. +5 −5 core/src/test/scala/spark/SortingSuite.scala
  78. +20 −20 core/src/test/scala/spark/{ → rdd}/ParallelCollectionSplitSuite.scala
  79. +11 −11 core/src/test/scala/spark/scheduler/DAGSchedulerSuite.scala
  80. +5 −5 core/src/test/scala/spark/scheduler/TaskContextSuite.scala
  81. +11 −0 ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
  82. +55 −19 ec2/spark_ec2.py
  83. +1 −1 streaming/src/main/scala/spark/streaming/Checkpoint.scala
  84. +1 −1 streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala
  85. +5 −5 streaming/src/main/scala/spark/streaming/StreamingContext.scala
  86. +3 −3 streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala
  87. +1 −1 streaming/src/main/scala/spark/streaming/dstream/CoGroupedDStream.scala
  88. +1 −1 streaming/src/main/scala/spark/streaming/dstream/ReducedWindowedDStream.scala
@@ -14,11 +14,11 @@ object Bagel extends Logging {
combiner: Combiner[M, C],
aggregator: Option[Aggregator[V, A]],
partitioner: Partitioner,
- numSplits: Int
+ numPartitions: Int
)(
compute: (V, Option[C], Option[A], Int) => (V, Array[M])
): RDD[(K, V)] = {
- val splits = if (numSplits != 0) numSplits else sc.defaultParallelism
+ val splits = if (numPartitions != 0) numPartitions else sc.defaultParallelism
var superstep = 0
var verts = vertices
@@ -56,12 +56,12 @@ object Bagel extends Logging {
messages: RDD[(K, M)],
combiner: Combiner[M, C],
partitioner: Partitioner,
- numSplits: Int
+ numPartitions: Int
)(
compute: (V, Option[C], Int) => (V, Array[M])
): RDD[(K, V)] = {
run[K, V, M, C, Nothing](
- sc, vertices, messages, combiner, None, partitioner, numSplits)(
+ sc, vertices, messages, combiner, None, partitioner, numPartitions)(
addAggregatorArg[K, V, M, C](compute))
}
@@ -70,27 +70,27 @@ object Bagel extends Logging {
vertices: RDD[(K, V)],
messages: RDD[(K, M)],
combiner: Combiner[M, C],
- numSplits: Int
+ numPartitions: Int
)(
compute: (V, Option[C], Int) => (V, Array[M])
): RDD[(K, V)] = {
- val part = new HashPartitioner(numSplits)
+ val part = new HashPartitioner(numPartitions)
run[K, V, M, C, Nothing](
- sc, vertices, messages, combiner, None, part, numSplits)(
+ sc, vertices, messages, combiner, None, part, numPartitions)(
addAggregatorArg[K, V, M, C](compute))
}
def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest](
sc: SparkContext,
vertices: RDD[(K, V)],
messages: RDD[(K, M)],
- numSplits: Int
+ numPartitions: Int
)(
compute: (V, Option[Array[M]], Int) => (V, Array[M])
): RDD[(K, V)] = {
- val part = new HashPartitioner(numSplits)
+ val part = new HashPartitioner(numPartitions)
run[K, V, M, Array[M], Nothing](
- sc, vertices, messages, new DefaultCombiner(), None, part, numSplits)(
+ sc, vertices, messages, new DefaultCombiner(), None, part, numPartitions)(
addAggregatorArg[K, V, M, Array[M]](compute))
}
@@ -16,7 +16,7 @@ import scala.xml.{XML,NodeSeq}
object WikipediaPageRank {
def main(args: Array[String]) {
if (args.length < 5) {
- System.err.println("Usage: WikipediaPageRank <inputFile> <threshold> <numSplits> <host> <usePartitioner>")
+ System.err.println("Usage: WikipediaPageRank <inputFile> <threshold> <numPartitions> <host> <usePartitioner>")
System.exit(-1)
}
@@ -25,7 +25,7 @@ object WikipediaPageRank {
val inputFile = args(0)
val threshold = args(1).toDouble
- val numSplits = args(2).toInt
+ val numPartitions = args(2).toInt
val host = args(3)
val usePartitioner = args(4).toBoolean
val sc = new SparkContext(host, "WikipediaPageRank")
@@ -69,7 +69,7 @@ object WikipediaPageRank {
val result =
Bagel.run(
sc, vertices, messages, combiner = new PRCombiner(),
- numSplits = numSplits)(
+ numPartitions = numPartitions)(
utils.computeWithCombiner(numVertices, epsilon))
// Print the result
@@ -88,7 +88,7 @@ object WikipediaPageRankStandalone {
n: Long,
partitioner: Partitioner,
usePartitioner: Boolean,
- numSplits: Int
+ numPartitions: Int
): RDD[(String, Double)] = {
var ranks = links.mapValues { edges => defaultRank }
for (i <- 1 to numIterations) {
@@ -11,13 +11,13 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
private val loading = new HashSet[String]
/** Gets or computes an RDD split. Used by RDD.iterator() when an RDD is cached. */
- def getOrCompute[T](rdd: RDD[T], split: Split, context: TaskContext, storageLevel: StorageLevel)
+ def getOrCompute[T](rdd: RDD[T], split: Partition, context: TaskContext, storageLevel: StorageLevel)
: Iterator[T] = {
val key = "rdd_%d_%d".format(rdd.id, split.index)
logInfo("Cache key is " + key)
blockManager.get(key) match {
case Some(cachedValues) =>
- // Split is in cache, so just return its values
+ // Partition is in cache, so just return its values
logInfo("Found partition in cache!")
return cachedValues.asInstanceOf[Iterator[T]]
@@ -42,14 +42,14 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable {
/** (Experimental) Approximate operation to return the mean within a timeout. */
def meanApprox(timeout: Long, confidence: Double = 0.95): PartialResult[BoundedDouble] = {
val processPartition = (ctx: TaskContext, ns: Iterator[Double]) => StatCounter(ns)
- val evaluator = new MeanEvaluator(self.splits.size, confidence)
+ val evaluator = new MeanEvaluator(self.partitions.size, confidence)
self.context.runApproximateJob(self, processPartition, evaluator, timeout)
}
/** (Experimental) Approximate operation to return the sum within a timeout. */
def sumApprox(timeout: Long, confidence: Double = 0.95): PartialResult[BoundedDouble] = {
val processPartition = (ctx: TaskContext, ns: Iterator[Double]) => StatCounter(ns)
- val evaluator = new SumEvaluator(self.splits.size, confidence)
+ val evaluator = new SumEvaluator(self.partitions.size, confidence)
self.context.runApproximateJob(self, processPartition, evaluator, timeout)
}
}
@@ -62,7 +62,9 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
}
val aggregator =
new Aggregator[K, V, C](createCombiner, mergeValue, mergeCombiners)
- if (mapSideCombine) {
+ if (self.partitioner == Some(partitioner)) {
+ self.mapPartitions(aggregator.combineValuesByKey(_), true)
+ } else if (mapSideCombine) {
val mapSideCombined = self.mapPartitions(aggregator.combineValuesByKey(_), true)
val partitioned = new ShuffledRDD[K, C](mapSideCombined, partitioner)
partitioned.mapPartitions(aggregator.combineCombinersByKey(_), true)
@@ -81,8 +83,8 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
def combineByKey[C](createCombiner: V => C,
mergeValue: (C, V) => C,
mergeCombiners: (C, C) => C,
- numSplits: Int): RDD[(K, C)] = {
- combineByKey(createCombiner, mergeValue, mergeCombiners, new HashPartitioner(numSplits))
+ numPartitions: Int): RDD[(K, C)] = {
+ combineByKey(createCombiner, mergeValue, mergeCombiners, new HashPartitioner(numPartitions))
}
/**
@@ -143,10 +145,10 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
/**
* Merge the values for each key using an associative reduce function. This will also perform
* the merging locally on each mapper before sending results to a reducer, similarly to a
- * "combiner" in MapReduce. Output will be hash-partitioned with numSplits splits.
+ * "combiner" in MapReduce. Output will be hash-partitioned with numPartitions partitions.
*/
- def reduceByKey(func: (V, V) => V, numSplits: Int): RDD[(K, V)] = {
- reduceByKey(new HashPartitioner(numSplits), func)
+ def reduceByKey(func: (V, V) => V, numPartitions: Int): RDD[(K, V)] = {
+ reduceByKey(new HashPartitioner(numPartitions), func)
}
/**
@@ -164,10 +166,10 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
/**
* Group the values for each key in the RDD into a single sequence. Hash-partitions the
- * resulting RDD with into `numSplits` partitions.
+ * resulting RDD with into `numPartitions` partitions.
*/
- def groupByKey(numSplits: Int): RDD[(K, Seq[V])] = {
- groupByKey(new HashPartitioner(numSplits))
+ def groupByKey(numPartitions: Int): RDD[(K, Seq[V])] = {
+ groupByKey(new HashPartitioner(numPartitions))
}
/**
@@ -285,8 +287,8 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
* pair of elements will be returned as a (k, (v1, v2)) tuple, where (k, v1) is in `this` and
* (k, v2) is in `other`. Performs a hash join across the cluster.
*/
- def join[W](other: RDD[(K, W)], numSplits: Int): RDD[(K, (V, W))] = {
- join(other, new HashPartitioner(numSplits))
+ def join[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (V, W))] = {
+ join(other, new HashPartitioner(numPartitions))
}
/**
@@ -303,10 +305,10 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
* Perform a left outer join of `this` and `other`. For each element (k, v) in `this`, the
* resulting RDD will either contain all pairs (k, (v, Some(w))) for w in `other`, or the
* pair (k, (v, None)) if no elements in `other` have key k. Hash-partitions the output
- * into `numSplits` partitions.
+ * into `numPartitions` partitions.
*/
- def leftOuterJoin[W](other: RDD[(K, W)], numSplits: Int): RDD[(K, (V, Option[W]))] = {
- leftOuterJoin(other, new HashPartitioner(numSplits))
+ def leftOuterJoin[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (V, Option[W]))] = {
+ leftOuterJoin(other, new HashPartitioner(numPartitions))
}
/**
@@ -325,8 +327,8 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
* pair (k, (None, w)) if no elements in `this` have key k. Hash-partitions the resulting
* RDD into the given number of partitions.
*/
- def rightOuterJoin[W](other: RDD[(K, W)], numSplits: Int): RDD[(K, (Option[V], W))] = {
- rightOuterJoin(other, new HashPartitioner(numSplits))
+ def rightOuterJoin[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (Option[V], W))] = {
+ rightOuterJoin(other, new HashPartitioner(numPartitions))
}
/**
@@ -361,7 +363,7 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
throw new SparkException("Default partitioner cannot partition array keys.")
}
val cg = new CoGroupedRDD[K](
- Seq(self.asInstanceOf[RDD[(_, _)]], other.asInstanceOf[RDD[(_, _)]]),
+ Seq(self.asInstanceOf[RDD[(K, _)]], other.asInstanceOf[RDD[(K, _)]]),
partitioner)
val prfs = new PairRDDFunctions[K, Seq[Seq[_]]](cg)(classManifest[K], Manifests.seqSeqManifest)
prfs.mapValues {
@@ -380,9 +382,9 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
throw new SparkException("Default partitioner cannot partition array keys.")
}
val cg = new CoGroupedRDD[K](
- Seq(self.asInstanceOf[RDD[(_, _)]],
- other1.asInstanceOf[RDD[(_, _)]],
- other2.asInstanceOf[RDD[(_, _)]]),
+ Seq(self.asInstanceOf[RDD[(K, _)]],
+ other1.asInstanceOf[RDD[(K, _)]],
+ other2.asInstanceOf[RDD[(K, _)]]),
partitioner)
val prfs = new PairRDDFunctions[K, Seq[Seq[_]]](cg)(classManifest[K], Manifests.seqSeqManifest)
prfs.mapValues {
@@ -412,17 +414,17 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
* For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the
* list of values for that key in `this` as well as `other`.
*/
- def cogroup[W](other: RDD[(K, W)], numSplits: Int): RDD[(K, (Seq[V], Seq[W]))] = {
- cogroup(other, new HashPartitioner(numSplits))
+ def cogroup[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (Seq[V], Seq[W]))] = {
+ cogroup(other, new HashPartitioner(numPartitions))
}
/**
* For each key k in `this` or `other1` or `other2`, return a resulting RDD that contains a
* tuple with the list of values for that key in `this`, `other1` and `other2`.
*/
- def cogroup[W1, W2](other1: RDD[(K, W1)], other2: RDD[(K, W2)], numSplits: Int)
+ def cogroup[W1, W2](other1: RDD[(K, W1)], other2: RDD[(K, W2)], numPartitions: Int)
: RDD[(K, (Seq[V], Seq[W1], Seq[W2]))] = {
- cogroup(other1, other2, new HashPartitioner(numSplits))
+ cogroup(other1, other2, new HashPartitioner(numPartitions))
}
/** Alias for cogroup. */
@@ -634,9 +636,9 @@ class OrderedRDDFunctions[K <% Ordered[K]: ClassManifest, V: ClassManifest](
* (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
* order of the keys).
*/
- def sortByKey(ascending: Boolean = true, numSplits: Int = self.splits.size): RDD[(K,V)] = {
+ def sortByKey(ascending: Boolean = true, numPartitions: Int = self.partitions.size): RDD[(K,V)] = {
val shuffled =
- new ShuffledRDD[K, V](self, new RangePartitioner(numSplits, self, ascending))
+ new ShuffledRDD[K, V](self, new RangePartitioner(numPartitions, self, ascending))
shuffled.mapPartitions(iter => {
val buf = iter.toArray
if (ascending) {
@@ -650,19 +652,19 @@ class OrderedRDDFunctions[K <% Ordered[K]: ClassManifest, V: ClassManifest](
private[spark]
class MappedValuesRDD[K, V, U](prev: RDD[(K, V)], f: V => U) extends RDD[(K, U)](prev) {
- override def getSplits = firstParent[(K, V)].splits
+ override def getPartitions = firstParent[(K, V)].partitions
override val partitioner = firstParent[(K, V)].partitioner
- override def compute(split: Split, context: TaskContext) =
+ override def compute(split: Partition, context: TaskContext) =
firstParent[(K, V)].iterator(split, context).map{ case (k, v) => (k, f(v)) }
}
private[spark]
class FlatMappedValuesRDD[K, V, U](prev: RDD[(K, V)], f: V => TraversableOnce[U])
extends RDD[(K, U)](prev) {
- override def getSplits = firstParent[(K, V)].splits
+ override def getPartitions = firstParent[(K, V)].partitions
override val partitioner = firstParent[(K, V)].partitioner
- override def compute(split: Split, context: TaskContext) = {
+ override def compute(split: Partition, context: TaskContext) = {
firstParent[(K, V)].iterator(split, context).flatMap { case (k, v) => f(v).map(x => (k, x)) }
}
}
@@ -3,7 +3,7 @@ package spark
/**
* A partition of an RDD.
*/
-trait Split extends Serializable {
+trait Partition extends Serializable {
/**
* Get the split's index within its parent RDD
*/
Oops, something went wrong.

0 comments on commit 7c12938

Please sign in to comment.