Permalink
Browse files

Single import and ScoobiApp

  • Loading branch information...
1 parent d16444b commit f6eefe1eb4da674657000a30f3c314e2497426c0 @espringe espringe committed with blever Mar 8, 2012
View
@@ -134,31 +134,22 @@ complete application for word count looks like this:
```scala
import com.nicta.scoobi.Scoobi._
- import com.nicta.scoobi.DList._
- import com.nicta.scoobi.io.text.TextInput._
- import com.nicta.scoobi.io.text.TextOutput._
- object WordCount {
- def main(allArgs: Array[String]) = withHadoopArgs(allArgs) { args =>
+ object WordCount extends ScoobiApp {
+ val lines: DList[String] = fromTextFile(args(0))
- val lines: DList[String] = fromTextFile(args(0))
+ val counts: DList[(String, Int)] = lines.flatMap(_.split(" "))
+ .map(word => (word, 1))
+ .groupByKey
+ .combine(_+_)
- val counts: DList[(String, Int)] = lines.flatMap(_.split(" "))
- .map(word => (word, 1))
- .groupByKey
- .combine(_+_)
-
- persist(toTextFile(counts, args(1)))
- }
+ persist(toTextFile(counts, args(1)))
}
```
-Our word count example is implemented by the object `WordCount`. First, there are few
-imports to specify for brining in `DList` and text I/O. The guts of the implementation
-are within the enclosing braces of the `withHadoopArgs` control structure. The purpose of
-`withHadoopArgs` is to parse the [generic Hadoop command line options](http://hadoop.apache.org/common/docs/current/commands_manual.html#Generic+Options)
-before passing the remaining arguments to the guts of the implementation. This implementation
-uses the remaining two arguments for the input (`args(0)`) and output directory (`args(1)`).
+Our word count example is implemented by the object `WordCount`, wich extends a `ScoobiApp`. This
+is a convience in Scoobi to avoid having to write a `main` function, as well as automatically
+handling arguments intended for hadoop. The remaining arguments are available as `args`
Within the implementation guts, the first task is to construct a `DList` representing the data
located at the input directory. In this situation, because the input data are simple text files,
View
@@ -31,6 +31,8 @@ libraryDependencies ++= Seq(
publishArtifact in packageDoc := false
+compileOrder := CompileOrder.ScalaThenJava
+
resolvers += "Cloudera Maven Repository" at "https://repository.cloudera.com/content/repositories/releases/"
resolvers += "Packaged Avro" at "http://nicta.github.com/scoobi/releases/"
@@ -15,64 +15,65 @@
*/
package com.nicta.scoobi.examples
-import com.nicta.scoobi._
import com.nicta.scoobi.Scoobi._
-import com.nicta.scoobi.WireFormat._
-import com.nicta.scoobi.io.text.TextInput._
-import com.nicta.scoobi.io.text.TextOutput._
import java.io._
/*
* This example takes a list of names and ages in the form: <id>, <firstName>, <secondName>, <age>
* then gets the average age for each first name.
*/
-object AverageAge {
+object AverageAge extends ScoobiApp {
- def main(args: Array[String]) = withHadoopArgs(args) { _ =>
-
- if (!new File("output-dir").mkdir) {
- sys.error("Could not make output-dir for results. Perhaps it already exists (and you should delete/rename the old one)")
- }
+ if (!new File("output-dir").mkdir) {
+ sys.error("Could not make output-dir for results. Perhaps it already exists (and you should delete/rename the old one)")
+ }
- val fileName = "output-dir/names.txt"
+ val fileName = "output-dir/names.txt"
- // write some names to a file (so this example has no external requirements)
- generateNames(fileName)
+ // write some names to a file (so this example has no external requirements)
+ generateNames(fileName)
- case class Person(val id: Long,
- val secondName: String,
- val firstName: String,
- val age: Int)
+ case class Person(val id: Long,
+ val secondName: String,
+ val firstName: String,
+ val age: Int)
- // With this implicit conversion, we let Scoobi know the apply and unapply function, which it uses
- // to construct and deconstruct Person objects. Now it can very efficiently serialize them (i.e. no overhead)
- implicit val PersonFmt = mkCaseWireFormat(Person, Person.unapply _)
+ // With this implicit conversion, we let Scoobi know the apply and unapply function, which it uses
+ // to construct and deconstruct Person objects. Now it can very efficiently serialize them (i.e. no overhead)
+ implicit val PersonFmt = mkCaseWireFormat(Person, Person.unapply _)
- // Read in lines of the form: 234242, Bob, Smith, 31.
- val persons : DList[Person] = fromDelimitedTextFile(fileName, ",") {
- case Long(id) :: fN :: sN :: Int(age) :: _ => Person(id, sN, fN, age)
- }
+ // Read in lines of the form: 234242, Bob, Smith, 31.
+ val persons : DList[Person] = fromDelimitedTextFile(fileName, ",") {
+ case Long(id) :: fN :: sN :: Int(age) :: _ => Person(id, sN, fN, age)
+ }
- // The only thing we're interested in, is the firstName and age
- val nameAndAge: DList[(String, Int)] = persons.map { p => (p.firstName, p.age) }
+ // The only thing we're interested in, is the firstName and age
+ val nameAndAge: DList[(String, Int)] = persons.map { p => (p.firstName, p.age) }
- // Let's group everyone with the same name together
- val grouped: DList[(String, Iterable[Int])] = nameAndAge groupByKey
+ // Let's group everyone with the same name together
+ val grouped: DList[(String, Iterable[Int])] = nameAndAge groupByKey
- // And for every name, we will average all the avages
- val avgAgeForName: DList[(String, Int)] = grouped map { case (n, ages) => (n, average(ages)) }
+ // And for every name, we will average all the avages
+ val avgAgeForName: DList[(String, Int)] = grouped map { case (n, ages) => (n, average(ages)) }
- // Execute everything, and throw it into a directory
- DList.persist (toTextFile(avgAgeForName, "output-dir/avg-age"))
- }
+ // Execute everything, and throw it into a directory
+ DList.persist (toTextFile(avgAgeForName, "output-dir/avg-age"))
private def average[A](values: Iterable[A])(implicit ev: Numeric[A]) = {
import ev._
- toInt(values.sum) / values.size
- }
+ var value: Int = 0
+ var count = 0
+
+ for (i <- values) {
+ value = value + toInt(i)
+ count = count + 1
+ }
+
+ value / count
+ }
private def generateNames(filename: String) {
val fstream = new FileWriter(filename)
@@ -15,12 +15,7 @@
*/
package com.nicta.scoobi.examples
-import com.nicta.scoobi._
import com.nicta.scoobi.Scoobi._
-import com.nicta.scoobi.WireFormat._
-import com.nicta.scoobi.io.text.TextInput._
-import com.nicta.scoobi.io.text.TextOutput._
-import com.nicta.scoobi.lib.Join._
import java.io._
/*
@@ -15,39 +15,29 @@
*/
package com.nicta.scoobi.examples
-import com.nicta.scoobi._
import com.nicta.scoobi.Scoobi._
-import com.nicta.scoobi.io.text._
import java.io._
-object NumberPartitioner {
+object NumberPartitioner extends ScoobiApp {
+ val fileName = "output-dir/all-ints.txt"
- def main(args: Array[String]) = withHadoopArgs(args) { _ =>
+ // Write 50 (new line seperated) ints to a file. We do this to make the example self contained
+ generateInts(fileName, 50)
- if (!new File("output-dir").mkdir) {
- sys.error("Could not make output-dir for results. Perhaps it already exists (and you should delete/rename the old one)")
- }
+ // fromTextFile creates a list of Strings, where each String is a line
+ val data : DList[String] = fromTextFile(fileName);
- val fileName = "output-dir/all-ints.txt"
+ // since they're numbers, we can easily parse them
+ val intData : DList[Int] = data.map(_.toInt)
- // Write 50 (new line seperated) ints to a file. We do this to make the example self contained
- generateInts(fileName, 50)
+ // Now we can parition this data into two lists, one where they're even one where they're odd
+ val (evens, odds) = intData.partition(_ % 2 == 0)
- // fromTextFile creates a list of Strings, where each String is a line
- val data : DList[String] = TextInput.fromTextFile(fileName);
-
- // since they're numbers, we can easily parse them
- val intData : DList[Int] = data.map(_.toInt)
-
- // Now we can parition this data into two lists, one where they're even one where they're odd
- val (evens, odds) = intData.partition(_ % 2 == 0)
-
- DList.persist (
- TextOutput.toTextFile(evens, "output-dir/evens"),
- TextOutput.toTextFile(odds, "output-dir/odds")
- )
- }
+ DList.persist (
+ TextOutput.toTextFile(evens, "output-dir/evens"),
+ TextOutput.toTextFile(odds, "output-dir/odds")
+ )
private def generateInts(filename: String, count: Int) {
val fstream = new FileWriter(filename)
Oops, something went wrong.

0 comments on commit f6eefe1

Please sign in to comment.