In [2]:
import org.apache.spark.sql.functions.avg
import org.apache.spark.sql.SparkSession


import org.apache.spark.sql.functions.avg
import org.apache.spark.sql.SparkSession


In [3]:
// Create a DataFrame using SparkSession
val spark = SparkSession
 .builder
 .appName("Chapter6")
 .getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@492a6e56


In [4]:
//To create a distributed Dataset[Bloggers], we define a Scala case class that defines each individual field that comprises a Scala object
case class Bloggers(id:BigInt, first:String, last:String, url:String, published:String, hits:BigInt, campaigns:Array[String])

//We read the file from the data source
val bloggers = "C:/Users/mariajose.chinchilla/OneDrive - Bosonit/Escritorio/Bosonit/Spark/datarepositorio/databricks-datasets/learning-spark-v2/blogs.json"
val bloggersDS = spark
  .read
  .format("json")
  .option("path", bloggers)
  .load()
  .as[Bloggers]

defined class Bloggers
bloggers: String = C:/Users/mariajose.chinchilla/OneDrive - Bosonit/Escritorio/Bosonit/Spark/datarepositorio/databricks-datasets/learning-spark-v2/blogs.json
bloggersDS: org.apache.spark.sql.Dataset[Bloggers] = [Campaigns: array<string>, First: string ... 5 more fields]


In [5]:
//Create a Scala object with three fields:uid(unique ID for a user), uname(randomly generated username string) and usage(minutes of server or service usage)

import scala.util.Random._

// Our case class for the Dataset
case class Usage(uid:Int, uname:String, usage: Int)

val r = new scala.util.Random(42)

// Create 1000 instances of scala Usage class 
// This generates data on the fly
val data = for (i <- 0 to 1000)
 yield (Usage(i, "user-" + r.alphanumeric.take(5).mkString(""),
 r.nextInt(1000)))

// Create a Dataset of Usage typed data
val dsUsage = spark.createDataset(data)
dsUsage.show(10)

+---+----------+-----+
|uid|     uname|usage|
+---+----------+-----+
|  0|user-Gpi2C|  525|
|  1|user-DgXDi|  502|
|  2|user-M66yO|  170|
|  3|user-xTOn6|  913|
|  4|user-3xGSz|  246|
|  5|user-2aWRN|  727|
|  6|user-EzZY1|   65|
|  7|user-ZlZMZ|  935|
|  8|user-VjxeG|  756|
|  9|user-iqf1P|    3|
+---+----------+-----+
only showing top 10 rows



import scala.util.Random._
defined class Usage
r: scala.util.Random = scala.util.Random@1a3c2840
data: scala.collection.immutable.IndexedSeq[Usage] = Vector(Usage(0,user-Gpi2C,525), Usage(1,user-DgXDi,502), Usage(2,user-M66yO,170), Usage(3,user-xTOn6,913), Usage(4,user-3xGSz,246), Usage(5,user-2aWRN,727), Usage(6,user-EzZY1,65), Usage(7,user-ZlZMZ,935), Usage(8,user-VjxeG,756), Usage(9,user-iqf1P,3), Usage(10,user-91S1q,794), Usage(11,user-qHNj0,501), Usage(12,user-7hb94,460), Usage(13,user-bz0WF,142), Usage(14,user-71nwy,479), Usage(15,user-7GZz1,823), Usage(16,user-1CSk6,140), Usage(17,user-WPzlL,246), Usage(18,user-VaEit,451), Usage(19,user-PSaRq,679), Usage(20,user-0Kkzu,332), Usage(21,user-UN3MG,172), Usage(22,user-KwwER,442), Usage(23,user-ZnltJ,923), Usage(24,user-IRA17,741), ...


In [None]:
//Filter to return all the users in our dsUsage dataset whose usage exceeds 900 minutes. 

//One way to do this is to use a functional expressions as an argument to the filter method
import org.apache.spark.sql.functions._
dsUsage
  .filter(d => d.usage > 900)
  .orderBy(desc("usage"))
  .show(5, false)

//We use a lambda expression {d.usage > 900} as an argument to the filter() method

In [None]:
//Another way is to define a function and supply that function as an argument to filter()
def filterWithUsage(u: Usage) = u.usage > 900
dsUsage.filter(filterWithUsage(_)).orderBy(desc("usage")).show(5)

//We define a Scala function def filterWithUsage(u:Usage)= u.usage > 900

//In both cases, the filter() method iterates over each row of the Usage object in the distributed Dataset

In [None]:
//We want to find out the usage cost for each user whose usage value is over a certain threshold so we can offer those users a special price per minute

// Use an if-then-else lambda expression and compute a value
dsUsage.map(u => {if (u.usage > 750) u.usage * .15 else u.usage * .50 })
 .show(5, false)
// Define a function to compute the usage
def computeCostUsage(usage: Int): Double = {
 if (usage > 750) usage * 0.15 else usage * 0.50
}
// Use the function as an argument to map()
dsUsage.map(u => {computeCostUsage(u.usage)}).show(5, false)

In [None]:
//We have computed values for the cost of usage, we don’t know which users the computed values are associated with. How do we get this information?

//1. Create a Scala case class or JavaBean class, UsageCost, with an additional field or column named cost.
//2. Define a function to compute the cost and use it in the map() method.

In [None]:
// Create a new case class with an additional field, cost
case class UsageCost(uid: Int, uname:String, usage: Int, cost: Double)
// Compute the usage cost with Usage as a parameter
// Return a new object, UsageCost
def computeUserCostUsage(u: Usage): UsageCost = {
 val v = if (u.usage > 750) u.usage * 0.15 else u.usage * 0.50
 UsageCost(u.uid, u.uname, u.usage, v)
}
// Use map() on our original Dataset
dsUsage.map(u => {computeUserCostUsage(u)}).show(5)

//Now we have a transformed Dataset with a new column, cost, computed by the function in our map() transformation, along with all the other columns

In [None]:
//To convert an existing DataFrame df to a Dataset of type SomeCaseClass
val bloggersDS = spark
 .read
 .format("json")
 .option("path", "/FileStore/tables/blogs-1.json")
 .load()
 .as[Bloggers]

In [None]:
//Suppose we have a Dataset of type Person, where Person is defined as a Scala case class:
case class Person(id: Integer, firstName: String, middleName:String, lastName: String, gender: String, birthDate: String, ssn: String, salary: String)


In [None]:
/Examine a case where we compose a query inefficiently:

/*

import java.util.Calendar
val earliestYear = Calendar.getInstance.get(Calendar.YEAR) - 40
Person
 // Everyone above 40: lambda-1
 .filter(x => x.birthDate.split("-")(0).toInt > earliestYear)
 
 // Everyone earning more than 80K
 .filter($"salary" > 80000)

// Last name starts with J: lambda-2
 .filter(x => x.lastName.startsWith("J"))
 
 // First name starts with D
 .filter($"firstName".startsWith("D"))
 .count()

/*

In [None]:
//The following query uses only DSL and no lambdas
/*
personDS
 .filter(year($"birthDate") > earliestYear) // Everyone above 40
 .filter($"salary" > 80000) // Everyone earning more than 80K
 .filter($"lastName".startsWith("J")) // Last name starts with J
 .filter($"firstName".startsWith("D")) // First name starts with D
 .count()

/*