In [None]:
spark

In [None]:
sc.getConf.getAll.filter(_._2.contains("/proxy/"))(0)._2

In [None]:
def getType(o: Any) = o.getClass.getCanonicalName

In [None]:
val os_name = System.getProperty("os.name")
val hdfs_home = "/user/" + System.getenv("HOME").split("/")(2)

In [None]:
val csv_read = hdfs_home+"/dataSets/spark-guide/flight-data/csv/2010-summary.csv"
val csv_write = hdfs_home + "/temp/my-csv-file.tsv"
val tsv_write = hdfs_home + "/temp/my-tsv-file.tsv"

val json_read = hdfs_home+"/dataSets/spark-guide/flight-data/csv/2010-summary.json"
val json_write = hdfs_home + "/temp/my-json-file.json"

val par_read = hdfs_home+"/dataSets/spark-guide/flight-data/csv/2010-summary.parquet"
val par_write = hdfs_home + "/temp/my-parquet-file.parquet"

val orc_read = hdfs_home+"/dataSets/spark-guide/flight-data/csv/2010-summary.orc"
val orc_write = hdfs_home + "/temp/my-orc-file.orc"

In [None]:
// COMMAND ----------

// in Scala
val csv_df = spark.read.format("csv")
  .option("header", "true")
  .option("mode", "FAILFAST")
  .option("inferSchema", "true")
  .load(csv_read)

In [None]:
csv_df.show(3)

In [None]:
// COMMAND ----------

// in Scala
csv_df.write.format("csv")
  .option("mode", "OVERWRITE")
  .option("dateFormat", "yyyy-MM-dd")
  .option("path", csv_write)
  .save()

In [None]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.types.{StructField, StructType, StringType, LongType}

val myManualSchema = new StructType(Array(
  new StructField("DEST_COUNTRY_NAME", StringType, true),
  new StructField("ORIGIN_COUNTRY_NAME", StringType, true),
  new StructField("count", LongType, false)
))

spark.read.format("csv")
  .option("header", "true")
  .option("mode", "FAILFAST")
  .schema(myManualSchema)
  .load(csv_read)
  .show(5)

In [None]:
// COMMAND ----------

// in Scala
val myManualSchema = new StructType(Array(
                     new StructField("DEST_COUNTRY_NAME", LongType, true),
                     new StructField("ORIGIN_COUNTRY_NAME", LongType, true),
                     new StructField("count", LongType, false) ))

// spark.read.format("csv")
//   .option("header", "true")
//   .option("mode", "FAILFAST")
//   .schema(myManualSchema)
//   .load(csv_read)
//   .take(5)

//org.apache.spark.SparkException: Malformed records are detected in record parsing. Parse Mode: FAILFAST.

In [None]:
// COMMAND ----------

// in Scala
val csvFile = spark.read.format("csv")
  .option("header", "true").option("mode", "FAILFAST").schema(myManualSchema)
  .load(csv_read)

In [None]:
// COMMAND ----------

// in Scala
csvFile.write.format("csv").mode("overwrite").option("sep", "\t")
  .save(tsv_write)

In [None]:
// COMMAND ----------

//spark.read.format("json")


// COMMAND ----------

// in Scala
val json_df = spark.read.format("json").option("mode", "FAILFAST").schema(myManualSchema)
  .load(json_read)
json.show(5)

In [None]:
// COMMAND ----------

// in Scala
json_df.write.format("json").mode("overwrite").save(json_write)

In [None]:
// COMMAND ----------

// in Scala
val par_df = spark.read.format("parquet")
  .load(par_read)
par_df.show(5)

In [None]:
// COMMAND ----------

// in Scala
par_df.write.format("parquet").mode("overwrite")
  .save(par_write)

In [None]:
// COMMAND ----------

// in Scala
val orc_df = spark.read.format("orc").load(orc_read)
orc_df.show(5)

In [None]:
// COMMAND ----------

// in Scala
orc_df.write.format("orc").mode("overwrite").save(orc_write)

In [None]:
// COMMAND ----------

// in Scala
val driver =  "org.sqlite.JDBC"
val path = "/data/flight-data/jdbc/my-sqlite.db"
val url = s"jdbc:sqlite:/${path}"
val tablename = "flight_info"


// COMMAND ----------

import java.sql.DriverManager
val connection = DriverManager.getConnection(url)
connection.isClosed()
connection.close()


// COMMAND ----------

// in Scala
val dbDataFrame = spark.read.format("jdbc").option("url", url)
  .option("dbtable", tablename).option("driver",  driver).load()

In [None]:
// COMMAND ----------

// in Scala
val pgDF = spark.read
  .format("jdbc")
  .option("driver", "org.postgresql.Driver")
  .option("url", "jdbc:postgresql://database_server")
  .option("dbtable", "schema.tablename")
  .option("user", "username").option("password","my-secret-password").load()


// COMMAND ----------

dbDataFrame.select("DEST_COUNTRY_NAME").distinct().show(5)


// COMMAND ----------

dbDataFrame.select("DEST_COUNTRY_NAME").distinct().explain


// COMMAND ----------

// in Scala
dbDataFrame.filter("DEST_COUNTRY_NAME in ('Anguilla', 'Sweden')").explain


// COMMAND ----------

// in Scala
val pushdownQuery = """(SELECT DISTINCT(DEST_COUNTRY_NAME) FROM flight_info)
  AS flight_info"""

val dbDataFrame = spark.read.format("jdbc")
  .option("url", url).option("dbtable", pushdownQuery).option("driver",  driver)
  .load()

In [None]:
// COMMAND ----------

dbDataFrame.explain()

In [None]:
// COMMAND ----------

// in Scala
val dbDataFrame = spark.read.format("jdbc")
  .option("url", url).option("dbtable", tablename).option("driver", driver)
  .option("numPartitions", 10).load()

In [None]:
// COMMAND ----------

dbDataFrame.select("DEST_COUNTRY_NAME").distinct().show()

In [None]:
// COMMAND ----------

// in Scala
val props = new java.util.Properties
props.setProperty("driver", "org.sqlite.JDBC")
val predicates = Array(
  "DEST_COUNTRY_NAME = 'Sweden' OR ORIGIN_COUNTRY_NAME = 'Sweden'",
  "DEST_COUNTRY_NAME = 'Anguilla' OR ORIGIN_COUNTRY_NAME = 'Anguilla'")

In [None]:
spark.read.jdbc(url, tablename, predicates, props).show()
spark.read.jdbc(url, tablename, predicates, props).rdd.getNumPartitions // 2

In [None]:
// COMMAND ----------

// in Scala
val props = new java.util.Properties
props.setProperty("driver", "org.sqlite.JDBC")

In [None]:
val predicates = Array(
  "DEST_COUNTRY_NAME != 'Sweden' OR ORIGIN_COUNTRY_NAME != 'Sweden'",
  "DEST_COUNTRY_NAME != 'Anguilla' OR ORIGIN_COUNTRY_NAME != 'Anguilla'")
spark.read.jdbc(url, tablename, predicates, props).count() // 510

In [None]:
// COMMAND ----------

// in Scala
val colName = "count"
val lowerBound = 0L
val upperBound = 348113L // this is the max count in our database
val numPartitions = 10

In [None]:
// COMMAND ----------

// in Scala
spark.read.jdbc(url,tablename,colName,lowerBound,upperBound,numPartitions,props)
  .count() // 255

In [None]:
// COMMAND ----------

// in Scala
val newPath = "jdbc:sqlite://tmp/my-sqlite.db"
csvFile.write.mode("overwrite").jdbc(newPath, tablename, props)

In [None]:
// COMMAND ----------

// in Scala
spark.read.jdbc(newPath, tablename, props).count() // 255

In [None]:
// COMMAND ----------

// in Scala
csvFile.write.mode("append").jdbc(newPath, tablename, props)

In [None]:
// COMMAND ----------

// in Scala
spark.read.jdbc(newPath, tablename, props).count() // 765

In [None]:
// COMMAND ----------

spark.read.textFile("/data/flight-data/csv/2010-summary.csv")
  .selectExpr("split(value, ',') as rows").show()

In [None]:
// COMMAND ----------

csvFile.select("DEST_COUNTRY_NAME").write.text("/tmp/simple-text-file.txt")

In [None]:
// COMMAND ----------

// in Scala
csvFile.limit(10).select("DEST_COUNTRY_NAME", "count")
  .write.partitionBy("count").text("/tmp/five-csv-files2.csv")

In [None]:
// COMMAND ----------

// in Scala
csvFile.limit(10).write.mode("overwrite").partitionBy("DEST_COUNTRY_NAME")
  .save("/tmp/partitioned-files.parquet")

In [None]:
// COMMAND ----------

val numberBuckets = 10
val columnToBucketBy = "count"

csvFile.write.format("parquet").mode("overwrite")
  .bucketBy(numberBuckets, columnToBucketBy).saveAsTable("bucketedFiles")


// COMMAND ----------