In [1]:
spark

Waiting for a Spark session to start...

In [2]:
sc.getConf.getAll.filter(_._2.contains("/proxy/"))(0)._2

Waiting for a Spark session to start...

http://rm01.itversity.com:19288/proxy/application_1540458187951_76586

In [3]:
def getType(o: Any) = o.getClass.getCanonicalName

getType: (o: Any)String


In [4]:
val os_name = System.getProperty("os.name")
val hdfs_home = "/user/" + System.getenv("HOME").split("/")(2)

os_name = Linux
hdfs_home = /user/kranthidr


/user/kranthidr

In [5]:
val path = hdfs_home+"/dataSets/spark-guide/retail-data/all/*.csv"

path = /user/kranthidr/dataSets/spark-guide/retail-data/all/*.csv


/user/kranthidr/dataSets/spark-guide/retail-data/all/*.csv

In [6]:
// in Scala
val df = spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load(path)
  .coalesce(5)

df = [InvoiceNo: string, StockCode: string ... 6 more fields]


[InvoiceNo: string, StockCode: string ... 6 more fields]

In [7]:
df.cache()
df.createOrReplaceTempView("dfTable")

In [8]:
// COMMAND ----------

df.count() == 541909

true

In [9]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.count
df.select(count("StockCode")).show() // 541909

+----------------+
|count(StockCode)|
+----------------+
|          541909|
+----------------+



In [10]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.countDistinct
df.select(countDistinct("StockCode")).show() // 4070

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



In [11]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show() // 3364

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



In [12]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.{first, last}
df.select(first("StockCode"), last("StockCode")).show()

+-----------------------+----------------------+
|first(StockCode, false)|last(StockCode, false)|
+-----------------------+----------------------+
|                  21544|                85049D|
+-----------------------+----------------------+



In [13]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.{min, max}
df.select(min("Quantity"), max("Quantity")).show()

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+



In [14]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.sum
df.select(sum("Quantity")).show() // 5176450

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+



In [15]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.sumDistinct
df.select(sumDistinct("Quantity")).show() // 29310

+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+



In [16]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.{sum, count, avg, expr}

df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()

+--------------------------------------+----------------+----------------+
|(total_purchases / total_transactions)|   avg_purchases|  mean_purchases|
+--------------------------------------+----------------+----------------+
|                      9.55224954743324|9.55224954743324|9.55224954743324|
+--------------------------------------+----------------+----------------+



In [17]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.{var_pop, stddev_pop}
import org.apache.spark.sql.functions.{var_samp, stddev_samp}
df.select(var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()

+------------------+------------------+--------------------+---------------------+
| var_pop(Quantity)|var_samp(Quantity)|stddev_pop(Quantity)|stddev_samp(Quantity)|
+------------------+------------------+--------------------+---------------------+
|47559.303646609354| 47559.39140929905|  218.08095663447864|   218.08115785023486|
+------------------+------------------+--------------------+---------------------+



In [18]:
// COMMAND ----------

import org.apache.spark.sql.functions.{skewness, kurtosis}
df.select(skewness("Quantity"), kurtosis("Quantity")).show()

+--------------------+------------------+
|  skewness(Quantity)|kurtosis(Quantity)|
+--------------------+------------------+
|-0.26407557610527843|119768.05495536518|
+--------------------+------------------+



In [19]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.{corr, covar_pop, covar_samp}
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
    covar_pop("InvoiceNo", "Quantity")).show()

+-------------------------+-------------------------------+------------------------------+
|corr(InvoiceNo, Quantity)|covar_samp(InvoiceNo, Quantity)|covar_pop(InvoiceNo, Quantity)|
+-------------------------+-------------------------------+------------------------------+
|     4.912186085637639E-4|             1052.7280543913773|            1052.7260778752732|
+-------------------------+-------------------------------+------------------------------+



In [20]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.{collect_set, collect_list}
df.agg(collect_set("Country"), collect_list("Country")).show()

+--------------------+---------------------+
|collect_set(Country)|collect_list(Country)|
+--------------------+---------------------+
|[Portugal, Italy,...| [United Kingdom, ...|
+--------------------+---------------------+



In [21]:
// COMMAND ----------

df.groupBy("InvoiceNo", "CustomerId").count().show()

+---------+----------+-----+
|InvoiceNo|CustomerId|count|
+---------+----------+-----+
|   563017|     13198|   32|
|   563214|     16370|   71|
|   563372|     15653|    4|
|   563714|     14565|   31|
|   564666|     12492|    9|
|  C565313|     14527|    1|
|   565318|     12921|   23|
|  C565962|     14410|   11|
|   566023|     12955|   18|
|   566079|     17593|   37|
|   566904|     15660|   19|
|   567476|     14859|   32|
|  C567643|     12409|    1|
|   568330|     16468|    2|
|   568509|     16422|    4|
|  C568911|     15050|    1|
|   569105|     16729|   49|
|   569211|     15774|   25|
|   569388|     14031|    1|
|   570179|     17509|   54|
+---------+----------+-----+
only showing top 20 rows



In [22]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.count

df.groupBy("InvoiceNo").agg(
  count("Quantity").alias("quan"),
  expr("count(Quantity)")).show()

+---------+----+---------------+
|InvoiceNo|quan|count(Quantity)|
+---------+----+---------------+
|   536596|   6|              6|
|   536938|  14|             14|
|   537252|   1|              1|
|   537691|  20|             20|
|   538041|   1|              1|
|   538184|  26|             26|
|   538517|  53|             53|
|   538879|  19|             19|
|   539275|   6|              6|
|   539630|  12|             12|
|   540499|  24|             24|
|   540540|  22|             22|
|  C540850|   1|              1|
|   540976|  48|             48|
|   541432|   4|              4|
|   541518| 101|            101|
|   541783|  35|             35|
|   542026|   9|              9|
|   542375|   6|              6|
|  C542604|   8|              8|
+---------+----+---------------+
only showing top 20 rows



In [23]:
// COMMAND ----------

// in Scala
df.groupBy("InvoiceNo").agg("Quantity"->"avg", "Quantity"->"stddev_pop").show()

+---------+------------------+--------------------+
|InvoiceNo|     avg(Quantity)|stddev_pop(Quantity)|
+---------+------------------+--------------------+
|   563020|16.041666666666668|  10.212244148841895|
|   565747|              10.3|   3.163858403911275|
|   566248|               9.0|   8.703447592764606|
|   566431| 14.11111111111111|   6.911254017815104|
|   567163|              14.5|  11.280514172678478|
|   567695|             -42.0|                 0.0|
|   567879|10.206896551724139|  6.4774372877616235|
|   568222|10.636363636363637|   2.705977466570403|
|   568711|             15.25|   9.575359001102779|
|   569020|11.541666666666666|   8.467974800518848|
|   569560|            10.625|  3.7893765978060294|
|   569823|1.4782608695652173|  0.9869980199409517|
|   570234|3.5833333333333335|   2.542691050398726|
|   570264|             -22.0|                 0.0|
|   570281|              48.0|                 0.0|
|   570592| 7.438356164383562|  11.910610328345681|
|   571010| 

In [24]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.{col, to_date}
val dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"),
  "MM/d/yyyy H:mm"))

dfWithDate = [InvoiceNo: string, StockCode: string ... 7 more fields]


[InvoiceNo: string, StockCode: string ... 7 more fields]

In [25]:
dfWithDate.show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|      date|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|2010-12-01|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|2010-12-01|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|2010-12-01|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|2010-12-01|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|2010-12-01|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|U

In [26]:
dfWithDate.createOrReplaceTempView("dfWithDate")

In [27]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.col
val windowSpec = Window
  .partitionBy("CustomerId", "date")
  .orderBy(col("Quantity").desc)
  .rowsBetween(Window.unboundedPreceding, Window.currentRow)

windowSpec = org.apache.spark.sql.expressions.WindowSpec@1e336cf8


org.apache.spark.sql.expressions.WindowSpec@1e336cf8

In [28]:
// COMMAND ----------

import org.apache.spark.sql.functions.max
val maxPurchaseQuantity = max(col("Quantity")).over(windowSpec)

maxPurchaseQuantity = max(Quantity) OVER (PARTITION BY CustomerId, date ORDER BY Quantity DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)


max(Quantity) OVER (PARTITION BY CustomerId, date ORDER BY Quantity DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)

In [29]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.{dense_rank, rank}
val purchaseDenseRank = dense_rank().over(windowSpec)
val purchaseRank = rank().over(windowSpec)

purchaseDenseRank = DENSE_RANK() OVER (PARTITION BY CustomerId, date ORDER BY Quantity DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
purchaseRank = RANK() OVER (PARTITION BY CustomerId, date ORDER BY Quantity DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)


RANK() OVER (PARTITION BY CustomerId, date ORDER BY Quantity DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)

In [30]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.col

dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")
  .select(
    col("CustomerId"),
    col("date"),
    col("Quantity"),
    purchaseRank.alias("quantityRank"),
    purchaseDenseRank.alias("quantityDenseRank"),
    maxPurchaseQuantity.alias("maxPurchaseQuantity")).show()

+----------+----------+--------+------------+-----------------+-------------------+
|CustomerId|      date|Quantity|quantityRank|quantityDenseRank|maxPurchaseQuantity|
+----------+----------+--------+------------+-----------------+-------------------+
|     12346|2011-01-18|   74215|           1|                1|              74215|
|     12346|2011-01-18|  -74215|           2|                2|              74215|
|     12347|2010-12-07|      36|           1|                1|                 36|
|     12347|2010-12-07|      30|           2|                2|                 36|
|     12347|2010-12-07|      24|           3|                3|                 36|
|     12347|2010-12-07|      12|           4|                4|                 36|
|     12347|2010-12-07|      12|           4|                4|                 36|
|     12347|2010-12-07|      12|           4|                4|                 36|
|     12347|2010-12-07|      12|           4|                4|             

In [31]:
// COMMAND ----------

// in Scala
val dfNoNull = dfWithDate.drop()
dfNoNull.createOrReplaceTempView("dfNoNull")

dfNoNull = [InvoiceNo: string, StockCode: string ... 7 more fields]


[InvoiceNo: string, StockCode: string ... 7 more fields]

In [32]:
// COMMAND ----------

val rolledUpDF = dfNoNull.rollup("Date", "Country").agg(sum("Quantity"))
  .selectExpr("Date", "Country", "`sum(Quantity)` as total_quantity")
  .orderBy("Date")
rolledUpDF.show()

+----------+--------------+--------------+
|      Date|       Country|total_quantity|
+----------+--------------+--------------+
|      null|          null|       5176450|
|2010-12-01|        France|           449|
|2010-12-01|          EIRE|           243|
|2010-12-01|     Australia|           107|
|2010-12-01|       Germany|           117|
|2010-12-01|United Kingdom|         23949|
|2010-12-01|        Norway|          1852|
|2010-12-01|   Netherlands|            97|
|2010-12-01|          null|         26814|
|2010-12-02|          EIRE|             4|
|2010-12-02|          null|         21023|
|2010-12-02|       Germany|           146|
|2010-12-02|United Kingdom|         20873|
|2010-12-03|      Portugal|            65|
|2010-12-03|        Poland|           140|
|2010-12-03|         Italy|           164|
|2010-12-03|       Belgium|           528|
|2010-12-03|         Spain|           400|
|2010-12-03|        France|           239|
|2010-12-03|          null|         14830|
+----------

rolledUpDF = [Date: date, Country: string ... 1 more field]


[Date: date, Country: string ... 1 more field]

In [33]:
// COMMAND ----------

rolledUpDF.where("Country IS NULL").show()

+----------+-------+--------------+
|      Date|Country|total_quantity|
+----------+-------+--------------+
|      null|   null|       5176450|
|2010-12-01|   null|         26814|
|2010-12-02|   null|         21023|
|2010-12-03|   null|         14830|
|2010-12-05|   null|         16395|
|2010-12-06|   null|         21419|
|2010-12-07|   null|         24995|
|2010-12-08|   null|         22741|
|2010-12-09|   null|         18431|
|2010-12-10|   null|         20297|
|2010-12-12|   null|         10565|
|2010-12-13|   null|         17623|
|2010-12-14|   null|         20098|
|2010-12-15|   null|         18229|
|2010-12-16|   null|         29632|
|2010-12-17|   null|         16069|
|2010-12-19|   null|          3795|
|2010-12-20|   null|         14965|
|2010-12-21|   null|         15467|
|2010-12-22|   null|          3192|
+----------+-------+--------------+
only showing top 20 rows



In [34]:
// COMMAND ----------

rolledUpDF.where("Date IS NULL").show()

+----+-------+--------------+
|Date|Country|total_quantity|
+----+-------+--------------+
|null|   null|       5176450|
+----+-------+--------------+



In [35]:
// COMMAND ----------

// in Scala
dfNoNull.cube("Date", "Country").agg(sum(col("Quantity")))
  .select("Date", "Country", "sum(Quantity)").orderBy("Date").show()

+----+--------------------+-------------+
|Date|             Country|sum(Quantity)|
+----+--------------------+-------------+
|null|               Japan|        25218|
|null|            Portugal|        16180|
|null|                 RSA|          352|
|null|                null|      5176450|
|null|           Australia|        83653|
|null|         Unspecified|         3300|
|null|             Germany|       117448|
|null|              Cyprus|         6317|
|null|United Arab Emirates|          982|
|null|           Hong Kong|         4769|
|null|                 USA|         1034|
|null|           Singapore|         5234|
|null|     Channel Islands|         9479|
|null|               Spain|        26824|
|null|             Denmark|         8188|
|null|      Czech Republic|          592|
|null|             Finland|        10666|
|null|  European Community|          497|
|null|              Norway|        19247|
|null|             Lebanon|          386|
+----+--------------------+-------

In [None]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.functions.{grouping_id, sum, expr}

In [44]:
dfNoNull.cube("CustomerId", "StockCode").agg(expr("grouping_id()"), sum(col("Quantity")))
  .select("grouping_id()","CustomerId", "StockCode", "sum(Quantity)").show()

+-------------+----------+---------+-------------+
|grouping_id()|CustomerId|StockCode|sum(Quantity)|
+-------------+----------+---------+-------------+
|            0|     17809|    22961|           72|
|            0|     16098|    22726|           40|
|            0|     17548|    22553|          -24|
|            0|     14307|    22736|           10|
|            0|     17908|    21811|            1|
|            0|     14729|    21427|            2|
|            0|     12433|    22315|           24|
|            0|     14594|    21143|            3|
|            0|     16928|    22774|          138|
|            0|     17855|    22037|           12|
|            0|     18041|    22196|           30|
|            0|     18041|    21328|            2|
|            0|     16244|    22352|            6|
|            0|     14449|    22147|            8|
|            0|     16781|   82494L|            6|
|            0|     17581|    22961|           24|
|            0|     17838|   79

In [46]:
dfNoNull.cube("customerId", "stockCode").agg(expr("grouping_id()"), sum("Quantity"))
.orderBy($"grouping_id()".desc)
.show()

+----------+---------+-------------+-------------+
|customerId|stockCode|grouping_id()|sum(Quantity)|
+----------+---------+-------------+-------------+
|      null|     null|            3|      5176450|
|      null|    22275|            2|           69|
|      null|    22295|            2|         2795|
|      null|    21201|            2|          849|
|      null|    22522|            2|          795|
|      null|   84804B|            2|            3|
|      null|   84931A|            2|          135|
|      null|    22919|            2|         1745|
|      null|    22207|            2|         1259|
|      null|   51014C|            2|         2505|
|      null|    21676|            2|          257|
|      null|    23217|            2|         1309|
|      null|    23401|            2|          456|
|      null|    22265|            2|          540|
|      null|    23630|            2|            1|
|      null|   90059E|            2|           19|
|      null|    21946|         

lastException: Throwable = null


In [47]:
// COMMAND ----------

// in Scala
val pivoted = dfWithDate.groupBy("date").pivot("Country").sum()

pivoted = [date: date, Australia_sum(Quantity): bigint ... 113 more fields]


[date: date, Australia_sum(Quantity): bigint ... 113 more fields]

In [48]:
// COMMAND ----------

pivoted.where("date > '2011-12-05'").select("date" ,"`USA_sum(Quantity)`").show()

+----------+-----------------+
|      date|USA_sum(Quantity)|
+----------+-----------------+
|2011-12-06|             null|
|2011-12-09|             null|
|2011-12-08|             -196|
|2011-12-07|             null|
+----------+-----------------+



In [49]:
// COMMAND ----------

// in Scala
import org.apache.spark.sql.expressions.MutableAggregationBuffer
import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

class BoolAnd extends UserDefinedAggregateFunction {
  def inputSchema: org.apache.spark.sql.types.StructType =
    StructType(StructField("value", BooleanType) :: Nil)
  def bufferSchema: StructType = StructType(
    StructField("result", BooleanType) :: Nil
  )
  def dataType: DataType = BooleanType
  def deterministic: Boolean = true
  def initialize(buffer: MutableAggregationBuffer): Unit = {
    buffer(0) = true
  }
  def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
    buffer(0) = buffer.getAs[Boolean](0) && input.getAs[Boolean](0)
  }
  def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
    buffer1(0) = buffer1.getAs[Boolean](0) && buffer2.getAs[Boolean](0)
  }
  def evaluate(buffer: Row): Any = {
    buffer(0)
  }
}

defined class BoolAnd


In [50]:
// COMMAND ----------

// in Scala
val ba = new BoolAnd
spark.udf.register("booland", ba)
import org.apache.spark.sql.functions._
spark.range(1)
  .selectExpr("explode(array(TRUE, TRUE, TRUE)) as t")
  .selectExpr("explode(array(TRUE, FALSE, TRUE)) as f", "t")
  .select(ba(col("t")), expr("booland(f)"))
  .show()


// COMMAND ----------

+----------+----------+
|booland(t)|booland(f)|
+----------+----------+
|      true|     false|
+----------+----------+



ba = BoolAnd@75677d8f


$line111.$read$$iw$$iw$BoolAnd@75677d8f

In [51]:
// in Scala
import org.apache.spark.sql.expressions.MutableAggregationBuffer
import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._

class BoolAnd extends org.apache.spark.sql.expressions.UserDefinedAggregateFunction {
    
  def inputSchema: StructType =
    StructType(StructField("value", BooleanType) :: Nil)
  def bufferSchema: StructType = StructType(
    StructField("result", BooleanType) :: Nil
  )
  def dataType: DataType = BooleanType
  def deterministic: Boolean = true
  def initialize(buffer: MutableAggregationBuffer): Unit = {
    buffer(0) = true
  }
  def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
    buffer(0) = buffer.getAs[Boolean](0) && input.getAs[Boolean](0)
  }
  def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
    buffer1(0) = buffer1.getAs[Boolean](0) && buffer2.getAs[Boolean](0)
  }
  def evaluate(buffer: Row): Any = {
    buffer(0)
  }
}

defined class BoolAnd


In [52]:
// COMMAND ----------
val ba = new BoolAnd()
spark.udf.register("booland", ba)

ba = BoolAnd@24548460


$line115.$read$$iw$$iw$BoolAnd@24548460

In [53]:
val df1 = spark.range(1)

df1 = [id: bigint]


[id: bigint]

In [54]:
df1.show()

+---+
| id|
+---+
|  0|
+---+



In [55]:
val df2 = df1.selectExpr("id","explode(array(TRUE, TRUE, TRUE)) as t")

df2 = [id: bigint, t: boolean]


[id: bigint, t: boolean]

In [56]:
df2.show()

+---+----+
| id|   t|
+---+----+
|  0|true|
|  0|true|
|  0|true|
+---+----+



In [57]:
val df3 = df2.selectExpr("id","t","explode(array(TRUE, FALSE, TRUE)) as f")

df3 = [id: bigint, t: boolean ... 1 more field]


[id: bigint, t: boolean ... 1 more field]

In [58]:
df3.show()

+---+----+-----+
| id|   t|    f|
+---+----+-----+
|  0|true| true|
|  0|true|false|
|  0|true| true|
|  0|true| true|
|  0|true|false|
|  0|true| true|
|  0|true| true|
|  0|true|false|
|  0|true| true|
+---+----+-----+



In [59]:
val df4 =  df3.select(ba(col("t")), expr("booland(f)"))

df4 = [booland(t): boolean, booland(f): boolean]


[booland(t): boolean, booland(f): boolean]

In [60]:
df4.show()
// COMMAND ----------

+----------+----------+
|booland(t)|booland(f)|
+----------+----------+
|      true|     false|
+----------+----------+

