In [13]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql._

val spark = SparkSession.builder()
.appName("types")
.getOrCreate()

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql._
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@438165ae


## Movies Data

In [14]:
val mv = spark.read
    .option("inferSchema", "true")
    .json("./dataset/movies.json")

mv.show(2)

+-------------+--------+-----------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+------+--------------------+------------+--------+---------------+
|Creative_Type|Director|Distributor|IMDB_Rating|IMDB_Votes|MPAA_Rating|Major_Genre|Production_Budget|Release_Date|Rotten_Tomatoes_Rating|Running_Time_min|Source|               Title|US_DVD_Sales|US_Gross|Worldwide_Gross|
+-------------+--------+-----------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+------+--------------------+------------+--------+---------------+
|         NULL|    NULL|   Gramercy|        6.1|      1071|          R|       NULL|          8000000|   12-Jun-98|                  NULL|            NULL|  NULL|      The Land Girls|        NULL|  146083|         146083|
|         NULL|    NULL|     Strand|        6.9|       207|          R|      Drama|           300000|    7-Aug-98|  

mv: org.apache.spark.sql.DataFrame = [Creative_Type: string, Director: string ... 14 more fields]


In [16]:
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")
val mv1 = mv.select(col("Title"), to_date(col("Release_Date"), "dd-MMM-yy").as("Actual_Release"))

mv1: org.apache.spark.sql.DataFrame = [Title: string, Actual_Release: date]


In [18]:
val mv2 = mv1
    .withColumn("Today", current_date())
    .withColumn("Now", current_timestamp())
    .withColumn("yearDiff", round(datediff(col("Today"), col("Actual_Release"))/365, 2))

mv2.show()

+--------------------+--------------+----------+--------------------+--------+
|               Title|Actual_Release|     Today|                 Now|yearDiff|
+--------------------+--------------+----------+--------------------+--------+
|      The Land Girls|    1998-06-12|2024-01-30|2024-01-30 05:27:...|   25.65|
|First Love, Last ...|    1998-08-07|2024-01-30|2024-01-30 05:27:...|    25.5|
|I Married a Stran...|    1998-08-28|2024-01-30|2024-01-30 05:27:...|   25.44|
|Let's Talk About Sex|    1998-09-11|2024-01-30|2024-01-30 05:27:...|    25.4|
|                Slam|    1998-10-09|2024-01-30|2024-01-30 05:27:...|   25.33|
| Mississippi Mermaid|    1999-01-15|2024-01-30|2024-01-30 05:27:...|   25.06|
|           Following|    1999-04-04|2024-01-30|2024-01-30 05:27:...|   24.84|
|             Foolish|    1999-04-09|2024-01-30|2024-01-30 05:27:...|   24.83|
|             Pirates|    1986-07-01|2024-01-30|2024-01-30 05:27:...|   37.61|
|     Duel in the Sun|    1946-12-31|2024-01-30|2024

mv2: org.apache.spark.sql.DataFrame = [Title: string, Actual_Release: date ... 3 more fields]


## Stock Data

In [22]:
val stock = spark.read
    .option("inferSchema", "true")
    .option("header", "true")
    .csv("./dataset/stocks.csv")

stock: org.apache.spark.sql.DataFrame = [symbol: string, date: string ... 1 more field]


In [23]:
stock.show()

+------+----------+-----+
|symbol|      date|price|
+------+----------+-----+
|  MSFT|Jan 1 2000|39.81|
|  MSFT|Feb 1 2000|36.35|
|  MSFT|Mar 1 2000|43.22|
|  MSFT|Apr 1 2000|28.37|
|  MSFT|May 1 2000|25.45|
|  MSFT|Jun 1 2000|32.54|
|  MSFT|Jul 1 2000| 28.4|
|  MSFT|Aug 1 2000| 28.4|
|  MSFT|Sep 1 2000|24.53|
|  MSFT|Oct 1 2000|28.02|
|  MSFT|Nov 1 2000|23.34|
|  MSFT|Dec 1 2000|17.65|
|  MSFT|Jan 1 2001|24.84|
|  MSFT|Feb 1 2001| 24.0|
|  MSFT|Mar 1 2001|22.25|
|  MSFT|Apr 1 2001|27.56|
|  MSFT|May 1 2001|28.14|
|  MSFT|Jun 1 2001| 29.7|
|  MSFT|Jul 1 2001|26.93|
|  MSFT|Aug 1 2001|23.21|
+------+----------+-----+
only showing top 20 rows



In [25]:
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")
val st1 = stock.withColumn("to_date", to_date(col("date"), "MMM dd yyyy"))
st1.show(5)

+------+----------+-----+----------+
|symbol|      date|price|   to_date|
+------+----------+-----+----------+
|  MSFT|Jan 1 2000|39.81|2000-01-01|
|  MSFT|Feb 1 2000|36.35|2000-02-01|
|  MSFT|Mar 1 2000|43.22|2000-03-01|
|  MSFT|Apr 1 2000|28.37|2000-04-01|
|  MSFT|May 1 2000|25.45|2000-05-01|
+------+----------+-----+----------+
only showing top 5 rows



st1: org.apache.spark.sql.DataFrame = [symbol: string, date: string ... 2 more fields]


In [31]:
// struct

val st2 = st1.withColumn("struct", struct(col("symbol"), col("price"), col("to_date")))
st2.show(5, false)

+------+----------+-----+----------+-------------------------+
|symbol|date      |price|to_date   |struct                   |
+------+----------+-----+----------+-------------------------+
|MSFT  |Jan 1 2000|39.81|2000-01-01|{MSFT, 39.81, 2000-01-01}|
|MSFT  |Feb 1 2000|36.35|2000-02-01|{MSFT, 36.35, 2000-02-01}|
|MSFT  |Mar 1 2000|43.22|2000-03-01|{MSFT, 43.22, 2000-03-01}|
|MSFT  |Apr 1 2000|28.37|2000-04-01|{MSFT, 28.37, 2000-04-01}|
|MSFT  |May 1 2000|25.45|2000-05-01|{MSFT, 25.45, 2000-05-01}|
+------+----------+-----+----------+-------------------------+
only showing top 5 rows



st2: org.apache.spark.sql.DataFrame = [symbol: string, date: string ... 3 more fields]


In [32]:
val st3 = st2.withColumn("struct_price", col("struct").getField("price"))
st3.show(5, false)

+------+----------+-----+----------+-------------------------+------------+
|symbol|date      |price|to_date   |struct                   |struct_price|
+------+----------+-----+----------+-------------------------+------------+
|MSFT  |Jan 1 2000|39.81|2000-01-01|{MSFT, 39.81, 2000-01-01}|39.81       |
|MSFT  |Feb 1 2000|36.35|2000-02-01|{MSFT, 36.35, 2000-02-01}|36.35       |
|MSFT  |Mar 1 2000|43.22|2000-03-01|{MSFT, 43.22, 2000-03-01}|43.22       |
|MSFT  |Apr 1 2000|28.37|2000-04-01|{MSFT, 28.37, 2000-04-01}|28.37       |
|MSFT  |May 1 2000|25.45|2000-05-01|{MSFT, 25.45, 2000-05-01}|25.45       |
+------+----------+-----+----------+-------------------------+------------+
only showing top 5 rows



st3: org.apache.spark.sql.DataFrame = [symbol: string, date: string ... 4 more fields]


In [37]:
val st4 = st1.selectExpr("symbol", "price", "(symbol, price) as struct")

st4.show(5)

+------+-----+-------------+
|symbol|price|       struct|
+------+-----+-------------+
|  MSFT|39.81|{MSFT, 39.81}|
|  MSFT|36.35|{MSFT, 36.35}|
|  MSFT|43.22|{MSFT, 43.22}|
|  MSFT|28.37|{MSFT, 28.37}|
|  MSFT|25.45|{MSFT, 25.45}|
+------+-----+-------------+
only showing top 5 rows



st4: org.apache.spark.sql.DataFrame = [symbol: string, price: double ... 1 more field]


In [39]:
st4.selectExpr("struct", "struct.symbol as ss").show(5) 

+-------------+----+
|       struct|  ss|
+-------------+----+
|{MSFT, 39.81}|MSFT|
|{MSFT, 36.35}|MSFT|
|{MSFT, 43.22}|MSFT|
|{MSFT, 28.37}|MSFT|
|{MSFT, 25.45}|MSFT|
+-------------+----+
only showing top 5 rows



In [41]:
mv.show(5)

+--------------------+--------+-----------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+-------------------+--------------------+------------+--------+---------------+
|       Creative_Type|Director|Distributor|IMDB_Rating|IMDB_Votes|MPAA_Rating|Major_Genre|Production_Budget|Release_Date|Rotten_Tomatoes_Rating|Running_Time_min|             Source|               Title|US_DVD_Sales|US_Gross|Worldwide_Gross|
+--------------------+--------+-----------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+-------------------+--------------------+------------+--------+---------------+
|                NULL|    NULL|   Gramercy|        6.1|      1071|          R|       NULL|          8000000|   12-Jun-98|                  NULL|            NULL|               NULL|      The Land Girls|        NULL|  146083|         146083|
|                NULL|    NULL|     

In [45]:
val nameSplitter = split(col("Title"), " |,").as("Title_List")

val smv = mv.select(col("Title"), nameSplitter)

smv.show(5, false)

+--------------------------+--------------------------------+
|Title                     |Title_List                      |
+--------------------------+--------------------------------+
|The Land Girls            |[The, Land, Girls]              |
|First Love, Last Rites    |[First, Love, , Last, Rites]    |
|I Married a Strange Person|[I, Married, a, Strange, Person]|
|Let's Talk About Sex      |[Let's, Talk, About, Sex]       |
|Slam                      |[Slam]                          |
+--------------------------+--------------------------------+
only showing top 5 rows



nameSplitter: org.apache.spark.sql.Column = split(Title,  |,, -1) AS Title_List
smv: org.apache.spark.sql.DataFrame = [Title: string, Title_List: array<string>]


In [48]:
smv.select(
    col("*"),
    expr("Title_List[0] as first"),
    col("Title_List").getItem(1).as("second"),
    size(col("Title_List")).as("size"),
    array_contains(col("Title_List"), "Girls").as("contains")
).show()

+--------------------+--------------------+-----------+-------+----+--------+
|               Title|          Title_List|      first| second|size|contains|
+--------------------+--------------------+-----------+-------+----+--------+
|      The Land Girls|  [The, Land, Girls]|        The|   Land|   3|    true|
|First Love, Last ...|[First, Love, , L...|      First|   Love|   5|   false|
|I Married a Stran...|[I, Married, a, S...|          I|Married|   5|   false|
|Let's Talk About Sex|[Let's, Talk, Abo...|      Let's|   Talk|   4|   false|
|                Slam|              [Slam]|       Slam|   NULL|   1|   false|
| Mississippi Mermaid|[Mississippi, Mer...|Mississippi|Mermaid|   2|   false|
|           Following|         [Following]|  Following|   NULL|   1|   false|
|             Foolish|           [Foolish]|    Foolish|   NULL|   1|   false|
|             Pirates|           [Pirates]|    Pirates|   NULL|   1|   false|
|     Duel in the Sun|[Duel, in, the, Sun]|       Duel|     in| 