In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructField, StructType, StringType, LongType, DoubleType, FloatType

# Creating a SparkSession

In [2]:
spark = SparkSession.builder \
    .appName("basic_app") \
    .getOrCreate()

23/11/26 19:22:00 WARN Utils: Your hostname, luan-Dell-G15-5520 resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp0s20f3)
23/11/26 19:22:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/26 19:22:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Loading Data

In [3]:
df = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("../../../spark_data_examples/retail-data/by-day/2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



# Checking data head

## F.lit: converts a type in another language to its corresponding Spark representation

In [4]:
df.show(3)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 3 rows



# Converting to Spark Types

In [5]:
df.select(F.lit(5), F.lit("five"), F.lit(5.0)).show(3)

+---+----+---+
|  5|five|5.0|
+---+----+---+
|  5|five|5.0|
|  5|five|5.0|
|  5|five|5.0|
+---+----+---+
only showing top 3 rows



In [6]:
spark.sql("""SELECT 5, "five", 5.0""").show()

+---+----+---+
|  5|five|5.0|
+---+----+---+
|  5|five|5.0|
+---+----+---+



# Working with Booleans

## Example 1

In [7]:
# verbose
df.where(F.col("InvoiceNo") == 536365)\
.select("InvoiceNo", "Description")\
.show(5,False)

+---------+-----------------------------------+
|InvoiceNo|Description                        |
+---------+-----------------------------------+
|536365   |WHITE HANGING HEART T-LIGHT HOLDER |
|536365   |WHITE METAL LANTERN                |
|536365   |CREAM CUPID HEARTS COAT HANGER     |
|536365   |KNITTED UNION FLAG HOT WATER BOTTLE|
|536365   |RED WOOLLY HOTTIE WHITE HEART.     |
+---------+-----------------------------------+
only showing top 5 rows



In [8]:
# Cleaner execution
spark.sql("SELECT InvoiceNo, Description FROM dfTable WHERE InvoiceNo == 536365").show(5,False)

+---------+-----------------------------------+
|InvoiceNo|Description                        |
+---------+-----------------------------------+
|536365   |WHITE HANGING HEART T-LIGHT HOLDER |
|536365   |WHITE METAL LANTERN                |
|536365   |CREAM CUPID HEARTS COAT HANGER     |
|536365   |KNITTED UNION FLAG HOT WATER BOTTLE|
|536365   |RED WOOLLY HOTTIE WHITE HEART.     |
+---------+-----------------------------------+
only showing top 5 rows



In [9]:
# Other alternatives
df[['InvoiceNo','Description']].where("InvoiceNo = 536365")\
.show(5, False)

+---------+-----------------------------------+
|InvoiceNo|Description                        |
+---------+-----------------------------------+
|536365   |WHITE HANGING HEART T-LIGHT HOLDER |
|536365   |WHITE METAL LANTERN                |
|536365   |CREAM CUPID HEARTS COAT HANGER     |
|536365   |KNITTED UNION FLAG HOT WATER BOTTLE|
|536365   |RED WOOLLY HOTTIE WHITE HEART.     |
+---------+-----------------------------------+
only showing top 5 rows



## Example 2

In [10]:
priceFilter = F.col("UnitPrice") > 600
descripFilter = F.instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [11]:
# Doing the same query with pure sql
spark.sql("""SELECT * FROM dfTable WHERE (Description LIKE "%POSTAGE%") AND (StockCode LIKE "%DOT%")""").show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [12]:
# Calling spark functions inside the query
spark.sql("""SELECT * FROM dfTable WHERE StockCode in ("DOT") AND(UnitPrice > 600 OR
instr(Description, "POSTAGE") >= 1)""").show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



## Example 3

In [13]:
# Using pure spark
DOTCodeFilter = F.col("StockCode") == "DOT"
priceFilter = F.col("UnitPrice") > 600
descripFilter = F.instr(F.col("Description"), "POSTAGE") >= 1
df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\
.where("isExpensive")\
.select("unitPrice", "isExpensive").show(5)

+---------+-----------+
|unitPrice|isExpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
+---------+-----------+



In [14]:
spark.sql("""SELECT UnitPrice, (StockCode = 'DOT' AND
(UnitPrice > 600 OR instr(Description, "POSTAGE") >= 1)) as isExpensive
FROM dfTable
WHERE (StockCode = 'DOT' AND
(UnitPrice > 600 OR instr(Description, "POSTAGE") >= 1))""").show()

+---------+-----------+
|UnitPrice|isExpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
+---------+-----------+



In [15]:
spark.sql("""SELECT UnitPrice, (StockCode = 'DOT' AND
(UnitPrice > 600 OR Description LIKE "%POSTAGE")) as isExpensive
FROM dfTable
WHERE (StockCode = 'DOT' AND
(UnitPrice > 600 OR Description LIKE "%POSTAGE"))""").show()

+---------+-----------+
|UnitPrice|isExpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
+---------+-----------+



## Example 4

In [16]:
df.withColumn("isExpensive", F.expr("NOT UnitPrice <= 250"))\
.where("isExpensive")\
.select("Description", "UnitPrice").show(5)

+--------------+---------+
|   Description|UnitPrice|
+--------------+---------+
|DOTCOM POSTAGE|   569.77|
|DOTCOM POSTAGE|   607.49|
+--------------+---------+



## Note

One “gotcha” that can come up is if you’re working with null data
when creating Boolean expressions. If there is a null in your data,
you’ll need to treat things a bit differently. Here’s how you can
ensure that you perform a null-safe equivalence test:
`df.where(col("Description").eqNullSafe("hello")).show()`


# Working with numbers

## Basic mathematical operations

In [17]:
fabricatedQuantity = F.pow(F.col("Quantity") * F.col("UnitPrice"), 2) + 5
df.select(F.expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [18]:
spark.sql("""SELECT customerId, (POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity
FROM dfTable""").show(2)

+----------+------------------+
|customerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



## Rounding

In [19]:
df.select(F.round(F.lit("2.5")), F.bround(F.lit("2.5"))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [20]:
spark.sql("""SELECT round(2.5), bround(2.5)""").show()

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|            3|             2|
+-------------+--------------+



## Creating ids

In [21]:
df.select(F.monotonically_increasing_id()).show(2)

+-----------------------------+
|monotonically_increasing_id()|
+-----------------------------+
|                            0|
|                            1|
+-----------------------------+
only showing top 2 rows



## Correlation and [Statistics](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.html)

In [22]:
df.stat.corr("Quantity", "UnitPrice")
df.select(F.corr("Quantity", "UnitPrice")).show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



In [23]:
spark.sql("SELECT corr(Quantity, UnitPrice) FROM dfTable").show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



## Getting description statistics

In [24]:
df.describe().show()

23/11/26 19:22:11 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                null| 8.627413127413128| 4.151946589446603|15661.388719512195|          null|
| stddev|72.89447869788873|17407.897548583845|                null|26.371821677029203|15.638659854603892|1854.4496996893627|          null|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

## Quantiles

In [25]:
colName = "UnitPrice"
quantileProbs = [0.25,0.5,0.75]
relError = 0.05
df.stat.approxQuantile(colName, quantileProbs, relError) # 2.51

[1.65, 2.51, 4.21]

## Crosstab

In [26]:
df.stat.crosstab("StockCode", "Quantity").toPandas()

Unnamed: 0,StockCode_Quantity,-1,-10,-12,-2,-24,-3,-4,-5,-6,...,60,600,64,7,70,72,8,80,9,96
0,21259,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21894,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,21452,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,22728,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,21889,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1346,22492,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1347,17164B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1348,20774,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1349,85226A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Frequent Items

In [27]:
df.stat.freqItems(["StockCode", "Quantity"]).show()

+--------------------+--------------------+
| StockCode_freqItems|  Quantity_freqItems|
+--------------------+--------------------+
|[22086, 21705, 72...|[200, 128, 23, 50...|
+--------------------+--------------------+



## Frequent items + descriptive statistics

In [28]:
df_desc = df.describe()
df_freq = df.stat.freqItems(df_desc.columns[1:])
new_cols = [col.split('_')[0] for col in df_freq.columns]
df_freq = df_freq.toDF(*new_cols)
df_freq = df_freq.withColumn("summary",F.lit("most_frequent"))
df_freq = df_freq.select(["summary"]+new_cols)
for col in df_freq.columns:
    df_freq = df_freq.withColumn(col,F.col(col).cast(StringType()))
df_desc = df_desc.union(df_freq)

In [29]:
df_desc.toPandas()

Unnamed: 0,summary,InvoiceNo,StockCode,Description,Quantity,UnitPrice,CustomerID,Country
0,count,3108,3108,3098,3108,3108,1968,3108
1,mean,536516.684944841,27834.304044117645,,8.627413127413128,4.151946589446603,15661.388719512195,
2,stddev,72.89447869788873,17407.897548583845,,26.371821677029203,15.638659854603892,1854.4496996893627,
3,min,536365,10002,4 PURPLE FLOCK DINNER CANDLES,-24,0.0,12431.0,Australia
4,max,C536548,POST,ZINC WILLIE WINKIE CANDLE STICK,600,607.49,18229.0,United Kingdom
5,most_frequent,"[536385, 536544, 536551, 536596, 536437, 53657...","[22086, 21705, 72803A, 84970L, 84926A, 90181B,...","[BLUE PAISLEY POCKET BOOK, NOEL WOODEN BLOCK L...","[200, 128, 23, 50, 32, 600, 17, 8, -1, 80, -10...","[1.95, 14.95, 0.43, 0.19, 9.34, 8.5, 3.35, 2.1...","[12662.0, 12868.0, 15165.0, 14142.0, 16098.0, ...","[United Kingdom, Netherlands, Germany, Norway,..."


# Working with strings

## initcap

In [30]:
df.select(F.col("Description").alias('before'),F.initcap(F.col("Description")).alias('after')).show(5)

+--------------------+--------------------+
|              before|               after|
+--------------------+--------------------+
|WHITE HANGING HEA...|White Hanging Hea...|
| WHITE METAL LANTERN| White Metal Lantern|
|CREAM CUPID HEART...|Cream Cupid Heart...|
|KNITTED UNION FLA...|Knitted Union Fla...|
|RED WOOLLY HOTTIE...|Red Woolly Hottie...|
+--------------------+--------------------+
only showing top 5 rows



In [31]:
spark.sql("SELECT Description AS before, initcap(Description) AS after FROM dfTable").show(5)

+--------------------+--------------------+
|              before|               after|
+--------------------+--------------------+
|WHITE HANGING HEA...|White Hanging Hea...|
| WHITE METAL LANTERN| White Metal Lantern|
|CREAM CUPID HEART...|Cream Cupid Heart...|
|KNITTED UNION FLA...|Knitted Union Fla...|
|RED WOOLLY HOTTIE...|Red Woolly Hottie...|
+--------------------+--------------------+
only showing top 5 rows



## lower and upper

In [32]:
df.select(F.col("Description"),
F.lower(F.col("Description")),
F.upper(F.lower(F.col("Description")))).show(2)

+--------------------+--------------------+-------------------------+
|         Description|  lower(Description)|upper(lower(Description))|
+--------------------+--------------------+-------------------------+
|WHITE HANGING HEA...|white hanging hea...|     WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern|      WHITE METAL LANTERN|
+--------------------+--------------------+-------------------------+
only showing top 2 rows



In [33]:
spark.sql("SELECT Description, lower(Description), Upper(lower(Description)) FROM dfTable").show(2)

+--------------------+--------------------+-------------------------+
|         Description|  lower(Description)|upper(lower(Description))|
+--------------------+--------------------+-------------------------+
|WHITE HANGING HEA...|white hanging hea...|     WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern|      WHITE METAL LANTERN|
+--------------------+--------------------+-------------------------+
only showing top 2 rows



## lpad , ltrim , rpad, rtrim , trim

In [34]:
df.select(
F.ltrim(F.lit("    HELLO    ")).alias("ltrim"),
F.rtrim(F.lit("    HELLO    ")).alias("rtrim"),
F.trim(F.lit("HELLO")).alias("trim"),
F.lpad(F.lit("HELLO"), 2, " ").alias("lp"),
F.rpad(F.lit("HELLO"), 10, " ").alias("rp")).show(2)

+---------+---------+-----+---+----------+
|    ltrim|    rtrim| trim| lp|        rp|
+---------+---------+-----+---+----------+
|HELLO    |    HELLO|HELLO| HE|HELLO     |
|HELLO    |    HELLO|HELLO| HE|HELLO     |
+---------+---------+-----+---+----------+
only showing top 2 rows



In [35]:
spark.sql("""SELECT ltrim('    HELLO    '),
                    rtrim('    HELLO    '),
                    trim('HELLO'),
                    lpad('HELLO', 2, ' '),
                    rpad('HELLO', 10, ' ')
                    FROM dfTable""").show(2)

+--------------------+--------------------+-----------+-----------------+------------------+
|ltrim(    HELLO    )|rtrim(    HELLO    )|trim(HELLO)|lpad(HELLO, 2,  )|rpad(HELLO, 10,  )|
+--------------------+--------------------+-----------+-----------------+------------------+
|           HELLO    |               HELLO|      HELLO|               HE|        HELLO     |
|           HELLO    |               HELLO|      HELLO|               HE|        HELLO     |
+--------------------+--------------------+-----------+-----------------+------------------+
only showing top 2 rows



## Regex: regexp_replace

In [36]:
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
F.regexp_replace(F.col("Description"), regex_string, "COLOR").alias("color_clean"),
F.col("Description")).show(2)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



In [37]:
spark.sql("""SELECT
                regexp_replace(Description, 'BLACK|WHITE|RED|GREEN|BLUE', 'COLOR') as
                color_clean, Description
            FROM dfTable""").show(2)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



## Translating

In [38]:
df.select(F.translate(F.col("Description"), "LEET", "1337"),F.col("Description"))\
.show(2)

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



In [39]:
spark.sql("SELECT translate(Description, 'LEET', '1337'), Description FROM dfTable").show(2)

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



## More regex: regexp_extract

In [40]:
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
F.regexp_extract(F.col("Description"), extract_str, 1).alias("color_clean"),
F.col("Description")).show(2)

+-----------+--------------------+
|color_clean|         Description|
+-----------+--------------------+
|      WHITE|WHITE HANGING HEA...|
|      WHITE| WHITE METAL LANTERN|
+-----------+--------------------+
only showing top 2 rows



In [41]:
spark.sql("""SELECT regexp_extract(Description,'(BLACK|WHITE|RED|GREEN|BLUE)',1) AS color_clean,
                    Description
             FROM dfTable""").show(2)

+-----------+--------------------+
|color_clean|         Description|
+-----------+--------------------+
|      WHITE|WHITE HANGING HEA...|
|      WHITE| WHITE METAL LANTERN|
+-----------+--------------------+
only showing top 2 rows



## instr

In [42]:
containsBlack = F.instr(F.col("Description"), "BLACK") >= 1
containsWhite = F.instr(F.col("Description"), "WHITE") >= 1
df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
.where("hasSimpleColor")\
.select("Description").show(3, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



In [43]:
spark.sql("""SELECT Description FROM dfTable
WHERE instr(Description, 'BLACK') >= 1 OR instr(Description, 'WHITE') >= 1""").show(3,False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



## Other example

In [44]:
simpleColors = ["black", "white", "red", "green", "blue"]
def color_locator(column, color_string):
    return F.locate(color_string.upper(), column)\
.cast("boolean")\
.alias("is_" + color_string)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(F.expr("*"))

In [45]:
df.select(*selectedColumns).where(F.expr("is_white OR is_red"))\
.select("Description").show(3, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



# Working with Dates and Timestamps

## Creating a Date DataFrame

In [46]:
dateDF = spark.range(1)\
.withColumn("today", F.current_date())\
.withColumn("now", F.current_timestamp())

In [47]:
dateDF.show()

+---+----------+--------------------+
| id|     today|                 now|
+---+----------+--------------------+
|  0|2023-11-26|2023-11-26 19:22:...|
+---+----------+--------------------+



In [48]:
dateDF.createOrReplaceTempView("dateTable")
dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



## Operations with dates - date_add and date_sub

In [49]:
dateDF.select(F.date_sub(F.col("today"), 5), F.date_add(F.col("today"), 5)).show(1)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2023-11-21|        2023-12-01|
+------------------+------------------+



## Difference between dates

In [50]:
dateDF.withColumn("week_ago", F.date_sub(F.col("today"), 7))\
.select(F.datediff(F.col("week_ago"), F.col("today"))).show(1)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+



In [51]:
dateDF.select(
F.to_date(F.lit("2016-01-01")).alias("start"),
F.to_date(F.lit("2017-05-22")).alias("end"))\
.select(F.months_between(F.col("end"), F.col("start"))).show(1)

+--------------------------------+
|months_between(end, start, true)|
+--------------------------------+
|                     16.67741935|
+--------------------------------+



## Converting string to date

In [52]:
spark.range(1).withColumn("date", F.lit("2017-01-01"))\
.select(F.to_date(F.col("date"))).show(1)

+-------------+
|to_date(date)|
+-------------+
|   2017-01-01|
+-------------+



## For parsing problems, spark returns null

In [53]:
dateDF.select(F.to_date(F.lit("2016-20-12")).alias('wrong_date'),F.to_date(F.lit("2017-12-11")).alias('right_date')).show(1)

+----------+----------+
|wrong_date|right_date|
+----------+----------+
|      null|2017-12-11|
+----------+----------+



## Fixing silent dates problems with to_date

In [54]:
# coercing a dateFormat
dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1).select(
F.to_date(F.lit("2017-12-11"), dateFormat).alias("date"),
F.to_date(F.lit("2017-20-12"), dateFormat).alias("date2"))
cleanDateDF.createOrReplaceTempView("dateTable2")

In [55]:
spark.sql("""SELECT to_date(date, 'yyyy-dd-MM') AS right_date, to_date(date2, 'yyyy-dd-MM') AS wrong_date, to_date(date) AS coerced_date
FROM dateTable2""").show()

+----------+----------+------------+
|right_date|wrong_date|coerced_date|
+----------+----------+------------+
|2017-11-12|2017-12-20|  2017-11-12|
+----------+----------+------------+



## to_timestamp

In [56]:
cleanDateDF.select(F.to_timestamp(F.col("date"), dateFormat).alias('coerced_format')).show()

+-------------------+
|     coerced_format|
+-------------------+
|2017-11-12 00:00:00|
+-------------------+



In [57]:
spark.sql("""SELECT to_timestamp(date, 'yyyy-dd-MM') AS right_timestamp, to_timestamp(date2, 'yyyy-dd-MM') AS coerced_timestamp
FROM dateTable2""").show()

+-------------------+-------------------+
|    right_timestamp|  coerced_timestamp|
+-------------------+-------------------+
|2017-11-12 00:00:00|2017-12-20 00:00:00|
+-------------------+-------------------+



## Alternative in SQL

In [58]:
spark.sql("""SELECT cast(to_date("2017-01-01", "yyyy-dd-MM") as timestamp) AS sql_casting""").show()

+-------------------+
|        sql_casting|
+-------------------+
|2017-01-01 00:00:00|
+-------------------+



## Dates comparison

In [59]:
cleanDateDF.filter(F.col("date2") > F.lit("2017-12-12")).show()

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+



In [60]:
spark.sql("""SELECT * 
FROM dateTable2 WHERE date2>date""").show()

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+



In [61]:
cleanDateDF.filter(F.col("date2") > "'2017-12-12'").show()

+----+-----+
|date|date2|
+----+-----+
+----+-----+



Warning: implicit type casting is an easy way to shoot yourself in the foot,
especially when dealing with null values or dates in different time‐
zones or formats. We recommend that you parse them explicitly
instead of relying on implicit conversions.

## Working with nulls in data

## Coalesce

### Basic coalesce example

In [62]:
cDf = spark.createDataFrame([(None, None,None), (1, None,2), (None, 2,None),(None, None,3)], ("a", "b","c"))
cDf.show()

                                                                                

+----+----+----+
|   a|   b|   c|
+----+----+----+
|null|null|null|
|   1|null|   2|
|null|   2|null|
|null|null|   3|
+----+----+----+



In [63]:
cDf.select(F.coalesce(cDf["a"], cDf["b"],cDf["c"])).show()

+-----------------+
|coalesce(a, b, c)|
+-----------------+
|             null|
|                1|
|                2|
|                3|
+-----------------+



### Coalesce working like a fillna

In [64]:
def fillna(cDf,fill_dict):
    return cDf.select(*[F.coalesce(cDf[col], F.lit(fill_dict.get(col))) for col in cDf.columns])

In [65]:
fillna(cDf,{'a':20,'b':30,'c':40}).show()

+---------------+---------------+---------------+
|coalesce(a, 20)|coalesce(b, 30)|coalesce(c, 40)|
+---------------+---------------+---------------+
|             20|             30|             40|
|              1|             30|              2|
|             20|              2|             40|
|             20|             30|              3|
+---------------+---------------+---------------+



# A database example

In [66]:
# Since there is no null value in Description column, the result is exactly this
df.select(F.coalesce(F.col("Description"), F.col("CustomerId"))).show()

+---------------------------------+
|coalesce(Description, CustomerId)|
+---------------------------------+
|             WHITE HANGING HEA...|
|              WHITE METAL LANTERN|
|             CREAM CUPID HEART...|
|             KNITTED UNION FLA...|
|             RED WOOLLY HOTTIE...|
|             SET 7 BABUSHKA NE...|
|             GLASS STAR FROSTE...|
|             HAND WARMER UNION...|
|             HAND WARMER RED P...|
|             ASSORTED COLOUR B...|
|             POPPY'S PLAYHOUSE...|
|             POPPY'S PLAYHOUSE...|
|             FELTCRAFT PRINCES...|
|             IVORY KNITTED MUG...|
|             BOX OF 6 ASSORTED...|
|             BOX OF VINTAGE JI...|
|             BOX OF VINTAGE AL...|
|             HOME BUILDING BLO...|
|             LOVE BUILDING BLO...|
|             RECIPE BOX WITH M...|
+---------------------------------+
only showing top 20 rows



### ifnull, nullIf

In [67]:
spark.sql("""
SELECT
ifnull(null, 'return_value') AS ifnull_example,
nullif('value', 'value') AS nullif_example
FROM dfTable LIMIT 1
""").show()

+--------------+--------------+
|ifnull_example|nullif_example|
+--------------+--------------+
|  return_value|          null|
+--------------+--------------+



### nvl, and nvl2

In [68]:
spark.sql("""
SELECT
nvl(null, 'return_value') AS nvl_example,
nvl2('not_null', 'return_value', "else_value") AS nvl2_example
FROM dfTable LIMIT 1
""").show()

+------------+------------+
| nvl_example|nvl2_example|
+------------+------------+
|return_value|return_value|
+------------+------------+



# drop 

## basic example

In [69]:
df_ex = spark.createDataFrame([(None,None), (None, 1),(None,2),(1,2)], ("a", "b"))
df_ex.show()

+----+----+
|   a|   b|
+----+----+
|null|null|
|null|   1|
|null|   2|
|   1|   2|
+----+----+



## SQL example

In [70]:
df_ex.createOrReplaceTempView("example")

In [71]:
# works as na.drop("any")
spark.sql("SELECT * FROM example WHERE a IS NOT NULL").show()

+---+---+
|  a|  b|
+---+---+
|  1|  2|
+---+---+



In [72]:
# works as na.drop("all")
spark.sql("SELECT * FROM example WHERE (a IS NOT NULL) OR (b IS NOT NULL)").show()

+----+---+
|   a|  b|
+----+---+
|null|  1|
|null|  2|
|   1|  2|
+----+---+



In [73]:
df_ex.na.drop().show()

+---+---+
|  a|  b|
+---+---+
|  1|  2|
+---+---+



## drop all x any

In [74]:
df_ex.na.drop("all").show()

+----+---+
|   a|  b|
+----+---+
|null|  1|
|null|  2|
|   1|  2|
+----+---+



In [75]:
df_ex.na.drop("any").show()

+---+---+
|  a|  b|
+---+---+
|  1|  2|
+---+---+



# fill

## before

In [76]:
df_ex.show()

+----+----+
|   a|   b|
+----+----+
|null|null|
|null|   1|
|null|   2|
|   1|   2|
+----+----+



## after

In [77]:
df_ex.na.fill(0).show()

+---+---+
|  a|  b|
+---+---+
|  0|  0|
|  0|  1|
|  0|  2|
|  1|  2|
+---+---+



## filling subset

In [78]:
df_ex = spark.createDataFrame([(None,1,None), (1, None,None),(0,None,1),(1,2,3)], ("a", "b","c"))
df_ex.show()

+----+----+----+
|   a|   b|   c|
+----+----+----+
|null|   1|null|
|   1|null|null|
|   0|null|   1|
|   1|   2|   3|
+----+----+----+



In [79]:
df_ex.na.fill(0, subset=["a","b"]).show()

+---+---+----+
|  a|  b|   c|
+---+---+----+
|  0|  1|null|
|  1|  0|null|
|  0|  0|   1|
|  1|  2|   3|
+---+---+----+



# replace

In [80]:
df_ex.na.replace(to_replace = 1, value=100, subset=["a","c"]).show()

+----+----+----+
|   a|   b|   c|
+----+----+----+
|null|   1|null|
| 100|null|null|
|   0|null| 100|
| 100|   2|   3|
+----+----+----+



# Working with complex types

In [81]:
df.selectExpr("(Description, InvoiceNo) as complex", "*").show()

+--------------------+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|             complex|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+--------------------+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|{WHITE HANGING HE...|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|{WHITE METAL LANT...|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|{CREAM CUPID HEAR...|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|{KNITTED UNION FL...|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|{RED WOOLLY HOTTI...|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     

In [82]:
spark.sql("SELECT (Description, InvoiceNo) as complex,*  FROM dfTable").show()

+--------------------+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|             complex|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+--------------------+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|{WHITE HANGING HE...|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|{WHITE METAL LANT...|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|{CREAM CUPID HEAR...|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|{KNITTED UNION FL...|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|{RED WOOLLY HOTTI...|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     

In [83]:
complexDF = df.select(F.struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

## Structs

In [84]:
complexDF = df.select(F.struct("Description", "InvoiceNo").alias("complex"))

In [85]:
complexDF.createOrReplaceTempView("complexDF")

In [86]:
complexDF.select("complex.Description").show(2)

+--------------------+
|         Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
+--------------------+
only showing top 2 rows



In [87]:
complexDF.select("complex.InvoiceNo").show(2)

+---------+
|InvoiceNo|
+---------+
|   536365|
|   536365|
+---------+
only showing top 2 rows



## Using pyspark api

In [88]:
complexDF.select("complex.Description")
complexDF.select(F.col("complex").getField("Description")).show(2)

+--------------------+
| complex.Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
+--------------------+
only showing top 2 rows



## Selecting all fields of a complex type

In [89]:
complexDF.select("complex.*").show(2)

+--------------------+---------+
|         Description|InvoiceNo|
+--------------------+---------+
|WHITE HANGING HEA...|   536365|
| WHITE METAL LANTERN|   536365|
+--------------------+---------+
only showing top 2 rows



In [90]:
spark.sql("SELECT complex.* FROM complexDF").show(2)

+--------------------+---------+
|         Description|InvoiceNo|
+--------------------+---------+
|WHITE HANGING HEA...|   536365|
| WHITE METAL LANTERN|   536365|
+--------------------+---------+
only showing top 2 rows



In [91]:
complexDF.select("complex.*").show(2)

+--------------------+---------+
|         Description|InvoiceNo|
+--------------------+---------+
|WHITE HANGING HEA...|   536365|
| WHITE METAL LANTERN|   536365|
+--------------------+---------+
only showing top 2 rows



# Arrays

## split

In [92]:
df.select(F.split(F.col("Description"), " ")).show(2)

+-------------------------+
|split(Description,  , -1)|
+-------------------------+
|     [WHITE, HANGING, ...|
|     [WHITE, METAL, LA...|
+-------------------------+
only showing top 2 rows



In [93]:
spark.sql("SELECT split(Description, ' ') FROM dfTable").show(2)

+-------------------------+
|split(Description,  , -1)|
+-------------------------+
|     [WHITE, HANGING, ...|
|     [WHITE, METAL, LA...|
+-------------------------+
only showing top 2 rows



## Acessing indexes

In [94]:
df.select(F.split(F.col("Description"), " ").alias("array_col"))\
.selectExpr("array_col[0]").show(2)

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows



In [95]:
spark.sql("SELECT split(Description, ' ')[0] AS `array_col[0]` FROM dfTable").show(2)

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows



## Array length - size function

In [96]:
df.select(F.size(F.split(F.col("Description"), " "))).show(2)

+-------------------------------+
|size(split(Description,  , -1))|
+-------------------------------+
|                              5|
|                              3|
+-------------------------------+
only showing top 2 rows



In [97]:
spark.sql("""SELECT size(split(Description, " ")) FROM dfTable""").show(2)

+-------------------------------+
|size(split(Description,  , -1))|
+-------------------------------+
|                              5|
|                              3|
+-------------------------------+
only showing top 2 rows



## array_contains

In [98]:
df.select(F.array_contains(F.split(F.col("Description"), " "), "WHITE")).show(2)

+------------------------------------------------+
|array_contains(split(Description,  , -1), WHITE)|
+------------------------------------------------+
|                                            true|
|                                            true|
+------------------------------------------------+
only showing top 2 rows



In [99]:
spark.sql("SELECT array_contains(split(Description, ' '), 'WHITE') FROM dfTable").show(2)

+------------------------------------------------+
|array_contains(split(Description,  , -1), WHITE)|
+------------------------------------------------+
|                                            true|
|                                            true|
+------------------------------------------------+
only showing top 2 rows



In [100]:
spark.sql("SELECT Description FROM dfTable WHERE array_contains(split(Description, ' '), 'GREY')").show(2)

+--------------------+
|         Description|
+--------------------+
|CHICK GREY HOT WA...|
|CHICK GREY HOT WA...|
+--------------------+
only showing top 2 rows



## explode

In [101]:
df.withColumn("splitted", F.split(F.col("Description"), " "))\
.withColumn("exploded", F.explode(F.col("splitted")))\
.select("Description", "InvoiceNo", "exploded").show(2)

+--------------------+---------+--------+
|         Description|InvoiceNo|exploded|
+--------------------+---------+--------+
|WHITE HANGING HEA...|   536365|   WHITE|
|WHITE HANGING HEA...|   536365| HANGING|
+--------------------+---------+--------+
only showing top 2 rows



In [102]:
spark.sql("""SELECT Description, InvoiceNo, exploded
FROM (SELECT *, split(Description, " ") as splitted FROM dfTable)
LATERAL VIEW explode(splitted) as exploded""").show(2)

+--------------------+---------+--------+
|         Description|InvoiceNo|exploded|
+--------------------+---------+--------+
|WHITE HANGING HEA...|   536365|   WHITE|
|WHITE HANGING HEA...|   536365| HANGING|
+--------------------+---------+--------+
only showing top 2 rows



# Maps

In [103]:
df.select(F.create_map(F.col("Description"), F.col("InvoiceNo")).alias("complex_map"))\
.show(2)

+--------------------+
|         complex_map|
+--------------------+
|{WHITE HANGING HE...|
|{WHITE METAL LANT...|
+--------------------+
only showing top 2 rows



In [104]:
spark.sql("""SELECT map(Description, InvoiceNo) as complex_map FROM dfTable
WHERE Description IS NOT NULL""").show(2)

+--------------------+
|         complex_map|
+--------------------+
|{WHITE HANGING HE...|
|{WHITE METAL LANT...|
+--------------------+
only showing top 2 rows



In [105]:
df.select(F.create_map(F.col("Description"), F.col("InvoiceNo")).alias("complex_map"))\
.selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            null|
|                          536365|
+--------------------------------+
only showing top 2 rows



## Exploding maps

In [106]:
df.select(F.create_map(F.col("Description"), F.col("InvoiceNo")).alias("complex_map"))\
.selectExpr("explode(complex_map)").show(2)

+--------------------+------+
|                 key| value|
+--------------------+------+
|WHITE HANGING HEA...|536365|
| WHITE METAL LANTERN|536365|
+--------------------+------+
only showing top 2 rows



# Working with JSON

## A basic example

In [107]:
jsonDF = spark.range(1).selectExpr("""
'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")

In [108]:
jsonDF.select(F.get_json_object(F.col("jsonString"),"$.myJSONKey.myJSONValue[1]").alias('get_json_object')
              ,F.json_tuple(F.col("jsonString"), "myJSONKey").alias('json_tuple')).show()

+---------------+--------------------+
|get_json_object|          json_tuple|
+---------------+--------------------+
|              2|{"myJSONValue":[1...|
+---------------+--------------------+



## SQL alternative 1

In [109]:
jsonDF.selectExpr(
                    "get_json_object(jsonString, '$.myJSONKey.myJSONValue[1]') as `get_json_object`",
                    "json_tuple(jsonString,'myJSONKey') AS json_tuple").show(2)

+---------------+--------------------+
|get_json_object|          json_tuple|
+---------------+--------------------+
|              2|{"myJSONValue":[1...|
+---------------+--------------------+



## SQL alternative 2

In [110]:
jsonDF.createOrReplaceTempView("jsonDF")
spark.sql("""SELECT get_json_object(jsonString, '$.myJSONKey.myJSONValue[1]') as `get_json_object`,
             json_tuple(jsonString,'myJSONKey') AS json_tuple FROM jsonDF""").show()

+---------------+--------------------+
|get_json_object|          json_tuple|
+---------------+--------------------+
|              2|{"myJSONValue":[1...|
+---------------+--------------------+



## Converting a struct into a JSON

### Defining struct implicitly

In [111]:
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(F.to_json(F.col("myStruct"))).show(2)

+--------------------+
|   to_json(myStruct)|
+--------------------+
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
+--------------------+
only showing top 2 rows



### Defining struct explicitly

In [112]:
df.select(F.struct("InvoiceNo", "Description").alias("myStruct")).select(F.to_json(F.col("myStruct"))).show(2)

+--------------------+
|   to_json(myStruct)|
+--------------------+
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
+--------------------+
only showing top 2 rows



### In SQL

In [113]:
# As a subquery
spark.sql("""SELECT to_json (myStruct) FROM (SELECT struct(InvoiceNo,Description) AS myStruct FROM dfTable) """).show(2)

+--------------------+
|   to_json(myStruct)|
+--------------------+
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
+--------------------+
only showing top 2 rows



# from_json

In [114]:
parseSchema = StructType((
StructField("InvoiceNo",StringType(),True),
StructField("Description",StringType(),True)))

In [115]:
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(F.to_json(F.col("myStruct")).alias("newJSON"))\
.select(F.from_json(F.col("newJSON"), parseSchema), F.col("newJSON")).show(2)

+--------------------+--------------------+
|  from_json(newJSON)|             newJSON|
+--------------------+--------------------+
|{536365, WHITE HA...|{"InvoiceNo":"536...|
|{536365, WHITE ME...|{"InvoiceNo":"536...|
+--------------------+--------------------+
only showing top 2 rows



# User Defined Functions (UDFs)

## Defining a mock function

In [116]:
def power3(double_value):
    return double_value ** 3

In [117]:
power3(2.0)

8.0

In [118]:
udfExampleDF = spark.range(5).toDF("num")

## Registring the function

In [119]:
power3udf = F.udf(power3)

In [120]:
udfExampleDF.select(power3udf(F.col("num"))).show(2)

                                                                                

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
+-----------+
only showing top 2 rows



## In SQL

In [121]:
udfExampleDF.createOrReplaceTempView('udfExampleDF')

In [122]:
spark.udf.register("power3udf", power3udf)

<function __main__.power3(double_value)>

In [123]:
# you had better to write the functions in Scala/Java
spark.udf.register("lambda_power3udf", lambda x: x**3)

<function __main__.<lambda>(x)>

In [124]:
spark.sql("SELECT power3udf(num) FROM udfExampleDF").show()

+--------------+
|power3udf(num)|
+--------------+
|             0|
|             1|
|             8|
|            27|
|            64|
+--------------+



In [125]:
spark.sql("SELECT lambda_power3udf(num) FROM udfExampleDF").show()

+---------------------+
|lambda_power3udf(num)|
+---------------------+
|                    0|
|                    1|
|                    8|
|                   27|
|                   64|
+---------------------+



## Registring the function with a prescribed output

In [126]:
spark.udf.register("power3py", power3,LongType())

<function __main__.power3(double_value)>

In [127]:
spark.sql("SELECT power3py(num) FROM udfExampleDF").show()

+-------------+
|power3py(num)|
+-------------+
|            0|
|            1|
|            8|
|           27|
|           64|
+-------------+



# Creating UDF via Hive

In [128]:
spark = SparkSession.builder \
    .appName("basic_app") \
    .enableHiveSupport()\
    .getOrCreate()

23/11/26 19:22:32 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


## Saving a function for further use (needs to be written in Scala or Java)

In [None]:
spark.sql("CREATE TEMPORARY FUNCTION myFunc AS 'com.organization.hive.udf.FunctionName'")